i386.md (UNSPEC_SSE_PROLOGUE_SAVE_LOW): New.

* i386.md (UNSPEC_SSE_PROLOGUE_SAVE_LOW): New.
	(sse_prologue_save_insn expander): Use new pattern.
	(sse_prologue_save_insn1): New pattern and splitter.
	(sse_prologue_save_insn): Update to deal also with 64bit aligned
	blocks.
	* i386.c (setup_incoming_varargs_64): Do not compute jump destination here.
	(ix86_gimplify_va_arg): Update alignment needed.
	(ix86_local_alignment): Do not align all local arrays
	to 128bit.

From-SVN: r158483
This commit is contained in:
Jan Hubicka 2010-04-18 12:52:26 +02:00 committed by Jan Hubicka
parent 0d29aedcb8
commit 07b3ef2e78
3 changed files with 148 additions and 42 deletions

View File

@ -1,3 +1,15 @@
2010-04-18 Jan Hubicka <jh@suse.cz>
* i386.md (UNSPEC_SSE_PROLOGUE_SAVE_LOW): New.
(sse_prologue_save_insn expander): Use new pattern.
(sse_prologue_save_insn1): New pattern and splitter.
(sse_prologue_save_insn): Update to deal also with 64bit aligned
blocks.
* i386.c (setup_incoming_varargs_64): Do not compute jump destination here.
(ix86_gimplify_va_arg): Update alignment needed.
(ix86_local_alignment): Do not align all local arrays
to 128bit.
2010-04-17 Jan Hubicka <jh@suse.cz>
* ipa-inline.c (cgraph_early_inlining): Handle flattening too.

View File

@ -6790,7 +6790,6 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
{
rtx save_area, mem;
rtx label;
rtx label_ref;
rtx tmp_reg;
rtx nsse_reg;
alias_set_type set;
@ -6841,35 +6840,9 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
SSE saves. We need some preparation work to get this working. */
label = gen_label_rtx ();
label_ref = gen_rtx_LABEL_REF (Pmode, label);
/* Compute address to jump to :
label - eax*4 + nnamed_sse_arguments*4 Or
label - eax*5 + nnamed_sse_arguments*5 for AVX. */
tmp_reg = gen_reg_rtx (Pmode);
nsse_reg = gen_reg_rtx (Pmode);
emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, AX_REG)));
emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
gen_rtx_MULT (Pmode, nsse_reg,
GEN_INT (4))));
/* vmovaps is one byte longer than movaps. */
if (TARGET_AVX)
emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
gen_rtx_PLUS (Pmode, tmp_reg,
nsse_reg)));
if (cum->sse_regno)
emit_move_insn
(nsse_reg,
gen_rtx_CONST (DImode,
gen_rtx_PLUS (DImode,
label_ref,
GEN_INT (cum->sse_regno
* (TARGET_AVX ? 5 : 4)))));
else
emit_move_insn (nsse_reg, label_ref);
emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg));
/* Compute address of memory block we save into. We always use pointer
pointing 127 bytes after first byte to store - this is needed to keep
@ -6882,11 +6855,12 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127));
MEM_NOTRAP_P (mem) = 1;
set_mem_alias_set (mem, set);
set_mem_align (mem, BITS_PER_WORD);
set_mem_align (mem, 64);
/* And finally do the dirty job! */
emit_insn (gen_sse_prologue_save (mem, nsse_reg,
GEN_INT (cum->sse_regno), label));
GEN_INT (cum->sse_regno), label,
gen_reg_rtx (Pmode)));
}
}
@ -7047,7 +7021,7 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
int indirect_p = 0;
tree ptrtype;
enum machine_mode nat_mode;
int arg_boundary;
unsigned int arg_boundary;
/* Only 64bit target needs something special. */
if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
@ -7279,6 +7253,8 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
size_int (-align));
t = fold_convert (TREE_TYPE (ovf), t);
if (crtl->stack_alignment_needed < arg_boundary)
crtl->stack_alignment_needed = arg_boundary;
}
gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
gimplify_assign (addr, t, pre_p);
@ -20099,10 +20075,26 @@ ix86_local_alignment (tree exp, enum machine_mode mode,
}
/* x86-64 ABI requires arrays greater than 16 bytes to be aligned
to 16byte boundary. */
if (TARGET_64BIT)
to 16byte boundary. Exact wording is:
An array uses the same alignment as its elements, except that a local or
global array variable of length at least 16 bytes or
a C99 variable-length array variable always has alignment of at least 16 bytes.
This was added to allow use of aligned SSE instructions at arrays. This
rule is meant for static storage (where compiler can not do the analysis
by itself). We follow it for automatic variables only when convenient.
We fully control everything in the function compiled and functions from
other unit can not rely on the alignment.
Exclude va_list type. It is the common case of local array where
we can not benefit from the alignment. */
if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
&& TARGET_SSE)
{
if (AGGREGATE_TYPE_P (type)
&& (TYPE_MAIN_VARIANT (type)
!= TYPE_MAIN_VARIANT (va_list_type_node))
&& TYPE_SIZE (type)
&& TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
&& (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16

View File

@ -85,6 +85,7 @@
(UNSPEC_SET_RIP 16)
(UNSPEC_SET_GOT_OFFSET 17)
(UNSPEC_MEMORY_BLOCKAGE 18)
(UNSPEC_SSE_PROLOGUE_SAVE_LOW 19)
; TLS support
(UNSPEC_TP 20)
@ -18441,13 +18442,110 @@
(reg:DI XMM5_REG)
(reg:DI XMM6_REG)
(reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE))
(use (match_operand:DI 1 "register_operand" ""))
(clobber (match_operand:DI 1 "register_operand" ""))
(use (match_operand:DI 2 "immediate_operand" ""))
(use (label_ref:DI (match_operand 3 "" "")))])]
(use (label_ref:DI (match_operand 3 "" "")))
(clobber (match_operand:DI 4 "register_operand" ""))
(use (match_dup 1))])]
"TARGET_64BIT"
"")
(define_insn "*sse_prologue_save_insn"
;; Pre-reload version of prologue save. Until after prologue generation we don't know
;; what the size of save instruction will be.
;; Operand 0+operand 6 is the memory save area
;; Operand 1 is number of registers to save (will get overwritten to operand 5)
;; Operand 2 is number of non-vaargs SSE arguments
;; Operand 3 is label starting the save block
;; Operand 4 is used for temporary computation of jump address
(define_insn "*sse_prologue_save_insn1"
[(set (mem:BLK (plus:DI (match_operand:DI 0 "register_operand" "R")
(match_operand:DI 6 "const_int_operand" "n")))
(unspec:BLK [(reg:DI XMM0_REG)
(reg:DI XMM1_REG)
(reg:DI XMM2_REG)
(reg:DI XMM3_REG)
(reg:DI XMM4_REG)
(reg:DI XMM5_REG)
(reg:DI XMM6_REG)
(reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE))
(clobber (match_operand:DI 1 "register_operand" "=r"))
(use (match_operand:DI 2 "const_int_operand" "i"))
(use (label_ref:DI (match_operand 3 "" "X")))
(clobber (match_operand:DI 4 "register_operand" "=&r"))
(use (match_operand:DI 5 "register_operand" "1"))]
"TARGET_64BIT
&& INTVAL (operands[6]) + X86_64_SSE_REGPARM_MAX * 16 - 16 < 128
&& INTVAL (operands[6]) + INTVAL (operands[2]) * 16 >= -128"
"#"
[(set_attr "type" "other")
(set_attr "memory" "store")
(set_attr "mode" "DI")])
;; We know size of save instruction; expand the computation of jump address
;; in the jumptable.
(define_split
[(parallel [(set (match_operand:BLK 0 "" "")
(unspec:BLK [(reg:DI XMM0_REG)
(reg:DI XMM1_REG)
(reg:DI XMM2_REG)
(reg:DI XMM3_REG)
(reg:DI XMM4_REG)
(reg:DI XMM5_REG)
(reg:DI XMM6_REG)
(reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE))
(clobber (match_operand:DI 1 "register_operand" ""))
(use (match_operand:DI 2 "const_int_operand" ""))
(use (match_operand 3 "" ""))
(clobber (match_operand:DI 4 "register_operand" ""))
(use (match_operand:DI 5 "register_operand" ""))])]
"reload_completed"
[(parallel [(set (match_dup 0)
(unspec:BLK [(reg:DI XMM0_REG)
(reg:DI XMM1_REG)
(reg:DI XMM2_REG)
(reg:DI XMM3_REG)
(reg:DI XMM4_REG)
(reg:DI XMM5_REG)
(reg:DI XMM6_REG)
(reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE_LOW))
(use (match_dup 1))
(use (match_dup 2))
(use (match_dup 3))
(use (match_dup 5))])]
{
/* Movaps is 4 bytes, AVX and movsd is 5 bytes. */
int size = 4 + (TARGET_AVX || crtl->stack_alignment_needed < 128);
/* Compute address to jump to:
label - eax*size + nnamed_sse_arguments*size. */
if (size == 5)
emit_insn (gen_rtx_SET (VOIDmode, operands[4],
gen_rtx_PLUS
(Pmode,
gen_rtx_MULT (Pmode, operands[1],
GEN_INT (4)),
operands[1])));
else if (size == 4)
emit_insn (gen_rtx_SET (VOIDmode, operands[4],
gen_rtx_MULT (Pmode, operands[1],
GEN_INT (4))));
else
gcc_unreachable ();
if (INTVAL (operands[2]))
emit_move_insn
(operands[1],
gen_rtx_CONST (DImode,
gen_rtx_PLUS (DImode,
operands[3],
GEN_INT (INTVAL (operands[2])
* size))));
else
emit_move_insn (operands[1], operands[3]);
emit_insn (gen_subdi3 (operands[1], operands[1], operands[4]));
operands[5] = GEN_INT (size);
})
(define_insn "sse_prologue_save_insn"
[(set (mem:BLK (plus:DI (match_operand:DI 0 "register_operand" "R")
(match_operand:DI 4 "const_int_operand" "n")))
(unspec:BLK [(reg:DI XMM0_REG)
@ -18457,10 +18555,11 @@
(reg:DI XMM4_REG)
(reg:DI XMM5_REG)
(reg:DI XMM6_REG)
(reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE))
(reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE_LOW))
(use (match_operand:DI 1 "register_operand" "r"))
(use (match_operand:DI 2 "const_int_operand" "i"))
(use (label_ref:DI (match_operand 3 "" "X")))]
(use (label_ref:DI (match_operand 3 "" "X")))
(use (match_operand:DI 5 "const_int_operand" "i"))]
"TARGET_64BIT
&& INTVAL (operands[4]) + X86_64_SSE_REGPARM_MAX * 16 - 16 < 128
&& INTVAL (operands[4]) + INTVAL (operands[2]) * 16 >= -128"
@ -18480,7 +18579,10 @@
PUT_MODE (operands[4], TImode);
if (GET_CODE (XEXP (operands[0], 0)) != PLUS)
output_asm_insn ("rex", operands);
output_asm_insn ("%vmovaps\t{%5, %4|%4, %5}", operands);
if (crtl->stack_alignment_needed < 128)
output_asm_insn ("%vmovsd\t{%5, %4|%4, %5}", operands);
else
output_asm_insn ("%vmovaps\t{%5, %4|%4, %5}", operands);
}
(*targetm.asm_out.internal_label) (asm_out_file, "L",
CODE_LABEL_NUMBER (operands[3]));
@ -18489,11 +18591,11 @@
[(set_attr "type" "other")
(set_attr "length_immediate" "0")
(set_attr "length_address" "0")
;; 2 bytes for jump and opernds[4] bytes for each save.
(set (attr "length")
(if_then_else
(eq (symbol_ref "TARGET_AVX") (const_int 0))
(const_string "34")
(const_string "42")))
(plus (const_int 2)
(mult (symbol_ref ("INTVAL (operands[5])"))
(symbol_ref ("X86_64_SSE_REGPARM_MAX - INTVAL (operands[2])")))))
(set_attr "memory" "store")
(set_attr "modrm" "0")
(set_attr "prefix" "maybe_vex")