diff --git a/gcc/ChangeLog b/gcc/ChangeLog index addc0942f8d..912670d50dd 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,15 @@ +2010-04-18 Jan Hubicka + + * i386.md (UNSPEC_SSE_PROLOGUE_SAVE_LOW): New. + (sse_prologue_save_insn expander): Use new pattern. + (sse_prologue_save_insn1): New pattern and splitter. + (sse_prologue_save_insn): Update to deal also with 64bit aligned + blocks. + * i386.c (setup_incoming_varargs_64): Do not compute jump destination here. + (ix86_gimplify_va_arg): Update alignment needed. + (ix86_local_alignment): Do not align all local arrays + to 128bit. + 2010-04-17 Jan Hubicka * ipa-inline.c (cgraph_early_inlining): Handle flattening too. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index b99fe2ae345..7376d1b48e7 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -6790,7 +6790,6 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) { rtx save_area, mem; rtx label; - rtx label_ref; rtx tmp_reg; rtx nsse_reg; alias_set_type set; @@ -6841,35 +6840,9 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) SSE saves. We need some preparation work to get this working. */ label = gen_label_rtx (); - label_ref = gen_rtx_LABEL_REF (Pmode, label); - /* Compute address to jump to : - label - eax*4 + nnamed_sse_arguments*4 Or - label - eax*5 + nnamed_sse_arguments*5 for AVX. */ - tmp_reg = gen_reg_rtx (Pmode); nsse_reg = gen_reg_rtx (Pmode); emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, AX_REG))); - emit_insn (gen_rtx_SET (VOIDmode, tmp_reg, - gen_rtx_MULT (Pmode, nsse_reg, - GEN_INT (4)))); - - /* vmovaps is one byte longer than movaps. */ - if (TARGET_AVX) - emit_insn (gen_rtx_SET (VOIDmode, tmp_reg, - gen_rtx_PLUS (Pmode, tmp_reg, - nsse_reg))); - - if (cum->sse_regno) - emit_move_insn - (nsse_reg, - gen_rtx_CONST (DImode, - gen_rtx_PLUS (DImode, - label_ref, - GEN_INT (cum->sse_regno - * (TARGET_AVX ? 5 : 4))))); - else - emit_move_insn (nsse_reg, label_ref); - emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg)); /* Compute address of memory block we save into. We always use pointer pointing 127 bytes after first byte to store - this is needed to keep @@ -6882,11 +6855,12 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127)); MEM_NOTRAP_P (mem) = 1; set_mem_alias_set (mem, set); - set_mem_align (mem, BITS_PER_WORD); + set_mem_align (mem, 64); /* And finally do the dirty job! */ emit_insn (gen_sse_prologue_save (mem, nsse_reg, - GEN_INT (cum->sse_regno), label)); + GEN_INT (cum->sse_regno), label, + gen_reg_rtx (Pmode))); } } @@ -7047,7 +7021,7 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, int indirect_p = 0; tree ptrtype; enum machine_mode nat_mode; - int arg_boundary; + unsigned int arg_boundary; /* Only 64bit target needs something special. */ if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist))) @@ -7279,6 +7253,8 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, size_int (-align)); t = fold_convert (TREE_TYPE (ovf), t); + if (crtl->stack_alignment_needed < arg_boundary) + crtl->stack_alignment_needed = arg_boundary; } gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue); gimplify_assign (addr, t, pre_p); @@ -20099,10 +20075,26 @@ ix86_local_alignment (tree exp, enum machine_mode mode, } /* x86-64 ABI requires arrays greater than 16 bytes to be aligned - to 16byte boundary. */ - if (TARGET_64BIT) + to 16byte boundary. Exact wording is: + + An array uses the same alignment as its elements, except that a local or + global array variable of length at least 16 bytes or + a C99 variable-length array variable always has alignment of at least 16 bytes. + + This was added to allow use of aligned SSE instructions at arrays. This + rule is meant for static storage (where compiler can not do the analysis + by itself). We follow it for automatic variables only when convenient. + We fully control everything in the function compiled and functions from + other unit can not rely on the alignment. + + Exclude va_list type. It is the common case of local array where + we can not benefit from the alignment. */ + if (TARGET_64BIT && optimize_function_for_speed_p (cfun) + && TARGET_SSE) { if (AGGREGATE_TYPE_P (type) + && (TYPE_MAIN_VARIANT (type) + != TYPE_MAIN_VARIANT (va_list_type_node)) && TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16 diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index d08a7ea1081..fbc15522673 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -85,6 +85,7 @@ (UNSPEC_SET_RIP 16) (UNSPEC_SET_GOT_OFFSET 17) (UNSPEC_MEMORY_BLOCKAGE 18) + (UNSPEC_SSE_PROLOGUE_SAVE_LOW 19) ; TLS support (UNSPEC_TP 20) @@ -18441,13 +18442,110 @@ (reg:DI XMM5_REG) (reg:DI XMM6_REG) (reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE)) - (use (match_operand:DI 1 "register_operand" "")) + (clobber (match_operand:DI 1 "register_operand" "")) (use (match_operand:DI 2 "immediate_operand" "")) - (use (label_ref:DI (match_operand 3 "" "")))])] + (use (label_ref:DI (match_operand 3 "" ""))) + (clobber (match_operand:DI 4 "register_operand" "")) + (use (match_dup 1))])] "TARGET_64BIT" "") -(define_insn "*sse_prologue_save_insn" +;; Pre-reload version of prologue save. Until after prologue generation we don't know +;; what the size of save instruction will be. +;; Operand 0+operand 6 is the memory save area +;; Operand 1 is number of registers to save (will get overwritten to operand 5) +;; Operand 2 is number of non-vaargs SSE arguments +;; Operand 3 is label starting the save block +;; Operand 4 is used for temporary computation of jump address +(define_insn "*sse_prologue_save_insn1" + [(set (mem:BLK (plus:DI (match_operand:DI 0 "register_operand" "R") + (match_operand:DI 6 "const_int_operand" "n"))) + (unspec:BLK [(reg:DI XMM0_REG) + (reg:DI XMM1_REG) + (reg:DI XMM2_REG) + (reg:DI XMM3_REG) + (reg:DI XMM4_REG) + (reg:DI XMM5_REG) + (reg:DI XMM6_REG) + (reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE)) + (clobber (match_operand:DI 1 "register_operand" "=r")) + (use (match_operand:DI 2 "const_int_operand" "i")) + (use (label_ref:DI (match_operand 3 "" "X"))) + (clobber (match_operand:DI 4 "register_operand" "=&r")) + (use (match_operand:DI 5 "register_operand" "1"))] + "TARGET_64BIT + && INTVAL (operands[6]) + X86_64_SSE_REGPARM_MAX * 16 - 16 < 128 + && INTVAL (operands[6]) + INTVAL (operands[2]) * 16 >= -128" + "#" + [(set_attr "type" "other") + (set_attr "memory" "store") + (set_attr "mode" "DI")]) + +;; We know size of save instruction; expand the computation of jump address +;; in the jumptable. +(define_split + [(parallel [(set (match_operand:BLK 0 "" "") + (unspec:BLK [(reg:DI XMM0_REG) + (reg:DI XMM1_REG) + (reg:DI XMM2_REG) + (reg:DI XMM3_REG) + (reg:DI XMM4_REG) + (reg:DI XMM5_REG) + (reg:DI XMM6_REG) + (reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE)) + (clobber (match_operand:DI 1 "register_operand" "")) + (use (match_operand:DI 2 "const_int_operand" "")) + (use (match_operand 3 "" "")) + (clobber (match_operand:DI 4 "register_operand" "")) + (use (match_operand:DI 5 "register_operand" ""))])] + "reload_completed" + [(parallel [(set (match_dup 0) + (unspec:BLK [(reg:DI XMM0_REG) + (reg:DI XMM1_REG) + (reg:DI XMM2_REG) + (reg:DI XMM3_REG) + (reg:DI XMM4_REG) + (reg:DI XMM5_REG) + (reg:DI XMM6_REG) + (reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE_LOW)) + (use (match_dup 1)) + (use (match_dup 2)) + (use (match_dup 3)) + (use (match_dup 5))])] +{ + /* Movaps is 4 bytes, AVX and movsd is 5 bytes. */ + int size = 4 + (TARGET_AVX || crtl->stack_alignment_needed < 128); + + /* Compute address to jump to: + label - eax*size + nnamed_sse_arguments*size. */ + if (size == 5) + emit_insn (gen_rtx_SET (VOIDmode, operands[4], + gen_rtx_PLUS + (Pmode, + gen_rtx_MULT (Pmode, operands[1], + GEN_INT (4)), + operands[1]))); + else if (size == 4) + emit_insn (gen_rtx_SET (VOIDmode, operands[4], + gen_rtx_MULT (Pmode, operands[1], + GEN_INT (4)))); + else + gcc_unreachable (); + if (INTVAL (operands[2])) + emit_move_insn + (operands[1], + gen_rtx_CONST (DImode, + gen_rtx_PLUS (DImode, + operands[3], + GEN_INT (INTVAL (operands[2]) + * size)))); + else + emit_move_insn (operands[1], operands[3]); + emit_insn (gen_subdi3 (operands[1], operands[1], operands[4])); + operands[5] = GEN_INT (size); +}) + +(define_insn "sse_prologue_save_insn" [(set (mem:BLK (plus:DI (match_operand:DI 0 "register_operand" "R") (match_operand:DI 4 "const_int_operand" "n"))) (unspec:BLK [(reg:DI XMM0_REG) @@ -18457,10 +18555,11 @@ (reg:DI XMM4_REG) (reg:DI XMM5_REG) (reg:DI XMM6_REG) - (reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE)) + (reg:DI XMM7_REG)] UNSPEC_SSE_PROLOGUE_SAVE_LOW)) (use (match_operand:DI 1 "register_operand" "r")) (use (match_operand:DI 2 "const_int_operand" "i")) - (use (label_ref:DI (match_operand 3 "" "X")))] + (use (label_ref:DI (match_operand 3 "" "X"))) + (use (match_operand:DI 5 "const_int_operand" "i"))] "TARGET_64BIT && INTVAL (operands[4]) + X86_64_SSE_REGPARM_MAX * 16 - 16 < 128 && INTVAL (operands[4]) + INTVAL (operands[2]) * 16 >= -128" @@ -18480,7 +18579,10 @@ PUT_MODE (operands[4], TImode); if (GET_CODE (XEXP (operands[0], 0)) != PLUS) output_asm_insn ("rex", operands); - output_asm_insn ("%vmovaps\t{%5, %4|%4, %5}", operands); + if (crtl->stack_alignment_needed < 128) + output_asm_insn ("%vmovsd\t{%5, %4|%4, %5}", operands); + else + output_asm_insn ("%vmovaps\t{%5, %4|%4, %5}", operands); } (*targetm.asm_out.internal_label) (asm_out_file, "L", CODE_LABEL_NUMBER (operands[3])); @@ -18489,11 +18591,11 @@ [(set_attr "type" "other") (set_attr "length_immediate" "0") (set_attr "length_address" "0") + ;; 2 bytes for jump and opernds[4] bytes for each save. (set (attr "length") - (if_then_else - (eq (symbol_ref "TARGET_AVX") (const_int 0)) - (const_string "34") - (const_string "42"))) + (plus (const_int 2) + (mult (symbol_ref ("INTVAL (operands[5])")) + (symbol_ref ("X86_64_SSE_REGPARM_MAX - INTVAL (operands[2])"))))) (set_attr "memory" "store") (set_attr "modrm" "0") (set_attr "prefix" "maybe_vex")