From 73e9d63709f920702a5fbaa5129e6216158b4420 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 19 Jun 2012 11:19:26 -0700 Subject: [PATCH] Handle const_vector in mulv4si3 for pre-sse4.1. From-SVN: r188787 --- gcc/ChangeLog | 9 +++++ gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.c | 76 +++++++++++++++++++++++++++++++++++ gcc/config/i386/predicates.md | 7 ++++ gcc/config/i386/sse.md | 72 ++++++--------------------------- 5 files changed, 106 insertions(+), 59 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 0aed15f86e2..8b328197cd6 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2012-06-19 Richard Henderson + + * config/i386/i386-protos.h (ix86_expand_sse2_mulv4si3): Declare. + * config/i386/i386.c (ix86_expand_sse2_mulv4si3): New. + * config/i386/predicates.md (nonimmediate_or_const_vector_operand): New. + * config/i386/sse.md (sse2_mulv4si3): Delete. + (mul3): Use ix86_expand_sse2_mulv4si3 and + nonimmediate_or_const_vector_operand. + 2012-06-19 Richard Henderson * expmed.c (struct init_expmed_rtl): Split ... diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index f300a56834d..431db6c8cf5 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx); extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned); extern bool ix86_expand_pinsr (rtx *); +extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); /* In i386-c.c */ extern void ix86_target_macros (void); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 578a7565dea..0dc08f3aa3a 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) expand_vec_perm_even_odd_1 (&d, odd); } +void +ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) +{ + rtx op1_m1, op1_m2; + rtx op2_m1, op2_m2; + rtx res_1, res_2; + + /* Shift both input vectors down one element, so that elements 3 + and 1 are now in the slots for elements 2 and 0. For K8, at + least, this is faster than using a shuffle. */ + op1_m1 = op1 = force_reg (V4SImode, op1); + op1_m2 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2), + gen_lowpart (V1TImode, op1), + GEN_INT (32))); + + if (GET_CODE (op2) == CONST_VECTOR) + { + rtvec v; + + /* Constant propagate the vector shift, leaving the dont-care + vector elements as zero. */ + v = rtvec_alloc (4); + RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0); + RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2); + RTVEC_ELT (v, 1) = const0_rtx; + RTVEC_ELT (v, 3) = const0_rtx; + op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v); + op2_m1 = force_reg (V4SImode, op2_m1); + + v = rtvec_alloc (4); + RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1); + RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3); + RTVEC_ELT (v, 1) = const0_rtx; + RTVEC_ELT (v, 3) = const0_rtx; + op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v); + op2_m2 = force_reg (V4SImode, op2_m2); + } + else + { + op2_m1 = op2 = force_reg (V4SImode, op2); + op2_m2 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2), + gen_lowpart (V1TImode, op2), + GEN_INT (32))); + } + + /* Widening multiply of elements 0+2, and 1+3. */ + res_1 = gen_reg_rtx (V4SImode); + res_2 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1), + op1_m1, op2_m1)); + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2), + op1_m2, op2_m2)); + + /* Move the results in element 2 down to element 1; we don't care + what goes in elements 2 and 3. Then we can merge the parts + back together with an interleave. + + Note that two other sequences were tried: + (1) Use interleaves at the start instead of psrldq, which allows + us to use a single shufps to merge things back at the end. + (2) Use shufps here to combine the two vectors, then pshufd to + put the elements in the correct order. + In both cases the cost of the reformatting stall was too high + and the overall sequence slower. */ + + emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2)); + + set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2)); +} + /* Expand an insert into a vector register through pinsr insn. Return true if successful. */ diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 92db80912be..f23e932d9f1 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -816,6 +816,13 @@ return false; }) +;; Return true when OP is a nonimmediate or a vector constant. Note +;; that most vector constants are not legitimate operands, so we need +;; to special-case this. +(define_predicate "nonimmediate_or_const_vector_operand" + (ior (match_code "const_vector") + (match_operand 0 "nonimmediate_operand"))) + ;; Return true if OP is a register or a zero. (define_predicate "reg_or_0_operand" (ior (match_operand 0 "register_operand") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 38ade496f17..6c54d33113b 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5610,12 +5610,22 @@ (define_expand "mul3" [(set (match_operand:VI4_AVX2 0 "register_operand") - (mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand") - (match_operand:VI4_AVX2 2 "register_operand")))] + (mult:VI4_AVX2 + (match_operand:VI4_AVX2 1 "nonimmediate_operand") + (match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))] "TARGET_SSE2" { if (TARGET_SSE4_1 || TARGET_AVX) - ix86_fixup_binary_operands_no_copy (MULT, mode, operands); + { + if (CONSTANT_P (operands[2])) + operands[2] = force_const_mem (mode, operands[2]); + ix86_fixup_binary_operands_no_copy (MULT, mode, operands); + } + else + { + ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]); + DONE; + } }) (define_insn "*_mul3" @@ -5633,62 +5643,6 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "")]) -(define_insn_and_split "*sse2_mulv4si3" - [(set (match_operand:V4SI 0 "register_operand") - (mult:V4SI (match_operand:V4SI 1 "register_operand") - (match_operand:V4SI 2 "register_operand")))] - "TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX - && can_create_pseudo_p ()" - "#" - "&& 1" - [(const_int 0)] -{ - rtx t1, t2, t3, t4, t5, t6, thirtytwo; - rtx op0, op1, op2; - - op0 = operands[0]; - op1 = operands[1]; - op2 = operands[2]; - t1 = gen_reg_rtx (V4SImode); - t2 = gen_reg_rtx (V4SImode); - t3 = gen_reg_rtx (V4SImode); - t4 = gen_reg_rtx (V4SImode); - t5 = gen_reg_rtx (V4SImode); - t6 = gen_reg_rtx (V4SImode); - thirtytwo = GEN_INT (32); - - /* Multiply elements 2 and 0. */ - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1), - op1, op2)); - - /* Shift both input vectors down one element, so that elements 3 - and 1 are now in the slots for elements 2 and 0. For K8, at - least, this is faster than using a shuffle. */ - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2), - gen_lowpart (V1TImode, op1), - thirtytwo)); - emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3), - gen_lowpart (V1TImode, op2), - thirtytwo)); - /* Multiply elements 3 and 1. */ - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4), - t2, t3)); - - /* Move the results in element 2 down to element 1; we don't care - what goes in elements 2 and 3. */ - emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - - /* Merge the parts back together. */ - emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6)); - - set_unique_reg_note (get_last_insn (), REG_EQUAL, - gen_rtx_MULT (V4SImode, operands[1], operands[2])); - DONE; -}) - (define_insn_and_split "mul3" [(set (match_operand:VI8_AVX2 0 "register_operand") (mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")