Handle const_vector in mulv4si3 for pre-sse4.1.

From-SVN: r188787
2012-06-19 11:19:26 -07:00 · 2012-06-19 11:19:26 -07:00 · 73e9d63709
parent 84ddb6810c
commit 73e9d63709
5 changed files with 106 additions and 59 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,12 @@
 2012-06-19  Richard Henderson  <rth@redhat.com>
 	* config/i386/i386-protos.h (ix86_expand_sse2_mulv4si3): Declare.
 	* config/i386/i386.c (ix86_expand_sse2_mulv4si3): New.
 	* config/i386/predicates.md (nonimmediate_or_const_vector_operand): New.
 	* config/i386/sse.md (sse2_mulv4si3): Delete.
 	(mul<VI4_AVX2>3): Use ix86_expand_sse2_mulv4si3 and
 	nonimmediate_or_const_vector_operand.
 2012-06-19  Richard Henderson  <rth@redhat.com>
 	* expmed.c (struct init_expmed_rtl): Split ...
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
 extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
 extern bool ix86_expand_pinsr (rtx *);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
  expand_vec_perm_even_odd_1 (&d, odd);
 }
 void
 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
 {
  rtx op1_m1, op1_m2;
  rtx op2_m1, op2_m2;
  rtx res_1, res_2;
  /* Shift both input vectors down one element, so that elements 3
     and 1 are now in the slots for elements 2 and 0.  For K8, at
     least, this is faster than using a shuffle.  */
  op1_m1 = op1 = force_reg (V4SImode, op1);
  op1_m2 = gen_reg_rtx (V4SImode);
  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
 				 gen_lowpart (V1TImode, op1),
 				 GEN_INT (32)));
  if (GET_CODE (op2) == CONST_VECTOR)
    {
      rtvec v;
      /* Constant propagate the vector shift, leaving the dont-care
 	 vector elements as zero.  */
      v = rtvec_alloc (4);
      RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
      RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
      RTVEC_ELT (v, 1) = const0_rtx;
      RTVEC_ELT (v, 3) = const0_rtx;
      op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
      op2_m1 = force_reg (V4SImode, op2_m1);
      v = rtvec_alloc (4);
      RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
      RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
      RTVEC_ELT (v, 1) = const0_rtx;
      RTVEC_ELT (v, 3) = const0_rtx;
      op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
      op2_m2 = force_reg (V4SImode, op2_m2);
    }
  else
    {
      op2_m1 = op2 = force_reg (V4SImode, op2);
      op2_m2 = gen_reg_rtx (V4SImode);
      emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
 				     gen_lowpart (V1TImode, op2),
 				     GEN_INT (32)));
    }
  /* Widening multiply of elements 0+2, and 1+3.  */
  res_1 = gen_reg_rtx (V4SImode);
  res_2 = gen_reg_rtx (V4SImode);
  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
 				     op1_m1, op2_m1));
  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
 				     op1_m2, op2_m2));
  /* Move the results in element 2 down to element 1; we don't care
     what goes in elements 2 and 3.  Then we can merge the parts
     back together with an interleave.
     Note that two other sequences were tried:
     (1) Use interleaves at the start instead of psrldq, which allows
     us to use a single shufps to merge things back at the end.
     (2) Use shufps here to combine the two vectors, then pshufd to
     put the elements in the correct order.
     In both cases the cost of the reformatting stall was too high
     and the overall sequence slower.  */
  emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
 				const0_rtx, const0_rtx));
  emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
 				const0_rtx, const0_rtx));
  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
 }
 /* Expand an insert into a vector register through pinsr insn.
   Return true if successful.  */
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@ -816,6 +816,13 @@
  return false;
 })
 ;; Return true when OP is a nonimmediate or a vector constant.  Note
 ;; that most vector constants are not legitimate operands, so we need
 ;; to special-case this.
 (define_predicate "nonimmediate_or_const_vector_operand"
  (ior (match_code "const_vector")
       (match_operand 0 "nonimmediate_operand")))
 ;; Return true if OP is a register or a zero.
 (define_predicate "reg_or_0_operand"
  (ior (match_operand 0 "register_operand")
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@ -5610,12 +5610,22 @@
 (define_expand "mul<mode>3"
  [(set (match_operand:VI4_AVX2 0 "register_operand")
-	(mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand")
+	(mult:VI4_AVX2
-		       (match_operand:VI4_AVX2 2 "register_operand")))]
+	  (match_operand:VI4_AVX2 1 "nonimmediate_operand")
 	  (match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))]
  "TARGET_SSE2"
 {
  if (TARGET_SSE4_1 || TARGET_AVX)
-    ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
+    {
      if (CONSTANT_P (operands[2]))
 	operands[2] = force_const_mem (<MODE>mode, operands[2]);
      ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
    }
  else
    {
      ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]);
      DONE;
    }
 })
 (define_insn "*<sse4_1_avx2>_mul<mode>3"
@ -5633,62 +5643,6 @@
   (set_attr "prefix" "orig,vex")
   (set_attr "mode" "<sseinsnmode>")])
 (define_insn_and_split "*sse2_mulv4si3"
  [(set (match_operand:V4SI 0 "register_operand")
 	(mult:V4SI (match_operand:V4SI 1 "register_operand")
 		   (match_operand:V4SI 2 "register_operand")))]
  "TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX
   && can_create_pseudo_p ()"
  "#"
  "&& 1"
  [(const_int 0)]
 {
  rtx t1, t2, t3, t4, t5, t6, thirtytwo;
  rtx op0, op1, op2;
  op0 = operands[0];
  op1 = operands[1];
  op2 = operands[2];
  t1 = gen_reg_rtx (V4SImode);
  t2 = gen_reg_rtx (V4SImode);
  t3 = gen_reg_rtx (V4SImode);
  t4 = gen_reg_rtx (V4SImode);
  t5 = gen_reg_rtx (V4SImode);
  t6 = gen_reg_rtx (V4SImode);
  thirtytwo = GEN_INT (32);
  /* Multiply elements 2 and 0.  */
  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
 				     op1, op2));
  /* Shift both input vectors down one element, so that elements 3
     and 1 are now in the slots for elements 2 and 0.  For K8, at
     least, this is faster than using a shuffle.  */
  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2),
 				 gen_lowpart (V1TImode, op1),
 				 thirtytwo));
  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3),
 				 gen_lowpart (V1TImode, op2),
 				 thirtytwo));
  /* Multiply elements 3 and 1.  */
  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
 				     t2, t3));
  /* Move the results in element 2 down to element 1; we don't care
     what goes in elements 2 and 3.  */
  emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
 				const0_rtx, const0_rtx));
  emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
 				const0_rtx, const0_rtx));
  /* Merge the parts back together.  */
  emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6));
  set_unique_reg_note (get_last_insn (), REG_EQUAL,
 		       gen_rtx_MULT (V4SImode, operands[1], operands[2]));
  DONE;
 })
 (define_insn_and_split "mul<mode>3"
  [(set (match_operand:VI8_AVX2 0 "register_operand")
 	(mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")