Handle const_vector in mulv4si3 for pre-sse4.1.

From-SVN: r188787
This commit is contained in:
Richard Henderson 2012-06-19 11:19:26 -07:00 committed by Richard Henderson
parent 84ddb6810c
commit 73e9d63709
5 changed files with 106 additions and 59 deletions

View File

@ -1,3 +1,12 @@
2012-06-19 Richard Henderson <rth@redhat.com>
* config/i386/i386-protos.h (ix86_expand_sse2_mulv4si3): Declare.
* config/i386/i386.c (ix86_expand_sse2_mulv4si3): New.
* config/i386/predicates.md (nonimmediate_or_const_vector_operand): New.
* config/i386/sse.md (sse2_mulv4si3): Delete.
(mul<VI4_AVX2>3): Use ix86_expand_sse2_mulv4si3 and
nonimmediate_or_const_vector_operand.
2012-06-19 Richard Henderson <rth@redhat.com>
* expmed.c (struct init_expmed_rtl): Split ...

View File

@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
extern bool ix86_expand_pinsr (rtx *);
extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
/* In i386-c.c */
extern void ix86_target_macros (void);

View File

@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
expand_vec_perm_even_odd_1 (&d, odd);
}
void
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
{
rtx op1_m1, op1_m2;
rtx op2_m1, op2_m2;
rtx res_1, res_2;
/* Shift both input vectors down one element, so that elements 3
and 1 are now in the slots for elements 2 and 0. For K8, at
least, this is faster than using a shuffle. */
op1_m1 = op1 = force_reg (V4SImode, op1);
op1_m2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
gen_lowpart (V1TImode, op1),
GEN_INT (32)));
if (GET_CODE (op2) == CONST_VECTOR)
{
rtvec v;
/* Constant propagate the vector shift, leaving the dont-care
vector elements as zero. */
v = rtvec_alloc (4);
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
RTVEC_ELT (v, 1) = const0_rtx;
RTVEC_ELT (v, 3) = const0_rtx;
op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
op2_m1 = force_reg (V4SImode, op2_m1);
v = rtvec_alloc (4);
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
RTVEC_ELT (v, 1) = const0_rtx;
RTVEC_ELT (v, 3) = const0_rtx;
op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
op2_m2 = force_reg (V4SImode, op2_m2);
}
else
{
op2_m1 = op2 = force_reg (V4SImode, op2);
op2_m2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
gen_lowpart (V1TImode, op2),
GEN_INT (32)));
}
/* Widening multiply of elements 0+2, and 1+3. */
res_1 = gen_reg_rtx (V4SImode);
res_2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
op1_m1, op2_m1));
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
op1_m2, op2_m2));
/* Move the results in element 2 down to element 1; we don't care
what goes in elements 2 and 3. Then we can merge the parts
back together with an interleave.
Note that two other sequences were tried:
(1) Use interleaves at the start instead of psrldq, which allows
us to use a single shufps to merge things back at the end.
(2) Use shufps here to combine the two vectors, then pshufd to
put the elements in the correct order.
In both cases the cost of the reformatting stall was too high
and the overall sequence slower. */
emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
}
/* Expand an insert into a vector register through pinsr insn.
Return true if successful. */

View File

@ -816,6 +816,13 @@
return false;
})
;; Return true when OP is a nonimmediate or a vector constant. Note
;; that most vector constants are not legitimate operands, so we need
;; to special-case this.
(define_predicate "nonimmediate_or_const_vector_operand"
(ior (match_code "const_vector")
(match_operand 0 "nonimmediate_operand")))
;; Return true if OP is a register or a zero.
(define_predicate "reg_or_0_operand"
(ior (match_operand 0 "register_operand")

View File

@ -5610,12 +5610,22 @@
(define_expand "mul<mode>3"
[(set (match_operand:VI4_AVX2 0 "register_operand")
(mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand")
(match_operand:VI4_AVX2 2 "register_operand")))]
(mult:VI4_AVX2
(match_operand:VI4_AVX2 1 "nonimmediate_operand")
(match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))]
"TARGET_SSE2"
{
if (TARGET_SSE4_1 || TARGET_AVX)
ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
{
if (CONSTANT_P (operands[2]))
operands[2] = force_const_mem (<MODE>mode, operands[2]);
ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
}
else
{
ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]);
DONE;
}
})
(define_insn "*<sse4_1_avx2>_mul<mode>3"
@ -5633,62 +5643,6 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "<sseinsnmode>")])
(define_insn_and_split "*sse2_mulv4si3"
[(set (match_operand:V4SI 0 "register_operand")
(mult:V4SI (match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")))]
"TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX
&& can_create_pseudo_p ()"
"#"
"&& 1"
[(const_int 0)]
{
rtx t1, t2, t3, t4, t5, t6, thirtytwo;
rtx op0, op1, op2;
op0 = operands[0];
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
t3 = gen_reg_rtx (V4SImode);
t4 = gen_reg_rtx (V4SImode);
t5 = gen_reg_rtx (V4SImode);
t6 = gen_reg_rtx (V4SImode);
thirtytwo = GEN_INT (32);
/* Multiply elements 2 and 0. */
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
op1, op2));
/* Shift both input vectors down one element, so that elements 3
and 1 are now in the slots for elements 2 and 0. For K8, at
least, this is faster than using a shuffle. */
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2),
gen_lowpart (V1TImode, op1),
thirtytwo));
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3),
gen_lowpart (V1TImode, op2),
thirtytwo));
/* Multiply elements 3 and 1. */
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
t2, t3));
/* Move the results in element 2 down to element 1; we don't care
what goes in elements 2 and 3. */
emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
/* Merge the parts back together. */
emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6));
set_unique_reg_note (get_last_insn (), REG_EQUAL,
gen_rtx_MULT (V4SImode, operands[1], operands[2]));
DONE;
})
(define_insn_and_split "mul<mode>3"
[(set (match_operand:VI8_AVX2 0 "register_operand")
(mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")