Handle const_vector in mulv4si3 for pre-sse4.1.
From-SVN: r188787
This commit is contained in:
parent
84ddb6810c
commit
73e9d63709
|
@ -1,3 +1,12 @@
|
||||||
|
2012-06-19 Richard Henderson <rth@redhat.com>
|
||||||
|
|
||||||
|
* config/i386/i386-protos.h (ix86_expand_sse2_mulv4si3): Declare.
|
||||||
|
* config/i386/i386.c (ix86_expand_sse2_mulv4si3): New.
|
||||||
|
* config/i386/predicates.md (nonimmediate_or_const_vector_operand): New.
|
||||||
|
* config/i386/sse.md (sse2_mulv4si3): Delete.
|
||||||
|
(mul<VI4_AVX2>3): Use ix86_expand_sse2_mulv4si3 and
|
||||||
|
nonimmediate_or_const_vector_operand.
|
||||||
|
|
||||||
2012-06-19 Richard Henderson <rth@redhat.com>
|
2012-06-19 Richard Henderson <rth@redhat.com>
|
||||||
|
|
||||||
* expmed.c (struct init_expmed_rtl): Split ...
|
* expmed.c (struct init_expmed_rtl): Split ...
|
||||||
|
|
|
@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
|
||||||
|
|
||||||
extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
|
extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
|
||||||
extern bool ix86_expand_pinsr (rtx *);
|
extern bool ix86_expand_pinsr (rtx *);
|
||||||
|
extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
|
||||||
|
|
||||||
/* In i386-c.c */
|
/* In i386-c.c */
|
||||||
extern void ix86_target_macros (void);
|
extern void ix86_target_macros (void);
|
||||||
|
|
|
@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
|
||||||
expand_vec_perm_even_odd_1 (&d, odd);
|
expand_vec_perm_even_odd_1 (&d, odd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
|
||||||
|
{
|
||||||
|
rtx op1_m1, op1_m2;
|
||||||
|
rtx op2_m1, op2_m2;
|
||||||
|
rtx res_1, res_2;
|
||||||
|
|
||||||
|
/* Shift both input vectors down one element, so that elements 3
|
||||||
|
and 1 are now in the slots for elements 2 and 0. For K8, at
|
||||||
|
least, this is faster than using a shuffle. */
|
||||||
|
op1_m1 = op1 = force_reg (V4SImode, op1);
|
||||||
|
op1_m2 = gen_reg_rtx (V4SImode);
|
||||||
|
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
|
||||||
|
gen_lowpart (V1TImode, op1),
|
||||||
|
GEN_INT (32)));
|
||||||
|
|
||||||
|
if (GET_CODE (op2) == CONST_VECTOR)
|
||||||
|
{
|
||||||
|
rtvec v;
|
||||||
|
|
||||||
|
/* Constant propagate the vector shift, leaving the dont-care
|
||||||
|
vector elements as zero. */
|
||||||
|
v = rtvec_alloc (4);
|
||||||
|
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
|
||||||
|
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
|
||||||
|
RTVEC_ELT (v, 1) = const0_rtx;
|
||||||
|
RTVEC_ELT (v, 3) = const0_rtx;
|
||||||
|
op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
|
||||||
|
op2_m1 = force_reg (V4SImode, op2_m1);
|
||||||
|
|
||||||
|
v = rtvec_alloc (4);
|
||||||
|
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
|
||||||
|
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
|
||||||
|
RTVEC_ELT (v, 1) = const0_rtx;
|
||||||
|
RTVEC_ELT (v, 3) = const0_rtx;
|
||||||
|
op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
|
||||||
|
op2_m2 = force_reg (V4SImode, op2_m2);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
op2_m1 = op2 = force_reg (V4SImode, op2);
|
||||||
|
op2_m2 = gen_reg_rtx (V4SImode);
|
||||||
|
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
|
||||||
|
gen_lowpart (V1TImode, op2),
|
||||||
|
GEN_INT (32)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Widening multiply of elements 0+2, and 1+3. */
|
||||||
|
res_1 = gen_reg_rtx (V4SImode);
|
||||||
|
res_2 = gen_reg_rtx (V4SImode);
|
||||||
|
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
|
||||||
|
op1_m1, op2_m1));
|
||||||
|
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
|
||||||
|
op1_m2, op2_m2));
|
||||||
|
|
||||||
|
/* Move the results in element 2 down to element 1; we don't care
|
||||||
|
what goes in elements 2 and 3. Then we can merge the parts
|
||||||
|
back together with an interleave.
|
||||||
|
|
||||||
|
Note that two other sequences were tried:
|
||||||
|
(1) Use interleaves at the start instead of psrldq, which allows
|
||||||
|
us to use a single shufps to merge things back at the end.
|
||||||
|
(2) Use shufps here to combine the two vectors, then pshufd to
|
||||||
|
put the elements in the correct order.
|
||||||
|
In both cases the cost of the reformatting stall was too high
|
||||||
|
and the overall sequence slower. */
|
||||||
|
|
||||||
|
emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
|
||||||
|
const0_rtx, const0_rtx));
|
||||||
|
emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
|
||||||
|
const0_rtx, const0_rtx));
|
||||||
|
res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
|
||||||
|
|
||||||
|
set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
|
||||||
|
}
|
||||||
|
|
||||||
/* Expand an insert into a vector register through pinsr insn.
|
/* Expand an insert into a vector register through pinsr insn.
|
||||||
Return true if successful. */
|
Return true if successful. */
|
||||||
|
|
||||||
|
|
|
@ -816,6 +816,13 @@
|
||||||
return false;
|
return false;
|
||||||
})
|
})
|
||||||
|
|
||||||
|
;; Return true when OP is a nonimmediate or a vector constant. Note
|
||||||
|
;; that most vector constants are not legitimate operands, so we need
|
||||||
|
;; to special-case this.
|
||||||
|
(define_predicate "nonimmediate_or_const_vector_operand"
|
||||||
|
(ior (match_code "const_vector")
|
||||||
|
(match_operand 0 "nonimmediate_operand")))
|
||||||
|
|
||||||
;; Return true if OP is a register or a zero.
|
;; Return true if OP is a register or a zero.
|
||||||
(define_predicate "reg_or_0_operand"
|
(define_predicate "reg_or_0_operand"
|
||||||
(ior (match_operand 0 "register_operand")
|
(ior (match_operand 0 "register_operand")
|
||||||
|
|
|
@ -5610,12 +5610,22 @@
|
||||||
|
|
||||||
(define_expand "mul<mode>3"
|
(define_expand "mul<mode>3"
|
||||||
[(set (match_operand:VI4_AVX2 0 "register_operand")
|
[(set (match_operand:VI4_AVX2 0 "register_operand")
|
||||||
(mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand")
|
(mult:VI4_AVX2
|
||||||
(match_operand:VI4_AVX2 2 "register_operand")))]
|
(match_operand:VI4_AVX2 1 "nonimmediate_operand")
|
||||||
|
(match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))]
|
||||||
"TARGET_SSE2"
|
"TARGET_SSE2"
|
||||||
{
|
{
|
||||||
if (TARGET_SSE4_1 || TARGET_AVX)
|
if (TARGET_SSE4_1 || TARGET_AVX)
|
||||||
ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
|
{
|
||||||
|
if (CONSTANT_P (operands[2]))
|
||||||
|
operands[2] = force_const_mem (<MODE>mode, operands[2]);
|
||||||
|
ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]);
|
||||||
|
DONE;
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
(define_insn "*<sse4_1_avx2>_mul<mode>3"
|
(define_insn "*<sse4_1_avx2>_mul<mode>3"
|
||||||
|
@ -5633,62 +5643,6 @@
|
||||||
(set_attr "prefix" "orig,vex")
|
(set_attr "prefix" "orig,vex")
|
||||||
(set_attr "mode" "<sseinsnmode>")])
|
(set_attr "mode" "<sseinsnmode>")])
|
||||||
|
|
||||||
(define_insn_and_split "*sse2_mulv4si3"
|
|
||||||
[(set (match_operand:V4SI 0 "register_operand")
|
|
||||||
(mult:V4SI (match_operand:V4SI 1 "register_operand")
|
|
||||||
(match_operand:V4SI 2 "register_operand")))]
|
|
||||||
"TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX
|
|
||||||
&& can_create_pseudo_p ()"
|
|
||||||
"#"
|
|
||||||
"&& 1"
|
|
||||||
[(const_int 0)]
|
|
||||||
{
|
|
||||||
rtx t1, t2, t3, t4, t5, t6, thirtytwo;
|
|
||||||
rtx op0, op1, op2;
|
|
||||||
|
|
||||||
op0 = operands[0];
|
|
||||||
op1 = operands[1];
|
|
||||||
op2 = operands[2];
|
|
||||||
t1 = gen_reg_rtx (V4SImode);
|
|
||||||
t2 = gen_reg_rtx (V4SImode);
|
|
||||||
t3 = gen_reg_rtx (V4SImode);
|
|
||||||
t4 = gen_reg_rtx (V4SImode);
|
|
||||||
t5 = gen_reg_rtx (V4SImode);
|
|
||||||
t6 = gen_reg_rtx (V4SImode);
|
|
||||||
thirtytwo = GEN_INT (32);
|
|
||||||
|
|
||||||
/* Multiply elements 2 and 0. */
|
|
||||||
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
|
|
||||||
op1, op2));
|
|
||||||
|
|
||||||
/* Shift both input vectors down one element, so that elements 3
|
|
||||||
and 1 are now in the slots for elements 2 and 0. For K8, at
|
|
||||||
least, this is faster than using a shuffle. */
|
|
||||||
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2),
|
|
||||||
gen_lowpart (V1TImode, op1),
|
|
||||||
thirtytwo));
|
|
||||||
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3),
|
|
||||||
gen_lowpart (V1TImode, op2),
|
|
||||||
thirtytwo));
|
|
||||||
/* Multiply elements 3 and 1. */
|
|
||||||
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
|
|
||||||
t2, t3));
|
|
||||||
|
|
||||||
/* Move the results in element 2 down to element 1; we don't care
|
|
||||||
what goes in elements 2 and 3. */
|
|
||||||
emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
|
|
||||||
const0_rtx, const0_rtx));
|
|
||||||
emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
|
|
||||||
const0_rtx, const0_rtx));
|
|
||||||
|
|
||||||
/* Merge the parts back together. */
|
|
||||||
emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6));
|
|
||||||
|
|
||||||
set_unique_reg_note (get_last_insn (), REG_EQUAL,
|
|
||||||
gen_rtx_MULT (V4SImode, operands[1], operands[2]));
|
|
||||||
DONE;
|
|
||||||
})
|
|
||||||
|
|
||||||
(define_insn_and_split "mul<mode>3"
|
(define_insn_and_split "mul<mode>3"
|
||||||
[(set (match_operand:VI8_AVX2 0 "register_operand")
|
[(set (match_operand:VI8_AVX2 0 "register_operand")
|
||||||
(mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")
|
(mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")
|
||||||
|
|
Loading…
Reference in New Issue