i386: Expand mul<VI8_AVX2> earlier
Move the expansion code to i386.c next to mulv4si3. Eliminate one shift by adding the highparts before shifting. Correct costs. * config/i386/sse.md (mul<VI8_AVX2>3): Change from insn_and_split to expander; move guts to ... * config/i386/i386.c (ix86_expand_sse2_mulvxdi3): ... here. Add highparts before shifting up. * config/i386/i386-protos.h: Update. From-SVN: r189005
This commit is contained in:
parent
e2a3a0987f
commit
298301d9e7
|
@ -1,3 +1,11 @@
|
|||
2012-06-26 Richard Henderson <rth@redhat.com>
|
||||
|
||||
* config/i386/sse.md (mul<VI8_AVX2>3): Change from insn_and_split
|
||||
to expander; move guts to ...
|
||||
* config/i386/i386.c (ix86_expand_sse2_mulvxdi3): ... here. Add
|
||||
highparts before shifting up.
|
||||
* config/i386/i386-protos.h: Update.
|
||||
|
||||
2012-06-26 Steven Bosscher <steven@gcc.gnu.org>
|
||||
|
||||
* system.h (USE_COMMON_FOR_ONE_ONLY): Poison.
|
||||
|
|
|
@ -227,6 +227,7 @@ extern bool ix86_expand_pinsr (rtx *);
|
|||
extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool);
|
||||
extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
|
||||
extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
|
||||
extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
|
||||
|
||||
/* In i386-c.c */
|
||||
extern void ix86_target_macros (void);
|
||||
|
|
|
@ -32293,6 +32293,14 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
|
|||
extra = 6;
|
||||
*total = cost->fmul * 2 + cost->fabs * extra;
|
||||
}
|
||||
/* V*DImode is emulated with 5-8 insns. */
|
||||
else if (mode == V2DImode || mode == V4DImode)
|
||||
{
|
||||
if (TARGET_XOP && mode == V2DImode)
|
||||
*total = cost->fmul * 2 + cost->fabs * 3;
|
||||
else
|
||||
*total = cost->fmul * 3 + cost->fabs * 5;
|
||||
}
|
||||
/* Without sse4.1, we don't have PMULLD; it's emulated with 7
|
||||
insns, including two PMULUDQ. */
|
||||
else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
|
||||
|
@ -38915,6 +38923,88 @@ ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
|
|||
set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
|
||||
}
|
||||
|
||||
void
|
||||
ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
|
||||
{
|
||||
enum machine_mode mode = GET_MODE (op0);
|
||||
rtx t1, t2, t3, t4, t5, t6;
|
||||
|
||||
if (TARGET_XOP && mode == V2DImode)
|
||||
{
|
||||
/* op1: A,B,C,D, op2: E,F,G,H */
|
||||
op1 = gen_lowpart (V4SImode, op1);
|
||||
op2 = gen_lowpart (V4SImode, op2);
|
||||
|
||||
t1 = gen_reg_rtx (V4SImode);
|
||||
t2 = gen_reg_rtx (V4SImode);
|
||||
t3 = gen_reg_rtx (V2DImode);
|
||||
t4 = gen_reg_rtx (V2DImode);
|
||||
|
||||
/* t1: B,A,D,C */
|
||||
emit_insn (gen_sse2_pshufd_1 (t1, op1,
|
||||
GEN_INT (1),
|
||||
GEN_INT (0),
|
||||
GEN_INT (3),
|
||||
GEN_INT (2)));
|
||||
|
||||
/* t2: (B*E),(A*F),(D*G),(C*H) */
|
||||
emit_insn (gen_mulv4si3 (t2, t1, op2));
|
||||
|
||||
/* t3: (B*E)+(A*F), (D*G)+(C*H) */
|
||||
emit_insn (gen_xop_phadddq (t3, t2));
|
||||
|
||||
/* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
|
||||
emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
|
||||
|
||||
/* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
|
||||
emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
|
||||
}
|
||||
else
|
||||
{
|
||||
enum machine_mode nmode;
|
||||
rtx (*umul) (rtx, rtx, rtx);
|
||||
|
||||
if (mode == V2DImode)
|
||||
{
|
||||
umul = gen_sse2_umulv2siv2di3;
|
||||
nmode = V4SImode;
|
||||
}
|
||||
else if (mode == V4DImode)
|
||||
{
|
||||
umul = gen_avx2_umulv4siv4di3;
|
||||
nmode = V8SImode;
|
||||
}
|
||||
else
|
||||
gcc_unreachable ();
|
||||
|
||||
|
||||
/* Multiply low parts. */
|
||||
t1 = gen_reg_rtx (mode);
|
||||
emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
|
||||
|
||||
/* Shift input vectors right 32 bits so we can multiply high parts. */
|
||||
t6 = GEN_INT (32);
|
||||
t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
|
||||
t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
|
||||
|
||||
/* Multiply high parts by low parts. */
|
||||
t4 = gen_reg_rtx (mode);
|
||||
t5 = gen_reg_rtx (mode);
|
||||
emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
|
||||
emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
|
||||
|
||||
/* Combine and shift the highparts back. */
|
||||
t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
|
||||
t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
|
||||
|
||||
/* Combine high and low parts. */
|
||||
force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
|
||||
}
|
||||
|
||||
set_unique_reg_note (get_last_insn (), REG_EQUAL,
|
||||
gen_rtx_MULT (mode, op1, op2));
|
||||
}
|
||||
|
||||
/* Expand an insert into a vector register through pinsr insn.
|
||||
Return true if successful. */
|
||||
|
||||
|
|
|
@ -5592,91 +5592,13 @@
|
|||
(set_attr "prefix" "orig,vex")
|
||||
(set_attr "mode" "<sseinsnmode>")])
|
||||
|
||||
(define_insn_and_split "mul<mode>3"
|
||||
(define_expand "mul<mode>3"
|
||||
[(set (match_operand:VI8_AVX2 0 "register_operand")
|
||||
(mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")
|
||||
(match_operand:VI8_AVX2 2 "register_operand")))]
|
||||
"TARGET_SSE2
|
||||
&& can_create_pseudo_p ()"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(const_int 0)]
|
||||
"TARGET_SSE2"
|
||||
{
|
||||
rtx t1, t2, t3, t4, t5, t6, thirtytwo;
|
||||
rtx op0, op1, op2;
|
||||
|
||||
op0 = operands[0];
|
||||
op1 = operands[1];
|
||||
op2 = operands[2];
|
||||
|
||||
if (TARGET_XOP && <MODE>mode == V2DImode)
|
||||
{
|
||||
/* op1: A,B,C,D, op2: E,F,G,H */
|
||||
op1 = gen_lowpart (V4SImode, op1);
|
||||
op2 = gen_lowpart (V4SImode, op2);
|
||||
|
||||
t1 = gen_reg_rtx (V4SImode);
|
||||
t2 = gen_reg_rtx (V4SImode);
|
||||
t3 = gen_reg_rtx (V2DImode);
|
||||
t4 = gen_reg_rtx (V2DImode);
|
||||
|
||||
/* t1: B,A,D,C */
|
||||
emit_insn (gen_sse2_pshufd_1 (t1, op1,
|
||||
GEN_INT (1),
|
||||
GEN_INT (0),
|
||||
GEN_INT (3),
|
||||
GEN_INT (2)));
|
||||
|
||||
/* t2: (B*E),(A*F),(D*G),(C*H) */
|
||||
emit_insn (gen_mulv4si3 (t2, t1, op2));
|
||||
|
||||
/* t4: (B*E)+(A*F), (D*G)+(C*H) */
|
||||
emit_insn (gen_xop_phadddq (t3, t2));
|
||||
|
||||
/* t5: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
|
||||
emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
|
||||
|
||||
/* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
|
||||
emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
|
||||
}
|
||||
else
|
||||
{
|
||||
t1 = gen_reg_rtx (<MODE>mode);
|
||||
t2 = gen_reg_rtx (<MODE>mode);
|
||||
t3 = gen_reg_rtx (<MODE>mode);
|
||||
t4 = gen_reg_rtx (<MODE>mode);
|
||||
t5 = gen_reg_rtx (<MODE>mode);
|
||||
t6 = gen_reg_rtx (<MODE>mode);
|
||||
thirtytwo = GEN_INT (32);
|
||||
|
||||
/* Multiply low parts. */
|
||||
emit_insn (gen_<sse2_avx2>_umulv<ssescalarnum>si<mode>3
|
||||
(t1, gen_lowpart (<ssepackmode>mode, op1),
|
||||
gen_lowpart (<ssepackmode>mode, op2)));
|
||||
|
||||
/* Shift input vectors right 32 bits so we can multiply high parts. */
|
||||
emit_insn (gen_lshr<mode>3 (t2, op1, thirtytwo));
|
||||
emit_insn (gen_lshr<mode>3 (t3, op2, thirtytwo));
|
||||
|
||||
/* Multiply high parts by low parts. */
|
||||
emit_insn (gen_<sse2_avx2>_umulv<ssescalarnum>si<mode>3
|
||||
(t4, gen_lowpart (<ssepackmode>mode, op1),
|
||||
gen_lowpart (<ssepackmode>mode, t3)));
|
||||
emit_insn (gen_<sse2_avx2>_umulv<ssescalarnum>si<mode>3
|
||||
(t5, gen_lowpart (<ssepackmode>mode, op2),
|
||||
gen_lowpart (<ssepackmode>mode, t2)));
|
||||
|
||||
/* Shift them back. */
|
||||
emit_insn (gen_ashl<mode>3 (t4, t4, thirtytwo));
|
||||
emit_insn (gen_ashl<mode>3 (t5, t5, thirtytwo));
|
||||
|
||||
/* Add the three parts together. */
|
||||
emit_insn (gen_add<mode>3 (t6, t1, t4));
|
||||
emit_insn (gen_add<mode>3 (op0, t6, t5));
|
||||
}
|
||||
|
||||
set_unique_reg_note (get_last_insn (), REG_EQUAL,
|
||||
gen_rtx_MULT (<MODE>mode, operands[1], operands[2]));
|
||||
ix86_expand_sse2_mulvxdi3 (operands[0], operands[1], operands[2]);
|
||||
DONE;
|
||||
})
|
||||
|
||||
|
|
Loading…
Reference in New Issue