i386: Implement widen_smul_*_v4si for plain sse2

If we don't implement this pattern, the vectorizer is happy to
unpack the v4si and use the full mulv2di3.  This results in
more element shuffling than is required.

        * config/i386/i386.c (bdesc_args): Update.  Change
        IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI to OPTION_MASK_ISA_SSE2.
        (IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI): New.
        (ix86_builtin_mul_widen_even): Use it.
        (ix86_builtin_mul_widen_odd): Relax SMUL_ODD from sse4 to sse2.
        (ix86_expand_mul_widen_evenodd): Handle signed for sse2.
        * config/i386/sse.md (vec_widen_<s>mult_hi_<V124_AVX2>): Allow
        for all SSE2.
        (vec_widen_<s>mult_lo_<V124_AVX2>): Likewise.
        (vec_widen_<s>mult_odd_<VI4_AVX2>): Likewise.  Relax from V124_AVX2.
        (vec_widen_smult_even_v4si): New.

From-SVN: r189006
This commit is contained in:
Richard Henderson 2012-06-26 20:19:59 -07:00 committed by Richard Henderson
parent 298301d9e7
commit 87b7851620
3 changed files with 79 additions and 41 deletions

View File

@ -1,3 +1,17 @@
2012-06-26 Richard Henderson <rth@redhat.com>
* config/i386/i386.c (bdesc_args): Update. Change
IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI to OPTION_MASK_ISA_SSE2.
(IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI): New.
(ix86_builtin_mul_widen_even): Use it.
(ix86_builtin_mul_widen_odd): Relax SMUL_ODD from sse4 to sse2.
(ix86_expand_mul_widen_evenodd): Handle signed for sse2.
* config/i386/sse.md (vec_widen_<s>mult_hi_<V124_AVX2>): Allow
for all SSE2.
(vec_widen_<s>mult_lo_<V124_AVX2>): Likewise.
(vec_widen_<s>mult_odd_<VI4_AVX2>): Likewise. Relax from V124_AVX2.
(vec_widen_smult_even_v4si): New.
2012-06-26 Richard Henderson <rth@redhat.com>
* config/i386/sse.md (mul<VI8_AVX2>3): Change from insn_and_split

View File

@ -25758,6 +25758,7 @@ enum ix86_builtins
IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI,
IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI,
IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI,
IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI,
IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI,
IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI,
@ -26620,7 +26621,9 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_vw_umul_even_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_smult_even_v4si, "__builtin_ia32_vw_smul_even_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_odd_v4si, "__builtin_ia32_vw_umul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_smult_odd_v4si, "__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
@ -26747,7 +26750,6 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
{ OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE4_1, CODE_FOR_vec_widen_smult_odd_v4si, "__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
/* SSE4.1 */
@ -31067,18 +31069,10 @@ ix86_builtin_mul_widen_even (tree type)
switch (TYPE_MODE (type))
{
case V4SImode:
if (uns_p)
{
if (!TARGET_SSE2)
return NULL;
code = IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI;
}
else
{
if (!TARGET_SSE4_1)
return NULL;
code = IX86_BUILTIN_PMULDQ128;
}
if (!TARGET_SSE2)
return NULL;
code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI
: IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI);
break;
case V8SImode:
@ -31103,18 +31097,10 @@ ix86_builtin_mul_widen_odd (tree type)
switch (TYPE_MODE (type))
{
case V4SImode:
if (uns_p)
{
if (!TARGET_SSE2)
return NULL;
code = IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI;
}
else
{
if (!TARGET_SSE4_1)
return NULL;
code = IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI;
}
if (!TARGET_SSE2)
return NULL;
code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI
: IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI);
break;
case V8SImode:
@ -38774,12 +38760,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
return;
}
x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
1, OPTAB_DIRECT);
x, NULL, 1, OPTAB_DIRECT);
op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
1, OPTAB_DIRECT);
x, NULL, 1, OPTAB_DIRECT);
op1 = gen_lowpart (mode, op1);
op2 = gen_lowpart (mode, op2);
}
@ -38801,7 +38787,38 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
x = gen_xop_pmacsdql (dest, op1, op2, x);
}
else
gcc_unreachable ();
{
rtx s1, s2, t0, t1, t2;
/* The easiest way to implement this without PMULDQ is to go through
the motions as if we are performing a full 64-bit multiply. With
the exception that we need to do less shuffling of the elements. */
/* Compute the sign-extension, aka highparts, of the two operands. */
s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
op1, pc_rtx, pc_rtx);
s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
op2, pc_rtx, pc_rtx);
/* Multiply LO(A) * HI(B), and vice-versa. */
t1 = gen_reg_rtx (wmode);
t2 = gen_reg_rtx (wmode);
emit_insn (gen_sse2_umulv2siv2di3 (t1, s1, op2));
emit_insn (gen_sse2_umulv2siv2di3 (t2, s2, op1));
/* Multiply LO(A) * LO(B). */
t0 = gen_reg_rtx (wmode);
emit_insn (gen_sse2_umulv2siv2di3 (t0, op1, op2));
/* Combine and shift the highparts into place. */
t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
1, OPTAB_DIRECT);
/* Combine high and low parts. */
force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
return;
}
emit_insn (x);
}

View File

@ -5607,9 +5607,7 @@
(any_extend:<sseunpackmode>
(match_operand:VI124_AVX2 1 "register_operand"))
(match_operand:VI124_AVX2 2 "register_operand")]
; Note that SSE2 does not have signed SI multiply
"TARGET_XOP || TARGET_SSE4_1
|| (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
"TARGET_SSE2"
{
ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
<u_bool>, true);
@ -5621,23 +5619,32 @@
(any_extend:<sseunpackmode>
(match_operand:VI124_AVX2 1 "register_operand"))
(match_operand:VI124_AVX2 2 "register_operand")]
; Note that SSE2 does not have signed SI multiply
"TARGET_XOP || TARGET_SSE4_1
|| (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
"TARGET_SSE2"
{
ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
<u_bool>, false);
DONE;
})
;; Most widen_<s>mult_even_<mode> can be handled directly from other
;; named patterns, but signed V4SI needs special help for plain SSE2.
(define_expand "vec_widen_smult_even_v4si"
[(match_operand:V2DI 0 "register_operand")
(match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")]
"TARGET_SSE2"
{
ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
false, false);
DONE;
})
(define_expand "vec_widen_<s>mult_odd_<mode>"
[(match_operand:<sseunpackmode> 0 "register_operand")
(any_extend:<sseunpackmode>
(match_operand:VI124_AVX2 1 "register_operand"))
(match_operand:VI124_AVX2 2 "register_operand")]
; Note that SSE2 does not have signed SI multiply
"TARGET_AVX || TARGET_XOP || TARGET_SSE4_1
|| (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
(match_operand:VI4_AVX2 1 "register_operand"))
(match_operand:VI4_AVX2 2 "register_operand")]
"TARGET_SSE2"
{
ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
<u_bool>, true);