i386.md (UNSPEC_VPERMDI): Remove.

* config/i386/i386.md (UNSPEC_VPERMDI): Remove.
	* config/i386/i386.c (ix86_expand_vec_perm): Handle
	V16QImode and V32QImode for TARGET_AVX2.
	(MAX_VECT_LEN): Increase to 32.
	(expand_vec_perm_blend): Add support for 32-byte integer
	vectors with TARGET_AVX2.
	(valid_perm_using_mode_p): New function.
	(expand_vec_perm_pshufb): Add support for 32-byte integer
	vectors with TARGET_AVX2.
	(expand_vec_perm_vpshufb2_vpermq): New function.
	(expand_vec_perm_vpshufb2_vpermq_even_odd): New function.
	(expand_vec_perm_even_odd_1): Handle 32-byte integer vectors
	with TARGET_AVX2.
	(ix86_expand_vec_perm_builtin_1): Try expand_vec_perm_vpshufb2_vpermq
	and expand_vec_perm_vpshufb2_vpermq_even_odd.
	* config/i386/sse.md (VEC_EXTRACT_EVENODD_MODE): Add for TARGET_AVX2
	32-byte integer vector modes.
	(vec_pack_trunc_<mode>): Use VI248_AVX2 instead of VI248_128.
	(avx2_interleave_highv32qi, avx2_interleave_lowv32qi): Remove pasto.
	(avx2_pshufdv3, avx2_pshuflwv3, avx2_pshufhwv3): Generate
	4 new operands.
	(avx2_pshufd_1, avx2_pshuflw_1, avx2_pshufhw_1): Don't use
	match_dup, instead add 4 new operands and require they have
	right cross-lane values.
	(avx2_permv4di): Change into define_expand.
	(avx2_permv4di_1): New instruction.
	(avx2_permv2ti): Use nonimmediate_operand instead of register_operand
	for "xm" constrained operand.
	(VEC_PERM_AVX2): Add V32QI and V16QI for TARGET_AVX2.

From-SVN: r179870
This commit is contained in:
Jakub Jelinek 2011-10-13 00:05:58 +02:00 committed by Jakub Jelinek
parent 9d901b0e8f
commit 0c7189ae2d
4 changed files with 707 additions and 118 deletions

View File

@ -1,5 +1,35 @@
2011-10-12 Jakub Jelinek <jakub@redhat.com>
* config/i386/i386.md (UNSPEC_VPERMDI): Remove.
* config/i386/i386.c (ix86_expand_vec_perm): Handle
V16QImode and V32QImode for TARGET_AVX2.
(MAX_VECT_LEN): Increase to 32.
(expand_vec_perm_blend): Add support for 32-byte integer
vectors with TARGET_AVX2.
(valid_perm_using_mode_p): New function.
(expand_vec_perm_pshufb): Add support for 32-byte integer
vectors with TARGET_AVX2.
(expand_vec_perm_vpshufb2_vpermq): New function.
(expand_vec_perm_vpshufb2_vpermq_even_odd): New function.
(expand_vec_perm_even_odd_1): Handle 32-byte integer vectors
with TARGET_AVX2.
(ix86_expand_vec_perm_builtin_1): Try expand_vec_perm_vpshufb2_vpermq
and expand_vec_perm_vpshufb2_vpermq_even_odd.
* config/i386/sse.md (VEC_EXTRACT_EVENODD_MODE): Add for TARGET_AVX2
32-byte integer vector modes.
(vec_pack_trunc_<mode>): Use VI248_AVX2 instead of VI248_128.
(avx2_interleave_highv32qi, avx2_interleave_lowv32qi): Remove pasto.
(avx2_pshufdv3, avx2_pshuflwv3, avx2_pshufhwv3): Generate
4 new operands.
(avx2_pshufd_1, avx2_pshuflw_1, avx2_pshufhw_1): Don't use
match_dup, instead add 4 new operands and require they have
right cross-lane values.
(avx2_permv4di): Change into define_expand.
(avx2_permv4di_1): New instruction.
(avx2_permv2ti): Use nonimmediate_operand instead of register_operand
for "xm" constrained operand.
(VEC_PERM_AVX2): Add V32QI and V16QI for TARGET_AVX2.
* config/i386/sse.md (avx2_gathersi<mode>,
avx2_gatherdi<mode>, avx2_gatherdi<mode>256): Add clobber of
match_scratch, change memory_operand to register_operand,

View File

@ -19334,7 +19334,7 @@ ix86_expand_vec_perm (rtx operands[])
rtx op0 = operands[1];
rtx op1 = operands[2];
rtx mask = operands[3];
rtx t1, t2, vt, vec[16];
rtx t1, t2, t3, t4, vt, vt2, vec[32];
enum machine_mode mode = GET_MODE (op0);
enum machine_mode maskmode = GET_MODE (mask);
int w, e, i;
@ -19343,50 +19343,68 @@ ix86_expand_vec_perm (rtx operands[])
/* Number of elements in the vector. */
w = GET_MODE_NUNITS (mode);
e = GET_MODE_UNIT_SIZE (mode);
gcc_assert (w <= 16);
gcc_assert (w <= 32);
if (TARGET_AVX2)
{
if (mode == V4DImode || mode == V4DFmode)
if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
{
/* Unfortunately, the VPERMQ and VPERMPD instructions only support
an constant shuffle operand. With a tiny bit of effort we can
use VPERMD instead. A re-interpretation stall for V4DFmode is
unfortunate but there's no avoiding it. */
t1 = gen_reg_rtx (V8SImode);
unfortunate but there's no avoiding it.
Similarly for V16HImode we don't have instructions for variable
shuffling, while for V32QImode we can use after preparing suitable
masks vpshufb; vpshufb; vpermq; vpor. */
if (mode == V16HImode)
{
maskmode = mode = V32QImode;
w = 32;
e = 1;
}
else
{
maskmode = mode = V8SImode;
w = 8;
e = 4;
}
t1 = gen_reg_rtx (maskmode);
/* Replicate the low bits of the V4DImode mask into V8SImode:
mask = { A B C D }
t1 = { A A B B C C D D }. */
for (i = 0; i < 4; ++i)
for (i = 0; i < w / 2; ++i)
vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec));
vt = force_reg (V8SImode, vt);
mask = gen_lowpart (V8SImode, mask);
vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
vt = force_reg (maskmode, vt);
mask = gen_lowpart (maskmode, mask);
if (maskmode == V8SImode)
emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
else
emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
/* Multiply the shuffle indicies by two. */
emit_insn (gen_avx2_lshlv8si3 (t1, t1, const1_rtx));
t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
OPTAB_DIRECT);
/* Add one to the odd shuffle indicies:
t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
for (i = 0; i < 4; ++i)
for (i = 0; i < w / 2; ++i)
{
vec[i * 2] = const0_rtx;
vec[i * 2 + 1] = const1_rtx;
}
vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec));
vt = force_const_mem (V8SImode, vt);
emit_insn (gen_addv8si3 (t1, t1, vt));
vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
vt = force_const_mem (maskmode, vt);
t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
OPTAB_DIRECT);
/* Continue as if V8SImode was used initially. */
/* Continue as if V8SImode (resp. V32QImode) was used initially. */
operands[3] = mask = t1;
target = gen_lowpart (V8SImode, target);
op0 = gen_lowpart (V8SImode, op0);
op1 = gen_lowpart (V8SImode, op1);
maskmode = mode = V8SImode;
w = 8;
e = 4;
target = gen_lowpart (mode, target);
op0 = gen_lowpart (mode, op0);
op1 = gen_lowpart (mode, op1);
}
switch (mode)
@ -19443,6 +19461,92 @@ ix86_expand_vec_perm (rtx operands[])
emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
return;
case V32QImode:
t1 = gen_reg_rtx (V32QImode);
t2 = gen_reg_rtx (V32QImode);
t3 = gen_reg_rtx (V32QImode);
vt2 = GEN_INT (128);
for (i = 0; i < 32; i++)
vec[i] = vt2;
vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
vt = force_reg (V32QImode, vt);
for (i = 0; i < 32; i++)
vec[i] = i < 16 ? vt2 : const0_rtx;
vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
vt2 = force_reg (V32QImode, vt2);
/* From mask create two adjusted masks, which contain the same
bits as mask in the low 7 bits of each vector element.
The first mask will have the most significant bit clear
if it requests element from the same 128-bit lane
and MSB set if it requests element from the other 128-bit lane.
The second mask will have the opposite values of the MSB,
and additionally will have its 128-bit lanes swapped.
E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
stands for other 12 bytes. */
/* The bit whether element is from the same lane or the other
lane is bit 4, so shift it up by 3 to the MSB position. */
emit_insn (gen_avx2_lshlv4di3 (gen_lowpart (V4DImode, t1),
gen_lowpart (V4DImode, mask),
GEN_INT (3)));
/* Clear MSB bits from the mask just in case it had them set. */
emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
/* After this t1 will have MSB set for elements from other lane. */
emit_insn (gen_xorv32qi3 (t1, t1, vt2));
/* Clear bits other than MSB. */
emit_insn (gen_andv32qi3 (t1, t1, vt));
/* Or in the lower bits from mask into t3. */
emit_insn (gen_iorv32qi3 (t3, t1, t2));
/* And invert MSB bits in t1, so MSB is set for elements from the same
lane. */
emit_insn (gen_xorv32qi3 (t1, t1, vt));
/* Swap 128-bit lanes in t3. */
emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
gen_lowpart (V4DImode, t3),
const2_rtx, GEN_INT (3),
const0_rtx, const1_rtx));
/* And or in the lower bits from mask into t1. */
emit_insn (gen_iorv32qi3 (t1, t1, t2));
if (one_operand_shuffle)
{
/* Each of these shuffles will put 0s in places where
element from the other 128-bit lane is needed, otherwise
will shuffle in the requested value. */
emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
/* For t3 the 128-bit lanes are swapped again. */
emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
gen_lowpart (V4DImode, t3),
const2_rtx, GEN_INT (3),
const0_rtx, const1_rtx));
/* And oring both together leads to the result. */
emit_insn (gen_iorv32qi3 (target, t1, t3));
return;
}
t4 = gen_reg_rtx (V32QImode);
/* Similarly to the above one_operand_shuffle code,
just for repeated twice for each operand. merge_two:
code will merge the two results together. */
emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
gen_lowpart (V4DImode, t4),
const2_rtx, GEN_INT (3),
const0_rtx, const1_rtx));
emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
gen_lowpart (V4DImode, t3),
const2_rtx, GEN_INT (3),
const0_rtx, const1_rtx));
emit_insn (gen_iorv32qi3 (t4, t2, t4));
emit_insn (gen_iorv32qi3 (t3, t1, t3));
t1 = t4;
t2 = t3;
goto merge_two;
default:
gcc_assert (GET_MODE_SIZE (mode) <= 16);
break;
@ -31773,9 +31877,9 @@ x86_emit_floatuns (rtx operands[2])
emit_label (donelab);
}
/* AVX does not support 32-byte integer vector operations,
thus the longest vector we are faced with is V16QImode. */
#define MAX_VECT_LEN 16
/* AVX2 does support 32-byte integer vector operations,
thus the longest vector we are faced with is V32QImode. */
#define MAX_VECT_LEN 32
struct expand_vec_perm_d
{
@ -34582,7 +34686,7 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
in terms of blendp[sd] / pblendw / pblendvb. */
in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
static bool
expand_vec_perm_blend (struct expand_vec_perm_d *d)
@ -34590,10 +34694,17 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
enum machine_mode vmode = d->vmode;
unsigned i, mask, nelt = d->nelt;
rtx target, op0, op1, x;
rtx rperm[32], vperm;
if (!TARGET_SSE4_1 || d->op0 == d->op1)
if (d->op0 == d->op1)
return false;
if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode))
if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
;
else
return false;
/* This is a blend, not a permute. Elements must stay in their
@ -34611,30 +34722,6 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
/* ??? Without SSE4.1, we could implement this with and/andn/or. This
decision should be extracted elsewhere, so that we only try that
sequence once all budget==3 options have been tried. */
/* For bytes, see if bytes move in pairs so we can use pblendw with
an immediate argument, rather than pblendvb with a vector argument. */
if (vmode == V16QImode)
{
bool pblendw_ok = true;
for (i = 0; i < 16 && pblendw_ok; i += 2)
pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]);
if (!pblendw_ok)
{
rtx rperm[16], vperm;
for (i = 0; i < nelt; ++i)
rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
vperm = force_reg (V16QImode, vperm);
emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm));
return true;
}
}
target = d->target;
op0 = d->op0;
op1 = d->op1;
@ -34647,6 +34734,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
case V2DFmode:
case V4SFmode:
case V8HImode:
case V8SImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@ -34654,24 +34742,122 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
case V2DImode:
for (i = 0; i < 2; ++i)
mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
vmode = V8HImode;
goto do_subreg;
case V4SImode:
for (i = 0; i < 4; ++i)
mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
vmode = V8HImode;
goto do_subreg;
case V16QImode:
/* See if bytes move in pairs so we can use pblendw with
an immediate argument, rather than pblendvb with a vector
argument. */
for (i = 0; i < 16; i += 2)
if (d->perm[i] + 1 != d->perm[i + 1])
{
use_pblendvb:
for (i = 0; i < nelt; ++i)
rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
finish_pblendvb:
vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
vperm = force_reg (vmode, vperm);
if (GET_MODE_SIZE (vmode) == 16)
emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
else
emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
return true;
}
for (i = 0; i < 8; ++i)
mask |= (d->perm[i * 2] >= 16) << i;
vmode = V8HImode;
/* FALLTHRU */
do_subreg:
vmode = V8HImode;
target = gen_lowpart (vmode, target);
op0 = gen_lowpart (vmode, op0);
op1 = gen_lowpart (vmode, op1);
break;
case V32QImode:
/* See if bytes move in pairs. If not, vpblendvb must be used. */
for (i = 0; i < 32; i += 2)
if (d->perm[i] + 1 != d->perm[i + 1])
goto use_pblendvb;
/* See if bytes move in quadruplets. If yes, vpblendd
with immediate can be used. */
for (i = 0; i < 32; i += 4)
if (d->perm[i] + 2 != d->perm[i + 2])
break;
if (i < 32)
{
/* See if bytes move the same in both lanes. If yes,
vpblendw with immediate can be used. */
for (i = 0; i < 16; i += 2)
if (d->perm[i] + 16 != d->perm[i + 16])
goto use_pblendvb;
/* Use vpblendw. */
for (i = 0; i < 16; ++i)
mask |= (d->perm[i * 2] >= 32) << i;
vmode = V16HImode;
goto do_subreg;
}
/* Use vpblendd. */
for (i = 0; i < 8; ++i)
mask |= (d->perm[i * 4] >= 32) << i;
vmode = V8SImode;
goto do_subreg;
case V16HImode:
/* See if words move in pairs. If yes, vpblendd can be used. */
for (i = 0; i < 16; i += 2)
if (d->perm[i] + 1 != d->perm[i + 1])
break;
if (i < 16)
{
/* See if words move the same in both lanes. If not,
vpblendvb must be used. */
for (i = 0; i < 8; i++)
if (d->perm[i] + 8 != d->perm[i + 8])
{
/* Use vpblendvb. */
for (i = 0; i < 32; ++i)
rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
vmode = V32QImode;
nelt = 32;
target = gen_lowpart (vmode, target);
op0 = gen_lowpart (vmode, op0);
op1 = gen_lowpart (vmode, op1);
goto finish_pblendvb;
}
/* Use vpblendw. */
for (i = 0; i < 16; ++i)
mask |= (d->perm[i] >= 16) << i;
break;
}
/* Use vpblendd. */
for (i = 0; i < 8; ++i)
mask |= (d->perm[i * 2] >= 16) << i;
vmode = V8SImode;
goto do_subreg;
case V4DImode:
/* Use vpblendd. */
for (i = 0; i < 4; ++i)
mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
vmode = V8SImode;
goto do_subreg;
default:
gcc_unreachable ();
}
@ -34732,43 +34918,164 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
return true;
}
/* Return true if permutation D can be performed as VMODE permutation
instead. */
static bool
valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
{
unsigned int i, j, chunk;
if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
|| GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
|| GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
return false;
if (GET_MODE_NUNITS (vmode) >= d->nelt)
return true;
chunk = d->nelt / GET_MODE_NUNITS (vmode);
for (i = 0; i < d->nelt; i += chunk)
if (d->perm[i] & (chunk - 1))
return false;
else
for (j = 1; j < chunk; ++j)
if ((d->perm[i] & (d->nelt - 1)) + j
!= (d->perm[i + j] & (d->nelt - 1)))
return false;
return true;
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
in terms of pshufb or vpperm. */
in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
static bool
expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
{
unsigned i, nelt, eltsz;
rtx rperm[16], vperm, target, op0, op1;
unsigned i, nelt, eltsz, mask;
unsigned char perm[32];
enum machine_mode vmode = V16QImode;
rtx rperm[32], vperm, target, op0, op1;
if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP))
nelt = d->nelt;
if (d->op0 != d->op1)
{
if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
{
if (TARGET_AVX2
&& valid_perm_using_mode_p (V2TImode, d))
{
if (d->testing_p)
return true;
/* Use vperm2i128 insn. The pattern uses
V4DImode instead of V2TImode. */
target = gen_lowpart (V4DImode, d->target);
op0 = gen_lowpart (V4DImode, d->op0);
op1 = gen_lowpart (V4DImode, d->op1);
rperm[0]
= GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
|| ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
return true;
}
return false;
if (GET_MODE_SIZE (d->vmode) != 16)
}
}
else
{
if (GET_MODE_SIZE (d->vmode) == 16)
{
if (!TARGET_SSSE3)
return false;
}
else if (GET_MODE_SIZE (d->vmode) == 32)
{
if (!TARGET_AVX2)
return false;
/* V4DImode should be already handled through
expand_vselect by vpermq instruction. */
gcc_assert (d->vmode != V4DImode);
vmode = V32QImode;
if (d->vmode == V8SImode
|| d->vmode == V16HImode
|| d->vmode == V32QImode)
{
/* First see if vpermq can be used for
V8SImode/V16HImode/V32QImode. */
if (valid_perm_using_mode_p (V4DImode, d))
{
for (i = 0; i < 4; i++)
perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
if (d->testing_p)
return true;
return expand_vselect (gen_lowpart (V4DImode, d->target),
gen_lowpart (V4DImode, d->op0),
perm, 4);
}
/* Next see if vpermd can be used. */
if (valid_perm_using_mode_p (V8SImode, d))
vmode = V8SImode;
}
if (vmode == V32QImode)
{
/* vpshufb only works intra lanes, it is not
possible to shuffle bytes in between the lanes. */
for (i = 0; i < nelt; ++i)
if ((d->perm[i] ^ i) & (nelt / 2))
return false;
}
}
else
return false;
}
if (d->testing_p)
return true;
nelt = d->nelt;
if (vmode == V8SImode)
for (i = 0; i < 8; ++i)
rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
else
{
eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
if (d->op0 != d->op1)
mask = 2 * nelt - 1;
else if (vmode == V16QImode)
mask = nelt - 1;
else
mask = nelt / 2 - 1;
for (i = 0; i < nelt; ++i)
{
unsigned j, e = d->perm[i];
unsigned j, e = d->perm[i] & mask;
for (j = 0; j < eltsz; ++j)
rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
}
}
vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
vperm = force_reg (V16QImode, vperm);
vperm = gen_rtx_CONST_VECTOR (vmode,
gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
vperm = force_reg (vmode, vperm);
target = gen_lowpart (V16QImode, d->target);
op0 = gen_lowpart (V16QImode, d->op0);
target = gen_lowpart (vmode, d->target);
op0 = gen_lowpart (vmode, d->op0);
if (d->op0 == d->op1)
{
if (vmode == V16QImode)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
}
else
{
op1 = gen_lowpart (V16QImode, d->op1);
op1 = gen_lowpart (vmode, d->op1);
emit_insn (gen_xop_pperm (target, op0, op1, vperm));
}
@ -34856,7 +35163,8 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_vpermil (d))
return true;
/* Try the SSSE3 pshufb or XOP vpperm variable permutation. */
/* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
vpshufb, vpermd or vpermq variable permutation. */
if (expand_vec_perm_pshufb (d))
return true;
@ -35156,6 +35464,150 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
return true;
}
/* Implement arbitrary permutation of one V32QImode and V16QImode operand
with two vpshufb insns, vpermq and vpor. We should have already failed
all two or three instruction sequences. */
static bool
expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
{
rtx rperm[2][32], vperm, l, h, hp, op, m128;
unsigned int i, nelt, eltsz;
if (!TARGET_AVX2
|| d->op0 != d->op1
|| (d->vmode != V32QImode && d->vmode != V16HImode))
return false;
nelt = d->nelt;
eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
/* Generate two permutation masks. If the required element is within
the same lane, it is shuffled in. If the required element from the
other lane, force a zero by setting bit 7 in the permutation mask.
In the other mask the mask has non-negative elements if element
is requested from the other lane, but also moved to the other lane,
so that the result of vpshufb can have the two V2TImode halves
swapped. */
m128 = GEN_INT (-128);
for (i = 0; i < nelt; ++i)
{
unsigned j, e = d->perm[i] & (nelt / 2 - 1);
unsigned which = ((d->perm[i] ^ i) & (nelt / 2));
for (j = 0; j < eltsz; ++j)
{
rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
rperm[!which][(i * eltsz + j) ^ (which ^ (nelt / 2))] = m128;
}
}
vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
vperm = force_reg (V32QImode, vperm);
h = gen_reg_rtx (V32QImode);
op = gen_lowpart (V32QImode, d->op0);
emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
/* Swap the 128-byte lanes of h into hp. */
hp = gen_reg_rtx (V32QImode);
op = gen_lowpart (V4DImode, h);
emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, hp), op,
const2_rtx, GEN_INT (3), const0_rtx,
const1_rtx));
vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
vperm = force_reg (V32QImode, vperm);
l = gen_reg_rtx (V32QImode);
op = gen_lowpart (V32QImode, d->op0);
emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
op = gen_lowpart (V32QImode, d->target);
emit_insn (gen_iorv32qi3 (op, l, hp));
return true;
}
/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
and extract-odd permutations of two V32QImode and V16QImode operand
with two vpshufb insns, vpor and vpermq. We should have already
failed all two or three instruction sequences. */
static bool
expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
{
rtx rperm[2][32], vperm, l, h, ior, op, m128;
unsigned int i, nelt, eltsz;
if (!TARGET_AVX2
|| d->op0 == d->op1
|| (d->vmode != V32QImode && d->vmode != V16HImode))
return false;
for (i = 0; i < d->nelt; ++i)
if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
return false;
if (d->testing_p)
return true;
nelt = d->nelt;
eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
/* Generate two permutation masks. In the first permutation mask
the first quarter will contain indexes for the first half
of the op0, the second quarter will contain bit 7 set, third quarter
will contain indexes for the second half of the op0 and the
last quarter bit 7 set. In the second permutation mask
the first quarter will contain bit 7 set, the second quarter
indexes for the first half of the op1, the third quarter bit 7 set
and last quarter indexes for the second half of the op1.
I.e. the first mask e.g. for V32QImode extract even will be:
0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
(all values masked with 0xf except for -128) and second mask
for extract even will be
-128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
m128 = GEN_INT (-128);
for (i = 0; i < nelt; ++i)
{
unsigned j, e = d->perm[i] & (nelt / 2 - 1);
unsigned which = d->perm[i] >= nelt;
unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
for (j = 0; j < eltsz; ++j)
{
rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
}
}
vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
vperm = force_reg (V32QImode, vperm);
l = gen_reg_rtx (V32QImode);
op = gen_lowpart (V32QImode, d->op0);
emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
vperm = force_reg (V32QImode, vperm);
h = gen_reg_rtx (V32QImode);
op = gen_lowpart (V32QImode, d->op0);
emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
ior = gen_reg_rtx (V32QImode);
emit_insn (gen_iorv32qi3 (ior, l, h));
/* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
op = gen_lowpart (V4DImode, d->target);
ior = gen_lowpart (V4DImode, ior);
emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
return true;
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
and extract-odd permutations. */
@ -35265,6 +35717,61 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
}
break;
case V16HImode:
case V32QImode:
return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
case V4DImode:
t1 = gen_reg_rtx (V4DImode);
t2 = gen_reg_rtx (V4DImode);
/* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
/* Now an vpunpck[lh]qdq will produce the result required. */
if (odd)
t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
else
t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
emit_insn (t3);
break;
case V8SImode:
t1 = gen_reg_rtx (V8SImode);
t2 = gen_reg_rtx (V8SImode);
/* Shuffle the lanes around into
{ 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
gen_lowpart (V4DImode, d->op0),
gen_lowpart (V4DImode, d->op1),
GEN_INT (0x20)));
emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
gen_lowpart (V4DImode, d->op0),
gen_lowpart (V4DImode, d->op1),
GEN_INT (0x31)));
/* Swap the 2nd and 3rd position in each lane into
{ 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
emit_insn (gen_avx2_pshufdv3 (t1, t1,
GEN_INT (2 * 2 + 1 * 16 + 3 * 64)));
emit_insn (gen_avx2_pshufdv3 (t2, t2,
GEN_INT (2 * 2 + 1 * 16 + 3 * 64)));
/* Now an vpunpck[lh]qdq will produce
{ 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
if (odd)
t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
gen_lowpart (V4DImode, t1),
gen_lowpart (V4DImode, t2));
else
t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
gen_lowpart (V4DImode, t1),
gen_lowpart (V4DImode, t2));
emit_insn (t3);
break;
default:
gcc_unreachable ();
}
@ -35399,6 +35906,14 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_pshufb2 (d))
return true;
/* Try sequences of four instructions. */
if (expand_vec_perm_vpshufb2_vpermq (d))
return true;
if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
return true;
/* ??? Look for narrow permutations whose element orderings would
allow the promotion to a wider mode. */

View File

@ -235,7 +235,6 @@
UNSPEC_VPERMSI
UNSPEC_VPERMDF
UNSPEC_VPERMSF
UNSPEC_VPERMDI
UNSPEC_VPERMTI
UNSPEC_GATHER

View File

@ -4330,10 +4330,10 @@
;; Modes handled by vec_extract_even/odd pattern.
(define_mode_iterator VEC_EXTRACT_EVENODD_MODE
[(V16QI "TARGET_SSE2")
(V8HI "TARGET_SSE2")
(V4SI "TARGET_SSE2")
(V2DI "TARGET_SSE2")
[(V32QI "TARGET_AVX2") (V16QI "TARGET_SSE2")
(V16HI "TARGET_AVX2") (V8HI "TARGET_SSE2")
(V8SI "TARGET_AVX2") (V4SI "TARGET_SSE2")
(V4DI "TARGET_AVX2") (V2DI "TARGET_SSE2")
(V8SF "TARGET_AVX") V4SF
(V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
@ -6196,11 +6196,9 @@
DONE;
})
;; ??? Irritatingly, the 256-bit VPSHUFB only shuffles within the 128-bit
;; lanes. For now, we don't try to support V32QI or V16HImode. So we
;; don't want to use VI_AVX2.
(define_mode_iterator VEC_PERM_AVX2
[V16QI V8HI V4SI V2DI V4SF V2DF
(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")])
@ -6431,8 +6429,8 @@
(define_expand "vec_pack_trunc_<mode>"
[(match_operand:<ssepackmode> 0 "register_operand" "")
(match_operand:VI248_128 1 "register_operand" "")
(match_operand:VI248_128 2 "register_operand" "")]
(match_operand:VI248_AVX2 1 "register_operand" "")
(match_operand:VI248_AVX2 2 "register_operand" "")]
"TARGET_SSE2"
{
rtx op1 = gen_lowpart (<ssepackmode>mode, operands[1]);
@ -6513,8 +6511,7 @@
(const_int 28) (const_int 60)
(const_int 29) (const_int 61)
(const_int 30) (const_int 62)
(const_int 31) (const_int 63)
(const_int 32) (const_int 64)])))]
(const_int 31) (const_int 63)])))]
"TARGET_AVX2"
"vpunpckhbw\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sselog")
@ -6559,7 +6556,6 @@
(const_int 5) (const_int 37)
(const_int 6) (const_int 38)
(const_int 7) (const_int 39)
(const_int 15) (const_int 47)
(const_int 16) (const_int 48)
(const_int 17) (const_int 49)
(const_int 18) (const_int 50)
@ -6919,7 +6915,11 @@
GEN_INT ((mask >> 0) & 3),
GEN_INT ((mask >> 2) & 3),
GEN_INT ((mask >> 4) & 3),
GEN_INT ((mask >> 6) & 3)));
GEN_INT ((mask >> 6) & 3),
GEN_INT (((mask >> 0) & 3) + 4),
GEN_INT (((mask >> 2) & 3) + 4),
GEN_INT (((mask >> 4) & 3) + 4),
GEN_INT (((mask >> 6) & 3) + 4)));
DONE;
})
@ -6931,11 +6931,15 @@
(match_operand 3 "const_0_to_3_operand" "")
(match_operand 4 "const_0_to_3_operand" "")
(match_operand 5 "const_0_to_3_operand" "")
(match_dup 2)
(match_dup 3)
(match_dup 4)
(match_dup 5)])))]
"TARGET_AVX2"
(match_operand 6 "const_4_to_7_operand" "")
(match_operand 7 "const_4_to_7_operand" "")
(match_operand 8 "const_4_to_7_operand" "")
(match_operand 9 "const_4_to_7_operand" "")])))]
"TARGET_AVX2
&& INTVAL (operands[2]) + 4 == INTVAL (operands[6])
&& INTVAL (operands[3]) + 4 == INTVAL (operands[7])
&& INTVAL (operands[4]) + 4 == INTVAL (operands[8])
&& INTVAL (operands[5]) + 4 == INTVAL (operands[9])"
{
int mask = 0;
mask |= INTVAL (operands[2]) << 0;
@ -7002,7 +7006,11 @@
GEN_INT ((mask >> 0) & 3),
GEN_INT ((mask >> 2) & 3),
GEN_INT ((mask >> 4) & 3),
GEN_INT ((mask >> 6) & 3)));
GEN_INT ((mask >> 6) & 3),
GEN_INT (((mask >> 0) & 3) + 8),
GEN_INT (((mask >> 2) & 3) + 8),
GEN_INT (((mask >> 4) & 3) + 8),
GEN_INT (((mask >> 6) & 3) + 8)));
DONE;
})
@ -7018,15 +7026,19 @@
(const_int 5)
(const_int 6)
(const_int 7)
(match_dup 2)
(match_dup 3)
(match_dup 4)
(match_dup 5)
(match_operand 6 "const_8_to_11_operand" "")
(match_operand 7 "const_8_to_11_operand" "")
(match_operand 8 "const_8_to_11_operand" "")
(match_operand 9 "const_8_to_11_operand" "")
(const_int 12)
(const_int 13)
(const_int 14)
(const_int 15)])))]
"TARGET_AVX2"
"TARGET_AVX2
&& INTVAL (operands[2]) + 8 == INTVAL (operands[6])
&& INTVAL (operands[3]) + 8 == INTVAL (operands[7])
&& INTVAL (operands[4]) + 8 == INTVAL (operands[8])
&& INTVAL (operands[5]) + 8 == INTVAL (operands[9])"
{
int mask = 0;
mask |= INTVAL (operands[2]) << 0;
@ -7098,7 +7110,11 @@
GEN_INT (((mask >> 0) & 3) + 4),
GEN_INT (((mask >> 2) & 3) + 4),
GEN_INT (((mask >> 4) & 3) + 4),
GEN_INT (((mask >> 6) & 3) + 4)));
GEN_INT (((mask >> 6) & 3) + 4),
GEN_INT (((mask >> 0) & 3) + 12),
GEN_INT (((mask >> 2) & 3) + 12),
GEN_INT (((mask >> 4) & 3) + 12),
GEN_INT (((mask >> 6) & 3) + 12)));
DONE;
})
@ -7118,11 +7134,15 @@
(const_int 9)
(const_int 10)
(const_int 11)
(match_dup 2)
(match_dup 3)
(match_dup 4)
(match_dup 5)])))]
"TARGET_AVX2"
(match_operand 6 "const_12_to_15_operand" "")
(match_operand 7 "const_12_to_15_operand" "")
(match_operand 8 "const_12_to_15_operand" "")
(match_operand 9 "const_12_to_15_operand" "")])))]
"TARGET_AVX2
&& INTVAL (operands[2]) + 8 == INTVAL (operands[6])
&& INTVAL (operands[3]) + 8 == INTVAL (operands[7])
&& INTVAL (operands[4]) + 8 == INTVAL (operands[8])
&& INTVAL (operands[5]) + 8 == INTVAL (operands[9])"
{
int mask = 0;
mask |= (INTVAL (operands[2]) - 4) << 0;
@ -11526,14 +11546,39 @@
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
(define_insn "avx2_permv4di"
[(set (match_operand:V4DI 0 "register_operand" "=x")
(unspec:V4DI
[(match_operand:V4DI 1 "register_operand" "xm")
(match_operand:SI 2 "const_0_to_255_operand" "n")]
UNSPEC_VPERMDI))]
(define_expand "avx2_permv4di"
[(match_operand:V4DI 0 "register_operand" "")
(match_operand:V4DI 1 "nonimmediate_operand" "")
(match_operand:SI 2 "const_0_to_255_operand" "")]
"TARGET_AVX2"
"vpermq\t{%2, %1, %0|%0, %1, %2}"
{
int mask = INTVAL (operands[2]);
emit_insn (gen_avx2_permv4di_1 (operands[0], operands[1],
GEN_INT ((mask >> 0) & 3),
GEN_INT ((mask >> 2) & 3),
GEN_INT ((mask >> 4) & 3),
GEN_INT ((mask >> 6) & 3)));
DONE;
})
(define_insn "avx2_permv4di_1"
[(set (match_operand:V4DI 0 "register_operand" "=x")
(vec_select:V4DI
(match_operand:V4DI 1 "nonimmediate_operand" "xm")
(parallel [(match_operand 2 "const_0_to_3_operand" "")
(match_operand 3 "const_0_to_3_operand" "")
(match_operand 4 "const_0_to_3_operand" "")
(match_operand 5 "const_0_to_3_operand" "")])))]
"TARGET_AVX2"
{
int mask = 0;
mask |= INTVAL (operands[2]) << 0;
mask |= INTVAL (operands[3]) << 2;
mask |= INTVAL (operands[4]) << 4;
mask |= INTVAL (operands[5]) << 6;
operands[2] = GEN_INT (mask);
return "vpermq\t{%2, %1, %0|%0, %1, %2}";
}
[(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
@ -11542,7 +11587,7 @@
[(set (match_operand:V4DI 0 "register_operand" "=x")
(unspec:V4DI
[(match_operand:V4DI 1 "register_operand" "x")
(match_operand:V4DI 2 "register_operand" "xm")
(match_operand:V4DI 2 "nonimmediate_operand" "xm")
(match_operand:SI 3 "const_0_to_255_operand" "n")]
UNSPEC_VPERMTI))]
"TARGET_AVX2"