sse.md (vec_interleave_high<mode>, [...]): Add AVX2 expanders for VI_256 modes.

* config/i386/sse.md (vec_interleave_high<mode>,
	vec_interleave_low<mode>): Add AVX2 expanders for VI_256
	modes.
	* config/i386/i386.c (expand_vec_perm_interleave3): New function.
	(ix86_expand_vec_perm_builtin_1): Call it.

From-SVN: r179995
This commit is contained in:
Jakub Jelinek 2011-10-14 18:55:25 +02:00 committed by Jakub Jelinek
parent a08147527b
commit 2e2accf8ae
3 changed files with 119 additions and 0 deletions

View File

@ -1,3 +1,11 @@
2011-10-14 Jakub Jelinek <jakub@redhat.com>
* config/i386/sse.md (vec_interleave_high<mode>,
vec_interleave_low<mode>): Add AVX2 expanders for VI_256
modes.
* config/i386/i386.c (expand_vec_perm_interleave3): New function.
(ix86_expand_vec_perm_builtin_1): Call it.
2011-10-14 Georg-Johann Lay <avr@gjlay.de>
Fix thinko from r179765

View File

@ -35474,6 +35474,82 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
return true;
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
a two vector permutation using 2 intra-lane interleave insns
and cross-lane shuffle for 32-byte vectors. */
static bool
expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
{
unsigned i, nelt;
rtx (*gen) (rtx, rtx, rtx);
if (d->op0 == d->op1)
return false;
if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
;
else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
;
else
return false;
nelt = d->nelt;
if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
return false;
for (i = 0; i < nelt; i += 2)
if (d->perm[i] != d->perm[0] + i / 2
|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
return false;
if (d->testing_p)
return true;
switch (d->vmode)
{
case V32QImode:
if (d->perm[0])
gen = gen_vec_interleave_highv32qi;
else
gen = gen_vec_interleave_lowv32qi;
break;
case V16HImode:
if (d->perm[0])
gen = gen_vec_interleave_highv16hi;
else
gen = gen_vec_interleave_lowv16hi;
break;
case V8SImode:
if (d->perm[0])
gen = gen_vec_interleave_highv8si;
else
gen = gen_vec_interleave_lowv8si;
break;
case V4DImode:
if (d->perm[0])
gen = gen_vec_interleave_highv4di;
else
gen = gen_vec_interleave_lowv4di;
break;
case V8SFmode:
if (d->perm[0])
gen = gen_vec_interleave_highv8sf;
else
gen = gen_vec_interleave_lowv8sf;
break;
case V4DFmode:
if (d->perm[0])
gen = gen_vec_interleave_highv4df;
else
gen = gen_vec_interleave_lowv4df;
break;
default:
gcc_unreachable ();
}
emit_insn (gen (d->target, d->op0, d->op1));
return true;
}
/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
permutation with two pshufb insns and an ior. We should have already
failed all two instruction sequences. */
@ -35972,6 +36048,9 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_pshufb2 (d))
return true;
if (expand_vec_perm_interleave3 (d))
return true;
/* Try sequences of four instructions. */
if (expand_vec_perm_vpshufb2_vpermq (d))

View File

@ -6848,6 +6848,38 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "TI")])
(define_expand "vec_interleave_high<mode>"
[(match_operand:VI_256 0 "register_operand" "=x")
(match_operand:VI_256 1 "register_operand" "x")
(match_operand:VI_256 2 "nonimmediate_operand" "xm")]
"TARGET_AVX2"
{
rtx t1 = gen_reg_rtx (<MODE>mode);
rtx t2 = gen_reg_rtx (<MODE>mode);
emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2]));
emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, operands[0]),
gen_lowpart (V4DImode, t1),
gen_lowpart (V4DImode, t2), GEN_INT (1 + (3 << 4))));
DONE;
})
(define_expand "vec_interleave_low<mode>"
[(match_operand:VI_256 0 "register_operand" "=x")
(match_operand:VI_256 1 "register_operand" "x")
(match_operand:VI_256 2 "nonimmediate_operand" "xm")]
"TARGET_AVX2"
{
rtx t1 = gen_reg_rtx (<MODE>mode);
rtx t2 = gen_reg_rtx (<MODE>mode);
emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2]));
emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, operands[0]),
gen_lowpart (V4DImode, t1),
gen_lowpart (V4DImode, t2), GEN_INT (0 + (2 << 4))));
DONE;
})
;; Modes handled by pinsr patterns.
(define_mode_iterator PINSR_MODE
[(V16QI "TARGET_SSE4_1") V8HI