rs6000: Add vec_unpacku_{hi,lo}_v4si

The existing vec_unpacku_{hi,lo} supports emulated unsigned
unpacking for short and char but misses the support for int.
This patch adds the support of vec_unpacku_{hi,lo}_v4si.

Meanwhile, the current implementation uses vector permutation
way, which requires one extra customized constant vector as
the permutation control vector.  It's better to use vector
merge high/low with zero constant vector, to save the space
in constant area as well as the cost to initialize pcv in
prologue.  This patch updates it with vector merging and
simplify it with iterators.

gcc/ChangeLog:

	* config/rs6000/altivec.md (vec_unpacku_hi_v16qi): Remove.
	(vec_unpacku_hi_v8hi): Likewise.
	(vec_unpacku_lo_v16qi): Likewise.
	(vec_unpacku_lo_v8hi): Likewise.
	(vec_unpacku_hi_<VP_small_lc>): New define_expand.
	(vec_unpacku_lo_<VP_small_lc>): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/powerpc/unpack-vectorize-1.c: New test.
	* gcc.target/powerpc/unpack-vectorize-1.h: New test.
	* gcc.target/powerpc/unpack-vectorize-2.c: New test.
	* gcc.target/powerpc/unpack-vectorize-2.h: New test.
	* gcc.target/powerpc/unpack-vectorize-3.c: New test.
	* gcc.target/powerpc/unpack-vectorize-3.h: New test.
	* gcc.target/powerpc/unpack-vectorize-run-1.c: New test.
	* gcc.target/powerpc/unpack-vectorize-run-2.c: New test.
	* gcc.target/powerpc/unpack-vectorize-run-3.c: New test.
	* gcc.target/powerpc/unpack-vectorize.h: New test.
This commit is contained in:
Kewen Lin 2021-08-24 21:58:14 -05:00
parent 4f5391dde1
commit a20be0cdc0
11 changed files with 199 additions and 132 deletions

View File

@ -134,10 +134,8 @@
UNSPEC_VMULWLUH
UNSPEC_VMULWHSH
UNSPEC_VMULWLSH
UNSPEC_VUPKHUB
UNSPEC_VUPKHUH
UNSPEC_VUPKLUB
UNSPEC_VUPKLUH
UNSPEC_VUPKHU
UNSPEC_VUPKLU
UNSPEC_VPERMSI
UNSPEC_VPERMHI
UNSPEC_INTERHI
@ -3688,143 +3686,45 @@
[(set_attr "type" "vecperm")
(set_attr "isa" "p9v,*")])
(define_expand "vec_unpacku_hi_v16qi"
[(set (match_operand:V8HI 0 "register_operand" "=v")
(unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")]
UNSPEC_VUPKHUB))]
"TARGET_ALTIVEC"
{
rtx vzero = gen_reg_rtx (V8HImode);
rtx mask = gen_reg_rtx (V16QImode);
rtvec v = rtvec_alloc (16);
bool be = BYTES_BIG_ENDIAN;
emit_insn (gen_altivec_vspltish (vzero, const0_rtx));
RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 7);
RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 0 : 16);
RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 16 : 6);
RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 1 : 16);
RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 5);
RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 2 : 16);
RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 16 : 4);
RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 3 : 16);
RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 3);
RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 4 : 16);
RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 16 : 2);
RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 5 : 16);
RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 1);
RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 6 : 16);
RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 16 : 0);
RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 7 : 16);
(define_expand "vec_unpacku_hi_<VP_small_lc>"
[(set (match_operand:VP 0 "register_operand" "=v")
(unspec:VP [(match_operand:<VP_small> 1 "register_operand" "v")]
UNSPEC_VUPKHU))]
"TARGET_ALTIVEC"
{
rtx vzero = gen_reg_rtx (<VP_small>mode);
emit_insn (gen_altivec_vspltis<VU_char> (vzero, const0_rtx));
emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
emit_insn (gen_vperm_v16qiv8hi (operands[0], operands[1], vzero, mask));
rtx res = gen_reg_rtx (<VP_small>mode);
rtx op1 = operands[1];
if (BYTES_BIG_ENDIAN)
emit_insn (gen_altivec_vmrgh<VU_char> (res, vzero, op1));
else
emit_insn (gen_altivec_vmrgl<VU_char> (res, op1, vzero));
emit_insn (gen_move_insn (operands[0], gen_lowpart (<MODE>mode, res)));
DONE;
})
(define_expand "vec_unpacku_hi_v8hi"
[(set (match_operand:V4SI 0 "register_operand" "=v")
(unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")]
UNSPEC_VUPKHUH))]
(define_expand "vec_unpacku_lo_<VP_small_lc>"
[(set (match_operand:VP 0 "register_operand" "=v")
(unspec:VP [(match_operand:<VP_small> 1 "register_operand" "v")]
UNSPEC_VUPKLU))]
"TARGET_ALTIVEC"
{
rtx vzero = gen_reg_rtx (V4SImode);
rtx mask = gen_reg_rtx (V16QImode);
rtvec v = rtvec_alloc (16);
bool be = BYTES_BIG_ENDIAN;
rtx vzero = gen_reg_rtx (<VP_small>mode);
emit_insn (gen_altivec_vspltis<VU_char> (vzero, const0_rtx));
emit_insn (gen_altivec_vspltisw (vzero, const0_rtx));
RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 7);
RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 17 : 6);
RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 0 : 17);
RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 1 : 16);
RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 5);
RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 17 : 4);
RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 2 : 17);
RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 3 : 16);
RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 3);
RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 17 : 2);
RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 4 : 17);
RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 5 : 16);
RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 1);
RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 17 : 0);
RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 6 : 17);
RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 7 : 16);
rtx res = gen_reg_rtx (<VP_small>mode);
rtx op1 = operands[1];
emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
emit_insn (gen_vperm_v8hiv4si (operands[0], operands[1], vzero, mask));
DONE;
})
if (BYTES_BIG_ENDIAN)
emit_insn (gen_altivec_vmrgl<VU_char> (res, vzero, op1));
else
emit_insn (gen_altivec_vmrgh<VU_char> (res, op1, vzero));
(define_expand "vec_unpacku_lo_v16qi"
[(set (match_operand:V8HI 0 "register_operand" "=v")
(unspec:V8HI [(match_operand:V16QI 1 "register_operand" "v")]
UNSPEC_VUPKLUB))]
"TARGET_ALTIVEC"
{
rtx vzero = gen_reg_rtx (V8HImode);
rtx mask = gen_reg_rtx (V16QImode);
rtvec v = rtvec_alloc (16);
bool be = BYTES_BIG_ENDIAN;
emit_insn (gen_altivec_vspltish (vzero, const0_rtx));
RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 15);
RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 8 : 16);
RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 16 : 14);
RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 9 : 16);
RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 13);
RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 10 : 16);
RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 16 : 12);
RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 11 : 16);
RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 11);
RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 12 : 16);
RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 16 : 10);
RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 13 : 16);
RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 9);
RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 14 : 16);
RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 16 : 8);
RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 15 : 16);
emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
emit_insn (gen_vperm_v16qiv8hi (operands[0], operands[1], vzero, mask));
DONE;
})
(define_expand "vec_unpacku_lo_v8hi"
[(set (match_operand:V4SI 0 "register_operand" "=v")
(unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")]
UNSPEC_VUPKLUH))]
"TARGET_ALTIVEC"
{
rtx vzero = gen_reg_rtx (V4SImode);
rtx mask = gen_reg_rtx (V16QImode);
rtvec v = rtvec_alloc (16);
bool be = BYTES_BIG_ENDIAN;
emit_insn (gen_altivec_vspltisw (vzero, const0_rtx));
RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (QImode, be ? 16 : 15);
RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (QImode, be ? 17 : 14);
RTVEC_ELT (v, 2) = gen_rtx_CONST_INT (QImode, be ? 8 : 17);
RTVEC_ELT (v, 3) = gen_rtx_CONST_INT (QImode, be ? 9 : 16);
RTVEC_ELT (v, 4) = gen_rtx_CONST_INT (QImode, be ? 16 : 13);
RTVEC_ELT (v, 5) = gen_rtx_CONST_INT (QImode, be ? 17 : 12);
RTVEC_ELT (v, 6) = gen_rtx_CONST_INT (QImode, be ? 10 : 17);
RTVEC_ELT (v, 7) = gen_rtx_CONST_INT (QImode, be ? 11 : 16);
RTVEC_ELT (v, 8) = gen_rtx_CONST_INT (QImode, be ? 16 : 11);
RTVEC_ELT (v, 9) = gen_rtx_CONST_INT (QImode, be ? 17 : 10);
RTVEC_ELT (v, 10) = gen_rtx_CONST_INT (QImode, be ? 12 : 17);
RTVEC_ELT (v, 11) = gen_rtx_CONST_INT (QImode, be ? 13 : 16);
RTVEC_ELT (v, 12) = gen_rtx_CONST_INT (QImode, be ? 16 : 9);
RTVEC_ELT (v, 13) = gen_rtx_CONST_INT (QImode, be ? 17 : 8);
RTVEC_ELT (v, 14) = gen_rtx_CONST_INT (QImode, be ? 14 : 17);
RTVEC_ELT (v, 15) = gen_rtx_CONST_INT (QImode, be ? 15 : 16);
emit_insn (gen_vec_initv16qiqi (mask, gen_rtx_PARALLEL (V16QImode, v)));
emit_insn (gen_vperm_v8hiv4si (operands[0], operands[1], vzero, mask));
emit_insn (gen_move_insn (operands[0], gen_lowpart (<MODE>mode, res)));
DONE;
})

View File

@ -0,0 +1,18 @@
/* { dg-do compile } */
/* { dg-require-effective-target powerpc_altivec_ok } */
/* { dg-options "-maltivec -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
/* Test if unpack vectorization succeeds for type signed/unsigned
short and char. */
#include "unpack-vectorize-1.h"
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
/* { dg-final { scan-assembler-times {\mvupkhsb\M} 2 } } */
/* { dg-final { scan-assembler-times {\mvupklsb\M} 2 } } */
/* { dg-final { scan-assembler-times {\mvupkhsh\M} 2 } } */
/* { dg-final { scan-assembler-times {\mvupklsh\M} 2 } } */
/* { dg-final { scan-assembler-times {\mvmrghb\M} 2 } } */
/* { dg-final { scan-assembler-times {\mvmrglb\M} 2 } } */
/* { dg-final { scan-assembler-times {\mvmrghh\M} 2 } } */
/* { dg-final { scan-assembler-times {\mvmrglh\M} 2 } } */

View File

@ -0,0 +1,14 @@
#include "unpack-vectorize.h"
DEF_ARR (si)
DEF_ARR (ui)
DEF_ARR (sh)
DEF_ARR (uh)
DEF_ARR (sc)
DEF_ARR (uc)
TEST1 (sh, si)
TEST1 (uh, ui)
TEST1 (sc, sh)
TEST1 (uc, uh)

View File

@ -0,0 +1,12 @@
/* { dg-do compile } */
/* { dg-require-effective-target powerpc_vsx_ok } */
/* { dg-options "-mdejagnu-cpu=power7 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
/* Test if unsigned int unpack vectorization succeeds. V2DImode is
supported since Power7 so guard it under Power7 and up. */
#include "unpack-vectorize-2.h"
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { scan-assembler-times {\mxxmrghw\M} 1 } } */
/* { dg-final { scan-assembler-times {\mxxmrglw\M} 1 } } */

View File

@ -0,0 +1,7 @@
#include "unpack-vectorize.h"
DEF_ARR (ui)
DEF_ARR (ull)
TEST1 (ui, ull)

View File

@ -0,0 +1,11 @@
/* { dg-do compile } */
/* { dg-require-effective-target powerpc_p8vector_ok } */
/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */
/* Test if signed int unpack vectorization succeeds. */
#include "unpack-vectorize-3.h"
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { scan-assembler-times {\mvupkhsw\M} 1 } } */
/* { dg-final { scan-assembler-times {\mvupklsw\M} 1 } } */

View File

@ -0,0 +1,7 @@
#include "unpack-vectorize.h"
DEF_ARR (si)
DEF_ARR (sll)
TEST1 (si, sll)

View File

@ -0,0 +1,24 @@
/* { dg-do run } */
/* { dg-require-effective-target vmx_hw } */
/* { dg-options "-maltivec -O2 -ftree-vectorize -fno-vect-cost-model" } */
#include "unpack-vectorize-1.h"
/* Test if unpack vectorization cases on signed/unsigned short and char
run successfully. */
CHECK1 (sh, si)
CHECK1 (uh, ui)
CHECK1 (sc, sh)
CHECK1 (uc, uh)
int
main ()
{
check1_sh_si ();
check1_uh_ui ();
check1_sc_sh ();
check1_uc_uh ();
return 0;
}

View File

@ -0,0 +1,16 @@
/* { dg-do run } */
/* { dg-require-effective-target vsx_hw } */
/* { dg-options "-mdejagnu-cpu=power7 -O2 -ftree-vectorize -fno-vect-cost-model" } */
#include "unpack-vectorize-2.h"
/* Test if unpack vectorization cases on unsigned int run successfully. */
CHECK1 (ui, ull)
int
main ()
{
check1_ui_ull ();
return 0;
}

View File

@ -0,0 +1,16 @@
/* { dg-do run } */
/* { dg-require-effective-target p8vector_hw } */
/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fno-vect-cost-model" } */
#include "unpack-vectorize-3.h"
/* Test if unpack vectorization cases on signed int run successfully. */
CHECK1 (si, sll)
int
main ()
{
check1_si_sll ();
return 0;
}

View File

@ -0,0 +1,42 @@
typedef signed long long sll;
typedef unsigned long long ull;
typedef signed int si;
typedef unsigned int ui;
typedef signed short sh;
typedef unsigned short uh;
typedef signed char sc;
typedef unsigned char uc;
#ifndef ALIGN
#define ALIGN 32
#endif
#define ALIGN_ATTR __attribute__((__aligned__(ALIGN)))
#define N 128
#define DEF_ARR(TYPE) \
TYPE TYPE##_a[N] ALIGN_ATTR; \
TYPE TYPE##_b[N] ALIGN_ATTR; \
TYPE TYPE##_c[N] ALIGN_ATTR;
#define TEST1(NTYPE, WTYPE) \
__attribute__((noipa)) void test1_##NTYPE##_##WTYPE() { \
for (int i = 0; i < N; i++) \
WTYPE##_c[i] = NTYPE##_a[i] + NTYPE##_b[i]; \
}
#define CHECK1(NTYPE, WTYPE) \
__attribute__((noipa, optimize(0))) void check1_##NTYPE##_##WTYPE() { \
for (int i = 0; i < N; i++) { \
NTYPE##_a[i] = 2 * i * sizeof(NTYPE) + 10; \
NTYPE##_b[i] = 7 * i * sizeof(NTYPE) / 5 - 10; \
} \
test1_##NTYPE##_##WTYPE(); \
for (int i = 0; i < N; i++) { \
WTYPE exp = NTYPE##_a[i] + NTYPE##_b[i]; \
if (WTYPE##_c[i] != exp) \
__builtin_abort(); \
} \
}