diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 72c02ef4a8a..c13d2435bd6 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,35 @@ +2010-02-15 Sebastian Pop + + * config/i386/i386-builtin-types.def + (V2DF_FTYPE_V2DF_V2DF_V2DI_INT): Declared. + (V4DF_FTYPE_V4DF_V4DF_V4DI_INT): Declared. + (V4SF_FTYPE_V4SF_V4SF_V4SI_INT): Declared. + (V8SF_FTYPE_V8SF_V8SF_V8SI_INT): Declared. + * config/i386/i386.c (enum ix86_builtins): Add IX86_BUILTIN_VPERMIL2PD, + IX86_BUILTIN_VPERMIL2PS, IX86_BUILTIN_VPERMIL2PD256, and + IX86_BUILTIN_VPERMIL2PS256. + (MULTI_ARG_4_DF2_DI_I): Defined. + (MULTI_ARG_4_DF2_DI_I1): Defined. + (MULTI_ARG_4_SF2_SI_I): Defined. + (MULTI_ARG_4_SF2_SI_I1): Defined. + (bdesc_multi_arg): Add __builtin_ia32_vpermil2pd, + __builtin_ia32_vpermil2ps, __builtin_ia32_vpermil2pd256, and + __builtin_ia32_vpermil2ps256. + (ix86_expand_multi_arg_builtin): Handle MULTI_ARG_4_DF2_DI_I, + MULTI_ARG_4_DF2_DI_I1, MULTI_ARG_4_SF2_SI_I, and + MULTI_ARG_4_SF2_SI_I1. Handle builtins with 4 arguments. + (ix86_expand_args_builtin): Handle MULTI_ARG_4_DF2_DI_I, + MULTI_ARG_4_DF2_DI_I1, MULTI_ARG_4_SF2_SI_I, and + MULTI_ARG_4_SF2_SI_I1. Handle CODE_FOR_xop_vpermil2v2df3, + CODE_FOR_xop_vpermil2v4sf3, CODE_FOR_xop_vpermil2v4df3, and + CODE_FOR_xop_vpermil2v8sf3. + * config/i386/i386.md (UNSPEC_VPERMIL2): Declared. + * config/i386/sse.md (xop_vpermil23): New insn pattern. + * config/i386/xopintrin.h (_mm_permute2_pd): New. + (_mm256_permute2_pd): New. + (_mm_permute2_ps): New. + (_mm256_permute2_ps): New. + 2010-02-15 Nick Clifton * config/h8300/h8300.c: (h8300_push_pop): Use bool type for diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 5fec96443a6..10310e233cb 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -311,6 +311,7 @@ DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI, V16QI) DEF_FUNCTION_TYPE (V1DI, V1DI, V1DI, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF) +DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT) DEF_FUNCTION_TYPE (V2DI, V2DI, DI, INT) DEF_FUNCTION_TYPE (V2DI, V2DI, UINT, UINT) DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, INT) @@ -319,11 +320,13 @@ DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI, V32QI) DEF_FUNCTION_TYPE (V4DF, V4DF, V2DF, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DF) +DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI, INT) DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, V4DI) DEF_FUNCTION_TYPE (V4HI, V4HI, HI, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, FLOAT, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF) +DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, SI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, INT) DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, V2DI) @@ -335,6 +338,7 @@ DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, V8HI) DEF_FUNCTION_TYPE (V8SF, V8SF, V4SF, INT) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, INT) DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SF) +DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, INT) DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, V8SI) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5bc4a64a29d..ac5ee3d812c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -20958,6 +20958,10 @@ enum ix86_builtins IX86_BUILTIN_VPERMILPS, IX86_BUILTIN_VPERMILPD256, IX86_BUILTIN_VPERMILPS256, + IX86_BUILTIN_VPERMIL2PD, + IX86_BUILTIN_VPERMIL2PS, + IX86_BUILTIN_VPERMIL2PD256, + IX86_BUILTIN_VPERMIL2PS256, IX86_BUILTIN_VPERM2F128PD256, IX86_BUILTIN_VPERM2F128PS256, IX86_BUILTIN_VPERM2F128SI256, @@ -22147,6 +22151,10 @@ static const struct builtin_description bdesc_args[] = }; /* FMA4 and XOP. */ +#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT +#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT +#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT +#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF @@ -22389,6 +22397,11 @@ static const struct builtin_description bdesc_multi_arg[] = { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF }, { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I }, + { OPTION_MASK_ISA_AVX, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I }, + { OPTION_MASK_ISA_AVX, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 }, + { OPTION_MASK_ISA_AVX, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 }, + }; /* Set up all the MMX/SSE builtins, even builtins for instructions that are not @@ -22769,6 +22782,14 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, switch (m_type) { + case MULTI_ARG_4_DF2_DI_I: + case MULTI_ARG_4_DF2_DI_I1: + case MULTI_ARG_4_SF2_SI_I: + case MULTI_ARG_4_SF2_SI_I1: + nargs = 4; + last_arg_constant = true; + break; + case MULTI_ARG_3_SF: case MULTI_ARG_3_DF: case MULTI_ARG_3_SF2: @@ -22912,6 +22933,10 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); break; + case 4: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op); + break; + default: gcc_unreachable (); } @@ -23530,6 +23555,13 @@ ix86_expand_args_builtin (const struct builtin_description *d, nargs = 3; nargs_constant = 2; break; + case MULTI_ARG_4_DF2_DI_I: + case MULTI_ARG_4_DF2_DI_I1: + case MULTI_ARG_4_SF2_SI_I: + case MULTI_ARG_4_SF2_SI_I1: + nargs = 4; + nargs_constant = 1; + break; case V2DI_FTYPE_V2DI_V2DI_UINT_UINT: nargs = 4; nargs_constant = 2; @@ -23599,6 +23631,10 @@ ix86_expand_args_builtin (const struct builtin_description *d, case CODE_FOR_sse4_1_blendpd: case CODE_FOR_avx_vpermilv2df: + case CODE_FOR_xop_vpermil2v2df3: + case CODE_FOR_xop_vpermil2v4sf3: + case CODE_FOR_xop_vpermil2v4df3: + case CODE_FOR_xop_vpermil2v8sf3: error ("the last argument must be a 2-bit immediate"); return const0_rtx; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b4a8a83064b..924433f88a4 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -219,11 +219,12 @@ ; For AVX support (UNSPEC_PCMP 166) (UNSPEC_VPERMIL 167) - (UNSPEC_VPERMIL2F128 168) - (UNSPEC_MASKLOAD 169) - (UNSPEC_MASKSTORE 170) - (UNSPEC_CAST 171) - (UNSPEC_VTESTP 172) + (UNSPEC_VPERMIL2 168) + (UNSPEC_VPERMIL2F128 169) + (UNSPEC_MASKLOAD 170) + (UNSPEC_MASKSTORE 171) + (UNSPEC_CAST 172) + (UNSPEC_VTESTP 173) ]) (define_constants diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 50b1b144ffb..105671601c1 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -11539,6 +11539,20 @@ (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) +(define_insn "xop_vpermil23" + [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x") + (unspec:AVXMODEF2P + [(match_operand:AVXMODEF2P 1 "register_operand" "x") + (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "%x") + (match_operand: 3 "nonimmediate_operand" "xm") + (match_operand:SI 4 "const_0_to_3_operand" "n")] + UNSPEC_VPERMIL2))] + "TARGET_XOP" + "vpermil2p\t{%4, %3, %2, %1, %0|%0, %1, %2, %3, %4}" + [(set_attr "type" "sse4arg") + (set_attr "length_immediate" "1") + (set_attr "mode" "")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn "*avx_aesenc" [(set (match_operand:V2DI 0 "register_operand" "=x") diff --git a/gcc/config/i386/xopintrin.h b/gcc/config/i386/xopintrin.h index 803417a6a45..30ce72d1751 100644 --- a/gcc/config/i386/xopintrin.h +++ b/gcc/config/i386/xopintrin.h @@ -766,6 +766,70 @@ _mm256_frcz_pd (__m256d __A) return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A); } +/* PERMIL2 */ + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I) +{ + return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X, + (__v2df)__Y, + (__v2di)__C, + __I); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I) +{ + return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X, + (__v4df)__Y, + (__v4di)__C, + __I); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I) +{ + return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X, + (__v4sf)__Y, + (__v4si)__C, + __I); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I) +{ + return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X, + (__v8sf)__Y, + (__v8si)__C, + __I); +} +#else +#define _mm_permute2_pd(X, Y, C, I) \ + ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__v2di)(__m128d)(C), \ + (int)(I))) + +#define _mm256_permute2_pd(X, Y, C, I) \ + ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (__v4di)(__m256d)(C), \ + (int)(I))) + +#define _mm_permute2_ps(X, Y, C, I) \ + ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (__v4si)(__m128)(C), \ + (int)(I))) + +#define _mm256_permute2_ps(X, Y, C, I) \ + ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (__v8si)(__m256)(C), \ + (int)(I))) +#endif /* __OPTIMIZE__ */ + #endif /* __XOP__ */ #endif /* _XOPMMINTRIN_H_INCLUDED */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index a06de9a9f4a..c763de22451 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,12 @@ +2010-02-15 Sebastian Pop + + * gcc.target/i386/sse-14.c: Add tests for _mm_permute2_pd, + _mm256_permute2_pd, _mm_permute2_ps, and _mm256_permute2_ps. + * gcc.target/i386/xop-vpermil2pd-1.c: New. + * gcc.target/i386/xop-vpermil2pd-256-1.c: New. + * gcc.target/i386/xop-vpermil2ps-1.c: New. + * gcc.target/i386/xop-vpermil2ps-256-1.c: New. + 2010-02-15 Richard Guenther PR middle-end/43068 diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c index c3f72e419c7..96a3f210eb0 100644 --- a/gcc/testsuite/gcc.target/i386/sse-14.c +++ b/gcc/testsuite/gcc.target/i386/sse-14.c @@ -162,6 +162,10 @@ test_1 ( _mm_roti_epi8, __m128i, __m128i, 1) test_1 ( _mm_roti_epi16, __m128i, __m128i, 1) test_1 ( _mm_roti_epi32, __m128i, __m128i, 1) test_1 ( _mm_roti_epi64, __m128i, __m128i, 1) +test_3 (_mm_permute2_pd, __m128d, __m128d, __m128d, __m128d, 1) +test_3 (_mm256_permute2_pd, __m256d, __m256d, __m256d, __m256d, 1) +test_3 (_mm_permute2_ps, __m128, __m128, __m128, __m128, 1) +test_3 (_mm256_permute2_ps, __m256, __m256, __m256, __m256, 1) /* lwpintrin.h */ test_2 ( __lwpval32, void, unsigned int, unsigned int, 1) diff --git a/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-1.c b/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-1.c new file mode 100644 index 00000000000..c7f0594f53e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-1.c @@ -0,0 +1,55 @@ +/* { dg-do run } */ +/* { dg-require-effective-target xop } */ +/* { dg-options "-O2 -mxop" } */ + +#include "xop-check.h" + +#ifndef ZERO_MATCH +#define ZERO_MATCH 2 +#endif + +static double +select2dp(double *src1, double *src2, long long sel) +{ + double tmp = 0.0; + + if ((sel & 0x3) == 0) tmp = src1[0]; + if ((sel & 0x3) == 1) tmp = src1[1]; + if ((sel & 0x3) == 2) tmp = src2[0]; + if ((sel & 0x3) == 3) tmp = src2[1]; + + return tmp; +} + +static double +sel_and_condzerodp(double *src1, double *src2, long long sel, int imm8) +{ + double tmp; + + tmp = select2dp(src1, src2, sel & 0x3); + + if (((imm8 & 0x3) == 2) && ((sel & 0x4) == 0x4)) tmp = 0; + if (((imm8 & 0x3) == 3) && ((sel & 0x4) == 0x0)) tmp = 0; + + return tmp; +} + +void static +xop_test () +{ + union128d s1, s2, u; + union128i_q s3; + double e[2]; + + s1.x = _mm_set_pd (1, 2); + s2.x = _mm_set_pd (3, 4); + s3.x = _mm_set_epi64x (1, 2); + u.x = _mm_permute2_pd(s1.x, s2.x, s3.x, ZERO_MATCH); + + e[0] = sel_and_condzerodp (s1.a, s2.a, (s3.a[0] & 0xe)>>1, ZERO_MATCH); + e[1] = sel_and_condzerodp (s1.a, s2.a, (s3.a[1] & 0xe)>>1, ZERO_MATCH); + + if (check_union128d (u, e)) + abort (); +} + diff --git a/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-256-1.c b/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-256-1.c new file mode 100644 index 00000000000..90012dbf687 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/xop-vpermil2pd-256-1.c @@ -0,0 +1,56 @@ +/* { dg-do run } */ +/* { dg-require-effective-target xop } */ +/* { dg-options "-O2 -mxop" } */ + +#include "xop-check.h" + +#ifndef ZERO_MATCH +#define ZERO_MATCH 1 +#endif + +static double +select2dp(double *src1, double *src2, long long sel) +{ + double tmp = 3.414; + + if ((sel & 0x3) == 0) tmp = src1[0]; + if ((sel & 0x3) == 1) tmp = src1[1]; + if ((sel & 0x3) == 2) tmp = src2[0]; + if ((sel & 0x3) == 3) tmp = src2[1]; + + return tmp; +} + +static double +sel_and_condzerodp(double *src1, double *src2, long long sel, int imm8) +{ + double tmp; + + tmp = select2dp(src1, src2, sel); + + if (((imm8 & 0x3) == 2) && ((sel & 0x4) == 0x4)) tmp = 0; + if (((imm8 & 0x3) == 3) && ((sel & 0x4) == 0x0)) tmp = 0; + + return tmp; +} + +void static +xop_test () +{ + union256d u, s1, s2; + double e[4] = {0.0}; + union256i_q s3; + + s1.x = _mm256_set_pd (1, 2, 3, 4); + s2.x = _mm256_set_pd (5, 6, 7, 8); + s3.x = _mm256_set_epi64x (0, 1, 2, 3); + u.x = _mm256_permute2_pd(s1.x, s2.x, s3.x, ZERO_MATCH); + + e[0] = sel_and_condzerodp (s1.a, s2.a, (s3.a[0] & 0xe)>>1, ZERO_MATCH); + e[1] = sel_and_condzerodp (s1.a, s2.a, (s3.a[1] & 0xe)>>1, ZERO_MATCH); + e[2] = sel_and_condzerodp (s1.a + 2, s2.a + 2, (s3.a[2] & 0xe)>>1, ZERO_MATCH); + e[3] = sel_and_condzerodp (s1.a + 2, s2.a + 2, (s3.a[3] & 0xe)>>1, ZERO_MATCH); + + if (check_union256d (u, e)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-1.c b/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-1.c new file mode 100644 index 00000000000..be47564e509 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-1.c @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-require-effective-target xop } */ +/* { dg-options "-O2 -mxop" } */ + +#include "xop-check.h" + +#ifndef ZERO_MATCH +#define ZERO_MATCH 1 +#endif + +static float +select2sp(float *src1, float *src2, int sel) +{ + float tmp; + + if ((sel & 0x7) == 0) tmp = src1[0]; + if ((sel & 0x7) == 1) tmp = src1[1]; + if ((sel & 0x7) == 2) tmp = src1[2]; + if ((sel & 0x7) == 3) tmp = src1[3]; + if ((sel & 0x7) == 4) tmp = src2[0]; + if ((sel & 0x7) == 5) tmp = src2[1]; + if ((sel & 0x7) == 6) tmp = src2[2]; + if ((sel & 0x7) == 7) tmp = src2[3]; + + return tmp; +} +static float +sel_and_condzerosp(float *src1, float *src2, int sel, int imm8) +{ + float tmp; + + tmp = select2sp(src1, src2, sel & 0x7); + + if (((imm8 & 0x3) == 2) && ((sel & 0x8) == 0x8)) tmp = 0; + if (((imm8 & 0x3) == 3) && ((sel & 0x8) == 0x0)) tmp = 0; + + return tmp; +} + +void static +xop_test () +{ + int i; + union128 source1, source2, u; + union128i_d source3; + float s1[4] = {1, 2, 3, 4}; + float s2[4] = {5, 6, 7, 8}; + int s3[4] = {0, 1, 0, 1}; + float e[4]; + + source1.x = _mm_loadu_ps(s1); + source2.x = _mm_loadu_ps(s2); + source3.x = _mm_loadu_si128((__m128i*) s3); + u.x = _mm_permute2_ps(source1.x, source2.x, source3.x, ZERO_MATCH); + + for (i = 0; i < 4; ++i) { + e[i] = sel_and_condzerosp(&s1[i & 0x4], &s2[i & 0x4], s3[i] & 0xf, ZERO_MATCH & 0x3); + } + + if (check_union128 (u, e)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-256-1.c b/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-256-1.c new file mode 100644 index 00000000000..4a5fcc61b86 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/xop-vpermil2ps-256-1.c @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-require-effective-target xop } */ +/* { dg-options "-O2 -mxop" } */ + +#include "xop-check.h" + +#ifndef ZERO_MATCH +#define ZERO_MATCH 3 +#endif + +static float +select2sp(float *src1, float *src2, int sel) +{ + float tmp; + + if ((sel & 0x7) == 0) tmp = src1[0]; + if ((sel & 0x7) == 1) tmp = src1[1]; + if ((sel & 0x7) == 2) tmp = src1[2]; + if ((sel & 0x7) == 3) tmp = src1[3]; + if ((sel & 0x7) == 4) tmp = src2[0]; + if ((sel & 0x7) == 5) tmp = src2[1]; + if ((sel & 0x7) == 6) tmp = src2[2]; + if ((sel & 0x7) == 7) tmp = src2[3]; + + return tmp; +} +static float +sel_and_condzerosp(float *src1, float *src2, int sel, int imm8) +{ + float tmp; + + tmp = select2sp(src1, src2, sel & 0x7); + + if (((imm8 & 0x3) == 2) && ((sel & 0x8) == 0x8)) tmp = 0; + if (((imm8 & 0x3) == 3) && ((sel & 0x8) == 0x0)) tmp = 0; + + return tmp; +} + +void static +xop_test () +{ + int i; + union256 source1, source2, u; + union256i_d source3; + float s1[8]={1, 2, 3, 4, 5, 6, 7, 8}; + float s2[8]={9, 10, 11, 12, 13, 14, 15, 16}; + int s3[8]={11, 2, 3, 15, 5, 12, 7, 8}; + float e[8]; + + source1.x = _mm256_loadu_ps(s1); + source2.x = _mm256_loadu_ps(s2); + source3.x = _mm256_loadu_si256((__m256i*) s3); + u.x = _mm256_permute2_ps(source1.x, source2.x, source3.x, ZERO_MATCH); + + for (i = 0; i < 8; ++i) { + e[i] = sel_and_condzerosp(&s1[i & 0x4], &s2[i & 0x4], s3[i] & 0xf, ZERO_MATCH & 0x3); + } + + if (check_union256(u, e)) + abort (); +}