Optimize vec_extract for 256/512-bit vector when index exceeds the lower 128 bits.

-	vextracti32x8	$0x1, %zmm0, %ymm0
-	vmovd	%xmm0, %eax
+	valignd	$8, %zmm0, %zmm0, %zmm1
+	vmovd	%xmm1, %eax

-	vextracti32x8	$0x1, %zmm0, %ymm0
-	vextracti128	$0x1, %ymm0, %xmm0
-	vpextrd	$3, %xmm0, %eax
+	valignd	$15, %zmm0, %zmm0, %zmm1
+	vmovd	%xmm1, %eax

-	vextractf64x2	$0x1, %ymm0, %xmm0
+	valignq	$2, %ymm0, %ymm0, %ymm0

-	vextractf64x4	$0x1, %zmm0, %ymm0
-	vextractf64x2	$0x1, %ymm0, %xmm0
-	vunpckhpd	%xmm0, %xmm0, %xmm0
+	valignq	$7, %zmm0, %zmm0, %zmm0

gcc/ChangeLog:

	PR target/91103
	* config/i386/sse.md (*vec_extract<mode><ssescalarmodelower>_valign):
	New define_insn.

gcc/testsuite/ChangeLog:

	PR target/91103
	* gcc.target/i386/pr91103-1.c: New test.
	* gcc.target/i386/pr91103-2.c: New test.
This commit is contained in:
liuhongt 2021-09-08 16:19:37 +08:00
parent b6db7cd41c
commit 60eec23b5e
3 changed files with 150 additions and 0 deletions

View File

@ -233,6 +233,12 @@
V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
(define_mode_iterator V48_256_512_AVX512VL
[V16SI (V8SI "TARGET_AVX512VL")
V8DI (V4DI "TARGET_AVX512VL")
V16SF (V8SF "TARGET_AVX512VL")
V8DF (V4DF "TARGET_AVX512VL")])
;; 1,2 byte AVX-512{BW,VL} vector modes. Supposed TARGET_AVX512BW baseline.
(define_mode_iterator VI12_AVX512VL
[V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")
@ -828,6 +834,15 @@
(V8HF "TI") (V16HF "OI") (V32HF "XI")
(TI "TI")])
(define_mode_attr sseintvecinsnmode
[(V64QI "XI") (V32HI "XI") (V16SI "XI") (V8DI "XI") (V4TI "XI")
(V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI") (V2TI "OI")
(V16QI "TI") (V8HI "TI") (V4SI "TI") (V2DI "TI") (V1TI "TI")
(V16SF "XI") (V8DF "XI")
(V8SF "OI") (V4DF "OI")
(V4SF "TI") (V2DF "TI")
(TI "TI")])
;; SSE constant -1 constraint
(define_mode_attr sseconstm1
[(V64QI "BC") (V32HI "BC") (V16SI "BC") (V8DI "BC") (V4TI "BC")
@ -10517,6 +10532,23 @@
[(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
(define_mode_attr vec_extract_imm_predicate
[(V16SF "const_0_to_15_operand") (V8SF "const_0_to_7_operand")
(V16SI "const_0_to_15_operand") (V8SI "const_0_to_7_operand")
(V8DF "const_0_to_7_operand") (V4DF "const_0_to_3_operand")
(V8DI "const_0_to_7_operand") (V4DI "const_0_to_3_operand")])
(define_insn "*vec_extract<mode><ssescalarmodelower>_valign"
[(set (match_operand:<ssescalarmode> 0 "register_operand" "=v")
(vec_select:<ssescalarmode>
(match_operand:V48_256_512_AVX512VL 1 "register_operand" "v")
(parallel [(match_operand 2 "<vec_extract_imm_predicate>")])))]
"TARGET_AVX512F
&& INTVAL(operands[2]) >= 16 / GET_MODE_SIZE (<ssescalarmode>mode)"
"valign<ternlogsuffix>\t{%2, %1, %1, %<xtg_mode>0|%<xtg_mode>0, %1, %1, %2}";
[(set_attr "prefix" "evex")
(set_attr "mode" "<sseintvecinsnmode>")])
(define_expand "avx512f_shufps512_mask"
[(match_operand:V16SF 0 "register_operand")
(match_operand:V16SF 1 "register_operand")

View File

@ -0,0 +1,37 @@
/* { dg-do compile } */
/* { dg-options "-mavx512vl -O2" } */
/* { dg-final { scan-assembler-times "valign\[dq\]" 16 } } */
typedef float v8sf __attribute__((vector_size(32)));
typedef float v16sf __attribute__((vector_size(64)));
typedef int v8si __attribute__((vector_size(32)));
typedef int v16si __attribute__((vector_size(64)));
typedef double v4df __attribute__((vector_size(32)));
typedef double v8df __attribute__((vector_size(64)));
typedef long long v4di __attribute__((vector_size(32)));
typedef long long v8di __attribute__((vector_size(64)));
#define EXTRACT(V,S,IDX) \
S \
__attribute__((noipa)) \
foo_##V##_##IDX (V v) \
{ \
return v[IDX]; \
} \
EXTRACT (v8sf, float, 4);
EXTRACT (v8sf, float, 7);
EXTRACT (v8si, int, 4);
EXTRACT (v8si, int, 7);
EXTRACT (v16sf, float, 8);
EXTRACT (v16sf, float, 15);
EXTRACT (v16si, int, 8);
EXTRACT (v16si, int, 15);
EXTRACT (v4df, double, 2);
EXTRACT (v4df, double, 3);
EXTRACT (v4di, long long, 2);
EXTRACT (v4di, long long, 3);
EXTRACT (v8df, double, 4);
EXTRACT (v8df, double, 7);
EXTRACT (v8di, long long, 4);
EXTRACT (v8di, long long, 7);

View File

@ -0,0 +1,81 @@
/* { dg-do run } */
/* { dg-options "-O2 -mavx512vl" } */
/* { dg-require-effective-target avx512vl } */
#define AVX512VL
#ifndef CHECK
#define CHECK "avx512f-helper.h"
#endif
#include CHECK
#include "pr91103-1.c"
#define RUNCHECK(U,V,S,IDX) \
do \
{ \
S tmp = foo_##V##_##IDX ((V)U.x); \
if (tmp != U.a[IDX]) \
abort(); \
} \
while (0)
void
test_256 (void)
{
union512i_d di1;
union256i_d di2;
union512i_q q1;
union256i_q q2;
union512 f1;
union256 f2;
union512d d1;
union256d d2;
int sign = 1;
int i = 0;
for (i = 0; i < 16; i++)
{
di1.a[i] = 30 * (i - 30) * sign;
f1.a[i] = 56.78 * (i - 30) * sign;
sign = -sign;
}
for (i = 0; i != 8; i++)
{
di2.a[i] = 15 * (i + 40) * sign;
f2.a[i] = 90.12 * (i + 40) * sign;
q1.a[i] = 15 * (i + 40) * sign;
d1.a[i] = 90.12 * (i + 40) * sign;
sign = -sign;
}
for (i = 0; i != 4; i++)
{
q2.a[i] = 15 * (i + 40) * sign;
d2.a[i] = 90.12 * (i + 40) * sign;
sign = -sign;
}
RUNCHECK (f2, v8sf, float, 4);
RUNCHECK (f2, v8sf, float, 7);
RUNCHECK (di2, v8si, int, 4);
RUNCHECK (di2, v8si, int, 7);
RUNCHECK (f1, v16sf, float, 8);
RUNCHECK (f1, v16sf, float, 15);
RUNCHECK (di1, v16si, int, 8);
RUNCHECK (di1, v16si, int, 15);
RUNCHECK (d2, v4df, double, 2);
RUNCHECK (d2, v4df, double, 3);
RUNCHECK (q2, v4di, long long, 2);
RUNCHECK (q2, v4di, long long, 3);
RUNCHECK (d1, v8df, double, 4);
RUNCHECK (d1, v8df, double, 7);
RUNCHECK (q1, v8di, long long, 4);
RUNCHECK (q1, v8di, long long, 7);
}
void
test_128()
{
}