re PR target/88547 (missed optimization for vector comparisons)

PR target/88547
	* config/i386/i386.c (ix86_expand_sse_movcc): For maskcmp, try to
	emit vpmovm2? instruction perhaps after knot?.  Reorganize code
	so that it doesn't have to test !maskcmp in almost every conditional.

	* gcc.target/i386/pr88547-1.c: New test.

From-SVN: r267293
This commit is contained in:
Jakub Jelinek 2018-12-20 08:58:02 +01:00 committed by Jakub Jelinek
parent 59faa32efd
commit 5a5fe9175e
4 changed files with 333 additions and 121 deletions

View File

@ -1,3 +1,10 @@
2018-12-20 Jakub Jelinek <jakub@redhat.com>
PR target/88547
* config/i386/i386.c (ix86_expand_sse_movcc): For maskcmp, try to
emit vpmovm2? instruction perhaps after knot?. Reorganize code
so that it doesn't have to test !maskcmp in almost every conditional.
2018-12-20 Alan Modra <amodra@gmail.com>
* config/rs6000/sysv4.h (GNU_USER_DYNAMIC_LINKER): Define.

View File

@ -23593,33 +23593,117 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
cmp = gen_rtx_SUBREG (mode, cmp, 0);
}
if (vector_all_ones_operand (op_true, mode)
&& rtx_equal_p (op_false, CONST0_RTX (mode))
&& !maskcmp)
if (maskcmp)
{
rtx (*gen) (rtx, rtx) = NULL;
if ((op_true == CONST0_RTX (mode)
&& vector_all_ones_operand (op_false, mode))
|| (op_false == CONST0_RTX (mode)
&& vector_all_ones_operand (op_true, mode)))
switch (mode)
{
case E_V64QImode:
if (TARGET_AVX512BW)
gen = gen_avx512bw_cvtmask2bv64qi;
break;
case E_V32QImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_cvtmask2bv32qi;
break;
case E_V16QImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_cvtmask2bv16qi;
break;
case E_V32HImode:
if (TARGET_AVX512BW)
gen = gen_avx512bw_cvtmask2wv32hi;
break;
case E_V16HImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_cvtmask2wv16hi;
break;
case E_V8HImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_cvtmask2wv8hi;
break;
case E_V16SImode:
if (TARGET_AVX512DQ)
gen = gen_avx512f_cvtmask2dv16si;
break;
case E_V8SImode:
if (TARGET_AVX512VL && TARGET_AVX512DQ)
gen = gen_avx512vl_cvtmask2dv8si;
break;
case E_V4SImode:
if (TARGET_AVX512VL && TARGET_AVX512DQ)
gen = gen_avx512vl_cvtmask2dv4si;
break;
case E_V8DImode:
if (TARGET_AVX512DQ)
gen = gen_avx512f_cvtmask2qv8di;
break;
case E_V4DImode:
if (TARGET_AVX512VL && TARGET_AVX512DQ)
gen = gen_avx512vl_cvtmask2qv4di;
break;
case E_V2DImode:
if (TARGET_AVX512VL && TARGET_AVX512DQ)
gen = gen_avx512vl_cvtmask2qv2di;
break;
default:
break;
}
if (gen && SCALAR_INT_MODE_P (cmpmode))
{
cmp = force_reg (cmpmode, cmp);
if (op_true == CONST0_RTX (mode))
{
rtx (*gen_not) (rtx, rtx);
switch (cmpmode)
{
case E_QImode: gen_not = gen_knotqi; break;
case E_HImode: gen_not = gen_knothi; break;
case E_SImode: gen_not = gen_knotsi; break;
case E_DImode: gen_not = gen_knotdi; break;
default: gcc_unreachable ();
}
rtx n = gen_reg_rtx (cmpmode);
emit_insn (gen_not (n, cmp));
cmp = n;
}
emit_insn (gen (dest, cmp));
return;
}
}
else if (vector_all_ones_operand (op_true, mode)
&& op_false == CONST0_RTX (mode))
{
emit_insn (gen_rtx_SET (dest, cmp));
return;
}
else if (op_false == CONST0_RTX (mode) && !maskcmp)
else if (op_false == CONST0_RTX (mode))
{
op_true = force_reg (mode, op_true);
x = gen_rtx_AND (mode, cmp, op_true);
emit_insn (gen_rtx_SET (dest, x));
return;
}
else if (op_true == CONST0_RTX (mode) && !maskcmp)
else if (op_true == CONST0_RTX (mode))
{
op_false = force_reg (mode, op_false);
x = gen_rtx_NOT (mode, cmp);
x = gen_rtx_AND (mode, x, op_false);
emit_insn (gen_rtx_SET (dest, x));
return;
}
else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
&& !maskcmp)
else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
{
op_false = force_reg (mode, op_false);
x = gen_rtx_IOR (mode, cmp, op_false);
emit_insn (gen_rtx_SET (dest, x));
return;
}
else if (TARGET_XOP && !maskcmp)
else if (TARGET_XOP)
{
op_true = force_reg (mode, op_true);
@ -23629,127 +23713,126 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
op_true,
op_false)));
return;
}
rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
rtx d = dest;
if (!vector_operand (op_true, mode))
op_true = force_reg (mode, op_true);
op_false = force_reg (mode, op_false);
switch (mode)
{
case E_V4SFmode:
if (TARGET_SSE4_1)
gen = gen_sse4_1_blendvps;
break;
case E_V2DFmode:
if (TARGET_SSE4_1)
gen = gen_sse4_1_blendvpd;
break;
case E_SFmode:
if (TARGET_SSE4_1)
{
gen = gen_sse4_1_blendvss;
op_true = force_reg (mode, op_true);
}
break;
case E_DFmode:
if (TARGET_SSE4_1)
{
gen = gen_sse4_1_blendvsd;
op_true = force_reg (mode, op_true);
}
break;
case E_V16QImode:
case E_V8HImode:
case E_V4SImode:
case E_V2DImode:
if (TARGET_SSE4_1)
{
gen = gen_sse4_1_pblendvb;
if (mode != V16QImode)
d = gen_reg_rtx (V16QImode);
op_false = gen_lowpart (V16QImode, op_false);
op_true = gen_lowpart (V16QImode, op_true);
cmp = gen_lowpart (V16QImode, cmp);
}
break;
case E_V8SFmode:
if (TARGET_AVX)
gen = gen_avx_blendvps256;
break;
case E_V4DFmode:
if (TARGET_AVX)
gen = gen_avx_blendvpd256;
break;
case E_V32QImode:
case E_V16HImode:
case E_V8SImode:
case E_V4DImode:
if (TARGET_AVX2)
{
gen = gen_avx2_pblendvb;
if (mode != V32QImode)
d = gen_reg_rtx (V32QImode);
op_false = gen_lowpart (V32QImode, op_false);
op_true = gen_lowpart (V32QImode, op_true);
cmp = gen_lowpart (V32QImode, cmp);
}
break;
case E_V64QImode:
gen = gen_avx512bw_blendmv64qi;
break;
case E_V32HImode:
gen = gen_avx512bw_blendmv32hi;
break;
case E_V16SImode:
gen = gen_avx512f_blendmv16si;
break;
case E_V8DImode:
gen = gen_avx512f_blendmv8di;
break;
case E_V8DFmode:
gen = gen_avx512f_blendmv8df;
break;
case E_V16SFmode:
gen = gen_avx512f_blendmv16sf;
break;
default:
break;
}
if (gen != NULL)
{
emit_insn (gen (d, op_false, op_true, cmp));
if (d != dest)
emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
}
else
{
rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
rtx d = dest;
op_true = force_reg (mode, op_true);
if (!vector_operand (op_true, mode))
op_true = force_reg (mode, op_true);
op_false = force_reg (mode, op_false);
switch (mode)
{
case E_V4SFmode:
if (TARGET_SSE4_1)
gen = gen_sse4_1_blendvps;
break;
case E_V2DFmode:
if (TARGET_SSE4_1)
gen = gen_sse4_1_blendvpd;
break;
case E_SFmode:
if (TARGET_SSE4_1)
{
gen = gen_sse4_1_blendvss;
op_true = force_reg (mode, op_true);
}
break;
case E_DFmode:
if (TARGET_SSE4_1)
{
gen = gen_sse4_1_blendvsd;
op_true = force_reg (mode, op_true);
}
break;
case E_V16QImode:
case E_V8HImode:
case E_V4SImode:
case E_V2DImode:
if (TARGET_SSE4_1)
{
gen = gen_sse4_1_pblendvb;
if (mode != V16QImode)
d = gen_reg_rtx (V16QImode);
op_false = gen_lowpart (V16QImode, op_false);
op_true = gen_lowpart (V16QImode, op_true);
cmp = gen_lowpart (V16QImode, cmp);
}
break;
case E_V8SFmode:
if (TARGET_AVX)
gen = gen_avx_blendvps256;
break;
case E_V4DFmode:
if (TARGET_AVX)
gen = gen_avx_blendvpd256;
break;
case E_V32QImode:
case E_V16HImode:
case E_V8SImode:
case E_V4DImode:
if (TARGET_AVX2)
{
gen = gen_avx2_pblendvb;
if (mode != V32QImode)
d = gen_reg_rtx (V32QImode);
op_false = gen_lowpart (V32QImode, op_false);
op_true = gen_lowpart (V32QImode, op_true);
cmp = gen_lowpart (V32QImode, cmp);
}
break;
case E_V64QImode:
gen = gen_avx512bw_blendmv64qi;
break;
case E_V32HImode:
gen = gen_avx512bw_blendmv32hi;
break;
case E_V16SImode:
gen = gen_avx512f_blendmv16si;
break;
case E_V8DImode:
gen = gen_avx512f_blendmv8di;
break;
case E_V8DFmode:
gen = gen_avx512f_blendmv8df;
break;
case E_V16SFmode:
gen = gen_avx512f_blendmv16sf;
break;
default:
break;
}
if (gen != NULL)
{
emit_insn (gen (d, op_false, op_true, cmp));
if (d != dest)
emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
}
t2 = gen_reg_rtx (mode);
if (optimize)
t3 = gen_reg_rtx (mode);
else
{
op_true = force_reg (mode, op_true);
t3 = dest;
t2 = gen_reg_rtx (mode);
if (optimize)
t3 = gen_reg_rtx (mode);
else
t3 = dest;
x = gen_rtx_AND (mode, op_true, cmp);
emit_insn (gen_rtx_SET (t2, x));
x = gen_rtx_AND (mode, op_true, cmp);
emit_insn (gen_rtx_SET (t2, x));
x = gen_rtx_NOT (mode, cmp);
x = gen_rtx_AND (mode, x, op_false);
emit_insn (gen_rtx_SET (t3, x));
x = gen_rtx_NOT (mode, cmp);
x = gen_rtx_AND (mode, x, op_false);
emit_insn (gen_rtx_SET (t3, x));
x = gen_rtx_IOR (mode, t3, t2);
emit_insn (gen_rtx_SET (dest, x));
}
x = gen_rtx_IOR (mode, t3, t2);
emit_insn (gen_rtx_SET (dest, x));
}
}

View File

@ -1,3 +1,8 @@
2018-12-20 Jakub Jelinek <jakub@redhat.com>
PR target/88547
* gcc.target/i386/pr88547-1.c: New test.
2018-12-19 Steven G. Kargl <kargl@gcc.gnu.org>
PR fortran/87992

View File

@ -0,0 +1,117 @@
/* PR target/88547 */
/* { dg-do compile } */
/* { dg-options "-O2 -mavx512vl -mavx512bw -mavx512dq" } */
/* { dg-final { scan-assembler-not "vpternlog" } } */
/* { dg-final { scan-assembler-times "vpmovm2b\[\t ]" 4 } } */
/* { dg-final { scan-assembler-times "vpmovm2w\[\t ]" 4 } } */
/* { dg-final { scan-assembler-times "vpmovm2d\[\t ]" 4 } } */
/* { dg-final { scan-assembler-times "vpmovm2q\[\t ]" 4 } } */
/* { dg-final { scan-assembler-times "knotb\[\t ]" 4 } } */
/* { dg-final { scan-assembler-times "knotw\[\t ]" 4 } } */
/* { dg-final { scan-assembler-times "knotd\[\t ]" 2 } } */
/* { dg-final { scan-assembler-times "knotq\[\t ]" 2 } } */
typedef signed char v64qi __attribute__((vector_size(64)));
typedef unsigned char v64uqi __attribute__((vector_size(64)));
typedef short v32hi __attribute__((vector_size(64)));
typedef unsigned short v32uhi __attribute__((vector_size(64)));
typedef int v16si __attribute__((vector_size(64)));
typedef unsigned v16usi __attribute__((vector_size(64)));
typedef long long v8di __attribute__((vector_size(64)));
typedef unsigned long long v8udi __attribute__((vector_size(64)));
v64qi
f1 (v64qi x, v64qi y)
{
return x <= y;
}
v64uqi
f2 (v64uqi x, v64uqi y)
{
return x <= y;
}
v64qi
f3 (v64qi x, v64qi y)
{
return x >= y;
}
v64uqi
f4 (v64uqi x, v64uqi y)
{
return x >= y;
}
v32hi
f5 (v32hi x, v32hi y)
{
return x <= y;
}
v32uhi
f6 (v32uhi x, v32uhi y)
{
return x <= y;
}
v32hi
f7 (v32hi x, v32hi y)
{
return x >= y;
}
v32uhi
f8 (v32uhi x, v32uhi y)
{
return x >= y;
}
v16si
f9 (v16si x, v16si y)
{
return x <= y;
}
v16usi
f10 (v16usi x, v16usi y)
{
return x <= y;
}
v16si
f11 (v16si x, v16si y)
{
return x >= y;
}
v16usi
f12 (v16usi x, v16usi y)
{
return x >= y;
}
v8di
f13 (v8di x, v8di y)
{
return x <= y;
}
v8udi
f14 (v8udi x, v8udi y)
{
return x <= y;
}
v8di
f15 (v8di x, v8di y)
{
return x >= y;
}
v8udi
f16 (v8udi x, v8udi y)
{
return x >= y;
}