Remove pass_cpb which is related to enable avx512 embedded broadcast from constant pool.

By optimizing vector movement to broadcast in ix86_expand_vector_move
during pass_expand, pass_reload/LRA can automatically generate an avx512
embedded broadcast, pass_cpb is not needed.

Considering that in the absence of avx512f, broadcast from memory is
still slightly faster than loading the entire memory, so always enable
broadcast.

benchmark:
https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast

The performance diff

strategy    : cycles
memory      : 1046611188
memory      : 1255420817
memory      : 1044720793
memory      : 1253414145
average	    : 1097868397

broadcast   : 1044430688
broadcast   : 1044477630
broadcast   : 1253554603
broadcast   : 1044561934
average	    : 1096756213

But however broadcast has larger size.

the size diff

size broadcast.o
   text	   data	    bss	    dec	    hex	filename
    137	      0	      0	    137	     89	broadcast.o

size memory.o
   text	   data	    bss	    dec	    hex	filename
    115	      0	      0	    115	     73	memory.o

gcc/ChangeLog:

	* config/i386/i386-expand.c
	(ix86_broadcast_from_integer_constant): Rename to ..
	(ix86_broadcast_from_constant): .. this, and extend it to
	handle float mode.
	(ix86_expand_vector_move): Extend to float mode.
	* config/i386/i386-features.c
	(replace_constant_pool_with_broadcast): Remove.
	(remove_partial_avx_dependency_gate): Ditto.
	(constant_pool_broadcast): Ditto.
	(class pass_constant_pool_broadcast): Ditto.
	(make_pass_constant_pool_broadcast): Ditto.
	(remove_partial_avx_dependency): Adjust gate.
	* config/i386/i386-passes.def: Remove pass_constant_pool_broadcast.
	* config/i386/i386-protos.h
	(make_pass_constant_pool_broadcast): Remove.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase.
This commit is contained in:
liuhongt 2021-07-13 18:22:03 +08:00
parent a56c251898
commit a6291d88d5
5 changed files with 34 additions and 163 deletions

View File

@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[])
emit_insn (gen_rtx_SET (op0, op1));
}
/* OP is a memref of CONST_VECTOR, return scalar constant mem
if CONST_VECTOR is a vec_duplicate, else return NULL. */
static rtx
ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
ix86_broadcast_from_constant (machine_mode mode, rtx op)
{
int nunits = GET_MODE_NUNITS (mode);
if (nunits < 2)
@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
/* Don't use integer vector broadcast if we can't move from GPR to SSE
register directly. */
if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
if (!TARGET_INTER_UNIT_MOVES_TO_VEC
&& INTEGRAL_MODE_P (mode))
return nullptr;
/* Convert CONST_VECTOR to a non-standard SSE constant integer
@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
if (!(TARGET_AVX2
|| (TARGET_AVX
&& (GET_MODE_INNER (mode) == SImode
|| GET_MODE_INNER (mode) == DImode)))
|| GET_MODE_INNER (mode) == DImode))
|| FLOAT_MODE_P (mode))
|| standard_sse_constant_p (op, mode))
return nullptr;
/* Don't broadcast from a 64-bit integer constant in 32-bit mode. */
if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT)
/* Don't broadcast from a 64-bit integer constant in 32-bit mode.
We can still put 64-bit integer constant in memory when
avx512 embed broadcast is available. */
if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
&& (!TARGET_AVX512F
|| (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
return nullptr;
if (GET_MODE_INNER (mode) == TImode)
@ -561,17 +569,29 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
if (can_create_pseudo_p ()
&& GET_MODE_SIZE (mode) >= 16
&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
&& VECTOR_MODE_P (mode)
&& (MEM_P (op1)
&& SYMBOL_REF_P (XEXP (op1, 0))
&& CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
{
rtx first = ix86_broadcast_from_integer_constant (mode, op1);
rtx first = ix86_broadcast_from_constant (mode, op1);
if (first != nullptr)
{
/* Broadcast to XMM/YMM/ZMM register from an integer
constant. */
constant or scalar mem. */
/* Hard registers are used for 2 purposes:
1. Prevent stack realignment when the original code
doesn't use vector registers, which is the same for
memcpy and memset.
2. Prevent combine to convert constant broadcast to
load from constant pool. */
op1 = ix86_gen_scratch_sse_rtx (mode);
if (FLOAT_MODE_P (mode)
|| (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
{
first = force_const_mem (GET_MODE_INNER (mode), first);
op1 = gen_reg_rtx (mode);
}
bool ok = ix86_expand_vector_init_duplicate (false, mode,
op1, first);
gcc_assert (ok);

View File

@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
return new pass_insert_endbr_and_patchable_area (ctxt);
}
/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x
with embedded broadcast. i.e.transform
vpaddq .LC0(%rip), %zmm0, %zmm0
ret
.LC0:
.quad 3
.quad 3
.quad 3
.quad 3
.quad 3
.quad 3
.quad 3
.quad 3
to
vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0
ret
.LC0:
.quad 3 */
static void
replace_constant_pool_with_broadcast (rtx_insn *insn)
{
subrtx_ptr_iterator::array_type array;
FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL)
{
rtx *loc = *iter;
rtx x = *loc;
rtx broadcast_mem, vec_dup, constant, first;
machine_mode mode;
/* Constant pool. */
if (!MEM_P (x)
|| !SYMBOL_REF_P (XEXP (x, 0))
|| !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)))
continue;
/* Const vector. */
mode = GET_MODE (x);
if (!VECTOR_MODE_P (mode))
return;
constant = get_pool_constant (XEXP (x, 0));
if (GET_CODE (constant) != CONST_VECTOR)
return;
/* There could be some rtx like
(mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
but with "*.LC1" refer to V2DI constant vector. */
if (GET_MODE (constant) != mode)
{
constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR)
return;
}
first = XVECEXP (constant, 0, 0);
for (int i = 1; i < GET_MODE_NUNITS (mode); ++i)
{
rtx tmp = XVECEXP (constant, 0, i);
/* Vector duplicate value. */
if (!rtx_equal_p (tmp, first))
return;
}
/* Replace with embedded broadcast. */
broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first);
vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem);
validate_change (insn, loc, vec_dup, 0);
/* At most 1 memory_operand in an insn. */
return;
}
}
/* At entry of the nearest common dominator for basic blocks with
conversions, generate a single
vxorps %xmmN, %xmmN, %xmmN
@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void)
if (!NONDEBUG_INSN_P (insn))
continue;
/* Handle AVX512 embedded broadcast here to save compile time. */
if (TARGET_AVX512F)
replace_constant_pool_with_broadcast (insn);
set = single_set (insn);
if (!set)
continue;
@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void)
return 0;
}
static bool
remove_partial_avx_dependency_gate ()
{
return (TARGET_AVX
&& TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& TARGET_SSE_MATH
&& optimize
&& optimize_function_for_speed_p (cfun));
}
namespace {
const pass_data pass_data_remove_partial_avx_dependency =
@ -2419,7 +2330,11 @@ public:
/* opt_pass methods: */
virtual bool gate (function *)
{
return remove_partial_avx_dependency_gate ();
return (TARGET_AVX
&& TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& TARGET_SSE_MATH
&& optimize
&& optimize_function_for_speed_p (cfun));
}
virtual unsigned int execute (function *)
@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
return new pass_remove_partial_avx_dependency (ctxt);
}
/* For const vector having one duplicated value, there's no need to put
whole vector in the constant pool when target supports embedded broadcast. */
static unsigned int
constant_pool_broadcast (void)
{
timevar_push (TV_MACH_DEP);
rtx_insn *insn;
for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
{
if (INSN_P (insn))
replace_constant_pool_with_broadcast (insn);
}
timevar_pop (TV_MACH_DEP);
return 0;
}
namespace {
const pass_data pass_data_constant_pool_broadcast =
{
RTL_PASS, /* type */
"cpb", /* name */
OPTGROUP_NONE, /* optinfo_flags */
TV_MACH_DEP, /* tv_id */
0, /* properties_required */
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
TODO_df_finish, /* todo_flags_finish */
};
class pass_constant_pool_broadcast : public rtl_opt_pass
{
public:
pass_constant_pool_broadcast (gcc::context *ctxt)
: rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt)
{}
/* opt_pass methods: */
virtual bool gate (function *)
{
/* Return false if rpad pass gate is true.
replace_constant_pool_with_broadcast is called
from both this pass and rpad pass. */
return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ());
}
virtual unsigned int execute (function *)
{
return constant_pool_broadcast ();
}
}; // class pass_cpb
} // anon namespace
rtl_opt_pass *
make_pass_constant_pool_broadcast (gcc::context *ctxt)
{
return new pass_constant_pool_broadcast (ctxt);
}
/* This compares the priority of target features in function DECL1
and DECL2. It returns positive value if DECL1 is higher priority,
negative value if DECL2 is higher priority and 0 if they are the

View File

@ -33,4 +33,3 @@ along with GCC; see the file COPYING3. If not see
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast);

View File

@ -395,4 +395,3 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
(gcc::context *);
extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
(gcc::context *);
extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *);

View File

@ -6,7 +6,7 @@ typedef double v2df __attribute__((vector_size (16)));
static v2df __attribute__((noinline))
bar (v2df a)
{
return a + (v2df){ 3.0, 3.0 };
return a + (v2df){ 3.0, 4.0 };
}
v2df __attribute__((noinline))