Remove pass_cpb which is related to enable avx512 embedded broadcast from constant pool.
By optimizing vector movement to broadcast in ix86_expand_vector_move during pass_expand, pass_reload/LRA can automatically generate an avx512 embedded broadcast, pass_cpb is not needed. Considering that in the absence of avx512f, broadcast from memory is still slightly faster than loading the entire memory, so always enable broadcast. benchmark: https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast The performance diff strategy : cycles memory : 1046611188 memory : 1255420817 memory : 1044720793 memory : 1253414145 average : 1097868397 broadcast : 1044430688 broadcast : 1044477630 broadcast : 1253554603 broadcast : 1044561934 average : 1096756213 But however broadcast has larger size. the size diff size broadcast.o text data bss dec hex filename 137 0 0 137 89 broadcast.o size memory.o text data bss dec hex filename 115 0 0 115 73 memory.o gcc/ChangeLog: * config/i386/i386-expand.c (ix86_broadcast_from_integer_constant): Rename to .. (ix86_broadcast_from_constant): .. this, and extend it to handle float mode. (ix86_expand_vector_move): Extend to float mode. * config/i386/i386-features.c (replace_constant_pool_with_broadcast): Remove. (remove_partial_avx_dependency_gate): Ditto. (constant_pool_broadcast): Ditto. (class pass_constant_pool_broadcast): Ditto. (make_pass_constant_pool_broadcast): Ditto. (remove_partial_avx_dependency): Adjust gate. * config/i386/i386-passes.def: Remove pass_constant_pool_broadcast. * config/i386/i386-protos.h (make_pass_constant_pool_broadcast): Remove. gcc/testsuite/ChangeLog: * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase.
This commit is contained in:
parent
a56c251898
commit
a6291d88d5
@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[])
|
||||
emit_insn (gen_rtx_SET (op0, op1));
|
||||
}
|
||||
|
||||
/* OP is a memref of CONST_VECTOR, return scalar constant mem
|
||||
if CONST_VECTOR is a vec_duplicate, else return NULL. */
|
||||
static rtx
|
||||
ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
|
||||
ix86_broadcast_from_constant (machine_mode mode, rtx op)
|
||||
{
|
||||
int nunits = GET_MODE_NUNITS (mode);
|
||||
if (nunits < 2)
|
||||
@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
|
||||
|
||||
/* Don't use integer vector broadcast if we can't move from GPR to SSE
|
||||
register directly. */
|
||||
if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
|
||||
if (!TARGET_INTER_UNIT_MOVES_TO_VEC
|
||||
&& INTEGRAL_MODE_P (mode))
|
||||
return nullptr;
|
||||
|
||||
/* Convert CONST_VECTOR to a non-standard SSE constant integer
|
||||
@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
|
||||
if (!(TARGET_AVX2
|
||||
|| (TARGET_AVX
|
||||
&& (GET_MODE_INNER (mode) == SImode
|
||||
|| GET_MODE_INNER (mode) == DImode)))
|
||||
|| GET_MODE_INNER (mode) == DImode))
|
||||
|| FLOAT_MODE_P (mode))
|
||||
|| standard_sse_constant_p (op, mode))
|
||||
return nullptr;
|
||||
|
||||
/* Don't broadcast from a 64-bit integer constant in 32-bit mode. */
|
||||
if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT)
|
||||
/* Don't broadcast from a 64-bit integer constant in 32-bit mode.
|
||||
We can still put 64-bit integer constant in memory when
|
||||
avx512 embed broadcast is available. */
|
||||
if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
|
||||
&& (!TARGET_AVX512F
|
||||
|| (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
|
||||
return nullptr;
|
||||
|
||||
if (GET_MODE_INNER (mode) == TImode)
|
||||
@ -561,17 +569,29 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
|
||||
|
||||
if (can_create_pseudo_p ()
|
||||
&& GET_MODE_SIZE (mode) >= 16
|
||||
&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|
||||
&& VECTOR_MODE_P (mode)
|
||||
&& (MEM_P (op1)
|
||||
&& SYMBOL_REF_P (XEXP (op1, 0))
|
||||
&& CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
|
||||
{
|
||||
rtx first = ix86_broadcast_from_integer_constant (mode, op1);
|
||||
rtx first = ix86_broadcast_from_constant (mode, op1);
|
||||
if (first != nullptr)
|
||||
{
|
||||
/* Broadcast to XMM/YMM/ZMM register from an integer
|
||||
constant. */
|
||||
constant or scalar mem. */
|
||||
/* Hard registers are used for 2 purposes:
|
||||
1. Prevent stack realignment when the original code
|
||||
doesn't use vector registers, which is the same for
|
||||
memcpy and memset.
|
||||
2. Prevent combine to convert constant broadcast to
|
||||
load from constant pool. */
|
||||
op1 = ix86_gen_scratch_sse_rtx (mode);
|
||||
if (FLOAT_MODE_P (mode)
|
||||
|| (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
|
||||
{
|
||||
first = force_const_mem (GET_MODE_INNER (mode), first);
|
||||
op1 = gen_reg_rtx (mode);
|
||||
}
|
||||
bool ok = ix86_expand_vector_init_duplicate (false, mode,
|
||||
op1, first);
|
||||
gcc_assert (ok);
|
||||
|
@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
|
||||
return new pass_insert_endbr_and_patchable_area (ctxt);
|
||||
}
|
||||
|
||||
/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x
|
||||
with embedded broadcast. i.e.transform
|
||||
|
||||
vpaddq .LC0(%rip), %zmm0, %zmm0
|
||||
ret
|
||||
.LC0:
|
||||
.quad 3
|
||||
.quad 3
|
||||
.quad 3
|
||||
.quad 3
|
||||
.quad 3
|
||||
.quad 3
|
||||
.quad 3
|
||||
.quad 3
|
||||
|
||||
to
|
||||
|
||||
vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0
|
||||
ret
|
||||
.LC0:
|
||||
.quad 3 */
|
||||
static void
|
||||
replace_constant_pool_with_broadcast (rtx_insn *insn)
|
||||
{
|
||||
subrtx_ptr_iterator::array_type array;
|
||||
FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL)
|
||||
{
|
||||
rtx *loc = *iter;
|
||||
rtx x = *loc;
|
||||
rtx broadcast_mem, vec_dup, constant, first;
|
||||
machine_mode mode;
|
||||
|
||||
/* Constant pool. */
|
||||
if (!MEM_P (x)
|
||||
|| !SYMBOL_REF_P (XEXP (x, 0))
|
||||
|| !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)))
|
||||
continue;
|
||||
|
||||
/* Const vector. */
|
||||
mode = GET_MODE (x);
|
||||
if (!VECTOR_MODE_P (mode))
|
||||
return;
|
||||
constant = get_pool_constant (XEXP (x, 0));
|
||||
if (GET_CODE (constant) != CONST_VECTOR)
|
||||
return;
|
||||
|
||||
/* There could be some rtx like
|
||||
(mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
|
||||
but with "*.LC1" refer to V2DI constant vector. */
|
||||
if (GET_MODE (constant) != mode)
|
||||
{
|
||||
constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
|
||||
if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR)
|
||||
return;
|
||||
}
|
||||
first = XVECEXP (constant, 0, 0);
|
||||
|
||||
for (int i = 1; i < GET_MODE_NUNITS (mode); ++i)
|
||||
{
|
||||
rtx tmp = XVECEXP (constant, 0, i);
|
||||
/* Vector duplicate value. */
|
||||
if (!rtx_equal_p (tmp, first))
|
||||
return;
|
||||
}
|
||||
|
||||
/* Replace with embedded broadcast. */
|
||||
broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first);
|
||||
vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem);
|
||||
validate_change (insn, loc, vec_dup, 0);
|
||||
|
||||
/* At most 1 memory_operand in an insn. */
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* At entry of the nearest common dominator for basic blocks with
|
||||
conversions, generate a single
|
||||
vxorps %xmmN, %xmmN, %xmmN
|
||||
@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void)
|
||||
if (!NONDEBUG_INSN_P (insn))
|
||||
continue;
|
||||
|
||||
/* Handle AVX512 embedded broadcast here to save compile time. */
|
||||
if (TARGET_AVX512F)
|
||||
replace_constant_pool_with_broadcast (insn);
|
||||
|
||||
set = single_set (insn);
|
||||
if (!set)
|
||||
continue;
|
||||
@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool
|
||||
remove_partial_avx_dependency_gate ()
|
||||
{
|
||||
return (TARGET_AVX
|
||||
&& TARGET_SSE_PARTIAL_REG_DEPENDENCY
|
||||
&& TARGET_SSE_MATH
|
||||
&& optimize
|
||||
&& optimize_function_for_speed_p (cfun));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
const pass_data pass_data_remove_partial_avx_dependency =
|
||||
@ -2419,7 +2330,11 @@ public:
|
||||
/* opt_pass methods: */
|
||||
virtual bool gate (function *)
|
||||
{
|
||||
return remove_partial_avx_dependency_gate ();
|
||||
return (TARGET_AVX
|
||||
&& TARGET_SSE_PARTIAL_REG_DEPENDENCY
|
||||
&& TARGET_SSE_MATH
|
||||
&& optimize
|
||||
&& optimize_function_for_speed_p (cfun));
|
||||
}
|
||||
|
||||
virtual unsigned int execute (function *)
|
||||
@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
|
||||
return new pass_remove_partial_avx_dependency (ctxt);
|
||||
}
|
||||
|
||||
/* For const vector having one duplicated value, there's no need to put
|
||||
whole vector in the constant pool when target supports embedded broadcast. */
|
||||
static unsigned int
|
||||
constant_pool_broadcast (void)
|
||||
{
|
||||
timevar_push (TV_MACH_DEP);
|
||||
rtx_insn *insn;
|
||||
|
||||
for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
|
||||
{
|
||||
if (INSN_P (insn))
|
||||
replace_constant_pool_with_broadcast (insn);
|
||||
}
|
||||
timevar_pop (TV_MACH_DEP);
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
const pass_data pass_data_constant_pool_broadcast =
|
||||
{
|
||||
RTL_PASS, /* type */
|
||||
"cpb", /* name */
|
||||
OPTGROUP_NONE, /* optinfo_flags */
|
||||
TV_MACH_DEP, /* tv_id */
|
||||
0, /* properties_required */
|
||||
0, /* properties_provided */
|
||||
0, /* properties_destroyed */
|
||||
0, /* todo_flags_start */
|
||||
TODO_df_finish, /* todo_flags_finish */
|
||||
};
|
||||
|
||||
class pass_constant_pool_broadcast : public rtl_opt_pass
|
||||
{
|
||||
public:
|
||||
pass_constant_pool_broadcast (gcc::context *ctxt)
|
||||
: rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt)
|
||||
{}
|
||||
|
||||
/* opt_pass methods: */
|
||||
virtual bool gate (function *)
|
||||
{
|
||||
/* Return false if rpad pass gate is true.
|
||||
replace_constant_pool_with_broadcast is called
|
||||
from both this pass and rpad pass. */
|
||||
return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ());
|
||||
}
|
||||
|
||||
virtual unsigned int execute (function *)
|
||||
{
|
||||
return constant_pool_broadcast ();
|
||||
}
|
||||
}; // class pass_cpb
|
||||
|
||||
} // anon namespace
|
||||
|
||||
rtl_opt_pass *
|
||||
make_pass_constant_pool_broadcast (gcc::context *ctxt)
|
||||
{
|
||||
return new pass_constant_pool_broadcast (ctxt);
|
||||
}
|
||||
|
||||
/* This compares the priority of target features in function DECL1
|
||||
and DECL2. It returns positive value if DECL1 is higher priority,
|
||||
negative value if DECL2 is higher priority and 0 if they are the
|
||||
|
@ -33,4 +33,3 @@ along with GCC; see the file COPYING3. If not see
|
||||
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
|
||||
|
||||
INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
|
||||
INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast);
|
||||
|
@ -395,4 +395,3 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
|
||||
(gcc::context *);
|
||||
extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
|
||||
(gcc::context *);
|
||||
extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *);
|
||||
|
@ -6,7 +6,7 @@ typedef double v2df __attribute__((vector_size (16)));
|
||||
static v2df __attribute__((noinline))
|
||||
bar (v2df a)
|
||||
{
|
||||
return a + (v2df){ 3.0, 3.0 };
|
||||
return a + (v2df){ 3.0, 4.0 };
|
||||
}
|
||||
|
||||
v2df __attribute__((noinline))
|
||||
|
Loading…
Reference in New Issue
Block a user