From a6291d88d5b6c17d41950e21d7d452f7f0f73020 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Tue, 13 Jul 2021 18:22:03 +0800 Subject: [PATCH] Remove pass_cpb which is related to enable avx512 embedded broadcast from constant pool. By optimizing vector movement to broadcast in ix86_expand_vector_move during pass_expand, pass_reload/LRA can automatically generate an avx512 embedded broadcast, pass_cpb is not needed. Considering that in the absence of avx512f, broadcast from memory is still slightly faster than loading the entire memory, so always enable broadcast. benchmark: https://gitlab.com/x86-benchmarks/microbenchmark/-/tree/vaddps/broadcast The performance diff strategy : cycles memory : 1046611188 memory : 1255420817 memory : 1044720793 memory : 1253414145 average : 1097868397 broadcast : 1044430688 broadcast : 1044477630 broadcast : 1253554603 broadcast : 1044561934 average : 1096756213 But however broadcast has larger size. the size diff size broadcast.o text data bss dec hex filename 137 0 0 137 89 broadcast.o size memory.o text data bss dec hex filename 115 0 0 115 73 memory.o gcc/ChangeLog: * config/i386/i386-expand.c (ix86_broadcast_from_integer_constant): Rename to .. (ix86_broadcast_from_constant): .. this, and extend it to handle float mode. (ix86_expand_vector_move): Extend to float mode. * config/i386/i386-features.c (replace_constant_pool_with_broadcast): Remove. (remove_partial_avx_dependency_gate): Ditto. (constant_pool_broadcast): Ditto. (class pass_constant_pool_broadcast): Ditto. (make_pass_constant_pool_broadcast): Ditto. (remove_partial_avx_dependency): Adjust gate. * config/i386/i386-passes.def: Remove pass_constant_pool_broadcast. * config/i386/i386-protos.h (make_pass_constant_pool_broadcast): Remove. gcc/testsuite/ChangeLog: * gcc.target/i386/fuse-caller-save-xmm.c: Adjust testcase. --- gcc/config/i386/i386-expand.c | 36 +++- gcc/config/i386/i386-features.c | 157 +----------------- gcc/config/i386/i386-passes.def | 1 - gcc/config/i386/i386-protos.h | 1 - .../gcc.target/i386/fuse-caller-save-xmm.c | 2 +- 5 files changed, 34 insertions(+), 163 deletions(-) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 69ea79e6123..896bd685b1f 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -453,8 +453,10 @@ ix86_expand_move (machine_mode mode, rtx operands[]) emit_insn (gen_rtx_SET (op0, op1)); } +/* OP is a memref of CONST_VECTOR, return scalar constant mem + if CONST_VECTOR is a vec_duplicate, else return NULL. */ static rtx -ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) +ix86_broadcast_from_constant (machine_mode mode, rtx op) { int nunits = GET_MODE_NUNITS (mode); if (nunits < 2) @@ -462,7 +464,8 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) /* Don't use integer vector broadcast if we can't move from GPR to SSE register directly. */ - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) + if (!TARGET_INTER_UNIT_MOVES_TO_VEC + && INTEGRAL_MODE_P (mode)) return nullptr; /* Convert CONST_VECTOR to a non-standard SSE constant integer @@ -470,12 +473,17 @@ ix86_broadcast_from_integer_constant (machine_mode mode, rtx op) if (!(TARGET_AVX2 || (TARGET_AVX && (GET_MODE_INNER (mode) == SImode - || GET_MODE_INNER (mode) == DImode))) + || GET_MODE_INNER (mode) == DImode)) + || FLOAT_MODE_P (mode)) || standard_sse_constant_p (op, mode)) return nullptr; - /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */ - if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT) + /* Don't broadcast from a 64-bit integer constant in 32-bit mode. + We can still put 64-bit integer constant in memory when + avx512 embed broadcast is available. */ + if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT + && (!TARGET_AVX512F + || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) return nullptr; if (GET_MODE_INNER (mode) == TImode) @@ -561,17 +569,29 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) if (can_create_pseudo_p () && GET_MODE_SIZE (mode) >= 16 - && GET_MODE_CLASS (mode) == MODE_VECTOR_INT + && VECTOR_MODE_P (mode) && (MEM_P (op1) && SYMBOL_REF_P (XEXP (op1, 0)) && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0)))) { - rtx first = ix86_broadcast_from_integer_constant (mode, op1); + rtx first = ix86_broadcast_from_constant (mode, op1); if (first != nullptr) { /* Broadcast to XMM/YMM/ZMM register from an integer - constant. */ + constant or scalar mem. */ + /* Hard registers are used for 2 purposes: + 1. Prevent stack realignment when the original code + doesn't use vector registers, which is the same for + memcpy and memset. + 2. Prevent combine to convert constant broadcast to + load from constant pool. */ op1 = ix86_gen_scratch_sse_rtx (mode); + if (FLOAT_MODE_P (mode) + || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) + { + first = force_const_mem (GET_MODE_INNER (mode), first); + op1 = gen_reg_rtx (mode); + } bool ok = ix86_expand_vector_init_duplicate (false, mode, op1, first); gcc_assert (ok); diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index cbd430a2ecf..d9c6652d1c9 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -2136,81 +2136,6 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt) return new pass_insert_endbr_and_patchable_area (ctxt); } -/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x - with embedded broadcast. i.e.transform - - vpaddq .LC0(%rip), %zmm0, %zmm0 - ret - .LC0: - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - .quad 3 - - to - - vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0 - ret - .LC0: - .quad 3 */ -static void -replace_constant_pool_with_broadcast (rtx_insn *insn) -{ - subrtx_ptr_iterator::array_type array; - FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL) - { - rtx *loc = *iter; - rtx x = *loc; - rtx broadcast_mem, vec_dup, constant, first; - machine_mode mode; - - /* Constant pool. */ - if (!MEM_P (x) - || !SYMBOL_REF_P (XEXP (x, 0)) - || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))) - continue; - - /* Const vector. */ - mode = GET_MODE (x); - if (!VECTOR_MODE_P (mode)) - return; - constant = get_pool_constant (XEXP (x, 0)); - if (GET_CODE (constant) != CONST_VECTOR) - return; - - /* There could be some rtx like - (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1"))) - but with "*.LC1" refer to V2DI constant vector. */ - if (GET_MODE (constant) != mode) - { - constant = simplify_subreg (mode, constant, GET_MODE (constant), 0); - if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR) - return; - } - first = XVECEXP (constant, 0, 0); - - for (int i = 1; i < GET_MODE_NUNITS (mode); ++i) - { - rtx tmp = XVECEXP (constant, 0, i); - /* Vector duplicate value. */ - if (!rtx_equal_p (tmp, first)) - return; - } - - /* Replace with embedded broadcast. */ - broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first); - vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem); - validate_change (insn, loc, vec_dup, 0); - - /* At most 1 memory_operand in an insn. */ - return; - } -} - /* At entry of the nearest common dominator for basic blocks with conversions, generate a single vxorps %xmmN, %xmmN, %xmmN @@ -2249,10 +2174,6 @@ remove_partial_avx_dependency (void) if (!NONDEBUG_INSN_P (insn)) continue; - /* Handle AVX512 embedded broadcast here to save compile time. */ - if (TARGET_AVX512F) - replace_constant_pool_with_broadcast (insn); - set = single_set (insn); if (!set) continue; @@ -2384,16 +2305,6 @@ remove_partial_avx_dependency (void) return 0; } -static bool -remove_partial_avx_dependency_gate () -{ - return (TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY - && TARGET_SSE_MATH - && optimize - && optimize_function_for_speed_p (cfun)); -} - namespace { const pass_data pass_data_remove_partial_avx_dependency = @@ -2419,7 +2330,11 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { - return remove_partial_avx_dependency_gate (); + return (TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && TARGET_SSE_MATH + && optimize + && optimize_function_for_speed_p (cfun)); } virtual unsigned int execute (function *) @@ -2436,68 +2351,6 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) return new pass_remove_partial_avx_dependency (ctxt); } -/* For const vector having one duplicated value, there's no need to put - whole vector in the constant pool when target supports embedded broadcast. */ -static unsigned int -constant_pool_broadcast (void) -{ - timevar_push (TV_MACH_DEP); - rtx_insn *insn; - - for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) - { - if (INSN_P (insn)) - replace_constant_pool_with_broadcast (insn); - } - timevar_pop (TV_MACH_DEP); - return 0; -} - -namespace { - -const pass_data pass_data_constant_pool_broadcast = -{ - RTL_PASS, /* type */ - "cpb", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; - -class pass_constant_pool_broadcast : public rtl_opt_pass -{ -public: - pass_constant_pool_broadcast (gcc::context *ctxt) - : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - /* Return false if rpad pass gate is true. - replace_constant_pool_with_broadcast is called - from both this pass and rpad pass. */ - return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ()); - } - - virtual unsigned int execute (function *) - { - return constant_pool_broadcast (); - } -}; // class pass_cpb - -} // anon namespace - -rtl_opt_pass * -make_pass_constant_pool_broadcast (gcc::context *ctxt) -{ - return new pass_constant_pool_broadcast (ctxt); -} - /* This compares the priority of target features in function DECL1 and DECL2. It returns positive value if DECL1 is higher priority, negative value if DECL2 is higher priority and 0 if they are the diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index 44df00e94ac..29baf8acd0b 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -33,4 +33,3 @@ along with GCC; see the file COPYING3. If not see INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); - INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 51376fcc454..07ac02aff69 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -395,4 +395,3 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area (gcc::context *); extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); -extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *); diff --git a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c index 4deff93c1e8..b0d3dc38a0c 100644 --- a/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c +++ b/gcc/testsuite/gcc.target/i386/fuse-caller-save-xmm.c @@ -6,7 +6,7 @@ typedef double v2df __attribute__((vector_size (16))); static v2df __attribute__((noinline)) bar (v2df a) { - return a + (v2df){ 3.0, 3.0 }; + return a + (v2df){ 3.0, 4.0 }; } v2df __attribute__((noinline))