re PR tree-optimization/88709 (Improve store-merging)

PR tree-optimization/88709
	PR tree-optimization/90271
	* params.def (PARAM_STORE_MERGING_MAX_SIZE): New parameter.
	* gimple-ssa-store-merging.c (encode_tree_to_bitpos): Handle
	non-clobber CONSTRUCTORs with no elts.  Remove useless tmp_int
	variable.
	(imm_store_chain_info::coalesce_immediate_stores): Punt if the size
	of the store merging group is larger than
	PARAM_STORE_MERGING_MAX_SIZE parameter.
	(split_group): Add bzero_first argument.  If set, always emit first
	the first store which must be = {} of the whole area and then for the
	rest of the stores consider all zero bytes as paddings.
	(imm_store_chain_info::output_merged_store): Check if first store
	is = {} of the whole area and if yes, determine which setting of
	bzero_first for split_group gives smaller number of stores.  Adjust
	split_group callers.
	(lhs_valid_for_store_merging_p): Allow decls.
	(rhs_valid_for_store_merging_p): Allow non-clobber CONTRUCTORs with
	no elts.
	(pass_store_merging::process_store): Likewise.

	* gcc.dg/store_merging_26.c: New test.
	* gcc.dg/store_merging_27.c: New test.
	* gcc.dg/store_merging_28.c: New test.
	* gcc.dg/store_merging_29.c: New test.

From-SVN: r270924
This commit is contained in:
Jakub Jelinek 2019-05-06 23:50:14 +02:00 committed by Jakub Jelinek
parent 839c480a14
commit 3afd514bca
8 changed files with 296 additions and 21 deletions

View File

@ -1,3 +1,26 @@
2019-05-06 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/88709
PR tree-optimization/90271
* params.def (PARAM_STORE_MERGING_MAX_SIZE): New parameter.
* gimple-ssa-store-merging.c (encode_tree_to_bitpos): Handle
non-clobber CONSTRUCTORs with no elts. Remove useless tmp_int
variable.
(imm_store_chain_info::coalesce_immediate_stores): Punt if the size
of the store merging group is larger than
PARAM_STORE_MERGING_MAX_SIZE parameter.
(split_group): Add bzero_first argument. If set, always emit first
the first store which must be = {} of the whole area and then for the
rest of the stores consider all zero bytes as paddings.
(imm_store_chain_info::output_merged_store): Check if first store
is = {} of the whole area and if yes, determine which setting of
bzero_first for split_group gives smaller number of stores. Adjust
split_group callers.
(lhs_valid_for_store_merging_p): Allow decls.
(rhs_valid_for_store_merging_p): Allow non-clobber CONTRUCTORs with
no elts.
(pass_store_merging::process_store): Likewise.
2019-05-06 Kelvin Nilsen <kelvin@gcc.gnu.org>
PR target/89424

View File

@ -1615,13 +1615,31 @@ encode_tree_to_bitpos (tree expr, unsigned char *ptr, int bitlen, int bitpos,
unsigned int total_bytes)
{
unsigned int first_byte = bitpos / BITS_PER_UNIT;
tree tmp_int = expr;
bool sub_byte_op_p = ((bitlen % BITS_PER_UNIT)
|| (bitpos % BITS_PER_UNIT)
|| !int_mode_for_size (bitlen, 0).exists ());
bool empty_ctor_p
= (TREE_CODE (expr) == CONSTRUCTOR
&& CONSTRUCTOR_NELTS (expr) == 0
&& TYPE_SIZE_UNIT (TREE_TYPE (expr))
&& tree_fits_uhwi_p (TYPE_SIZE_UNIT (TREE_TYPE (expr))));
if (!sub_byte_op_p)
return native_encode_expr (tmp_int, ptr + first_byte, total_bytes) != 0;
{
if (first_byte >= total_bytes)
return false;
total_bytes -= first_byte;
if (empty_ctor_p)
{
unsigned HOST_WIDE_INT rhs_bytes
= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (expr)));
if (rhs_bytes > total_bytes)
return false;
memset (ptr + first_byte, '\0', rhs_bytes);
return true;
}
return native_encode_expr (expr, ptr + first_byte, total_bytes) != 0;
}
/* LITTLE-ENDIAN
We are writing a non byte-sized quantity or at a position that is not
@ -1667,14 +1685,29 @@ encode_tree_to_bitpos (tree expr, unsigned char *ptr, int bitlen, int bitpos,
/* We must be dealing with fixed-size data at this point, since the
total size is also fixed. */
fixed_size_mode mode = as_a <fixed_size_mode> (TYPE_MODE (TREE_TYPE (expr)));
unsigned int byte_size;
if (empty_ctor_p)
{
unsigned HOST_WIDE_INT rhs_bytes
= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (expr)));
if (rhs_bytes > total_bytes)
return false;
byte_size = rhs_bytes;
}
else
{
fixed_size_mode mode
= as_a <fixed_size_mode> (TYPE_MODE (TREE_TYPE (expr)));
byte_size = GET_MODE_SIZE (mode);
}
/* Allocate an extra byte so that we have space to shift into. */
unsigned int byte_size = GET_MODE_SIZE (mode) + 1;
byte_size++;
unsigned char *tmpbuf = XALLOCAVEC (unsigned char, byte_size);
memset (tmpbuf, '\0', byte_size);
/* The store detection code should only have allowed constants that are
accepted by native_encode_expr. */
if (native_encode_expr (expr, tmpbuf, byte_size - 1) == 0)
accepted by native_encode_expr or empty ctors. */
if (!empty_ctor_p
&& native_encode_expr (expr, tmpbuf, byte_size - 1) == 0)
gcc_unreachable ();
/* The native_encode_expr machinery uses TYPE_MODE to determine how many
@ -2671,6 +2704,8 @@ imm_store_chain_info::coalesce_immediate_stores ()
FOR_EACH_VEC_ELT (m_store_info, i, info)
{
unsigned HOST_WIDE_INT new_bitregion_start, new_bitregion_end;
if (i <= ignore)
goto done;
@ -2702,7 +2737,14 @@ imm_store_chain_info::coalesce_immediate_stores ()
}
}
if (info->order >= merged_store->first_nonmergeable_order)
new_bitregion_start
= MIN (merged_store->bitregion_start, info->bitregion_start);
new_bitregion_end
= MAX (merged_store->bitregion_end, info->bitregion_end);
if (info->order >= merged_store->first_nonmergeable_order
|| (((new_bitregion_end - new_bitregion_start + 1) / BITS_PER_UNIT)
> (unsigned) PARAM_VALUE (PARAM_STORE_MERGING_MAX_SIZE)))
;
/* |---store 1---|
@ -3184,12 +3226,15 @@ count_multiple_uses (store_immediate_info *info)
Return number of new stores.
If ALLOW_UNALIGNED_STORE is false, then all stores must be aligned.
If ALLOW_UNALIGNED_LOAD is false, then all loads must be aligned.
BZERO_FIRST may be true only when the first store covers the whole group
and clears it; if BZERO_FIRST is true, keep that first store in the set
unmodified and emit further stores for the overrides only.
If SPLIT_STORES is NULL, it is just a dry run to count number of
new stores. */
static unsigned int
split_group (merged_store_group *group, bool allow_unaligned_store,
bool allow_unaligned_load,
bool allow_unaligned_load, bool bzero_first,
vec<struct split_store *> *split_stores,
unsigned *total_orig,
unsigned *total_new)
@ -3207,6 +3252,7 @@ split_group (merged_store_group *group, bool allow_unaligned_store,
if (group->stores[0]->rhs_code == LROTATE_EXPR
|| group->stores[0]->rhs_code == NOP_EXPR)
{
gcc_assert (!bzero_first);
/* For bswap framework using sets of stores, all the checking
has been done earlier in try_coalesce_bswap and needs to be
emitted as a single store. */
@ -3278,10 +3324,26 @@ split_group (merged_store_group *group, bool allow_unaligned_store,
if (group->load_align[i])
group_load_align = MIN (group_load_align, group->load_align[i]);
if (bzero_first)
{
first = 1;
ret = 1;
if (split_stores)
{
struct split_store *store
= new split_store (bytepos, group->stores[0]->bitsize, align_base);
store->orig_stores.safe_push (group->stores[0]);
store->orig = true;
any_orig = true;
split_stores->safe_push (store);
}
}
while (size > 0)
{
if ((allow_unaligned_store || group_align <= BITS_PER_UNIT)
&& group->mask[try_pos - bytepos] == (unsigned char) ~0U)
&& (group->mask[try_pos - bytepos] == (unsigned char) ~0U
|| (bzero_first && group->val[try_pos - bytepos] == 0)))
{
/* Skip padding bytes. */
++try_pos;
@ -3348,7 +3410,9 @@ split_group (merged_store_group *group, bool allow_unaligned_store,
/* Now look for whole padding bytes at the end of that bitsize. */
for (nonmasked = try_size / BITS_PER_UNIT; nonmasked > 0; --nonmasked)
if (group->mask[try_pos - bytepos + nonmasked - 1]
!= (unsigned char) ~0U)
!= (unsigned char) ~0U
&& (!bzero_first
|| group->val[try_pos - bytepos + nonmasked - 1] != 0))
break;
if (nonmasked == 0)
{
@ -3367,7 +3431,9 @@ split_group (merged_store_group *group, bool allow_unaligned_store,
/* Now look for whole padding bytes at the start of that bitsize. */
unsigned int try_bytesize = try_size / BITS_PER_UNIT, masked;
for (masked = 0; masked < try_bytesize; ++masked)
if (group->mask[try_pos - bytepos + masked] != (unsigned char) ~0U)
if (group->mask[try_pos - bytepos + masked] != (unsigned char) ~0U
&& (!bzero_first
|| group->val[try_pos - bytepos + masked] != 0))
break;
masked *= BITS_PER_UNIT;
gcc_assert (masked < try_size);
@ -3583,20 +3649,44 @@ imm_store_chain_info::output_merged_store (merged_store_group *group)
bool allow_unaligned_store
= !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED);
bool allow_unaligned_load = allow_unaligned_store;
if (allow_unaligned_store)
bool bzero_first = false;
if (group->stores[0]->rhs_code == INTEGER_CST
&& TREE_CODE (gimple_assign_rhs1 (group->stores[0]->stmt)) == CONSTRUCTOR
&& CONSTRUCTOR_NELTS (gimple_assign_rhs1 (group->stores[0]->stmt)) == 0
&& group->start == group->stores[0]->bitpos
&& group->width == group->stores[0]->bitsize
&& (group->start % BITS_PER_UNIT) == 0
&& (group->width % BITS_PER_UNIT) == 0)
bzero_first = true;
if (allow_unaligned_store || bzero_first)
{
/* If unaligned stores are allowed, see how many stores we'd emit
for unaligned and how many stores we'd emit for aligned stores.
Only use unaligned stores if it allows fewer stores than aligned. */
unsigned aligned_cnt
= split_group (group, false, allow_unaligned_load, NULL, NULL, NULL);
unsigned unaligned_cnt
= split_group (group, true, allow_unaligned_load, NULL, NULL, NULL);
if (aligned_cnt <= unaligned_cnt)
Only use unaligned stores if it allows fewer stores than aligned.
Similarly, if there is a whole region clear first, prefer expanding
it together compared to expanding clear first followed by merged
further stores. */
unsigned cnt[4] = { ~0, ~0, ~0, ~0 };
int pass_min = 0;
for (int pass = 0; pass < 4; ++pass)
{
if (!allow_unaligned_store && (pass & 1) != 0)
continue;
if (!bzero_first && (pass & 2) != 0)
continue;
cnt[pass] = split_group (group, (pass & 1) != 0,
allow_unaligned_load, (pass & 2) != 0,
NULL, NULL, NULL);
if (cnt[pass] < cnt[pass_min])
pass_min = pass;
}
if ((pass_min & 1) == 0)
allow_unaligned_store = false;
if ((pass_min & 2) == 0)
bzero_first = false;
}
unsigned total_orig, total_new;
split_group (group, allow_unaligned_store, allow_unaligned_load,
split_group (group, allow_unaligned_store, allow_unaligned_load, bzero_first,
&split_stores, &total_orig, &total_new);
if (split_stores.length () >= orig_num_stmts)
@ -4164,7 +4254,8 @@ lhs_valid_for_store_merging_p (tree lhs)
tree_code code = TREE_CODE (lhs);
if (code == ARRAY_REF || code == ARRAY_RANGE_REF || code == MEM_REF
|| code == COMPONENT_REF || code == BIT_FIELD_REF)
|| code == COMPONENT_REF || code == BIT_FIELD_REF
|| DECL_P (lhs))
return true;
return false;
@ -4178,6 +4269,12 @@ static bool
rhs_valid_for_store_merging_p (tree rhs)
{
unsigned HOST_WIDE_INT size;
if (TREE_CODE (rhs) == CONSTRUCTOR
&& !TREE_CLOBBER_P (rhs)
&& CONSTRUCTOR_NELTS (rhs) == 0
&& TYPE_SIZE_UNIT (TREE_TYPE (rhs))
&& tree_fits_uhwi_p (TYPE_SIZE_UNIT (TREE_TYPE (rhs))))
return true;
return (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (rhs))).is_constant (&size)
&& native_encode_expr (rhs, NULL, size) != 0);
}
@ -4363,7 +4460,9 @@ pass_store_merging::process_store (gimple *stmt)
bool invalid = (base_addr == NULL_TREE
|| (maybe_gt (bitsize,
(unsigned int) MAX_BITSIZE_MODE_ANY_INT)
&& (TREE_CODE (rhs) != INTEGER_CST)));
&& TREE_CODE (rhs) != INTEGER_CST
&& (TREE_CODE (rhs) != CONSTRUCTOR
|| CONSTRUCTOR_NELTS (rhs) != 0)));
enum tree_code rhs_code = ERROR_MARK;
bool bit_not_p = false;
struct symbolic_number n;

View File

@ -1205,6 +1205,11 @@ DEFPARAM (PARAM_MAX_STORES_TO_MERGE,
"store merging pass.",
64, 2, 0)
DEFPARAM (PARAM_STORE_MERGING_MAX_SIZE,
"store-merging-max-size",
"Maximum size of a single store merging region in bytes.",
65536, 1, 1)
DEFPARAM (PARAM_MAX_TAIL_MERGE_ITERATIONS,
"max-tail-merge-iterations",
"Maximum amount of iterations of the pass over a function.",

View File

@ -1,3 +1,12 @@
2019-05-06 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/88709
PR tree-optimization/90271
* gcc.dg/store_merging_26.c: New test.
* gcc.dg/store_merging_27.c: New test.
* gcc.dg/store_merging_28.c: New test.
* gcc.dg/store_merging_29.c: New test.
2019-05-06 Kelvin Nilsen <kelvin@gcc.gnu.org>
PR target/89424

View File

@ -0,0 +1,36 @@
/* PR tree-optimization/90271 */
/* { dg-do run { target int32 } } */
/* { dg-require-effective-target store_merge } */
/* { dg-options "-O2 -fdump-tree-store-merging-details" } */
/* { dg-final { scan-tree-dump "New sequence of 1 stores to replace old one of 2 stores" "store-merging" } } */
__attribute__((noipa)) void
foo (int *x)
{
asm volatile ("" : : "r" (x) : "memory");
}
__attribute__((noipa)) int
bar ()
{
int x;
foo (&x);
x = 3;
((char *) &x)[1] = 1;
foo (&x);
return x;
}
int
main ()
{
int x;
foo (&x);
x = 3;
foo (&x);
((char *) &x)[1] = 1;
foo (&x);
if (x != bar ())
__builtin_abort ();
return 0;
}

View File

@ -0,0 +1,26 @@
/* PR tree-optimization/88709 */
/* { dg-do run } */
/* { dg-require-effective-target store_merge } */
/* { dg-options "-O2 -fdump-tree-store-merging-details" } */
/* { dg-final { scan-tree-dump "New sequence of \[12] stores to replace old one of 3 stores" "store-merging" } } */
struct S { char buf[8]; };
__attribute__((noipa)) void
bar (struct S *x)
{
int i;
for (i = 0; i < 8; i++)
if (x->buf[i] != ((i == 1) + (i == 3) * 2))
__builtin_abort ();
}
int
main ()
{
struct S s = {};
s.buf[1] = 1;
s.buf[3] = 2;
bar (&s);
return 0;
}

View File

@ -0,0 +1,44 @@
/* PR tree-optimization/88709 */
/* { dg-do compile } */
/* { dg-require-effective-target store_merge } */
/* { dg-options "-O2 -fno-ipa-icf -fdump-tree-store-merging-details" } */
/* { dg-final { scan-tree-dump-times "New sequence of \[24] stores to replace old one of 16 stores" 8 "store-merging" { target { i?86-*-* x86_64-*-* } } } } */
/* { dg-final { scan-tree-dump-times "New sequence of \[24] stores to replace old one of 6 stores" 1 "store-merging" } } */
typedef struct S { char data[16]; } S;
void optimize_me (S);
void optimize_me3 (S, S, S);
void
good ()
{
optimize_me ((S) { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 });
}
void
bad ()
{
optimize_me ((S) { 1, 2, 3, 4, 5 });
}
void
why ()
{
optimize_me ((S) { 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
}
void
srsly ()
{
optimize_me3 ((S) { 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
(S) { 11, 12, 13, 14, 15, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 },
(S) { 21, 22, 23, 24, 25, 20, 20, 20, 10, 20, 20, 20, 20, 20, 20 });
}
void
srsly_not_one_missing ()
{
optimize_me3 ((S) { 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
(S) { 11, 12, 13, 14, 15, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 },
(S) { 21, 22, 23, 24, 25, 20, 20, 20, 10, 20, 20, 20, 20, 20, 20, 11 });
}

View File

@ -0,0 +1,33 @@
/* PR tree-optimization/88709 */
/* { dg-do run { target int32 } } */
/* { dg-require-effective-target store_merge } */
/* { dg-options "-O2 -fdump-tree-store-merging-details" } */
/* { dg-final { scan-tree-dump "New sequence of 3 stores to replace old one of 6 stores" "store-merging" { target le } } } */
/* { dg-final { scan-tree-dump "New sequence of \[34] stores to replace old one of 6 stores" "store-merging" { target be } } } */
struct T { char a[1024]; };
__attribute__((noipa)) void
bar (struct T *t)
{
int x = 0x506;
if (__builtin_memcmp (&t->a[97], &x, sizeof (x)))
__builtin_abort ();
__builtin_memset (&t->a[97], '\0', sizeof (x));
for (int i = 0; i < 8; ++i)
if (t->a[i] != ((i == 54) + 2 * (i == 52) + 3 * (i == 95) + 4 * (i == 96)))
__builtin_abort ();
}
int
main ()
{
struct T t = {};
t.a[54] = 1;
t.a[52] = 2;
t.a[95] = 3;
t.a[96] = 4;
int x = 0x506;
__builtin_memcpy (&t.a[97], &x, sizeof (x));
bar (&t);
}