Run pass_sink_code once more before store_merging
Gimple sink code pass runs quite early, there may be some new oppertunities exposed by later gimple optmization passes, this patch runs the sink code pass once more before store_merging. For detailed discussion, please refer to: https://gcc.gnu.org/pipermail/gcc-patches/2020-December/562352.html Tested the SPEC2017 performance on P8LE, 544.nab_r is improved by 2.43%, but no big changes to other cases, GEOMEAN is improved quite small with 0.25%. gcc/ChangeLog: 2021-05-18 Xionghu Luo <luoxhu@linux.ibm.com> * passes.def: Add sink_code pass before store_merging. * tree-ssa-sink.c (pass_sink_code:clone): New. gcc/testsuite/ChangeLog: 2021-05-18 Xionghu Luo <luoxhu@linux.ibm.com> * gcc.dg/tree-ssa/ssa-sink-1.c: Adjust. * gcc.dg/tree-ssa/ssa-sink-2.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-3.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-4.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-5.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-6.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-7.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-8.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-9.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-10.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-13.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-14.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-16.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-17.c: Ditto. * gcc.dg/tree-ssa/ssa-sink-18.c: New.
This commit is contained in:
parent
39ed6a88c7
commit
de56f95afa
|
@ -348,6 +348,7 @@ along with GCC; see the file COPYING3. If not see
|
|||
NEXT_PASS (pass_phiopt, false /* early_p */);
|
||||
NEXT_PASS (pass_fold_builtins);
|
||||
NEXT_PASS (pass_optimize_widening_mul);
|
||||
NEXT_PASS (pass_sink_code);
|
||||
NEXT_PASS (pass_store_merging);
|
||||
NEXT_PASS (pass_tail_calls);
|
||||
/* If DCE is not run before checking for uninitialized uses,
|
||||
|
|
|
@ -7,4 +7,4 @@ foo (int a, int b, int c)
|
|||
return c ? x : a;
|
||||
}
|
||||
/* We should sink the x = a * b calculation into the branch that returns x. */
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements: 1" 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements: 1" 1 "sink1" } } */
|
||||
|
|
|
@ -16,4 +16,4 @@ void foo (void)
|
|||
}
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Sinking # VUSE" 4 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Sinking # VUSE" 4 "sink1" } } */
|
||||
|
|
|
@ -21,5 +21,5 @@ void test ()
|
|||
|
||||
/* We should sink/merge all stores and end up with a single BB. */
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "MEM\[^\n\r\]* = 0;" 3 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "<bb " 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "MEM\[^\n\r\]* = 0;" 3 "sink1" } } */
|
||||
/* { dg-final { scan-tree-dump-times "<bb " 1 "sink1" } } */
|
||||
|
|
|
@ -13,5 +13,5 @@ void foo (int b)
|
|||
/* We should have sunk the store and inserted a PHI to merge the
|
||||
stored values. */
|
||||
|
||||
/* { dg-final { scan-tree-dump-times " = PHI" 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "x = " 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times " = PHI" 1 "sink1" } } */
|
||||
/* { dg-final { scan-tree-dump-times "x = " 1 "sink1" } } */
|
||||
|
|
|
@ -10,5 +10,5 @@ int f(int n)
|
|||
return j;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump "Sinking j_. = __builtin_ffs" "sink" } } */
|
||||
/* { dg-final { scan-tree-dump "Sinking j_. = __builtin_ffs" "sink1" } } */
|
||||
/* { dg-final { scan-tree-dump "return 2;" "optimized" } } */
|
||||
|
|
|
@ -12,4 +12,4 @@ int my_f(int a, int b)
|
|||
}
|
||||
|
||||
/* We should sink the call to pure_f to the if block. */
|
||||
/* { dg-final { scan-tree-dump "Sinking # VUSE" "sink" } } */
|
||||
/* { dg-final { scan-tree-dump "Sinking # VUSE" "sink1" } } */
|
||||
|
|
|
@ -0,0 +1,212 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -fdump-tree-sink-stats" } */
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define HLOG 16
|
||||
#define MAX_LIT (1 << 5)
|
||||
typedef const uint8_t *LZF_HSLOT;
|
||||
typedef LZF_HSLOT LZF_STATE[1 << (HLOG)];
|
||||
|
||||
int
|
||||
compute_on_bytes (uint8_t *in_data, int in_len, uint8_t *out_data, int out_len)
|
||||
{
|
||||
LZF_STATE htab;
|
||||
|
||||
uint8_t *ip = in_data;
|
||||
uint8_t *op = out_data;
|
||||
uint8_t *in_end = ip + in_len;
|
||||
uint8_t *out_end = op + out_len;
|
||||
uint8_t *ref;
|
||||
|
||||
unsigned long off;
|
||||
unsigned int hval;
|
||||
int lit;
|
||||
|
||||
if (!in_len || !out_len)
|
||||
return 0;
|
||||
|
||||
lit = 0;
|
||||
op++;
|
||||
hval = (((ip[0]) << 8) | ip[1]);
|
||||
|
||||
while (ip < in_end - 2)
|
||||
{
|
||||
uint8_t *hslot;
|
||||
|
||||
hval = (((hval) << 8) | ip[2]);
|
||||
hslot = (uint8_t*)(htab + (((hval >> (3 * 8 - 16)) - hval * 5) & ((1 << (16)) - 1)));
|
||||
|
||||
ref = *hslot + in_data;
|
||||
*hslot = ip - in_data;
|
||||
|
||||
if (1 && (off = ip - ref - 1) < (1 << 13) && ref > in_data
|
||||
&& ref[2] == ip[2]
|
||||
&& ((ref[1] << 8) | ref[0]) == ((ip[1] << 8) | ip[0]))
|
||||
{
|
||||
unsigned int len = 2;
|
||||
unsigned int maxlen = in_end - ip - len;
|
||||
maxlen
|
||||
= maxlen > ((1 << 8) + (1 << 3)) ? ((1 << 8) + (1 << 3)) : maxlen;
|
||||
|
||||
if ((op + 3 + 1 >= out_end) != 0)
|
||||
if (op - !lit + 3 + 1 >= out_end)
|
||||
return 0;
|
||||
|
||||
op[-lit - 1] = lit - 1;
|
||||
op -= !lit;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (maxlen > 16)
|
||||
{
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
len++;
|
||||
if (ref[len] != ip[len])
|
||||
break;
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
len++;
|
||||
}
|
||||
while (len < maxlen && ip[len] == ref[len]);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
len -= 2;
|
||||
ip++;
|
||||
|
||||
if (len < 7)
|
||||
{
|
||||
*op++ = (off >> 8) + (len << 5);
|
||||
}
|
||||
else
|
||||
{
|
||||
*op++ = (off >> 8) + (7 << 5);
|
||||
*op++ = len - 7;
|
||||
}
|
||||
*op++ = off;
|
||||
lit = 0;
|
||||
op++;
|
||||
ip += len + 1;
|
||||
|
||||
if (ip >= in_end - 2)
|
||||
break;
|
||||
|
||||
--ip;
|
||||
--ip;
|
||||
|
||||
hval = (((ip[0]) << 8) | ip[1]);
|
||||
hval = (((hval) << 8) | ip[2]);
|
||||
htab[(((hval >> (3 * 8 - 16)) - hval * 5) & ((1 << (16)) - 1))]
|
||||
= (LZF_HSLOT)(ip - in_data);
|
||||
ip++;
|
||||
|
||||
hval = (((hval) << 8) | ip[2]);
|
||||
htab[(((hval >> (3 * 8 - 16)) - hval * 5) & ((1 << (16)) - 1))]
|
||||
= (LZF_HSLOT)(ip - in_data);
|
||||
ip++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (op >= out_end)
|
||||
return 0;
|
||||
|
||||
lit++;
|
||||
*op++ = *ip++;
|
||||
|
||||
if (lit == (1 << 5))
|
||||
{
|
||||
op[-lit - 1] = lit - 1;
|
||||
lit = 0;
|
||||
op++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op + 3 > out_end) /* at most 3 bytes can be missing here */
|
||||
return 0;
|
||||
|
||||
while (ip < in_end)
|
||||
{
|
||||
lit++;
|
||||
*op++ = *ip++;
|
||||
if (lit == MAX_LIT)
|
||||
{
|
||||
op[-lit - 1] = lit - 1; /* stop run */
|
||||
lit = 0;
|
||||
op++; /* start run */
|
||||
}
|
||||
}
|
||||
|
||||
op[-lit - 1] = lit - 1; /* end run */
|
||||
op -= !lit; /* undo run if length is zero */
|
||||
|
||||
return op - out_data;
|
||||
}
|
||||
|
||||
/* For this case, pass sink2 sinks statements from hot loop header to loop
|
||||
exits after gimple loop optimizations, which generates instructions executed
|
||||
each iteration in loop, but the results are used outside of loop:
|
||||
With -m64,
|
||||
"Sinking _367 = (uint8_t *) _320;
|
||||
from bb 31 to bb 90
|
||||
Sinking _320 = _321 + ivtmp.25_326;
|
||||
from bb 31 to bb 90
|
||||
Sinking _321 = (unsigned long) ip_229;
|
||||
from bb 31 to bb 90
|
||||
Sinking len_158 = _322 + 4294967295;
|
||||
from bb 31 to bb 33"
|
||||
When -m32, Power and X86 will sink 3 instructions, but arm ilp32 couldn't
|
||||
sink due to ivopts chooses two IV candidates instead of one, which is
|
||||
expected, so this case is restricted to lp64 only so far. */
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements: 4" 1 "sink2" { target lp64 } } } */
|
|
@ -9,4 +9,4 @@ bar (int a, int b, int c)
|
|||
return y;
|
||||
}
|
||||
/* We should sink the x = a * b calculation into the else branch */
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements: 1" 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements: 1" 1 "sink1" } } */
|
||||
|
|
|
@ -1,15 +0,0 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -fdump-tree-sink-stats" } */
|
||||
extern void foo(int a);
|
||||
int
|
||||
main (int argc)
|
||||
{
|
||||
int a;
|
||||
a = argc + 1;
|
||||
if (argc + 3)
|
||||
{
|
||||
foo (a);
|
||||
}
|
||||
}
|
||||
/* We should sink the a = argc + 1 calculation into the if branch */
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements: 1" 1 "sink" } } */
|
|
@ -17,4 +17,4 @@ main (int argc)
|
|||
foo2 (a);
|
||||
}
|
||||
/* We should sink the first a = b + c calculation into the else branch */
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements: 1" 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements: 1" 1 "sink1" } } */
|
||||
|
|
|
@ -44,4 +44,4 @@ void foo(int16_t runs[], uint8_t alpha[], int x, int count)
|
|||
}
|
||||
|
||||
/* We should not sink the next_runs = runs + x calculation after the loop. */
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements:" 0 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Sunk statements:" 0 "sink1" } } */
|
||||
|
|
|
@ -14,4 +14,4 @@ int foo(int *a, int r)
|
|||
|
||||
/* *a = 1 should be sunk to the else block. */
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Sinking" 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Sinking" 1 "sink1" } } */
|
||||
|
|
|
@ -15,4 +15,4 @@ int foo(int *a, int r, short *b)
|
|||
|
||||
/* *a = 1 should be sunk to the else block. */
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Sinking" 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Sinking" 1 "sink1" } } */
|
||||
|
|
|
@ -24,4 +24,4 @@ int foo(int *a, int r, short *b)
|
|||
|
||||
/* *a = 1 should be sunk into the default case. */
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Sinking" 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Sinking" 1 "sink1" } } */
|
||||
|
|
|
@ -15,4 +15,4 @@ int foo(int *a, int r, int *b)
|
|||
|
||||
/* *a = 1 should be sunk to the else block. */
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Sinking" 1 "sink" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Sinking" 1 "sink1" } } */
|
||||
|
|
|
@ -819,6 +819,7 @@ public:
|
|||
/* opt_pass methods: */
|
||||
virtual bool gate (function *) { return flag_tree_sink != 0; }
|
||||
virtual unsigned int execute (function *);
|
||||
opt_pass *clone (void) { return new pass_sink_code (m_ctxt); }
|
||||
|
||||
}; // class pass_sink_code
|
||||
|
||||
|
|
Loading…
Reference in New Issue