ldist: Recognize strlen and rawmemchr like loops

This patch adds support for recognizing loops which mimic the behaviour
of functions strlen and rawmemchr, and replaces those with internal
function calls in case a target provides them.  In contrast to the
standard strlen and rawmemchr functions, this patch also supports
different instances where the memory pointed to is interpreted as 8, 16,
and 32-bit sized, respectively.

gcc/ChangeLog:

	* builtins.c (get_memory_rtx): Change to external linkage.
	* builtins.h (get_memory_rtx): Add function prototype.
	* doc/md.texi (rawmemchr<mode>): Document.
	* internal-fn.c (expand_RAWMEMCHR): Define.
	* internal-fn.def (RAWMEMCHR): Add.
	* optabs.def (rawmemchr_optab): Add.
	* tree-loop-distribution.c (find_single_drs): Change return code
	behaviour by also returning true if no single store was found
	but a single load.
	(loop_distribution::classify_partition): Respect the new return
	code behaviour of function find_single_drs.
	(loop_distribution::execute): Call new function
	transform_reduction_loop in order to replace rawmemchr or strlen
	like loops by calls into builtins.
	(generate_reduction_builtin_1): New function.
	(generate_rawmemchr_builtin): New function.
	(generate_strlen_builtin_1): New function.
	(generate_strlen_builtin): New function.
	(generate_strlen_builtin_using_rawmemchr): New function.
	(reduction_var_overflows_first): New function.
	(determine_reduction_stmt_1): New function.
	(determine_reduction_stmt): New function.
	(loop_distribution::transform_reduction_loop): New function.

gcc/testsuite/ChangeLog:

	* gcc.dg/tree-ssa/ldist-rawmemchr-1.c: New test.
	* gcc.dg/tree-ssa/ldist-rawmemchr-2.c: New test.
	* gcc.dg/tree-ssa/ldist-strlen-1.c: New test.
	* gcc.dg/tree-ssa/ldist-strlen-2.c: New test.
	* gcc.dg/tree-ssa/ldist-strlen-3.c: New test.
This commit is contained in:
Stefan Schulze Frielinghaus 2021-10-11 09:59:13 +02:00
parent 9abf8c9534
commit 6f966f0614
12 changed files with 855 additions and 31 deletions

View File

@ -105,7 +105,6 @@ builtin_info_type builtin_info[(int)END_BUILTINS];
bool force_folding_builtin_constant_p;
static int target_char_cast (tree, char *);
static rtx get_memory_rtx (tree, tree);
static int apply_args_size (void);
static int apply_result_size (void);
static rtx result_vector (int, rtx);
@ -1355,7 +1354,7 @@ expand_builtin_prefetch (tree exp)
the maximum length of the block of memory that might be accessed or
NULL if unknown. */
static rtx
rtx
get_memory_rtx (tree exp, tree len)
{
tree orig_exp = exp;

View File

@ -146,6 +146,7 @@ extern char target_percent_s[3];
extern char target_percent_c[3];
extern char target_percent_s_newline[4];
extern bool target_char_cst_p (tree t, char *p);
extern rtx get_memory_rtx (tree exp, tree len);
extern internal_fn associated_internal_fn (tree);
extern internal_fn replacement_internal_fn (gcall *);

View File

@ -6697,6 +6697,13 @@ operand 2 is the character to search for (normally zero),
and operand 3 is a constant describing the known alignment
of the beginning of the string.
@cindex @code{rawmemchr@var{m}} instruction pattern
@item @samp{rawmemchr@var{m}}
Scan memory referred to by operand 1 for the first occurrence of operand 2.
Operand 1 is a @code{mem} and operand 2 a @code{const_int} of mode @var{m}.
Operand 0 is the result, i.e., a pointer to the first occurrence of operand 2
in the memory block given by operand 1.
@cindex @code{float@var{m}@var{n}2} instruction pattern
@item @samp{float@var{m}@var{n}2}
Convert signed integer operand 1 (valid for fixed point mode @var{m}) to

View File

@ -2934,6 +2934,36 @@ expand_VEC_CONVERT (internal_fn, gcall *)
gcc_unreachable ();
}
/* Expand IFN_RAWMEMCHAR internal function. */
void
expand_RAWMEMCHR (internal_fn, gcall *stmt)
{
expand_operand ops[3];
tree lhs = gimple_call_lhs (stmt);
if (!lhs)
return;
machine_mode lhs_mode = TYPE_MODE (TREE_TYPE (lhs));
rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
create_output_operand (&ops[0], lhs_rtx, lhs_mode);
tree mem = gimple_call_arg (stmt, 0);
rtx mem_rtx = get_memory_rtx (mem, NULL);
create_fixed_operand (&ops[1], mem_rtx);
tree pattern = gimple_call_arg (stmt, 1);
machine_mode mode = TYPE_MODE (TREE_TYPE (pattern));
rtx pattern_rtx = expand_normal (pattern);
create_input_operand (&ops[2], pattern_rtx, mode);
insn_code icode = direct_optab_handler (rawmemchr_optab, mode);
expand_insn (icode, 3, ops);
if (!rtx_equal_p (lhs_rtx, ops[0].value))
emit_move_insn (lhs_rtx, ops[0].value);
}
/* Expand the IFN_UNIQUE function according to its first argument. */
static void

View File

@ -352,6 +352,7 @@ DEF_INTERNAL_FN (MUL_OVERFLOW, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (TSAN_FUNC_EXIT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (VA_ARG, ECF_NOTHROW | ECF_LEAF, NULL)
DEF_INTERNAL_FN (VEC_CONVERT, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
DEF_INTERNAL_FN (RAWMEMCHR, ECF_PURE | ECF_LEAF | ECF_NOTHROW, NULL)
/* An unduplicable, uncombinable function. Generally used to preserve
a CFG property in the face of jump threading, tail merging or

View File

@ -267,6 +267,7 @@ OPTAB_D (cpymem_optab, "cpymem$a")
OPTAB_D (movmem_optab, "movmem$a")
OPTAB_D (setmem_optab, "setmem$a")
OPTAB_D (strlen_optab, "strlen$a")
OPTAB_D (rawmemchr_optab, "rawmemchr$a")
OPTAB_DC(fma_optab, "fma$a4", FMA)
OPTAB_D (fms_optab, "fms$a4")

View File

@ -0,0 +1,72 @@
/* { dg-do run { target s390x-*-* } } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */
/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */
/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */
/* Rawmemchr pattern: reduction stmt and no store */
#include <stdint.h>
#include <assert.h>
typedef __SIZE_TYPE__ size_t;
extern void* malloc (size_t);
extern void* memset (void*, int, size_t);
#define test(T, pattern) \
__attribute__((noinline)) \
T *test_##T (T *p) \
{ \
while (*p != (T)pattern) \
++p; \
return p; \
}
test (uint8_t, 0xab)
test (uint16_t, 0xabcd)
test (uint32_t, 0xabcdef15)
test (int8_t, 0xab)
test (int16_t, 0xabcd)
test (int32_t, 0xabcdef15)
#define run(T, pattern, i) \
{ \
T *q = p; \
q[i] = (T)pattern; \
assert (test_##T (p) == &q[i]); \
q[i] = 0; \
}
int main(void)
{
void *p = malloc (1024);
assert (p);
memset (p, 0, 1024);
run (uint8_t, 0xab, 0);
run (uint8_t, 0xab, 1);
run (uint8_t, 0xab, 13);
run (uint16_t, 0xabcd, 0);
run (uint16_t, 0xabcd, 1);
run (uint16_t, 0xabcd, 13);
run (uint32_t, 0xabcdef15, 0);
run (uint32_t, 0xabcdef15, 1);
run (uint32_t, 0xabcdef15, 13);
run (int8_t, 0xab, 0);
run (int8_t, 0xab, 1);
run (int8_t, 0xab, 13);
run (int16_t, 0xabcd, 0);
run (int16_t, 0xabcd, 1);
run (int16_t, 0xabcd, 13);
run (int32_t, 0xabcdef15, 0);
run (int32_t, 0xabcdef15, 1);
run (int32_t, 0xabcdef15, 13);
return 0;
}

View File

@ -0,0 +1,83 @@
/* { dg-do run { target s390x-*-* } } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */
/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */
/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */
/* Rawmemchr pattern: reduction stmt and store */
#include <stdint.h>
#include <assert.h>
typedef __SIZE_TYPE__ size_t;
extern void* malloc (size_t);
extern void* memset (void*, int, size_t);
uint8_t *p_uint8_t;
uint16_t *p_uint16_t;
uint32_t *p_uint32_t;
int8_t *p_int8_t;
int16_t *p_int16_t;
int32_t *p_int32_t;
#define test(T, pattern) \
__attribute__((noinline)) \
T *test_##T (void) \
{ \
while (*p_##T != pattern) \
++p_##T; \
return p_##T; \
}
test (uint8_t, 0xab)
test (uint16_t, 0xabcd)
test (uint32_t, 0xabcdef15)
test (int8_t, (int8_t)0xab)
test (int16_t, (int16_t)0xabcd)
test (int32_t, (int32_t)0xabcdef15)
#define run(T, pattern, i) \
{ \
T *q = p; \
q[i] = pattern; \
p_##T = p; \
T *r = test_##T (); \
assert (r == p_##T); \
assert (r == &q[i]); \
q[i] = 0; \
}
int main(void)
{
void *p = malloc (1024);
assert (p);
memset (p, '\0', 1024);
run (uint8_t, 0xab, 0);
run (uint8_t, 0xab, 1);
run (uint8_t, 0xab, 13);
run (uint16_t, 0xabcd, 0);
run (uint16_t, 0xabcd, 1);
run (uint16_t, 0xabcd, 13);
run (uint32_t, 0xabcdef15, 0);
run (uint32_t, 0xabcdef15, 1);
run (uint32_t, 0xabcdef15, 13);
run (int8_t, (int8_t)0xab, 0);
run (int8_t, (int8_t)0xab, 1);
run (int8_t, (int8_t)0xab, 13);
run (int16_t, (int16_t)0xabcd, 0);
run (int16_t, (int16_t)0xabcd, 1);
run (int16_t, (int16_t)0xabcd, 13);
run (int32_t, (int32_t)0xabcdef15, 0);
run (int32_t, (int32_t)0xabcdef15, 1);
run (int32_t, (int32_t)0xabcdef15, 13);
return 0;
}

View File

@ -0,0 +1,100 @@
/* { dg-do run } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
/* { dg-final { scan-tree-dump-times "generated strlenQI\n" 4 "ldist" } } */
/* { dg-final { scan-tree-dump-times "generated strlenHI\n" 4 "ldist" { target s390x-*-* } } } */
/* { dg-final { scan-tree-dump-times "generated strlenSI\n" 4 "ldist" { target s390x-*-* } } } */
#include <stdint.h>
#include <assert.h>
typedef __SIZE_TYPE__ size_t;
extern void* malloc (size_t);
extern void* memset (void*, int, size_t);
#define test(T, U) \
__attribute__((noinline)) \
U test_##T##U (T *s) \
{ \
U i; \
for (i=0; s[i]; ++i); \
return i; \
}
test (uint8_t, size_t)
test (uint16_t, size_t)
test (uint32_t, size_t)
test (uint8_t, int)
test (uint16_t, int)
test (uint32_t, int)
test (int8_t, size_t)
test (int16_t, size_t)
test (int32_t, size_t)
test (int8_t, int)
test (int16_t, int)
test (int32_t, int)
#define run(T, U, i) \
{ \
T *q = p; \
q[i] = 0; \
assert (test_##T##U (p) == i); \
memset (&q[i], 0xf, sizeof (T)); \
}
int main(void)
{
void *p = malloc (1024);
assert (p);
memset (p, 0xf, 1024);
run (uint8_t, size_t, 0);
run (uint8_t, size_t, 1);
run (uint8_t, size_t, 13);
run (int8_t, size_t, 0);
run (int8_t, size_t, 1);
run (int8_t, size_t, 13);
run (uint8_t, int, 0);
run (uint8_t, int, 1);
run (uint8_t, int, 13);
run (int8_t, int, 0);
run (int8_t, int, 1);
run (int8_t, int, 13);
run (uint16_t, size_t, 0);
run (uint16_t, size_t, 1);
run (uint16_t, size_t, 13);
run (int16_t, size_t, 0);
run (int16_t, size_t, 1);
run (int16_t, size_t, 13);
run (uint16_t, int, 0);
run (uint16_t, int, 1);
run (uint16_t, int, 13);
run (int16_t, int, 0);
run (int16_t, int, 1);
run (int16_t, int, 13);
run (uint32_t, size_t, 0);
run (uint32_t, size_t, 1);
run (uint32_t, size_t, 13);
run (int32_t, size_t, 0);
run (int32_t, size_t, 1);
run (int32_t, size_t, 13);
run (uint32_t, int, 0);
run (uint32_t, int, 1);
run (uint32_t, int, 13);
run (int32_t, int, 0);
run (int32_t, int, 1);
run (int32_t, int, 13);
return 0;
}

View File

@ -0,0 +1,58 @@
/* { dg-do run } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
/* { dg-final { scan-tree-dump-times "generated strlenQI\n" 3 "ldist" } } */
#include <assert.h>
typedef __SIZE_TYPE__ size_t;
extern void* malloc (size_t);
extern void* memset (void*, int, size_t);
__attribute__((noinline))
int test_pos (char *s)
{
int i;
for (i=42; s[i]; ++i);
return i;
}
__attribute__((noinline))
int test_neg (char *s)
{
int i;
for (i=-42; s[i]; ++i);
return i;
}
__attribute__((noinline))
int test_including_null_char (char *s)
{
int i;
for (i=1; s[i-1]; ++i);
return i;
}
int main(void)
{
void *p = malloc (1024);
assert (p);
memset (p, 0xf, 1024);
char *s = (char *)p + 100;
s[42+13] = 0;
assert (test_pos (s) == 42+13);
s[42+13] = 0xf;
s[13] = 0;
assert (test_neg (s) == 13);
s[13] = 0xf;
s[-13] = 0;
assert (test_neg (s) == -13);
s[-13] = 0xf;
s[13] = 0;
assert (test_including_null_char (s) == 13+1);
return 0;
}

View File

@ -0,0 +1,12 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
/* { dg-final { scan-tree-dump-times "generated strlenSI\n" 1 "ldist" { target s390x-*-* } } } */
extern int s[];
int test ()
{
int i = 0;
for (; s[i]; ++i);
return i;
}

View File

@ -116,6 +116,10 @@ along with GCC; see the file COPYING3. If not see
#include "tree-eh.h"
#include "gimple-fold.h"
#include "tree-affine.h"
#include "intl.h"
#include "rtl.h"
#include "memmodel.h"
#include "optabs.h"
#define MAX_DATAREFS_NUM \
@ -651,6 +655,10 @@ class loop_distribution
control_dependences *cd, int *nb_calls, bool *destroy_p,
bool only_patterns_p);
/* Transform loops which mimic the effects of builtins rawmemchr or strlen and
replace them accordingly. */
bool transform_reduction_loop (loop_p loop);
/* Compute topological order for basic blocks. Topological order is
needed because data dependence is computed for data references in
lexicographical order. */
@ -1492,14 +1500,14 @@ loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
data references. */
static bool
find_single_drs (class loop *loop, struct graph *rdg, partition *partition,
find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts,
data_reference_p *dst_dr, data_reference_p *src_dr)
{
unsigned i;
data_reference_p single_ld = NULL, single_st = NULL;
bitmap_iterator bi;
EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi)
{
gimple *stmt = RDG_STMT (rdg, i);
data_reference_p dr;
@ -1540,44 +1548,47 @@ find_single_drs (class loop *loop, struct graph *rdg, partition *partition,
}
}
if (!single_st)
if (!single_ld && !single_st)
return false;
/* Bail out if this is a bitfield memory reference. */
if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
&& DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
return false;
/* Data reference must be executed exactly once per iteration of each
loop in the loop nest. We only need to check dominance information
against the outermost one in a perfect loop nest because a bb can't
dominate outermost loop's latch without dominating inner loop's. */
basic_block bb_st = gimple_bb (DR_STMT (single_st));
if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
return false;
basic_block bb_ld = NULL;
basic_block bb_st = NULL;
if (single_ld)
{
gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
/* Direct aggregate copy or via an SSA name temporary. */
if (load != store
&& gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
return false;
/* Bail out if this is a bitfield memory reference. */
if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
&& DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
return false;
/* Load and store must be in the same loop nest. */
basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
if (bb_st->loop_father != bb_ld->loop_father)
/* Data reference must be executed exactly once per iteration of each
loop in the loop nest. We only need to check dominance information
against the outermost one in a perfect loop nest because a bb can't
dominate outermost loop's latch without dominating inner loop's. */
bb_ld = gimple_bb (DR_STMT (single_ld));
if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
return false;
}
if (single_st)
{
/* Bail out if this is a bitfield memory reference. */
if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
&& DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
return false;
/* Data reference must be executed exactly once per iteration.
Same as single_st, we only need to check against the outermost
Same as single_ld, we only need to check against the outermost
loop. */
if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
bb_st = gimple_bb (DR_STMT (single_st));
if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
return false;
}
if (single_ld && single_st)
{
/* Load and store must be in the same loop nest. */
if (bb_st->loop_father != bb_ld->loop_father)
return false;
edge e = single_exit (bb_st->loop_father);
@ -1852,9 +1863,19 @@ loop_distribution::classify_partition (loop_p loop,
return has_reduction;
/* Find single load/store data references for builtin partition. */
if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
if (!find_single_drs (loop, rdg, partition->stmts, &single_st, &single_ld)
|| !single_st)
return has_reduction;
if (single_ld && single_st)
{
gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
/* Direct aggregate copy or via an SSA name temporary. */
if (load != store
&& gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
return has_reduction;
}
partition->loc = gimple_location (DR_STMT (single_st));
/* Classify the builtin kind. */
@ -3260,6 +3281,428 @@ find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
return work_list->length () > 0;
}
/* A helper function for generate_{rawmemchr,strlen}_builtin functions in order
to place new statements SEQ before LOOP and replace the old reduction
variable with the new one. */
static void
generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq,
tree reduction_var_old, tree reduction_var_new,
const char *info, machine_mode load_mode)
{
/* Place new statements before LOOP. */
gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING);
/* Replace old reduction variable with new one. */
imm_use_iterator iter;
gimple *stmt;
use_operand_p use_p;
FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old)
{
FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
SET_USE (use_p, reduction_var_new);
update_stmt (stmt);
}
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, info, GET_MODE_NAME (load_mode));
}
/* Generate a call to rawmemchr and place it before LOOP. REDUCTION_VAR is
replaced with a fresh SSA name representing the result of the call. */
static void
generate_rawmemchr_builtin (loop_p loop, tree reduction_var,
data_reference_p store_dr, tree base, tree pattern,
location_t loc)
{
gimple_seq seq = NULL;
tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern);
tree reduction_var_new = copy_ssa_name (reduction_var);
gimple_call_set_lhs (fn_call, reduction_var_new);
gimple_set_location (fn_call, loc);
gimple_seq_add_stmt (&seq, fn_call);
if (store_dr)
{
gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new);
gimple_seq_add_stmt (&seq, g);
}
generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new,
"generated rawmemchr%s\n",
TYPE_MODE (TREE_TYPE (TREE_TYPE (base))));
}
/* Helper function for generate_strlen_builtin(,_using_rawmemchr) */
static void
generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq,
tree reduction_var_old, tree reduction_var_new,
machine_mode mode, tree start_len)
{
/* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be
converted if types of old and new reduction variable are not compatible. */
reduction_var_new = gimple_convert (&seq, TREE_TYPE (reduction_var_old),
reduction_var_new);
/* Loops of the form `for (i=42; s[i]; ++i);` have an additional start
length. */
if (!integer_zerop (start_len))
{
tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new));
gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new,
start_len);
gimple_seq_add_stmt (&seq, g);
reduction_var_new = lhs;
}
generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new,
"generated strlen%s\n", mode);
}
/* Generate a call to strlen and place it before LOOP. REDUCTION_VAR is
replaced with a fresh SSA name representing the result of the call. */
static void
generate_strlen_builtin (loop_p loop, tree reduction_var, tree base,
tree start_len, location_t loc)
{
gimple_seq seq = NULL;
tree reduction_var_new = make_ssa_name (size_type_node);
tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN));
gimple *fn_call = gimple_build_call (fn, 1, mem);
gimple_call_set_lhs (fn_call, reduction_var_new);
gimple_set_location (fn_call, loc);
gimple_seq_add_stmt (&seq, fn_call);
generate_strlen_builtin_1 (loop, seq, reduction_var, reduction_var_new,
QImode, start_len);
}
/* Generate code in order to mimic the behaviour of strlen but this time over
an array of elements with mode different than QI. REDUCTION_VAR is replaced
with a fresh SSA name representing the result, i.e., the length. */
static void
generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var,
tree base, tree load_type,
tree start_len, location_t loc)
{
gimple_seq seq = NULL;
tree start = force_gimple_operand (base, &seq, true, NULL_TREE);
tree zero = build_zero_cst (load_type);
gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero);
tree end = make_ssa_name (TREE_TYPE (base));
gimple_call_set_lhs (fn_call, end);
gimple_set_location (fn_call, loc);
gimple_seq_add_stmt (&seq, fn_call);
/* Determine the number of elements between START and END by
evaluating (END - START) / sizeof (*START). */
tree diff = make_ssa_name (ptrdiff_type_node);
gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start);
gimple_seq_add_stmt (&seq, diff_stmt);
/* Let SIZE be the size of each character. */
tree size = gimple_convert (&seq, ptrdiff_type_node,
TYPE_SIZE_UNIT (load_type));
tree count = make_ssa_name (ptrdiff_type_node);
gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size);
gimple_seq_add_stmt (&seq, count_stmt);
generate_strlen_builtin_1 (loop, seq, reduction_var, count,
TYPE_MODE (load_type),
start_len);
}
/* Return true if we can count at least as many characters by taking pointer
difference as we can count via reduction_var without an overflow. Thus
compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var),
m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character. */
static bool
reduction_var_overflows_first (tree reduction_var, tree load_type)
{
widest_int n2 = wi::lshift (1, TYPE_PRECISION (reduction_var));;
widest_int m2 = wi::lshift (1, TYPE_PRECISION (ptrdiff_type_node) - 1);
widest_int s = wi::to_widest (TYPE_SIZE_UNIT (load_type));
return wi::ltu_p (n2, wi::udiv_trunc (m2, s));
}
static gimple *
determine_reduction_stmt_1 (const loop_p loop, const basic_block *bbs)
{
gimple *reduction_stmt = NULL;
for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i)
{
basic_block bb = bbs[i];
for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
gsi_next_nondebug (&bsi))
{
gphi *phi = bsi.phi ();
if (virtual_operand_p (gimple_phi_result (phi)))
continue;
if (stmt_has_scalar_dependences_outside_loop (loop, phi))
{
if (reduction_stmt)
return NULL;
reduction_stmt = phi;
}
}
for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
gsi_next_nondebug (&bsi), ++ninsns)
{
/* Bail out early for loops which are unlikely to match. */
if (ninsns > 16)
return NULL;
gimple *stmt = gsi_stmt (bsi);
if (gimple_clobber_p (stmt))
continue;
if (gimple_code (stmt) == GIMPLE_LABEL)
continue;
if (gimple_has_volatile_ops (stmt))
return NULL;
if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
{
if (reduction_stmt)
return NULL;
reduction_stmt = stmt;
}
}
}
return reduction_stmt;
}
/* If LOOP has a single non-volatile reduction statement, then return a pointer
to it. Otherwise return NULL. */
static gimple *
determine_reduction_stmt (const loop_p loop)
{
basic_block *bbs = get_loop_body (loop);
gimple *reduction_stmt = determine_reduction_stmt_1 (loop, bbs);
XDELETEVEC (bbs);
return reduction_stmt;
}
/* Transform loops which mimic the effects of builtins rawmemchr or strlen and
replace them accordingly. For example, a loop of the form
for (; *p != 42; ++p);
is replaced by
p = rawmemchr<MODE> (p, 42);
under the assumption that rawmemchr is available for a particular MODE.
Another example is
int i;
for (i = 42; s[i]; ++i);
which is replaced by
i = (int)strlen (&s[42]) + 42;
for some character array S. In case array S is not of type character array
we end up with
i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42;
assuming that rawmemchr is available for a particular MODE. */
bool
loop_distribution::transform_reduction_loop (loop_p loop)
{
gimple *reduction_stmt;
data_reference_p load_dr = NULL, store_dr = NULL;
edge e = single_exit (loop);
gcond *cond = safe_dyn_cast <gcond *> (last_stmt (e->src));
if (!cond)
return false;
/* Ensure loop condition is an (in)equality test and loop is exited either if
the inequality test fails or the equality test succeeds. */
if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == NE_EXPR)
&& !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == EQ_EXPR))
return false;
/* A limitation of the current implementation is that we only support
constant patterns in (in)equality tests. */
tree pattern = gimple_cond_rhs (cond);
if (TREE_CODE (pattern) != INTEGER_CST)
return false;
reduction_stmt = determine_reduction_stmt (loop);
/* A limitation of the current implementation is that we require a reduction
statement. Therefore, loops without a reduction statement as in the
following are not recognized:
int *p;
void foo (void) { for (; *p; ++p); } */
if (reduction_stmt == NULL)
return false;
/* Reduction variables are guaranteed to be SSA names. */
tree reduction_var;
switch (gimple_code (reduction_stmt))
{
case GIMPLE_ASSIGN:
case GIMPLE_PHI:
reduction_var = gimple_get_lhs (reduction_stmt);
break;
default:
/* Bail out e.g. for GIMPLE_CALL. */
return false;
}
struct graph *rdg = build_rdg (loop, NULL);
if (rdg == NULL)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file,
"Loop %d not transformed: failed to build the RDG.\n",
loop->num);
return false;
}
auto_bitmap partition_stmts;
bitmap_set_range (partition_stmts, 0, rdg->n_vertices);
find_single_drs (loop, rdg, partition_stmts, &store_dr, &load_dr);
free_rdg (rdg);
/* Bail out if there is no single load. */
if (load_dr == NULL)
return false;
/* Reaching this point we have a loop with a single reduction variable,
a single load, and an optional single store. */
tree load_ref = DR_REF (load_dr);
tree load_type = TREE_TYPE (load_ref);
tree load_access_base = build_fold_addr_expr (load_ref);
tree load_access_size = TYPE_SIZE_UNIT (load_type);
affine_iv load_iv, reduction_iv;
if (!INTEGRAL_TYPE_P (load_type)
|| !type_has_mode_precision_p (load_type))
return false;
/* We already ensured that the loop condition tests for (in)equality where the
rhs is a constant pattern. Now ensure that the lhs is the result of the
load. */
if (gimple_cond_lhs (cond) != gimple_assign_lhs (DR_STMT (load_dr)))
return false;
/* Bail out if no affine induction variable with constant step can be
determined. */
if (!simple_iv (loop, loop, load_access_base, &load_iv, false))
return false;
/* Bail out if memory accesses are not consecutive or not growing. */
if (!operand_equal_p (load_iv.step, load_access_size, 0))
return false;
if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false))
return false;
/* Handle rawmemchr like loops. */
if (operand_equal_p (load_iv.base, reduction_iv.base)
&& operand_equal_p (load_iv.step, reduction_iv.step))
{
if (store_dr)
{
/* Ensure that we store to X and load from X+I where I>0. */
if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR
|| !integer_onep (TREE_OPERAND (load_iv.base, 1)))
return false;
tree ptr_base = TREE_OPERAND (load_iv.base, 0);
if (TREE_CODE (ptr_base) != SSA_NAME)
return false;
gimple *def = SSA_NAME_DEF_STMT (ptr_base);
if (!gimple_assign_single_p (def)
|| gimple_assign_rhs1 (def) != DR_REF (store_dr))
return false;
/* Ensure that the reduction value is stored. */
if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var)
return false;
}
/* Bail out if target does not provide rawmemchr for a certain mode. */
machine_mode mode = TYPE_MODE (load_type);
if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing)
return false;
location_t loc = gimple_location (DR_STMT (load_dr));
generate_rawmemchr_builtin (loop, reduction_var, store_dr, load_iv.base,
pattern, loc);
return true;
}
/* Handle strlen like loops. */
if (store_dr == NULL
&& integer_zerop (pattern)
&& TREE_CODE (reduction_iv.base) == INTEGER_CST
&& TREE_CODE (reduction_iv.step) == INTEGER_CST
&& integer_onep (reduction_iv.step))
{
location_t loc = gimple_location (DR_STMT (load_dr));
/* While determining the length of a string an overflow might occur.
If an overflow only occurs in the loop implementation and not in the
strlen implementation, then either the overflow is undefined or the
truncated result of strlen equals the one of the loop. Otherwise if
an overflow may also occur in the strlen implementation, then
replacing a loop by a call to strlen is sound whenever we ensure that
if an overflow occurs in the strlen implementation, then also an
overflow occurs in the loop implementation which is undefined. It
seems reasonable to relax this and assume that the strlen
implementation cannot overflow in case sizetype is big enough in the
sense that an overflow can only happen for string objects which are
bigger than half of the address space; at least for 32-bit targets and
up.
For strlen which makes use of rawmemchr the maximal length of a string
which can be determined without an overflow is PTRDIFF_MAX / S where
each character has size S. Since an overflow for ptrdiff type is
undefined we have to make sure that if an overflow occurs, then an
overflow occurs in the loop implementation, too, and this is
undefined, too. Similar as before we relax this and assume that no
string object is larger than half of the address space; at least for
32-bit targets and up. */
if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node)
&& TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node)
&& ((TYPE_PRECISION (sizetype) >= TYPE_PRECISION (ptr_type_node) - 1
&& TYPE_PRECISION (ptr_type_node) >= 32)
|| (TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (reduction_var))
&& TYPE_PRECISION (reduction_var) <= TYPE_PRECISION (sizetype)))
&& builtin_decl_implicit (BUILT_IN_STRLEN))
generate_strlen_builtin (loop, reduction_var, load_iv.base,
reduction_iv.base, loc);
else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type))
!= CODE_FOR_nothing
&& ((TYPE_PRECISION (ptrdiff_type_node) == TYPE_PRECISION (ptr_type_node)
&& TYPE_PRECISION (ptrdiff_type_node) >= 32)
|| (TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (reduction_var))
&& reduction_var_overflows_first (reduction_var, load_type))))
generate_strlen_builtin_using_rawmemchr (loop, reduction_var,
load_iv.base,
load_type,
reduction_iv.base, loc);
else
return false;
return true;
}
return false;
}
/* Given innermost LOOP, return the outermost enclosing loop that forms a
perfect loop nest. */
@ -3324,10 +3767,27 @@ loop_distribution::execute (function *fun)
&& !optimize_loop_for_speed_p (loop)))
continue;
/* Don't distribute loop if niters is unknown. */
/* If niters is unknown don't distribute loop but rather try to transform
it to a call to a builtin. */
tree niters = number_of_latch_executions (loop);
if (niters == NULL_TREE || niters == chrec_dont_know)
continue;
{
datarefs_vec.create (20);
if (transform_reduction_loop (loop))
{
changed = true;
loops_to_be_destroyed.safe_push (loop);
if (dump_enabled_p ())
{
dump_user_location_t loc = find_loop_location (loop);
dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
loc, "Loop %d transformed into a builtin.\n",
loop->num);
}
}
free_data_refs (datarefs_vec);
continue;
}
/* Get the perfect loop nest for distribution. */
loop = prepare_perfect_loop_nest (loop);