re PR target/55701 (Inline some instances of memset for ARM)

PR target/55701
	* config/arm/arm.md (setmem): New pattern.
	* config/arm/arm-protos.h (struct tune_params): New fields.
	(arm_gen_setmem): New prototype.
	* config/arm/arm.c (arm_slowmul_tune): Initialize new fields.
	(arm_fastmul_tune, arm_strongarm_tune, arm_xscale_tune): Ditto.
	(arm_9e_tune, arm_v6t2_tune, arm_cortex_tune): Ditto.
	(arm_cortex_a8_tune, arm_cortex_a7_tune): Ditto.
	(arm_cortex_a15_tune, arm_cortex_a53_tune): Ditto.
	(arm_cortex_a57_tune, arm_cortex_a5_tune): Ditto.
	(arm_cortex_a9_tune, arm_cortex_a12_tune): Ditto.
	(arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune): Ditto.
	(arm_const_inline_cost): New function.
	(arm_block_set_max_insns): New function.
	(arm_block_set_non_vect_profit_p): New function.
	(arm_block_set_vect_profit_p): New function.
	(arm_block_set_unaligned_vect): New function.
	(arm_block_set_aligned_vect): New function.
	(arm_block_set_unaligned_non_vect): New function.
	(arm_block_set_aligned_non_vect): New function.
	(arm_block_set_vect, arm_gen_setmem): New functions.

	* gcc.target/arm/memset-inline-1.c: New test.
	* gcc.target/arm/memset-inline-2.c: New test.
	* gcc.target/arm/memset-inline-3.c: New test.
	* gcc.target/arm/memset-inline-4.c: New test.
	* gcc.target/arm/memset-inline-5.c: New test.
	* gcc.target/arm/memset-inline-6.c: New test.
	* gcc.target/arm/memset-inline-7.c: New test.
	* gcc.target/arm/memset-inline-8.c: New test.
	* gcc.target/arm/memset-inline-9.c: New test.

From-SVN: r212893
This commit is contained in:
Bin Cheng 2014-07-21 12:24:06 +00:00 committed by Bin Cheng
parent 8cde4e713a
commit 62e79a48df
14 changed files with 1219 additions and 18 deletions

View File

@ -1,3 +1,27 @@
2014-07-21 Bin Cheng <bin.cheng@arm.com>
PR target/55701
* config/arm/arm.md (setmem): New pattern.
* config/arm/arm-protos.h (struct tune_params): New fields.
(arm_gen_setmem): New prototype.
* config/arm/arm.c (arm_slowmul_tune): Initialize new fields.
(arm_fastmul_tune, arm_strongarm_tune, arm_xscale_tune): Ditto.
(arm_9e_tune, arm_v6t2_tune, arm_cortex_tune): Ditto.
(arm_cortex_a8_tune, arm_cortex_a7_tune): Ditto.
(arm_cortex_a15_tune, arm_cortex_a53_tune): Ditto.
(arm_cortex_a57_tune, arm_cortex_a5_tune): Ditto.
(arm_cortex_a9_tune, arm_cortex_a12_tune): Ditto.
(arm_v7m_tune, arm_v6m_tune, arm_fa726te_tune): Ditto.
(arm_const_inline_cost): New function.
(arm_block_set_max_insns): New function.
(arm_block_set_non_vect_profit_p): New function.
(arm_block_set_vect_profit_p): New function.
(arm_block_set_unaligned_vect): New function.
(arm_block_set_aligned_vect): New function.
(arm_block_set_unaligned_non_vect): New function.
(arm_block_set_aligned_non_vect): New function.
(arm_block_set_vect, arm_gen_setmem): New functions.
2014-07-21 Bin Cheng <bin.cheng@arm.com>
* config/arm/arm.c (output_move_neon): Handle REG explicitly.

View File

@ -278,6 +278,10 @@ struct tune_params
/* Prefer 32-bit encoding instead of 16-bit encoding where subset of flags
would be set. */
bool disparage_partial_flag_setting_t16_encodings;
/* Prefer to inline string operations like memset by using Neon. */
bool string_ops_prefer_neon;
/* Maximum number of instructions to inline calls to memset. */
int max_insns_inline_memset;
};
extern const struct tune_params *current_tune;
@ -290,6 +294,7 @@ extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
extern bool arm_validize_comparison (rtx *, rtx *, rtx *);
#endif /* RTX_CODE */
extern bool arm_gen_setmem (rtx *);
extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
extern bool arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);

View File

@ -1698,7 +1698,9 @@ const struct tune_params arm_slowmul_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_fastmul_tune =
@ -1715,7 +1717,9 @@ const struct tune_params arm_fastmul_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
/* StrongARM has early execution of branches, so a sequence that is worth
@ -1735,7 +1739,9 @@ const struct tune_params arm_strongarm_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_xscale_tune =
@ -1752,7 +1758,9 @@ const struct tune_params arm_xscale_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_9e_tune =
@ -1769,7 +1777,9 @@ const struct tune_params arm_9e_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_v6t2_tune =
@ -1786,7 +1796,9 @@ const struct tune_params arm_v6t2_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
/* Generic Cortex tuning. Use more specific tunings if appropriate. */
@ -1804,7 +1816,9 @@ const struct tune_params arm_cortex_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a8_tune =
@ -1821,7 +1835,9 @@ const struct tune_params arm_cortex_a8_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a7_tune =
@ -1838,7 +1854,9 @@ const struct tune_params arm_cortex_a7_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a15_tune =
@ -1855,7 +1873,9 @@ const struct tune_params arm_cortex_a15_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
true, true /* Prefer 32-bit encodings. */
true, true, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a53_tune =
@ -1872,7 +1892,9 @@ const struct tune_params arm_cortex_a53_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a57_tune =
@ -1889,7 +1911,9 @@ const struct tune_params arm_cortex_a57_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
true, true /* Prefer 32-bit encodings. */
true, true, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
/* Branches can be dual-issued on Cortex-A5, so conditional execution is
@ -1909,7 +1933,9 @@ const struct tune_params arm_cortex_a5_tune =
{false, false}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a9_tune =
@ -1926,7 +1952,9 @@ const struct tune_params arm_cortex_a9_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_cortex_a12_tune =
@ -1943,7 +1971,9 @@ const struct tune_params arm_cortex_a12_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
true, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
/* armv7m tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single
@ -1967,7 +1997,9 @@ const struct tune_params arm_v7m_tune =
{false, false}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
/* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
@ -1986,7 +2018,9 @@ const struct tune_params arm_v6m_tune =
{false, false}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
const struct tune_params arm_fa726te_tune =
@ -2003,7 +2037,9 @@ const struct tune_params arm_fa726te_tune =
{true, true}, /* Prefer non short circuit. */
&arm_default_vec_cost, /* Vectorizer costs. */
false, /* Prefer Neon for 64-bits bitops. */
false, false /* Prefer 32-bit encodings. */
false, false, /* Prefer 32-bit encodings. */
false, /* Prefer Neon for stringops. */
8 /* Maximum insns to inline memset. */
};
@ -16899,6 +16935,14 @@ arm_const_double_inline_cost (rtx val)
NULL_RTX, NULL_RTX, 0, 0));
}
/* Cost of loading a SImode constant. */
static inline int
arm_const_inline_cost (enum rtx_code code, rtx val)
{
return arm_gen_constant (code, SImode, NULL_RTX, INTVAL (val),
NULL_RTX, NULL_RTX, 1, 0);
}
/* Return true if it is worthwhile to split a 64-bit constant into two
32-bit operations. This is the case if optimizing for size, or
if we have load delay slots, or if one 32-bit part can be done with
@ -31521,6 +31565,519 @@ arm_validize_comparison (rtx *comparison, rtx * op1, rtx * op2)
}
/* Maximum number of instructions to set block of memory. */
static int
arm_block_set_max_insns (void)
{
if (optimize_function_for_size_p (cfun))
return 4;
else
return current_tune->max_insns_inline_memset;
}
/* Return TRUE if it's profitable to set block of memory for
non-vectorized case. VAL is the value to set the memory
with. LENGTH is the number of bytes to set. ALIGN is the
alignment of the destination memory in bytes. UNALIGNED_P
is TRUE if we can only set the memory with instructions
meeting alignment requirements. USE_STRD_P is TRUE if we
can use strd to set the memory. */
static bool
arm_block_set_non_vect_profit_p (rtx val,
unsigned HOST_WIDE_INT length,
unsigned HOST_WIDE_INT align,
bool unaligned_p, bool use_strd_p)
{
int num = 0;
/* For leftovers in bytes of 0-7, we can set the memory block using
strb/strh/str with minimum instruction number. */
const int leftover[8] = {0, 1, 1, 2, 1, 2, 2, 3};
if (unaligned_p)
{
num = arm_const_inline_cost (SET, val);
num += length / align + length % align;
}
else if (use_strd_p)
{
num = arm_const_double_inline_cost (val);
num += (length >> 3) + leftover[length & 7];
}
else
{
num = arm_const_inline_cost (SET, val);
num += (length >> 2) + leftover[length & 3];
}
/* We may be able to combine last pair STRH/STRB into a single STR
by shifting one byte back. */
if (unaligned_access && length > 3 && (length & 3) == 3)
num--;
return (num <= arm_block_set_max_insns ());
}
/* Return TRUE if it's profitable to set block of memory for
vectorized case. LENGTH is the number of bytes to set.
ALIGN is the alignment of destination memory in bytes.
MODE is the vector mode used to set the memory. */
static bool
arm_block_set_vect_profit_p (unsigned HOST_WIDE_INT length,
unsigned HOST_WIDE_INT align,
enum machine_mode mode)
{
int num;
bool unaligned_p = ((align & 3) != 0);
unsigned int nelt = GET_MODE_NUNITS (mode);
/* Instruction loading constant value. */
num = 1;
/* Instructions storing the memory. */
num += (length + nelt - 1) / nelt;
/* Instructions adjusting the address expression. Only need to
adjust address expression if it's 4 bytes aligned and bytes
leftover can only be stored by mis-aligned store instruction. */
if (!unaligned_p && (length & 3) != 0)
num++;
/* Store the first 16 bytes using vst1:v16qi for the aligned case. */
if (!unaligned_p && mode == V16QImode)
num--;
return (num <= arm_block_set_max_insns ());
}
/* Set a block of memory using vectorization instructions for the
unaligned case. We fill the first LENGTH bytes of the memory
area starting from DSTBASE with byte constant VALUE. ALIGN is
the alignment requirement of memory. Return TRUE if succeeded. */
static bool
arm_block_set_unaligned_vect (rtx dstbase,
unsigned HOST_WIDE_INT length,
unsigned HOST_WIDE_INT value,
unsigned HOST_WIDE_INT align)
{
unsigned int i, j, nelt_v16, nelt_v8, nelt_mode;
rtx dst, mem;
rtx val_elt, val_vec, reg;
rtx rval[MAX_VECT_LEN];
rtx (*gen_func) (rtx, rtx);
enum machine_mode mode;
unsigned HOST_WIDE_INT v = value;
gcc_assert ((align & 0x3) != 0);
nelt_v8 = GET_MODE_NUNITS (V8QImode);
nelt_v16 = GET_MODE_NUNITS (V16QImode);
if (length >= nelt_v16)
{
mode = V16QImode;
gen_func = gen_movmisalignv16qi;
}
else
{
mode = V8QImode;
gen_func = gen_movmisalignv8qi;
}
nelt_mode = GET_MODE_NUNITS (mode);
gcc_assert (length >= nelt_mode);
/* Skip if it isn't profitable. */
if (!arm_block_set_vect_profit_p (length, align, mode))
return false;
dst = copy_addr_to_reg (XEXP (dstbase, 0));
mem = adjust_automodify_address (dstbase, mode, dst, 0);
v = sext_hwi (v, BITS_PER_WORD);
val_elt = GEN_INT (v);
for (j = 0; j < nelt_mode; j++)
rval[j] = val_elt;
reg = gen_reg_rtx (mode);
val_vec = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt_mode, rval));
/* Emit instruction loading the constant value. */
emit_move_insn (reg, val_vec);
/* Handle nelt_mode bytes in a vector. */
for (i = 0; (i + nelt_mode <= length); i += nelt_mode)
{
emit_insn ((*gen_func) (mem, reg));
if (i + 2 * nelt_mode <= length)
emit_insn (gen_add2_insn (dst, GEN_INT (nelt_mode)));
}
/* If there are not less than nelt_v8 bytes leftover, we must be in
V16QI mode. */
gcc_assert ((i + nelt_v8) > length || mode == V16QImode);
/* Handle (8, 16) bytes leftover. */
if (i + nelt_v8 < length)
{
emit_insn (gen_add2_insn (dst, GEN_INT (length - i)));
/* We are shifting bytes back, set the alignment accordingly. */
if ((length & 1) != 0 && align >= 2)
set_mem_align (mem, BITS_PER_UNIT);
emit_insn (gen_movmisalignv16qi (mem, reg));
}
/* Handle (0, 8] bytes leftover. */
else if (i < length && i + nelt_v8 >= length)
{
if (mode == V16QImode)
{
reg = gen_lowpart (V8QImode, reg);
mem = adjust_automodify_address (dstbase, V8QImode, dst, 0);
}
emit_insn (gen_add2_insn (dst, GEN_INT ((length - i)
+ (nelt_mode - nelt_v8))));
/* We are shifting bytes back, set the alignment accordingly. */
if ((length & 1) != 0 && align >= 2)
set_mem_align (mem, BITS_PER_UNIT);
emit_insn (gen_movmisalignv8qi (mem, reg));
}
return true;
}
/* Set a block of memory using vectorization instructions for the
aligned case. We fill the first LENGTH bytes of the memory area
starting from DSTBASE with byte constant VALUE. ALIGN is the
alignment requirement of memory. Return TRUE if succeeded. */
static bool
arm_block_set_aligned_vect (rtx dstbase,
unsigned HOST_WIDE_INT length,
unsigned HOST_WIDE_INT value,
unsigned HOST_WIDE_INT align)
{
unsigned int i, j, nelt_v8, nelt_v16, nelt_mode;
rtx dst, addr, mem;
rtx val_elt, val_vec, reg;
rtx rval[MAX_VECT_LEN];
enum machine_mode mode;
unsigned HOST_WIDE_INT v = value;
gcc_assert ((align & 0x3) == 0);
nelt_v8 = GET_MODE_NUNITS (V8QImode);
nelt_v16 = GET_MODE_NUNITS (V16QImode);
if (length >= nelt_v16 && unaligned_access && !BYTES_BIG_ENDIAN)
mode = V16QImode;
else
mode = V8QImode;
nelt_mode = GET_MODE_NUNITS (mode);
gcc_assert (length >= nelt_mode);
/* Skip if it isn't profitable. */
if (!arm_block_set_vect_profit_p (length, align, mode))
return false;
dst = copy_addr_to_reg (XEXP (dstbase, 0));
v = sext_hwi (v, BITS_PER_WORD);
val_elt = GEN_INT (v);
for (j = 0; j < nelt_mode; j++)
rval[j] = val_elt;
reg = gen_reg_rtx (mode);
val_vec = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt_mode, rval));
/* Emit instruction loading the constant value. */
emit_move_insn (reg, val_vec);
i = 0;
/* Handle first 16 bytes specially using vst1:v16qi instruction. */
if (mode == V16QImode)
{
mem = adjust_automodify_address (dstbase, mode, dst, 0);
emit_insn (gen_movmisalignv16qi (mem, reg));
i += nelt_mode;
/* Handle (8, 16) bytes leftover using vst1:v16qi again. */
if (i + nelt_v8 < length && i + nelt_v16 > length)
{
emit_insn (gen_add2_insn (dst, GEN_INT (length - nelt_mode)));
mem = adjust_automodify_address (dstbase, mode, dst, 0);
/* We are shifting bytes back, set the alignment accordingly. */
if ((length & 0x3) == 0)
set_mem_align (mem, BITS_PER_UNIT * 4);
else if ((length & 0x1) == 0)
set_mem_align (mem, BITS_PER_UNIT * 2);
else
set_mem_align (mem, BITS_PER_UNIT);
emit_insn (gen_movmisalignv16qi (mem, reg));
return true;
}
/* Fall through for bytes leftover. */
mode = V8QImode;
nelt_mode = GET_MODE_NUNITS (mode);
reg = gen_lowpart (V8QImode, reg);
}
/* Handle 8 bytes in a vector. */
for (; (i + nelt_mode <= length); i += nelt_mode)
{
addr = plus_constant (Pmode, dst, i);
mem = adjust_automodify_address (dstbase, mode, addr, i);
emit_move_insn (mem, reg);
}
/* Handle single word leftover by shifting 4 bytes back. We can
use aligned access for this case. */
if (i + UNITS_PER_WORD == length)
{
addr = plus_constant (Pmode, dst, i - UNITS_PER_WORD);
mem = adjust_automodify_address (dstbase, mode,
addr, i - UNITS_PER_WORD);
/* We are shifting 4 bytes back, set the alignment accordingly. */
if (align > UNITS_PER_WORD)
set_mem_align (mem, BITS_PER_UNIT * UNITS_PER_WORD);
emit_move_insn (mem, reg);
}
/* Handle (0, 4), (4, 8) bytes leftover by shifting bytes back.
We have to use unaligned access for this case. */
else if (i < length)
{
emit_insn (gen_add2_insn (dst, GEN_INT (length - nelt_mode)));
mem = adjust_automodify_address (dstbase, mode, dst, 0);
/* We are shifting bytes back, set the alignment accordingly. */
if ((length & 1) == 0)
set_mem_align (mem, BITS_PER_UNIT * 2);
else
set_mem_align (mem, BITS_PER_UNIT);
emit_insn (gen_movmisalignv8qi (mem, reg));
}
return true;
}
/* Set a block of memory using plain strh/strb instructions, only
using instructions allowed by ALIGN on processor. We fill the
first LENGTH bytes of the memory area starting from DSTBASE
with byte constant VALUE. ALIGN is the alignment requirement
of memory. */
static bool
arm_block_set_unaligned_non_vect (rtx dstbase,
unsigned HOST_WIDE_INT length,
unsigned HOST_WIDE_INT value,
unsigned HOST_WIDE_INT align)
{
unsigned int i;
rtx dst, addr, mem;
rtx val_exp, val_reg, reg;
enum machine_mode mode;
HOST_WIDE_INT v = value;
gcc_assert (align == 1 || align == 2);
if (align == 2)
v |= (value << BITS_PER_UNIT);
v = sext_hwi (v, BITS_PER_WORD);
val_exp = GEN_INT (v);
/* Skip if it isn't profitable. */
if (!arm_block_set_non_vect_profit_p (val_exp, length,
align, true, false))
return false;
dst = copy_addr_to_reg (XEXP (dstbase, 0));
mode = (align == 2 ? HImode : QImode);
val_reg = force_reg (SImode, val_exp);
reg = gen_lowpart (mode, val_reg);
for (i = 0; (i + GET_MODE_SIZE (mode) <= length); i += GET_MODE_SIZE (mode))
{
addr = plus_constant (Pmode, dst, i);
mem = adjust_automodify_address (dstbase, mode, addr, i);
emit_move_insn (mem, reg);
}
/* Handle single byte leftover. */
if (i + 1 == length)
{
reg = gen_lowpart (QImode, val_reg);
addr = plus_constant (Pmode, dst, i);
mem = adjust_automodify_address (dstbase, QImode, addr, i);
emit_move_insn (mem, reg);
i++;
}
gcc_assert (i == length);
return true;
}
/* Set a block of memory using plain strd/str/strh/strb instructions,
to permit unaligned copies on processors which support unaligned
semantics for those instructions. We fill the first LENGTH bytes
of the memory area starting from DSTBASE with byte constant VALUE.
ALIGN is the alignment requirement of memory. */
static bool
arm_block_set_aligned_non_vect (rtx dstbase,
unsigned HOST_WIDE_INT length,
unsigned HOST_WIDE_INT value,
unsigned HOST_WIDE_INT align)
{
unsigned int i;
rtx dst, addr, mem;
rtx val_exp, val_reg, reg;
unsigned HOST_WIDE_INT v;
bool use_strd_p;
use_strd_p = (length >= 2 * UNITS_PER_WORD && (align & 3) == 0
&& TARGET_LDRD && current_tune->prefer_ldrd_strd);
v = (value | (value << 8) | (value << 16) | (value << 24));
if (length < UNITS_PER_WORD)
v &= (0xFFFFFFFF >> (UNITS_PER_WORD - length) * BITS_PER_UNIT);
if (use_strd_p)
v |= (v << BITS_PER_WORD);
else
v = sext_hwi (v, BITS_PER_WORD);
val_exp = GEN_INT (v);
/* Skip if it isn't profitable. */
if (!arm_block_set_non_vect_profit_p (val_exp, length,
align, false, use_strd_p))
{
if (!use_strd_p)
return false;
/* Try without strd. */
v = (v >> BITS_PER_WORD);
v = sext_hwi (v, BITS_PER_WORD);
val_exp = GEN_INT (v);
use_strd_p = false;
if (!arm_block_set_non_vect_profit_p (val_exp, length,
align, false, use_strd_p))
return false;
}
i = 0;
dst = copy_addr_to_reg (XEXP (dstbase, 0));
/* Handle double words using strd if possible. */
if (use_strd_p)
{
val_reg = force_reg (DImode, val_exp);
reg = val_reg;
for (; (i + 8 <= length); i += 8)
{
addr = plus_constant (Pmode, dst, i);
mem = adjust_automodify_address (dstbase, DImode, addr, i);
emit_move_insn (mem, reg);
}
}
else
val_reg = force_reg (SImode, val_exp);
/* Handle words. */
reg = (use_strd_p ? gen_lowpart (SImode, val_reg) : val_reg);
for (; (i + 4 <= length); i += 4)
{
addr = plus_constant (Pmode, dst, i);
mem = adjust_automodify_address (dstbase, SImode, addr, i);
if ((align & 3) == 0)
emit_move_insn (mem, reg);
else
emit_insn (gen_unaligned_storesi (mem, reg));
}
/* Merge last pair of STRH and STRB into a STR if possible. */
if (unaligned_access && i > 0 && (i + 3) == length)
{
addr = plus_constant (Pmode, dst, i - 1);
mem = adjust_automodify_address (dstbase, SImode, addr, i - 1);
/* We are shifting one byte back, set the alignment accordingly. */
if ((align & 1) == 0)
set_mem_align (mem, BITS_PER_UNIT);
/* Most likely this is an unaligned access, and we can't tell at
compilation time. */
emit_insn (gen_unaligned_storesi (mem, reg));
return true;
}
/* Handle half word leftover. */
if (i + 2 <= length)
{
reg = gen_lowpart (HImode, val_reg);
addr = plus_constant (Pmode, dst, i);
mem = adjust_automodify_address (dstbase, HImode, addr, i);
if ((align & 1) == 0)
emit_move_insn (mem, reg);
else
emit_insn (gen_unaligned_storehi (mem, reg));
i += 2;
}
/* Handle single byte leftover. */
if (i + 1 == length)
{
reg = gen_lowpart (QImode, val_reg);
addr = plus_constant (Pmode, dst, i);
mem = adjust_automodify_address (dstbase, QImode, addr, i);
emit_move_insn (mem, reg);
}
return true;
}
/* Set a block of memory using vectorization instructions for both
aligned and unaligned cases. We fill the first LENGTH bytes of
the memory area starting from DSTBASE with byte constant VALUE.
ALIGN is the alignment requirement of memory. */
static bool
arm_block_set_vect (rtx dstbase,
unsigned HOST_WIDE_INT length,
unsigned HOST_WIDE_INT value,
unsigned HOST_WIDE_INT align)
{
/* Check whether we need to use unaligned store instruction. */
if (((align & 3) != 0 || (length & 3) != 0)
/* Check whether unaligned store instruction is available. */
&& (!unaligned_access || BYTES_BIG_ENDIAN))
return false;
if ((align & 3) == 0)
return arm_block_set_aligned_vect (dstbase, length, value, align);
else
return arm_block_set_unaligned_vect (dstbase, length, value, align);
}
/* Expand string store operation. Firstly we try to do that by using
vectorization instructions, then try with ARM unaligned access and
double-word store if profitable. OPERANDS[0] is the destination,
OPERANDS[1] is the number of bytes, operands[2] is the value to
initialize the memory, OPERANDS[3] is the known alignment of the
destination. */
bool
arm_gen_setmem (rtx *operands)
{
rtx dstbase = operands[0];
unsigned HOST_WIDE_INT length;
unsigned HOST_WIDE_INT value;
unsigned HOST_WIDE_INT align;
if (!CONST_INT_P (operands[2]) || !CONST_INT_P (operands[1]))
return false;
length = UINTVAL (operands[1]);
if (length > 64)
return false;
value = (UINTVAL (operands[2]) & 0xFF);
align = UINTVAL (operands[3]);
if (TARGET_NEON && length >= 8
&& current_tune->string_ops_prefer_neon
&& arm_block_set_vect (dstbase, length, value, align))
return true;
if (!unaligned_access && (align & 3) != 0)
return arm_block_set_unaligned_non_vect (dstbase, length, value, align);
return arm_block_set_aligned_non_vect (dstbase, length, value, align);
}
/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
static unsigned HOST_WIDE_INT

View File

@ -6716,6 +6716,20 @@
})
(define_expand "setmemsi"
[(match_operand:BLK 0 "general_operand" "")
(match_operand:SI 1 "const_int_operand" "")
(match_operand:SI 2 "const_int_operand" "")
(match_operand:SI 3 "const_int_operand" "")]
"TARGET_32BIT"
{
if (arm_gen_setmem (operands))
DONE;
FAIL;
})
;; Move a block of memory if it is word aligned and MORE than 2 words long.
;; We could let this apply for blocks of less than this, but it clobbers so
;; many registers that there is then probably a better way.

View File

@ -1,3 +1,16 @@
2014-07-21 Bin Cheng <bin.cheng@arm.com>
PR target/55701
* gcc.target/arm/memset-inline-1.c: New test.
* gcc.target/arm/memset-inline-2.c: New test.
* gcc.target/arm/memset-inline-3.c: New test.
* gcc.target/arm/memset-inline-4.c: New test.
* gcc.target/arm/memset-inline-5.c: New test.
* gcc.target/arm/memset-inline-6.c: New test.
* gcc.target/arm/memset-inline-7.c: New test.
* gcc.target/arm/memset-inline-8.c: New test.
* gcc.target/arm/memset-inline-9.c: New test.
2014-07-21 Tom de Vries <tom@codesourcery.com>
PR target/61827

View File

@ -0,0 +1,39 @@
/* { dg-do run } */
/* { dg-options "-save-temps -O2 -fno-inline" } */
#include <string.h>
#include <stdlib.h>
#define LEN (100)
short a[LEN];
void
foo (void)
{
memset (a, -1, 14);
return;
}
void
check (signed char *arr, int idx, int len, int v)
{
int i;
for (i = 0; i < idx; i++)
if (arr[i] != v)
abort ();
for (i = idx; i < len; i++)
if (arr[i] != 0)
abort ();
}
int
main(void)
{
foo ();
check ((signed char *)a, 14, sizeof (a), -1);
return 0;
}
/* { dg-final { scan-assembler-not "bl?\[ \t\]*memset" { target { ! arm_thumb1_ok } } } } */
/* { dg-final { cleanup-saved-temps } } */

View File

@ -0,0 +1,38 @@
/* { dg-do run } */
/* { dg-options "-save-temps -Os -fno-inline" } */
#include <string.h>
#include <stdlib.h>
#define LEN (100)
short a[LEN];
void
foo (void)
{
memset (a, -1, 14);
return;
}
void
check (signed char *arr, int idx, int len, int v)
{
int i;
for (i = 0; i < idx; i++)
if (arr[i] != v)
abort ();
for (i = idx; i < len; i++)
if (arr[i] != 0)
abort ();
}
int
main(void)
{
foo ();
check ((signed char *)a, 14, sizeof (a), -1);
return 0;
}
/* { dg-final { scan-assembler "bl?\[ \t\]*memset" { target { ! arm_neon } } } } */
/* { dg-final { cleanup-saved-temps } } */

View File

@ -0,0 +1,40 @@
/* { dg-do run } */
/* { dg-options "-save-temps -O2 -fno-inline" } */
#include <string.h>
#include <stdlib.h>
#define LEN (100)
short a[LEN];
void
foo (void)
{
memset (a, -1, 7);
return;
}
void
check (signed char *arr, int idx, int len, int v)
{
int i;
for (i = 0; i < idx; i++)
if (arr[i] != v)
abort ();
for (i = idx; i < len; i++)
if (arr[i] != 0)
abort ();
}
int
main(void)
{
foo ();
check ((signed char *)a, 7, sizeof (a), -1);
return 0;
}
/* { dg-final { scan-assembler-not "bl?\[ \t\]*memset" { target { ! arm_thumb1_ok } } } } */
/* { dg-final { scan-assembler-not "strh" { target { ! arm_thumb1 } } } } */
/* { dg-final { scan-assembler-not "strb" { target { ! arm_thumb1 } } } } */

View File

@ -0,0 +1,68 @@
/* { dg-do run } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */
/* { dg-options "-save-temps -O2 -fno-inline" } */
/* { dg-add-options "arm_neon" } */
#include <string.h>
#include <stdlib.h>
#define LEN (100)
int a[LEN];
int b[LEN];
int c[LEN];
void
foo1 (void)
{
memset (a, -1, 8);
return;
}
void
foo2 (void)
{
memset (b, 1, 12);
return;
}
void
foo3 (void)
{
memset (c, 1, 13);
return;
}
void
check (signed char *arr, int idx, int len, int v)
{
int i;
for (i = 0; i < idx; i++)
if (arr[i] != v)
abort ();
for (i = idx; i < len; i++)
if (arr[i] != 0)
abort ();
}
int
main(void)
{
int i;
foo1 ();
check ((signed char *)a, 8, sizeof (a), -1);
foo2 ();
check ((signed char *)b, 12, sizeof (b), 1);
foo3 ();
check ((signed char *)c, 13, sizeof (c), 1);
return 0;
}
/* { dg-final { scan-assembler-not "bl?\[ \t\]+memset" { target { ! arm_thumb1_ok } } } } */
/* { dg-final { scan-assembler-times "vst1\.8" 1 { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { scan-assembler "vstr" { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { cleanup-saved-temps } } */

View File

@ -0,0 +1,78 @@
/* { dg-do run } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */
/* { dg-options "-save-temps -O2 -fno-inline" } */
/* { dg-add-options "arm_neon" } */
#include <string.h>
#include <stdlib.h>
#define LEN (100)
int a[LEN];
int b[LEN];
int c[LEN];
int d[LEN];
void
foo1 (void)
{
memset (a, -1, 16);
return;
}
void
foo2 (void)
{
memset (b, 1, 25);
return;
}
void
foo3 (void)
{
memset (c, -1, 19);
return;
}
void
foo4 (void)
{
memset (d, 1, 23);
return;
}
void
check (signed char *arr, int idx, int len, int v)
{
int i;
for (i = 0; i < idx; i++)
if (arr[i] != v)
abort ();
for (i = idx; i < len; i++)
if (arr[i] != 0)
abort ();
}
int
main(void)
{
foo1 ();
check ((signed char *)a, 16, sizeof (a), -1);
foo2 ();
check ((signed char *)b, 25, sizeof (b), 1);
foo3 ();
check ((signed char *)c, 19, sizeof (c), -1);
foo4 ();
check ((signed char *)d, 23, sizeof (d), 1);
return 0;
}
/* { dg-final { scan-assembler-not "bl?\[ \t\]+memset" { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { scan-assembler "vst1" { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { scan-assembler-not "vstr" { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { cleanup-saved-temps } } */

View File

@ -0,0 +1,68 @@
/* { dg-do run } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */
/* { dg-options "-save-temps -O2 -fno-inline" } */
/* { dg-add-options "arm_neon" } */
#include <string.h>
#include <stdlib.h>
#define LEN (100)
int a[LEN];
int b[LEN];
int c[LEN];
void
foo1 (void)
{
memset (a, -1, 20);
return;
}
void
foo2 (void)
{
memset (b, 1, 24);
return;
}
void
foo3 (void)
{
memset (c, -1, 32);
return;
}
void
check (signed char *arr, int idx, int len, int v)
{
int i;
for (i = 0; i < idx; i++)
if (arr[i] != v)
abort ();
for (i = idx; i < len; i++)
if (arr[i] != 0)
abort ();
}
int
main(void)
{
foo1 ();
check ((signed char *)a, 20, sizeof (a), -1);
foo2 ();
check ((signed char *)b, 24, sizeof (b), 1);
foo3 ();
check ((signed char *)c, 32, sizeof (c), -1);
return 0;
}
/* { dg-final { scan-assembler-not "bl?\[ \t\]+memset" { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { scan-assembler-times "vst1" 3 { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { scan-assembler-times "vstr" 4 { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { cleanup-saved-temps } } */

View File

@ -0,0 +1,171 @@
/* { dg-do run } */
/* { dg-options "-O2" } */
#include <string.h>
#include <stdlib.h>
#define LEN (100)
short a[LEN];
int b[LEN];
void
init (signed char *arr, int len)
{
int i;
for (i = 0; i < len; i++)
arr[i] = 0;
}
void
check (signed char *arr, int idx, int len, int v)
{
int i;
for (i = 0; i < idx; i++)
if (arr[i] != v)
abort ();
for (i = idx; i < len; i++)
if (arr[i] != 0)
abort ();
}
#define TEST(a,l,v) \
init ((signed char*)(a), sizeof (a)); \
memset ((a), (v), (l)); \
check ((signed char *)(a), (l), sizeof (a), (v));
int
main(void)
{
TEST (a, 1, -1);
TEST (a, 2, -1);
TEST (a, 3, -1);
TEST (a, 4, -1);
TEST (a, 5, -1);
TEST (a, 6, -1);
TEST (a, 7, -1);
TEST (a, 8, -1);
TEST (a, 9, 1);
TEST (a, 10, -1);
TEST (a, 11, 1);
TEST (a, 12, -1);
TEST (a, 13, 1);
TEST (a, 14, -1);
TEST (a, 15, 1);
TEST (a, 16, -1);
TEST (a, 17, 1);
TEST (a, 18, -1);
TEST (a, 19, 1);
TEST (a, 20, -1);
TEST (a, 21, 1);
TEST (a, 22, -1);
TEST (a, 23, 1);
TEST (a, 24, -1);
TEST (a, 25, 1);
TEST (a, 26, -1);
TEST (a, 27, 1);
TEST (a, 28, -1);
TEST (a, 29, 1);
TEST (a, 30, -1);
TEST (a, 31, 1);
TEST (a, 32, -1);
TEST (a, 33, 1);
TEST (a, 34, -1);
TEST (a, 35, 1);
TEST (a, 36, -1);
TEST (a, 37, 1);
TEST (a, 38, -1);
TEST (a, 39, 1);
TEST (a, 40, -1);
TEST (a, 41, 1);
TEST (a, 42, -1);
TEST (a, 43, 1);
TEST (a, 44, -1);
TEST (a, 45, 1);
TEST (a, 46, -1);
TEST (a, 47, 1);
TEST (a, 48, -1);
TEST (a, 49, 1);
TEST (a, 50, -1);
TEST (a, 51, 1);
TEST (a, 52, -1);
TEST (a, 53, 1);
TEST (a, 54, -1);
TEST (a, 55, 1);
TEST (a, 56, -1);
TEST (a, 57, 1);
TEST (a, 58, -1);
TEST (a, 59, 1);
TEST (a, 60, -1);
TEST (a, 61, 1);
TEST (a, 62, -1);
TEST (a, 63, 1);
TEST (a, 64, -1);
TEST (b, 1, -1);
TEST (b, 2, -1);
TEST (b, 3, -1);
TEST (b, 4, -1);
TEST (b, 5, -1);
TEST (b, 6, -1);
TEST (b, 7, -1);
TEST (b, 8, -1);
TEST (b, 9, 1);
TEST (b, 10, -1);
TEST (b, 11, 1);
TEST (b, 12, -1);
TEST (b, 13, 1);
TEST (b, 14, -1);
TEST (b, 15, 1);
TEST (b, 16, -1);
TEST (b, 17, 1);
TEST (b, 18, -1);
TEST (b, 19, 1);
TEST (b, 20, -1);
TEST (b, 21, 1);
TEST (b, 22, -1);
TEST (b, 23, 1);
TEST (b, 24, -1);
TEST (b, 25, 1);
TEST (b, 26, -1);
TEST (b, 27, 1);
TEST (b, 28, -1);
TEST (b, 29, 1);
TEST (b, 30, -1);
TEST (b, 31, 1);
TEST (b, 32, -1);
TEST (b, 33, 1);
TEST (b, 34, -1);
TEST (b, 35, 1);
TEST (b, 36, -1);
TEST (b, 37, 1);
TEST (b, 38, -1);
TEST (b, 39, 1);
TEST (b, 40, -1);
TEST (b, 41, 1);
TEST (b, 42, -1);
TEST (b, 43, 1);
TEST (b, 44, -1);
TEST (b, 45, 1);
TEST (b, 46, -1);
TEST (b, 47, 1);
TEST (b, 48, -1);
TEST (b, 49, 1);
TEST (b, 50, -1);
TEST (b, 51, 1);
TEST (b, 52, -1);
TEST (b, 53, 1);
TEST (b, 54, -1);
TEST (b, 55, 1);
TEST (b, 56, -1);
TEST (b, 57, 1);
TEST (b, 58, -1);
TEST (b, 59, 1);
TEST (b, 60, -1);
TEST (b, 61, 1);
TEST (b, 62, -1);
TEST (b, 63, 1);
TEST (b, 64, -1);
return 0;
}

View File

@ -0,0 +1,44 @@
/* { dg-do run } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */
/* { dg-options "-save-temps -O2 -fno-inline" } */
/* { dg-add-options "arm_neon" } */
#include <string.h>
#include <stdlib.h>
#define LEN (100)
short a[LEN];
void
foo (void)
{
memset (a, -1, 14);
return;
}
void
check (signed char *arr, int idx, int len, int v)
{
int i;
for (i = 0; i < idx; i++)
if (arr[i] != v)
abort ();
for (i = idx; i < len; i++)
if (arr[i] != 0)
abort ();
}
int
main(void)
{
foo ();
check ((signed char *)a, 14, sizeof (a), -1);
return 0;
}
/* { dg-final { scan-assembler-not "bl?\[ \t\]*memset" { target { ! arm_thumb1_ok } } } } */
/* { dg-final { scan-assembler "vst1" { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { scan-assembler-not "vstr" { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { cleanup-saved-temps } } */

View File

@ -0,0 +1,42 @@
/* { dg-do run } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mcpu=cortex-a9" } { "" } } */
/* { dg-skip-if "Don't inline memset using neon instructions on cortex-a9" { *-*-* } { "-mtune=cortex-a9" } { "" } } */
/* { dg-options "-save-temps -Os -fno-inline" } */
/* { dg-add-options "arm_neon" } */
#include <string.h>
#include <stdlib.h>
#define LEN (100)
short a[LEN];
void
foo (void)
{
memset (a, -1, 14);
return;
}
void
check (signed char *arr, int idx, int len, int v)
{
int i;
for (i = 0; i < idx; i++)
if (arr[i] != v)
abort ();
for (i = idx; i < len; i++)
if (arr[i] != 0)
abort ();
}
int
main(void)
{
foo ();
check ((signed char *)a, 14, sizeof (a), -1);
return 0;
}
/* { dg-final { scan-assembler-not "bl?\[ \t\]*memset" { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { scan-assembler "vst1" { target { arm_little_endian && arm_neon } } } } */
/* { dg-final { cleanup-saved-temps } } */