i386.c (ix86_expand_clrmem): Move gen_cld down to the places where it is actually needed.
* config/i386/i386.c (ix86_expand_clrmem): Move gen_cld down to the places where it is actually needed. Don't use repz; stosb for -Os with sufficiently small constant sizes. For sufficiently small repz; stos{l,q} repeat counts use a sequence of stos{l,q} instructions instead. From-SVN: r85635
This commit is contained in:
parent
6797f908ee
commit
6b32b6286b
|
@ -1,3 +1,11 @@
|
||||||
|
2004-08-06 Jakub Jelinek <jakub@redhat.com>
|
||||||
|
|
||||||
|
* config/i386/i386.c (ix86_expand_clrmem): Move gen_cld down to
|
||||||
|
the places where it is actually needed. Don't use repz; stosb
|
||||||
|
for -Os with sufficiently small constant sizes.
|
||||||
|
For sufficiently small repz; stos{l,q} repeat counts use a sequence
|
||||||
|
of stos{l,q} instructions instead.
|
||||||
|
|
||||||
2004-08-06 Zdenek Dvorak <rakdver@atrey.karlin.mff.cuni.cz>
|
2004-08-06 Zdenek Dvorak <rakdver@atrey.karlin.mff.cuni.cz>
|
||||||
|
|
||||||
PR tree-optimization/16807
|
PR tree-optimization/16807
|
||||||
|
|
|
@ -11508,13 +11508,20 @@ ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp)
|
||||||
if (destreg != XEXP (dst, 0))
|
if (destreg != XEXP (dst, 0))
|
||||||
dst = replace_equiv_address_nv (dst, destreg);
|
dst = replace_equiv_address_nv (dst, destreg);
|
||||||
|
|
||||||
emit_insn (gen_cld ());
|
|
||||||
|
|
||||||
/* When optimizing for size emit simple rep ; movsb instruction for
|
/* When optimizing for size emit simple rep ; movsb instruction for
|
||||||
counts not divisible by 4. */
|
counts not divisible by 4. The movl $N, %ecx; rep; stosb
|
||||||
|
sequence is 7 bytes long, so if optimizing for size and count is
|
||||||
|
small enough that some stosl, stosw and stosb instructions without
|
||||||
|
rep are shorter, fall back into the next if. */
|
||||||
|
|
||||||
if ((!optimize || optimize_size) && (count == 0 || (count & 0x03)))
|
if ((!optimize || optimize_size)
|
||||||
|
&& (count == 0
|
||||||
|
|| ((count & 0x03)
|
||||||
|
&& (!optimize_size || (count & 0x03) + (count >> 2) > 7))))
|
||||||
{
|
{
|
||||||
|
emit_insn (gen_cld ());
|
||||||
|
|
||||||
countreg = ix86_zero_extend_to_Pmode (count_exp);
|
countreg = ix86_zero_extend_to_Pmode (count_exp);
|
||||||
zeroreg = copy_to_mode_reg (QImode, const0_rtx);
|
zeroreg = copy_to_mode_reg (QImode, const0_rtx);
|
||||||
destexp = gen_rtx_PLUS (Pmode, destreg, countreg);
|
destexp = gen_rtx_PLUS (Pmode, destreg, countreg);
|
||||||
|
@ -11528,17 +11535,54 @@ ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp)
|
||||||
int size = TARGET_64BIT && !optimize_size ? 8 : 4;
|
int size = TARGET_64BIT && !optimize_size ? 8 : 4;
|
||||||
unsigned HOST_WIDE_INT offset = 0;
|
unsigned HOST_WIDE_INT offset = 0;
|
||||||
|
|
||||||
|
emit_insn (gen_cld ());
|
||||||
|
|
||||||
zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx);
|
zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx);
|
||||||
if (count & ~(size - 1))
|
if (count & ~(size - 1))
|
||||||
{
|
{
|
||||||
countreg = copy_to_mode_reg (counter_mode,
|
unsigned HOST_WIDE_INT repcount;
|
||||||
GEN_INT ((count >> (size == 4 ? 2 : 3))
|
unsigned int max_nonrep;
|
||||||
& (TARGET_64BIT ? -1 : 0x3fffffff)));
|
|
||||||
countreg = ix86_zero_extend_to_Pmode (countreg);
|
repcount = count >> (size == 4 ? 2 : 3);
|
||||||
destexp = gen_rtx_ASHIFT (Pmode, countreg, GEN_INT (size == 4 ? 2 : 3));
|
if (!TARGET_64BIT)
|
||||||
destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
|
repcount &= 0x3fffffff;
|
||||||
emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp));
|
|
||||||
offset = count & ~(size - 1);
|
/* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes.
|
||||||
|
movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN
|
||||||
|
bytes. In both cases the latter seems to be faster for small
|
||||||
|
values of N. */
|
||||||
|
max_nonrep = size == 4 ? 7 : 4;
|
||||||
|
if (!optimize_size)
|
||||||
|
switch (ix86_tune)
|
||||||
|
{
|
||||||
|
case PROCESSOR_PENTIUM4:
|
||||||
|
case PROCESSOR_NOCONA:
|
||||||
|
max_nonrep = 3;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (repcount <= max_nonrep)
|
||||||
|
while (repcount-- > 0)
|
||||||
|
{
|
||||||
|
rtx mem = adjust_automodify_address_nv (dst,
|
||||||
|
GET_MODE (zeroreg),
|
||||||
|
destreg, offset);
|
||||||
|
emit_insn (gen_strset (destreg, mem, zeroreg));
|
||||||
|
offset += size;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount));
|
||||||
|
countreg = ix86_zero_extend_to_Pmode (countreg);
|
||||||
|
destexp = gen_rtx_ASHIFT (Pmode, countreg,
|
||||||
|
GEN_INT (size == 4 ? 2 : 3));
|
||||||
|
destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
|
||||||
|
emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg,
|
||||||
|
destexp));
|
||||||
|
offset = count & ~(size - 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (size == 8 && (count & 0x04))
|
if (size == 8 && (count & 0x04))
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue