i386.c (ix86_expand_clrmem): Move gen_cld down to the places where it is actually needed.

* config/i386/i386.c (ix86_expand_clrmem): Move gen_cld down to
	the places where it is actually needed.  Don't use repz; stosb
	for -Os with sufficiently small constant sizes.
	For sufficiently small repz; stos{l,q} repeat counts use a sequence
	of stos{l,q} instructions instead.

From-SVN: r85635
This commit is contained in:
Jakub Jelinek 2004-08-06 12:17:14 +02:00 committed by Jakub Jelinek
parent 6797f908ee
commit 6b32b6286b
2 changed files with 63 additions and 11 deletions

View File

@ -1,3 +1,11 @@
2004-08-06 Jakub Jelinek <jakub@redhat.com>
* config/i386/i386.c (ix86_expand_clrmem): Move gen_cld down to
the places where it is actually needed. Don't use repz; stosb
for -Os with sufficiently small constant sizes.
For sufficiently small repz; stos{l,q} repeat counts use a sequence
of stos{l,q} instructions instead.
2004-08-06 Zdenek Dvorak <rakdver@atrey.karlin.mff.cuni.cz>
PR tree-optimization/16807

View File

@ -11508,13 +11508,20 @@ ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp)
if (destreg != XEXP (dst, 0))
dst = replace_equiv_address_nv (dst, destreg);
emit_insn (gen_cld ());
/* When optimizing for size emit simple rep ; movsb instruction for
counts not divisible by 4. */
counts not divisible by 4. The movl $N, %ecx; rep; stosb
sequence is 7 bytes long, so if optimizing for size and count is
small enough that some stosl, stosw and stosb instructions without
rep are shorter, fall back into the next if. */
if ((!optimize || optimize_size) && (count == 0 || (count & 0x03)))
if ((!optimize || optimize_size)
&& (count == 0
|| ((count & 0x03)
&& (!optimize_size || (count & 0x03) + (count >> 2) > 7))))
{
emit_insn (gen_cld ());
countreg = ix86_zero_extend_to_Pmode (count_exp);
zeroreg = copy_to_mode_reg (QImode, const0_rtx);
destexp = gen_rtx_PLUS (Pmode, destreg, countreg);
@ -11528,17 +11535,54 @@ ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp)
int size = TARGET_64BIT && !optimize_size ? 8 : 4;
unsigned HOST_WIDE_INT offset = 0;
emit_insn (gen_cld ());
zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx);
if (count & ~(size - 1))
{
countreg = copy_to_mode_reg (counter_mode,
GEN_INT ((count >> (size == 4 ? 2 : 3))
& (TARGET_64BIT ? -1 : 0x3fffffff)));
countreg = ix86_zero_extend_to_Pmode (countreg);
destexp = gen_rtx_ASHIFT (Pmode, countreg, GEN_INT (size == 4 ? 2 : 3));
destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp));
offset = count & ~(size - 1);
unsigned HOST_WIDE_INT repcount;
unsigned int max_nonrep;
repcount = count >> (size == 4 ? 2 : 3);
if (!TARGET_64BIT)
repcount &= 0x3fffffff;
/* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes.
movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN
bytes. In both cases the latter seems to be faster for small
values of N. */
max_nonrep = size == 4 ? 7 : 4;
if (!optimize_size)
switch (ix86_tune)
{
case PROCESSOR_PENTIUM4:
case PROCESSOR_NOCONA:
max_nonrep = 3;
break;
default:
break;
}
if (repcount <= max_nonrep)
while (repcount-- > 0)
{
rtx mem = adjust_automodify_address_nv (dst,
GET_MODE (zeroreg),
destreg, offset);
emit_insn (gen_strset (destreg, mem, zeroreg));
offset += size;
}
else
{
countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount));
countreg = ix86_zero_extend_to_Pmode (countreg);
destexp = gen_rtx_ASHIFT (Pmode, countreg,
GEN_INT (size == 4 ? 2 : 3));
destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg,
destexp));
offset = count & ~(size - 1);
}
}
if (size == 8 && (count & 0x04))
{