i386.md (memstr): Do not use rep stosb for counts divisible by 4 when optimize_size.

* i386.md (memstr): Do not use rep stosb for counts divisible by 4
	when optimize_size.
	(clrstrsi): Rewrite.
	(strsethi, strsetqi): New expanders.
	(strsethi_1, strsetqi_1, rep_stossi, rep_stosqi): New insn patterns.
	(cmpstrsi): Emit compare insn before cmpstrsi_1
	(cmpstrsi_nz): use flags, set type to str, prefix_length to 1.
	(strlensi_1): Likewise.
	(cmpstrsi_1): Likewise; do not output compare.
	(strlen expander): Do not unroll when optimizing for size.
	(*subsi3_carry): Rename to subsi3_carry
	(addqi3_cc): New pattern.
	* i386.h (processor_costs): Add move_ratio field.
	(MOVE_RATIO): Use move_ratio field, set to 3 for OPTIMIZE_SIZE
	* i386.c (*_cost): Set move_ratio.
	(x86_unroll_strlen): Enable for Athlon, PPro and K6 too.
	(x86_expand_strlensi_1): Rewrite the main loop.

From-SVN: r31488
This commit is contained in:
Jan Hubicka 2000-01-18 16:25:05 +01:00 committed by Jan Hubicka
parent b9f243c201
commit e2e52e1be7
4 changed files with 238 additions and 121 deletions

View File

@ -1,3 +1,23 @@
Tue Jan 18 16:19:55 MET 2000 Jan Hubicka <hubicka@freesoft.cz>
* i386.md (memstr): Do not use rep stosb for counts divisible by 4
when optimize_size.
(clrstrsi): Rewrite.
(strsethi, strsetqi): New expanders.
(strsethi_1, strsetqi_1, rep_stossi, rep_stosqi): New insn patterns.
(cmpstrsi): Emit compare insn before cmpstrsi_1
(cmpstrsi_nz): use flags, set type to str, prefix_length to 1.
(strlensi_1): Likewise.
(cmpstrsi_1): Likewise; do not output compare.
(strlen expander): Do not unroll when optimizing for size.
(*subsi3_carry): Rename to subsi3_carry
(addqi3_cc): New pattern.
* i386.h (processor_costs): Add move_ratio field.
(MOVE_RATIO): Use move_ratio field, set to 3 for OPTIMIZE_SIZE
* i386.c (*_cost): Set move_ratio.
(x86_unroll_strlen): Enable for Athlon, PPro and K6 too.
(x86_expand_strlensi_1): Rewrite the main loop.
2000-01-17 Richard Henderson <rth@cygnus.com>
* combine.c (combine_simplify_rtx): Give FLOAT_STORE_FLAG_VALUE a mode.

View File

@ -64,6 +64,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
1, /* cost of multiply per each bit set */
23, /* cost of a divide/mod */
15, /* "large" insn */
3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
@ -84,6 +85,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
1, /* cost of multiply per each bit set */
40, /* cost of a divide/mod */
15, /* "large" insn */
3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
@ -104,6 +106,7 @@ struct processor_costs pentium_cost = {
0, /* cost of multiply per each bit set */
25, /* cost of a divide/mod */
8, /* "large" insn */
6, /* MOVE_RATIO */
6, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
@ -124,6 +127,7 @@ struct processor_costs pentiumpro_cost = {
0, /* cost of multiply per each bit set */
17, /* cost of a divide/mod */
8, /* "large" insn */
6, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
@ -144,6 +148,7 @@ struct processor_costs k6_cost = {
0, /* cost of multiply per each bit set */
18, /* cost of a divide/mod */
8, /* "large" insn */
4, /* MOVE_RATIO */
3, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
@ -164,6 +169,7 @@ struct processor_costs athlon_cost = {
0, /* cost of multiply per each bit set */
19, /* cost of a divide/mod */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
@ -191,7 +197,7 @@ const int x86_zero_extend_with_and = m_486 | m_PENT;
const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */;
const int x86_double_with_add = ~m_386;
const int x86_use_bit_test = m_386;
const int x86_unroll_strlen = m_486 | m_PENT;
const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6;
const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
const int x86_use_any_reg = m_486;
const int x86_cmove = m_PPRO | m_ATHLON;
@ -5149,10 +5155,9 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch)
rtx align_3_label = NULL_RTX;
rtx align_4_label = gen_label_rtx ();
rtx end_0_label = gen_label_rtx ();
rtx end_2_label = gen_label_rtx ();
rtx end_3_label = gen_label_rtx ();
rtx mem;
rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
rtx tmpreg = gen_reg_rtx (SImode);
align = 0;
if (GET_CODE (align_rtx) == CONST_INT)
@ -5269,48 +5274,69 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch)
mem = gen_rtx_MEM (SImode, out);
emit_move_insn (scratch, mem);
/* Check first byte. */
emit_insn (gen_cmpqi_0 (gen_lowpart (QImode, scratch), const0_rtx));
tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, end_0_label),
pc_rtx);
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
/* Check second byte. */
emit_insn (gen_cmpqi_ext_3 (scratch, const0_rtx));
tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, end_3_label),
pc_rtx);
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
/* Check third byte. */
emit_insn (gen_testsi_1 (scratch, GEN_INT (0x00ff0000)));
tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, end_2_label),
pc_rtx);
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
/* Check fourth byte and increment address. */
emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
emit_insn (gen_testsi_1 (scratch, GEN_INT (0xff000000)));
tmp = gen_rtx_NE (VOIDmode, flags, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, align_4_label),
pc_rtx);
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
/* Now generate fixups when the compare stops within a 4-byte word. */
emit_insn (gen_subsi3 (out, out, GEN_INT (3)));
emit_label (end_2_label);
emit_insn (gen_addsi3 (out, out, const1_rtx));
/* This formula yields a nonzero result iff one of the bytes is zero.
This saves three branches inside loop and many cycles. */
emit_label (end_3_label);
emit_insn (gen_addsi3 (out, out, const1_rtx));
emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
emit_insn (gen_one_cmplsi2 (scratch, scratch));
emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
emit_insn (gen_andsi3 (tmpreg, tmpreg, GEN_INT (0x80808080)));
emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, 0, align_4_label);
if (TARGET_CMOVE)
{
rtx reg = gen_reg_rtx (SImode);
emit_move_insn (reg, tmpreg);
emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
/* If zero is not in the first two bytes, move two bytes forward. */
emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
gen_rtx_IF_THEN_ELSE (SImode, tmp,
reg,
tmpreg)));
/* Emit lea manually to avoid clobbering of flags. */
emit_insn (gen_rtx_SET (SImode, reg,
gen_rtx_PLUS (SImode, out, GEN_INT (2))));
tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
emit_insn (gen_rtx_SET (VOIDmode, out,
gen_rtx_IF_THEN_ELSE (SImode, tmp,
reg,
out)));
}
else
{
rtx end_2_label = gen_label_rtx ();
/* Is zero in the first two bytes? */
emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, end_2_label),
pc_rtx);
tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
JUMP_LABEL (tmp) = end_2_label;
/* Not in the first two. Move two bytes forward. */
emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
emit_insn (gen_addsi3 (out, out, GEN_INT (2)));
emit_label (end_2_label);
}
/* Avoid branch in fixing the byte. */
tmpreg = gen_lowpart (QImode, tmpreg);
emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
emit_insn (gen_subsi3_carry (out, out, GEN_INT (3)));
emit_label (end_0_label);
}

View File

@ -62,6 +62,8 @@ struct processor_costs {
int mult_bit; /* cost of multiply per each bit set */
int divide; /* cost of a divide/mod */
int large_insn; /* insns larger than this cost more */
int move_ratio; /* The threshold of number of scalar memory-to-memory
move insns. */
int movzbl_load; /* cost of loading using movzbl */
int int_load[3]; /* cost of loading integer registers
in QImode, HImode and SImode relative
@ -1709,13 +1711,9 @@ while (0)
Increasing the value will always make code faster, but eventually
incurs high cost in increased code size.
If you don't define this, a reasonable default is used.
If you don't define this, a reasonable default is used. */
Make this large on i386, since the block move is very inefficient with small
blocks, and the hard register needs of the block move require much reload
work. */
#define MOVE_RATIO 5
#define MOVE_RATIO (optimize_size ? 3 : ix86_cost->move_ratio)
/* Define if shifts truncate the shift count
which implies one can omit a sign-extension or zero-extension

View File

@ -3235,6 +3235,15 @@
"add{l}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")])
(define_insn "addqi3_cc"
[(set (reg:CC 17) (plus:CC (match_operand:QI 1 "nonimmediate_operand" "%0,0")
(match_operand:QI 2 "general_operand" "ri,rm")))
(set (match_operand:QI 0 "nonimmediate_operand" "=rm,r")
(plus:QI (match_dup 1) (match_dup 2)))]
"ix86_binary_operator_ok (PLUS, QImode, operands)"
"add{b}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")])
(define_insn "*addsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
(plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0")
@ -3736,7 +3745,7 @@
"sub{l}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")])
(define_insn "*subsi3_carry"
(define_insn "subsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
(minus:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
(plus:SI (match_operand:SI 2 "general_operand" "ri,rm")
@ -7841,8 +7850,9 @@
srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
emit_insn (gen_cld());
/* When optimizing for size emit simple rep ; movsb instruction. */
if (!optimize || optimize_size)
/* When optimizing for size emit simple rep ; movsb instruction for
counts not divisible by 4. */
if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03))
{
countreg = copy_to_mode_reg (SImode, operands[2]);
emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
@ -7983,84 +7993,143 @@
(set_attr "memory" "both")])
(define_expand "clrstrsi"
[(set (reg:SI 19) (const_int 0))
(set (match_dup 3) (const_int 0))
(parallel [(set (match_operand:BLK 0 "memory_operand" "")
(const_int 0))
(use (match_operand:SI 1 "const_int_operand" ""))
(use (match_operand:SI 2 "const_int_operand" ""))
(use (match_dup 3))
(use (reg:SI 19))
(clobber (match_scratch:SI 4 ""))
(clobber (match_dup 5))])]
[(use (match_operand:BLK 0 "memory_operand" ""))
(use (match_operand:SI 1 "const_int_operand" ""))
(use (match_operand:SI 2 "const_int_operand" ""))]
""
"
{
rtx addr0;
rtx destreg, zeroreg, countreg;
if (GET_CODE (operands[1]) != CONST_INT)
FAIL;
addr0 = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
operands[3] = gen_reg_rtx (SImode);
operands[5] = addr0;
emit_insn (gen_cld());
operands[0] = gen_rtx_MEM (BLKmode, addr0);
/* When optimizing for size emit simple rep ; movsb instruction for
counts not divisible by 4. */
if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03))
{
countreg = copy_to_mode_reg (SImode, operands[1]);
zeroreg = copy_to_mode_reg (QImode, const0_rtx);
emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
destreg, countreg));
}
else
{
zeroreg = copy_to_mode_reg (SImode, const0_rtx);
if (INTVAL (operands[1]) & ~0x03)
{
countreg = copy_to_mode_reg (SImode,
GEN_INT ((INTVAL (operands[1]) >> 2)
& 0x3fffffff));
emit_insn (gen_rep_stossi (destreg, countreg, zeroreg,
destreg, countreg));
}
if (INTVAL (operands[1]) & 0x02)
emit_insn (gen_strsethi (destreg,
gen_rtx_SUBREG (HImode, zeroreg, 0)));
if (INTVAL (operands[1]) & 0x01)
emit_insn (gen_strsetqi (destreg,
gen_rtx_SUBREG (QImode, zeroreg, 0)));
}
DONE;
}")
;; Most CPUs don't like single string operations
;; Handle this case here to simplify previous expander.
(define_expand "strsethi"
[(set (mem:HI (match_operand:SI 0 "register_operand" ""))
(match_operand:HI 1 "register_operand" ""))
(parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 2)))
(clobber (reg:CC 17))])]
""
"
{
if (TARGET_SINGLE_STRINGOP || optimize_size)
{
emit_insn (gen_strsethi_1 (operands[0], operands[0], operands[1]));
DONE;
}
}")
(define_expand "strsetqi"
[(set (mem:QI (match_operand:SI 0 "register_operand" ""))
(match_operand:QI 1 "register_operand" ""))
(parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
(clobber (reg:CC 17))])]
""
"
{
if (TARGET_SINGLE_STRINGOP || optimize_size)
{
emit_insn (gen_strsetqi_1 (operands[0], operands[0], operands[1]));
DONE;
}
}")
(define_insn "strsethi_1"
[(set (mem:HI (match_operand:SI 1 "register_operand" "0"))
(match_operand:HI 2 "register_operand" "a"))
(set (match_operand:SI 0 "register_operand" "=D")
(plus:SI (match_dup 0)
(const_int 2)))
(use (reg:SI 19))]
"TARGET_SINGLE_STRINGOP || optimize_size"
"stosw"
[(set_attr "type" "str")
(set_attr "memory" "store")
(set_attr "length_prefix" "1")])
(define_insn "strsetqi_1"
[(set (mem:QI (match_operand:SI 1 "register_operand" "0"))
(match_operand:QI 2 "register_operand" "a"))
(set (match_operand:SI 0 "register_operand" "=D")
(plus:SI (match_dup 0)
(const_int 1)))
(use (reg:SI 19))]
"TARGET_SINGLE_STRINGOP || optimize_size"
"stosb"
[(set_attr "type" "str")
(set_attr "memory" "store")])
;; It might seem that operand 0 could use predicate register_operand.
;; But strength reduction might offset the MEM expression. So we let
;; reload put the address into %edi.
(define_insn "*clrstrsi_1"
[(set (mem:BLK (match_operand:SI 0 "address_operand" "D"))
(define_insn "rep_stossi"
[(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
(use (match_operand:SI 2 "register_operand" "a"))
(use (match_operand:SI 4 "register_operand" "1"))
(set (match_operand:SI 0 "register_operand" "=D")
(plus:SI (match_operand:SI 3 "address_operand" "0")
(ashift:SI (match_dup 3) (const_int 2))))
(set (mem:BLK (match_dup 3))
(const_int 0))
(use (match_operand:SI 1 "const_int_operand" "n"))
(use (match_operand:SI 2 "immediate_operand" "i"))
(use (match_operand:SI 3 "register_operand" "a"))
(use (reg:SI 19))
(clobber (match_scratch:SI 4 "=&c"))
(clobber (match_dup 0))]
(use (reg:SI 19))]
""
"*
{
rtx xops[2];
"rep\;stosl|rep stosd"
[(set_attr "type" "str")
(set_attr "length_prefix" "1")
(set_attr "memory" "store")])
if (GET_CODE (operands[1]) == CONST_INT)
{
unsigned int count = INTVAL (operands[1]) & 0xffffffff;
if (count & ~0x03)
{
xops[0] = GEN_INT (count / 4);
xops[1] = operands[4];
/* K6: stos takes 1 cycle, rep stos takes 8 + %ecx cycles.
80386: 4/5+5n (+2 for set of ecx)
80486: 5/7+5n (+1 for set of ecx)
*/
if (count / 4 < ((int) ix86_cpu < (int)PROCESSOR_PENTIUM ? 4 : 6))
{
do
output_asm_insn (\"{stosl|stosd}\", xops);
while ((count -= 4) > 3);
}
else
{
output_asm_insn (\"mov{l}\\t{%0, %1|%1, %0}\", xops);
output_asm_insn (\"{rep\;stosl|rep stosd}\", xops);
}
}
if (INTVAL (operands[1]) & 0x02)
output_asm_insn (\"stosw\", operands);
if (INTVAL (operands[1]) & 0x01)
output_asm_insn (\"stosb\", operands);
}
else
abort ();
RET;
}"
[(set_attr "type" "multi")])
(define_insn "rep_stosqi"
[(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
(use (match_operand:QI 2 "register_operand" "a"))
(use (match_operand:SI 4 "register_operand" "1"))
(set (match_operand:SI 0 "register_operand" "=D")
(plus:SI (match_operand:SI 3 "address_operand" "0") (match_dup 3)))
(set (mem:BLK (match_dup 3))
(const_int 0))
(use (reg:SI 19))]
""
"rep\;stosb|rep stosb"
[(set_attr "type" "str")
(set_attr "length_prefix" "1")
(set_attr "memory" "store")])
(define_expand "cmpstrsi"
[(set (match_operand:SI 0 "register_operand" "")
@ -8099,7 +8168,10 @@
emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align));
}
else
emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align));
{
emit_insn (gen_cmpsi_1 (countreg, countreg));
emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align));
}
outlow = gen_lowpart (QImode, out);
emit_insn (gen_cmpintqi (outlow));
@ -8145,8 +8217,8 @@
(clobber (match_dup 2))]
""
"repz{\;| }cmpsb"
[(set_attr "type" "multi")
(set_attr "length" "3")])
[(set_attr "type" "str")
(set_attr "length_prefix" "1")])
;; The same, but the count is not known to not be zero.
@ -8158,15 +8230,15 @@
(mem:BLK (match_operand:SI 1 "address_operand" "D")))
(const_int 0)))
(use (match_operand:SI 3 "immediate_operand" "i"))
(use (reg:CC 17))
(use (reg:SI 19))
(clobber (match_dup 0))
(clobber (match_dup 1))
(clobber (match_dup 2))]
""
;; The initial compare sets the zero flag.
"cmp{l}\\t%2, %2\;repz{\;| }cmpsb"
[(set_attr "type" "multi")
(set_attr "length" "5")])
"repz{\;| }cmpsb"
[(set_attr "type" "str")
(set_attr "length_prefix" "1")])
(define_expand "strlensi"
[(set (match_operand:SI 0 "register_operand" "")
@ -8184,7 +8256,8 @@
align = operands[3];
scratch1 = gen_reg_rtx (SImode);
if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1)
if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
&& !optimize_size)
{
/* Well it seems that some optimizer does not combine a call like
foo(strlen(bar), strlen(bar));
@ -8236,8 +8309,8 @@
(clobber (reg:CC 17))]
""
"repnz{\;| }scasb"
[(set_attr "type" "multi")
(set_attr "length" "3")])
[(set_attr "type" "str")
(set_attr "length_prefix" "1")])
;; Conditional move instructions.