i386: Improve expansion of __builtin_parity
GCC currently hides the shift and xor reduction inside a backend specific UNSPEC PARITY, making it invisible to the RTL optimizers until very late during compilation. It is normally reasonable for the middle-end to maintain wider mode representations for as long as possible and split them later, but this only helps if the semantics are visible at the RTL-level (to combine and other passes), but UNSPECs are black boxes, so in this case splitting early (during RTL expansion) is a better strategy. It turns out that that popcount instruction on modern x86_64 processors has (almost) made the integer parity flag in the x86 ALU completely obsolete, especially as POPCOUNT's integer semantics are a much better fit to RTL. The one remaining case where these transistors are useful is where __builtin_parity is immediately tested by a conditional branch, and therefore the result is wanted in a flags register rather than as an integer. This case is captured by two peephole2 optimizations in the attached patch. 2020-06-07 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog: * config/i386/i386.md (paritydi2, paritysi2): Expand reduction via shift and xor to an USPEC PARITY matching a parityhi2_cmp. (paritydi2_cmp, paritysi2_cmp): Delete these define_insn_and_split. (parityhi2, parityqi2): New expanders. (parityhi2_cmp): Implement set parity flag with xorb insn. (parityqi2_cmp): Implement set parity flag with testb insn. New peephole2s to use these insns (UNSPEC PARITY) when appropriate. gcc/testsuite/ChangeLog: * gcc.target/i386/parity-3.c: New test. * gcc.target/i386/parity-4.c: Likewise. * gcc.target/i386/parity-5.c: Likewise. * gcc.target/i386/parity-6.c: Likewise. * gcc.target/i386/parity-7.c: Likewise. * gcc.target/i386/parity-8.c: Likewise. * gcc.target/i386/parity-9.c: Likewise.
This commit is contained in:
parent
fced594b31
commit
f08995eefb
|
@ -14866,9 +14866,32 @@
|
|||
"! TARGET_POPCNT"
|
||||
{
|
||||
rtx scratch = gen_reg_rtx (QImode);
|
||||
rtx hipart1 = gen_reg_rtx (SImode);
|
||||
rtx lopart1 = gen_reg_rtx (SImode);
|
||||
rtx xor1 = gen_reg_rtx (SImode);
|
||||
rtx shift2 = gen_reg_rtx (SImode);
|
||||
rtx hipart2 = gen_reg_rtx (HImode);
|
||||
rtx lopart2 = gen_reg_rtx (HImode);
|
||||
rtx xor2 = gen_reg_rtx (HImode);
|
||||
|
||||
emit_insn (gen_paritydi2_cmp (NULL_RTX, NULL_RTX,
|
||||
NULL_RTX, operands[1]));
|
||||
if (TARGET_64BIT)
|
||||
{
|
||||
rtx shift1 = gen_reg_rtx (DImode);
|
||||
emit_insn (gen_lshrdi3 (shift1, operands[1], GEN_INT (32)));
|
||||
emit_move_insn (hipart1, gen_lowpart (SImode, shift1));
|
||||
}
|
||||
else
|
||||
emit_move_insn (hipart1, gen_highpart (SImode, operands[1]));
|
||||
|
||||
emit_move_insn (lopart1, gen_lowpart (SImode, operands[1]));
|
||||
emit_insn (gen_xorsi3 (xor1, hipart1, lopart1));
|
||||
|
||||
emit_insn (gen_lshrsi3 (shift2, xor1, GEN_INT (16)));
|
||||
emit_move_insn (hipart2, gen_lowpart (HImode, shift2));
|
||||
emit_move_insn (lopart2, gen_lowpart (HImode, xor1));
|
||||
emit_insn (gen_xorhi3 (xor2, hipart2, lopart2));
|
||||
|
||||
emit_insn (gen_parityhi2_cmp (xor2));
|
||||
|
||||
ix86_expand_setcc (scratch, ORDERED,
|
||||
gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
|
||||
|
@ -14891,8 +14914,17 @@
|
|||
"! TARGET_POPCNT"
|
||||
{
|
||||
rtx scratch = gen_reg_rtx (QImode);
|
||||
rtx shift = gen_reg_rtx (SImode);
|
||||
rtx hipart = gen_reg_rtx (HImode);
|
||||
rtx lopart = gen_reg_rtx (HImode);
|
||||
rtx tmp = gen_reg_rtx (HImode);
|
||||
|
||||
emit_insn (gen_paritysi2_cmp (NULL_RTX, NULL_RTX, operands[1]));
|
||||
emit_insn (gen_lshrsi3 (shift, operands[1], GEN_INT (16)));
|
||||
emit_move_insn (hipart, gen_lowpart (HImode, shift));
|
||||
emit_move_insn (lopart, gen_lowpart (HImode, operands[1]));
|
||||
emit_insn (gen_xorhi3 (tmp, hipart, lopart));
|
||||
|
||||
emit_insn (gen_parityhi2_cmp (tmp));
|
||||
|
||||
ix86_expand_setcc (scratch, ORDERED,
|
||||
gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
|
||||
|
@ -14901,70 +14933,128 @@
|
|||
DONE;
|
||||
})
|
||||
|
||||
(define_insn_and_split "paritydi2_cmp"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_operand:DI 3 "register_operand" "0")]
|
||||
UNSPEC_PARITY))
|
||||
(clobber (match_scratch:DI 0 "=r"))
|
||||
(clobber (match_scratch:SI 1 "=&r"))
|
||||
(clobber (match_scratch:HI 2 "=Q"))]
|
||||
(define_expand "parityhi2"
|
||||
[(set (match_operand:HI 0 "register_operand")
|
||||
(parity:HI (match_operand:HI 1 "register_operand")))]
|
||||
"! TARGET_POPCNT"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(parallel
|
||||
[(set (match_dup 1)
|
||||
(xor:SI (match_dup 1) (match_dup 4)))
|
||||
(clobber (reg:CC FLAGS_REG))])
|
||||
(parallel
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_dup 1)] UNSPEC_PARITY))
|
||||
(clobber (match_dup 1))
|
||||
(clobber (match_dup 2))])]
|
||||
{
|
||||
operands[4] = gen_lowpart (SImode, operands[3]);
|
||||
rtx scratch = gen_reg_rtx (QImode);
|
||||
|
||||
if (TARGET_64BIT)
|
||||
{
|
||||
emit_move_insn (operands[1], gen_lowpart (SImode, operands[3]));
|
||||
emit_insn (gen_lshrdi3 (operands[3], operands[3], GEN_INT (32)));
|
||||
}
|
||||
else
|
||||
operands[1] = gen_highpart (SImode, operands[3]);
|
||||
emit_insn (gen_parityhi2_cmp (operands[1]));
|
||||
|
||||
ix86_expand_setcc (scratch, ORDERED,
|
||||
gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
|
||||
|
||||
emit_insn (gen_zero_extendqihi2 (operands[0], scratch));
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn_and_split "paritysi2_cmp"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_operand:SI 2 "register_operand" "0")]
|
||||
UNSPEC_PARITY))
|
||||
(clobber (match_scratch:SI 0 "=r"))
|
||||
(clobber (match_scratch:HI 1 "=&Q"))]
|
||||
(define_expand "parityqi2"
|
||||
[(set (match_operand:QI 0 "register_operand")
|
||||
(parity:QI (match_operand:QI 1 "register_operand")))]
|
||||
"! TARGET_POPCNT"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[(parallel
|
||||
[(set (match_dup 1)
|
||||
(xor:HI (match_dup 1) (match_dup 3)))
|
||||
(clobber (reg:CC FLAGS_REG))])
|
||||
(parallel
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_dup 1)] UNSPEC_PARITY))
|
||||
(clobber (match_dup 1))])]
|
||||
{
|
||||
operands[3] = gen_lowpart (HImode, operands[2]);
|
||||
emit_insn (gen_parityqi2_cmp (operands[1]));
|
||||
|
||||
emit_move_insn (operands[1], gen_lowpart (HImode, operands[2]));
|
||||
emit_insn (gen_lshrsi3 (operands[2], operands[2], GEN_INT (16)));
|
||||
ix86_expand_setcc (operands[0], ORDERED,
|
||||
gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn "*parityhi2_cmp"
|
||||
(define_insn "parityhi2_cmp"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_operand:HI 1 "register_operand" "0")]
|
||||
(unspec:CC [(match_operand:HI 0 "register_operand" "+Q")]
|
||||
UNSPEC_PARITY))
|
||||
(clobber (match_scratch:HI 0 "=Q"))]
|
||||
"! TARGET_POPCNT"
|
||||
(clobber (match_dup 0))]
|
||||
""
|
||||
"xor{b}\t{%h0, %b0|%b0, %h0}"
|
||||
[(set_attr "length" "2")
|
||||
(set_attr "mode" "HI")])
|
||||
(set_attr "mode" "QI")])
|
||||
|
||||
(define_insn "parityqi2_cmp"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_operand:QI 0 "register_operand" "q")]
|
||||
UNSPEC_PARITY))]
|
||||
""
|
||||
"test{b}\t%0, %0"
|
||||
[(set_attr "mode" "QI")])
|
||||
|
||||
;; Replace zero_extend:HI followed by parityhi2_cmp with parityqi2_cmp
|
||||
(define_peephole2
|
||||
[(set (match_operand:HI 0 "register_operand")
|
||||
(zero_extend:HI (match_operand:QI 1 "register_operand")))
|
||||
(parallel [(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_dup 0)] UNSPEC_PARITY))
|
||||
(clobber (match_dup 0))])]
|
||||
""
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_dup 1)] UNSPEC_PARITY))])
|
||||
|
||||
;; Eliminate QImode popcount&1 using parity flag
|
||||
(define_peephole2
|
||||
[(set (match_operand:SI 0 "register_operand")
|
||||
(zero_extend:SI (match_operand:QI 1 "register_operand")))
|
||||
(parallel [(set (match_operand:SI 2 "register_operand")
|
||||
(popcount:SI (match_dup 0)))
|
||||
(clobber (reg:CC FLAGS_REG))])
|
||||
(set (reg:CCZ FLAGS_REG)
|
||||
(compare:CCZ (and:QI (match_operand:QI 3 "register_operand")
|
||||
(const_int 1))
|
||||
(const_int 0)))
|
||||
(set (pc) (if_then_else (match_operator 4 "bt_comparison_operator"
|
||||
[(reg:CCZ FLAGS_REG)
|
||||
(const_int 0)])
|
||||
(label_ref (match_operand 5))
|
||||
(pc)))]
|
||||
"REGNO (operands[2]) == REGNO (operands[3])
|
||||
&& peep2_reg_dead_p (3, operands[0])
|
||||
&& peep2_reg_dead_p (3, operands[2])
|
||||
&& peep2_regno_dead_p (4, FLAGS_REG)"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_dup 1)] UNSPEC_PARITY))
|
||||
(set (pc) (if_then_else (match_op_dup 4 [(reg:CC FLAGS_REG)
|
||||
(const_int 0)])
|
||||
(label_ref (match_dup 5))
|
||||
(pc)))]
|
||||
{
|
||||
operands[4] = shallow_copy_rtx (operands[4]);
|
||||
PUT_CODE (operands[4], GET_CODE (operands[4]) == EQ ? UNORDERED : ORDERED);
|
||||
})
|
||||
|
||||
;; Eliminate HImode popcount&1 using parity flag
|
||||
(define_peephole2
|
||||
[(match_scratch:HI 0 "Q")
|
||||
(parallel [(set (match_operand:HI 1 "register_operand")
|
||||
(popcount:HI
|
||||
(match_operand:HI 2 "nonimmediate_operand")))
|
||||
(clobber (reg:CC FLAGS_REG))])
|
||||
(set (match_operand 3 "register_operand")
|
||||
(zero_extend (match_dup 1)))
|
||||
(set (reg:CCZ FLAGS_REG)
|
||||
(compare:CCZ (and:QI (match_operand:QI 4 "register_operand")
|
||||
(const_int 1))
|
||||
(const_int 0)))
|
||||
(set (pc) (if_then_else (match_operator 5 "bt_comparison_operator"
|
||||
[(reg:CCZ FLAGS_REG)
|
||||
(const_int 0)])
|
||||
(label_ref (match_operand 6))
|
||||
(pc)))]
|
||||
"REGNO (operands[3]) == REGNO (operands[4])
|
||||
&& peep2_reg_dead_p (3, operands[1])
|
||||
&& peep2_reg_dead_p (3, operands[3])
|
||||
&& peep2_regno_dead_p (4, FLAGS_REG)"
|
||||
[(set (match_dup 0) (match_dup 2))
|
||||
(parallel [(set (reg:CC FLAGS_REG)
|
||||
(unspec:CC [(match_dup 0)] UNSPEC_PARITY))
|
||||
(clobber (match_dup 0))])
|
||||
(set (pc) (if_then_else (match_op_dup 5 [(reg:CC FLAGS_REG)
|
||||
(const_int 0)])
|
||||
(label_ref (match_dup 6))
|
||||
(pc)))]
|
||||
{
|
||||
operands[5] = shallow_copy_rtx (operands[5]);
|
||||
PUT_CODE (operands[5], GET_CODE (operands[5]) == EQ ? UNORDERED : ORDERED);
|
||||
})
|
||||
|
||||
|
||||
;; Thread-local storage patterns for ELF.
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */
|
||||
/* { dg-final { scan-assembler "setp" } } */
|
||||
/* { dg-final { scan-assembler "jnp" } } */
|
||||
/* { dg-final { scan-assembler "jp" } } */
|
||||
|
||||
void dummy(void);
|
||||
|
||||
int foo(unsigned int x)
|
||||
{
|
||||
return !__builtin_parity(x);
|
||||
}
|
||||
|
||||
void bar(unsigned int x)
|
||||
{
|
||||
if (__builtin_parity(x))
|
||||
dummy();
|
||||
}
|
||||
|
||||
void baz(unsigned int x)
|
||||
{
|
||||
if (!__builtin_parity(x))
|
||||
dummy();
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */
|
||||
/* { dg-final { scan-assembler "setp" } } */
|
||||
/* { dg-final { scan-assembler "jnp" } } */
|
||||
/* { dg-final { scan-assembler "jp" } } */
|
||||
|
||||
void dummy(void);
|
||||
|
||||
int foo(unsigned long long x)
|
||||
{
|
||||
return !__builtin_parityll(x);
|
||||
}
|
||||
|
||||
void bar(unsigned long long x)
|
||||
{
|
||||
if (__builtin_parityll(x))
|
||||
dummy();
|
||||
}
|
||||
|
||||
void baz(unsigned long long x)
|
||||
{
|
||||
if (!__builtin_parityll(x))
|
||||
dummy();
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=core-avx2" } */
|
||||
/* { dg-final { scan-assembler "popcnt" } } */
|
||||
/* { dg-final { scan-assembler "and" } } */
|
||||
|
||||
int foo(unsigned int x)
|
||||
{
|
||||
return __builtin_parity(x);
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=core-avx2" } */
|
||||
/* { dg-final { scan-assembler "popcnt" } } */
|
||||
/* { dg-final { scan-assembler "and" } } */
|
||||
|
||||
int foo(unsigned long long x)
|
||||
{
|
||||
return __builtin_parityll(x);
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */
|
||||
/* { dg-additional-options "-mregparm=1" { target ia32 } } */
|
||||
/* { dg-final { scan-assembler-times "test" 2 } } */
|
||||
/* { dg-final { scan-assembler-not "shr" } } */
|
||||
|
||||
int foo(unsigned char x)
|
||||
{
|
||||
return __builtin_parity(x);
|
||||
}
|
||||
|
||||
int bar(unsigned char x)
|
||||
{
|
||||
return __builtin_parityll(x);
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */
|
||||
/* { dg-final { scan-assembler-not "shr" } } */
|
||||
|
||||
int foo(unsigned short x)
|
||||
{
|
||||
return __builtin_parity(x);
|
||||
}
|
||||
|
||||
int bar(unsigned short x)
|
||||
{
|
||||
return __builtin_parityll(x);
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -march=core-avx2" } */
|
||||
/* { dg-additional-options "-mregparm=1" { target ia32 } } */
|
||||
/* { dg-final { scan-assembler-not "popcnt" } } */
|
||||
/* { dg-final { scan-assembler-not "shr" } } */
|
||||
/* { dg-final { scan-assembler-times "jp" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "jnp" 2 } } */
|
||||
|
||||
void dummy(void);
|
||||
|
||||
void pos8(unsigned char x)
|
||||
{
|
||||
if (__builtin_parity(x))
|
||||
dummy();
|
||||
}
|
||||
|
||||
void neg8(unsigned char x)
|
||||
{
|
||||
if (!__builtin_parity(x))
|
||||
dummy();
|
||||
}
|
||||
|
||||
void pos16(unsigned short x)
|
||||
{
|
||||
if (__builtin_parity(x))
|
||||
dummy();
|
||||
}
|
||||
|
||||
void neg16(unsigned short x)
|
||||
{
|
||||
if (!__builtin_parity(x))
|
||||
dummy();
|
||||
}
|
Loading…
Reference in New Issue