i386: Add define_insn_and_split patterns for btrl [PR96938]

In the following testcase we only optimize f2 and f7 to btrl, although we
should optimize that way all of the functions.  The problem is the type
demotion/narrowing (which is performed solely during the generic folding and
not later), without it we see the AND performed in SImode and match it as
btrl, but with it while the shifts are still performed in SImode, the
AND is already done in QImode or HImode low part of the shift.

2021-01-13  Jakub Jelinek  <jakub@redhat.com>

	PR target/96938
	* config/i386/i386.md (*btr<mode>_1, *btr<mode>_2): New
	define_insn_and_split patterns.
	(splitter after *btr<mode>_2): New splitter.

	* gcc.target/i386/pr96938.c: New test.
This commit is contained in:
Jakub Jelinek 2021-01-13 10:15:13 +01:00
parent 6b70fa678b
commit 5d057bfeff
2 changed files with 131 additions and 0 deletions

View File

@ -12419,6 +12419,71 @@
(match_dup 3)))
(clobber (reg:CC FLAGS_REG))])])
(define_insn_and_split "*btr<mode>_1"
[(set (match_operand:SWI12 0 "register_operand")
(and:SWI12
(subreg:SWI12
(rotate:SI (const_int -2)
(match_operand:QI 2 "register_operand")) 0)
(match_operand:SWI12 1 "nonimmediate_operand")))
(clobber (reg:CC FLAGS_REG))]
"TARGET_USE_BT && ix86_pre_reload_split ()"
"#"
"&& 1"
[(parallel
[(set (match_dup 0)
(and:SI (rotate:SI (const_int -2) (match_dup 2))
(match_dup 1)))
(clobber (reg:CC FLAGS_REG))])]
{
operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
if (MEM_P (operands[1]))
operands[1] = force_reg (<MODE>mode, operands[1]);
operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
})
(define_insn_and_split "*btr<mode>_2"
[(set (zero_extract:HI
(match_operand:SWI12 0 "nonimmediate_operand")
(const_int 1)
(zero_extend:SI (match_operand:QI 1 "register_operand")))
(const_int 0))
(clobber (reg:CC FLAGS_REG))]
"TARGET_USE_BT && ix86_pre_reload_split ()"
"#"
"&& MEM_P (operands[0])"
[(set (match_dup 2) (match_dup 0))
(parallel
[(set (match_dup 3)
(and:SI (rotate:SI (const_int -2) (match_dup 1))
(match_dup 4)))
(clobber (reg:CC FLAGS_REG))])
(set (match_dup 0) (match_dup 5))]
{
operands[2] = gen_reg_rtx (<MODE>mode);
operands[5] = gen_reg_rtx (<MODE>mode);
operands[3] = lowpart_subreg (SImode, operands[5], <MODE>mode);
operands[4] = lowpart_subreg (SImode, operands[2], <MODE>mode);
})
(define_split
[(set (zero_extract:HI
(match_operand:SWI12 0 "register_operand")
(const_int 1)
(zero_extend:SI (match_operand:QI 1 "register_operand")))
(const_int 0))
(clobber (reg:CC FLAGS_REG))]
"TARGET_USE_BT && ix86_pre_reload_split ()"
[(parallel
[(set (match_dup 0)
(and:SI (rotate:SI (const_int -2) (match_dup 1))
(match_dup 2)))
(clobber (reg:CC FLAGS_REG))])]
{
operands[2] = lowpart_subreg (SImode, operands[0], <MODE>mode);
operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
})
;; These instructions are never faster than the corresponding
;; and/ior/xor operations when using immediate operand, so with
;; 32-bit there's no point. But in 64-bit, we can't hold the

View File

@ -0,0 +1,66 @@
/* PR target/96938 */
/* { dg-do compile } */
/* { dg-options "-O2 -masm=att" } */
/* { dg-final { scan-assembler-times "\tbtrl\t" 10 } } */
void
f1 (unsigned char *f, int o, unsigned char v)
{
*f = (*f & ~(1 << o)) | (v << o);
}
void
f2 (unsigned char *f, int o, unsigned char v)
{
int t = *f & ~(1 << o);
*f = t | (v << o);
}
void
f3 (unsigned char *f, int o, unsigned char v)
{
*f &= ~(1 << o);
}
void
f4 (unsigned char *f, int o, unsigned char v)
{
*f = (*f & ~(1 << (o & 31))) | v;
}
void
f5 (unsigned char *f, int o, unsigned char v)
{
*f = (*f & ~(1 << (o & 31))) | (v << (o & 31));
}
void
f6 (unsigned short *f, int o, unsigned short v)
{
*f = (*f & ~(1 << o)) | (v << o);
}
void
f7 (unsigned short *f, int o, unsigned short v)
{
int t = *f & ~(1 << o);
*f = t | (v << o);
}
void
f8 (unsigned short *f, int o, unsigned short v)
{
*f &= ~(1 << o);
}
void
f9 (unsigned short *f, int o, unsigned short v)
{
*f = (*f & ~(1 << (o & 31))) | v;
}
void
f10 (unsigned short *f, int o, unsigned short v)
{
*f = (*f & ~(1 << (o & 31))) | (v << (o & 31));
}