[Patch AArch64] Stop generating BSL for simple integer code
Turn aarch64_simd_bsldi_internal in to an insn_and_split that knows to split back to integer operations if the register allocation falls that way. Do this to avoid having to move between integer and Advanced SIMD register files just for a single BSL. --- gcc/ * config/aarch64/aarch64-simd.md (aarch64_simd_bsl<mode>_internal): Remove DImode. (*aarch64_simd_bsl<mode>_alt): Likewise. (aarch64_simd_bsldi_internal): New. (aarch64_simd_bsldi_alt): Likewise. gcc/testsuite/ * gcc.target/aarch64/bsl-idiom.c: New. * gcc.target/aarch64/copysign-bsl.c: New. From-SVN: r254727
This commit is contained in:
parent
fee234f10f
commit
aea4b54ac5
@ -1,3 +1,11 @@
|
|||||||
|
2017-11-14 James Greenhalgh <james.greenhalgh@arm.com>
|
||||||
|
|
||||||
|
* config/aarch64/aarch64-simd.md
|
||||||
|
(aarch64_simd_bsl<mode>_internal): Remove DImode.
|
||||||
|
(*aarch64_simd_bsl<mode>_alt): Likewise.
|
||||||
|
(aarch64_simd_bsldi_internal): New.
|
||||||
|
(aarch64_simd_bsldi_alt): Likewise.
|
||||||
|
|
||||||
2017-11-13 Jan Hubicka <hubicka@ucw.cz>
|
2017-11-13 Jan Hubicka <hubicka@ucw.cz>
|
||||||
|
|
||||||
* tracer.c (better_p): Do not compare frequencies.
|
* tracer.c (better_p): Do not compare frequencies.
|
||||||
|
@ -2419,13 +2419,13 @@
|
|||||||
;; in *aarch64_simd_bsl<mode>_alt.
|
;; in *aarch64_simd_bsl<mode>_alt.
|
||||||
|
|
||||||
(define_insn "aarch64_simd_bsl<mode>_internal"
|
(define_insn "aarch64_simd_bsl<mode>_internal"
|
||||||
[(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
|
[(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
|
||||||
(xor:VSDQ_I_DI
|
(xor:VDQ_I
|
||||||
(and:VSDQ_I_DI
|
(and:VDQ_I
|
||||||
(xor:VSDQ_I_DI
|
(xor:VDQ_I
|
||||||
(match_operand:<V_INT_EQUIV> 3 "register_operand" "w,0,w")
|
(match_operand:<V_INT_EQUIV> 3 "register_operand" "w,0,w")
|
||||||
(match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
|
(match_operand:VDQ_I 2 "register_operand" "w,w,0"))
|
||||||
(match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
|
(match_operand:VDQ_I 1 "register_operand" "0,w,w"))
|
||||||
(match_dup:<V_INT_EQUIV> 3)
|
(match_dup:<V_INT_EQUIV> 3)
|
||||||
))]
|
))]
|
||||||
"TARGET_SIMD"
|
"TARGET_SIMD"
|
||||||
@ -2443,14 +2443,14 @@
|
|||||||
;; permutations of commutative operations, we have to have a separate pattern.
|
;; permutations of commutative operations, we have to have a separate pattern.
|
||||||
|
|
||||||
(define_insn "*aarch64_simd_bsl<mode>_alt"
|
(define_insn "*aarch64_simd_bsl<mode>_alt"
|
||||||
[(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
|
[(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
|
||||||
(xor:VSDQ_I_DI
|
(xor:VDQ_I
|
||||||
(and:VSDQ_I_DI
|
(and:VDQ_I
|
||||||
(xor:VSDQ_I_DI
|
(xor:VDQ_I
|
||||||
(match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
|
(match_operand:VDQ_I 3 "register_operand" "w,w,0")
|
||||||
(match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
|
(match_operand:<V_INT_EQUIV> 2 "register_operand" "w,0,w"))
|
||||||
(match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
|
(match_operand:VDQ_I 1 "register_operand" "0,w,w"))
|
||||||
(match_dup:VSDQ_I_DI 2)))]
|
(match_dup:<V_INT_EQUIV> 2)))]
|
||||||
"TARGET_SIMD"
|
"TARGET_SIMD"
|
||||||
"@
|
"@
|
||||||
bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
|
bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
|
||||||
@ -2459,6 +2459,100 @@
|
|||||||
[(set_attr "type" "neon_bsl<q>")]
|
[(set_attr "type" "neon_bsl<q>")]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
;; DImode is special, we want to avoid computing operations which are
|
||||||
|
;; more naturally computed in general purpose registers in the vector
|
||||||
|
;; registers. If we do that, we need to move all three operands from general
|
||||||
|
;; purpose registers to vector registers, then back again. However, we
|
||||||
|
;; don't want to make this pattern an UNSPEC as we'd lose scope for
|
||||||
|
;; optimizations based on the component operations of a BSL.
|
||||||
|
;;
|
||||||
|
;; That means we need a splitter back to the individual operations, if they
|
||||||
|
;; would be better calculated on the integer side.
|
||||||
|
|
||||||
|
(define_insn_and_split "aarch64_simd_bsldi_internal"
|
||||||
|
[(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
|
||||||
|
(xor:DI
|
||||||
|
(and:DI
|
||||||
|
(xor:DI
|
||||||
|
(match_operand:DI 3 "register_operand" "w,0,w,r")
|
||||||
|
(match_operand:DI 2 "register_operand" "w,w,0,r"))
|
||||||
|
(match_operand:DI 1 "register_operand" "0,w,w,r"))
|
||||||
|
(match_dup:DI 3)
|
||||||
|
))]
|
||||||
|
"TARGET_SIMD"
|
||||||
|
"@
|
||||||
|
bsl\\t%0.8b, %2.8b, %3.8b
|
||||||
|
bit\\t%0.8b, %2.8b, %1.8b
|
||||||
|
bif\\t%0.8b, %3.8b, %1.8b
|
||||||
|
#"
|
||||||
|
"&& GP_REGNUM_P (REGNO (operands[0]))"
|
||||||
|
[(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)]
|
||||||
|
{
|
||||||
|
/* Split back to individual operations. If we're before reload, and
|
||||||
|
able to create a temporary register, do so. If we're after reload,
|
||||||
|
we've got an early-clobber destination register, so use that.
|
||||||
|
Otherwise, we can't create pseudos and we can't yet guarantee that
|
||||||
|
operands[0] is safe to write, so FAIL to split. */
|
||||||
|
|
||||||
|
rtx scratch;
|
||||||
|
if (reload_completed)
|
||||||
|
scratch = operands[0];
|
||||||
|
else if (can_create_pseudo_p ())
|
||||||
|
scratch = gen_reg_rtx (DImode);
|
||||||
|
else
|
||||||
|
FAIL;
|
||||||
|
|
||||||
|
emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
|
||||||
|
emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
|
||||||
|
emit_insn (gen_xordi3 (operands[0], scratch, operands[3]));
|
||||||
|
DONE;
|
||||||
|
}
|
||||||
|
[(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
|
||||||
|
(set_attr "length" "4,4,4,12")]
|
||||||
|
)
|
||||||
|
|
||||||
|
(define_insn_and_split "aarch64_simd_bsldi_alt"
|
||||||
|
[(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
|
||||||
|
(xor:DI
|
||||||
|
(and:DI
|
||||||
|
(xor:DI
|
||||||
|
(match_operand:DI 3 "register_operand" "w,w,0,r")
|
||||||
|
(match_operand:DI 2 "register_operand" "w,0,w,r"))
|
||||||
|
(match_operand:DI 1 "register_operand" "0,w,w,r"))
|
||||||
|
(match_dup:DI 2)
|
||||||
|
))]
|
||||||
|
"TARGET_SIMD"
|
||||||
|
"@
|
||||||
|
bsl\\t%0.8b, %3.8b, %2.8b
|
||||||
|
bit\\t%0.8b, %3.8b, %1.8b
|
||||||
|
bif\\t%0.8b, %2.8b, %1.8b
|
||||||
|
#"
|
||||||
|
"&& GP_REGNUM_P (REGNO (operands[0]))"
|
||||||
|
[(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
|
||||||
|
{
|
||||||
|
/* Split back to individual operations. If we're before reload, and
|
||||||
|
able to create a temporary register, do so. If we're after reload,
|
||||||
|
we've got an early-clobber destination register, so use that.
|
||||||
|
Otherwise, we can't create pseudos and we can't yet guarantee that
|
||||||
|
operands[0] is safe to write, so FAIL to split. */
|
||||||
|
|
||||||
|
rtx scratch;
|
||||||
|
if (reload_completed)
|
||||||
|
scratch = operands[0];
|
||||||
|
else if (can_create_pseudo_p ())
|
||||||
|
scratch = gen_reg_rtx (DImode);
|
||||||
|
else
|
||||||
|
FAIL;
|
||||||
|
|
||||||
|
emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
|
||||||
|
emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
|
||||||
|
emit_insn (gen_xordi3 (operands[0], scratch, operands[2]));
|
||||||
|
DONE;
|
||||||
|
}
|
||||||
|
[(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
|
||||||
|
(set_attr "length" "4,4,4,12")]
|
||||||
|
)
|
||||||
|
|
||||||
(define_expand "aarch64_simd_bsl<mode>"
|
(define_expand "aarch64_simd_bsl<mode>"
|
||||||
[(match_operand:VALLDIF 0 "register_operand")
|
[(match_operand:VALLDIF 0 "register_operand")
|
||||||
(match_operand:<V_INT_EQUIV> 1 "register_operand")
|
(match_operand:<V_INT_EQUIV> 1 "register_operand")
|
||||||
|
@ -1,3 +1,8 @@
|
|||||||
|
2017-11-14 James Greenhalgh <james.greenhalgh@arm.com>
|
||||||
|
|
||||||
|
* gcc.target/aarch64/bsl-idiom.c: New.
|
||||||
|
* gcc.target/aarch64/copysign-bsl.c: New.
|
||||||
|
|
||||||
2017-11-14 Tom de Vries <tom@codesourcery.com>
|
2017-11-14 Tom de Vries <tom@codesourcery.com>
|
||||||
|
|
||||||
* c-c++-common/Wstringop-truncation.c: Require effective target alloca.
|
* c-c++-common/Wstringop-truncation.c: Require effective target alloca.
|
||||||
|
88
gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
Normal file
88
gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-options "-O2 -fdump-rtl-combine --save-temps" } */
|
||||||
|
|
||||||
|
/* Test that we don't generate BSL when in DImode with values in integer
|
||||||
|
registers, and do generate it where we have values in floating-point
|
||||||
|
registers. This is useful, as it allows us to avoid register moves
|
||||||
|
in the general case.
|
||||||
|
|
||||||
|
We want:
|
||||||
|
eor x0, x0, x1
|
||||||
|
and x0, x0, x2
|
||||||
|
eor x0, x0, x1
|
||||||
|
ret
|
||||||
|
|
||||||
|
Rather than:
|
||||||
|
fmov d2, x0
|
||||||
|
fmov d0, x2
|
||||||
|
fmov d1, x1
|
||||||
|
bsl v0.8b, v2.8b, v1.8b
|
||||||
|
fmov x0, d0
|
||||||
|
ret */
|
||||||
|
|
||||||
|
extern void abort (void);
|
||||||
|
|
||||||
|
unsigned long long __attribute__ ((noinline))
|
||||||
|
foo (unsigned long long a, unsigned long long b, unsigned long long c)
|
||||||
|
{
|
||||||
|
return ((a ^ b) & c) ^ b;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned long long __attribute__ ((noinline))
|
||||||
|
foo2 (unsigned long long a, unsigned long long b, unsigned long long c)
|
||||||
|
{
|
||||||
|
return ((a ^ b) & c) ^ a;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define force_simd(V1) asm volatile ("mov %d0, %1.d[0]" \
|
||||||
|
: "=w"(V1) \
|
||||||
|
: "w"(V1) \
|
||||||
|
: /* No clobbers */);
|
||||||
|
|
||||||
|
unsigned long long __attribute__ ((noinline))
|
||||||
|
bar (unsigned long long a, unsigned long long b, unsigned long long c)
|
||||||
|
{
|
||||||
|
force_simd (a);
|
||||||
|
force_simd (b);
|
||||||
|
force_simd (c);
|
||||||
|
c = ((a ^ b) & c) ^ b;
|
||||||
|
force_simd (c);
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned long long __attribute__ ((noinline))
|
||||||
|
bar2 (unsigned long long a, unsigned long long b, unsigned long long c)
|
||||||
|
{
|
||||||
|
force_simd (a);
|
||||||
|
force_simd (b);
|
||||||
|
force_simd (c);
|
||||||
|
c = ((a ^ b) & c) ^ a;
|
||||||
|
force_simd (c);
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
main (int argc, char** argv)
|
||||||
|
{
|
||||||
|
unsigned long long a = 0x0123456789abcdefULL;
|
||||||
|
unsigned long long b = 0xfedcba9876543210ULL;
|
||||||
|
unsigned long long c = 0xaabbccddeeff7777ULL;
|
||||||
|
if (foo (a, b, c) != bar (a, b, c))
|
||||||
|
abort ();
|
||||||
|
if (foo2 (a, b, c) != bar2 (a, b, c))
|
||||||
|
abort ();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 2 BSL, 6 FMOV (to floating-point registers), and 2 FMOV (to general
|
||||||
|
purpose registers) for the "bar" tests, which should still use BSL. */
|
||||||
|
/* { dg-final { scan-assembler-times "bsl\tv\[0-9\]" 2 } } */
|
||||||
|
/* { dg-final { scan-assembler-times "fmov\td\[0-9\]" 6 } } */
|
||||||
|
/* { dg-final { scan-assembler-times "fmov\tx\[0-9\]" 2 } } */
|
||||||
|
|
||||||
|
/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
|
||||||
|
/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
|
||||||
|
|
||||||
|
/* We always match the idiom during combine. */
|
||||||
|
/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_internal" 2 "combine" } } */
|
||||||
|
/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_alt" 2 "combine" } } */
|
13
gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
Normal file
13
gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-O2" } */
|
||||||
|
|
||||||
|
/* Test that we can generate DImode BSL when we are using
|
||||||
|
copysign. */
|
||||||
|
|
||||||
|
double
|
||||||
|
foo (double a, double b)
|
||||||
|
{
|
||||||
|
return __builtin_copysign (a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* { dg-final { scan-assembler "b\(sl|it|if\)\tv\[0-9\]" } } */
|
Loading…
Reference in New Issue
Block a user