diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3aa83fec31a..af60aedf97a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2017-11-14 James Greenhalgh + + * config/aarch64/aarch64-simd.md + (aarch64_simd_bsl_internal): Remove DImode. + (*aarch64_simd_bsl_alt): Likewise. + (aarch64_simd_bsldi_internal): New. + (aarch64_simd_bsldi_alt): Likewise. + 2017-11-13 Jan Hubicka * tracer.c (better_p): Do not compare frequencies. diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 2ee09539981..9e7e557ee9e 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -2419,13 +2419,13 @@ ;; in *aarch64_simd_bsl_alt. (define_insn "aarch64_simd_bsl_internal" - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") - (xor:VSDQ_I_DI - (and:VSDQ_I_DI - (xor:VSDQ_I_DI + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") + (xor:VDQ_I + (and:VDQ_I + (xor:VDQ_I (match_operand: 3 "register_operand" "w,0,w") - (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0")) - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) + (match_operand:VDQ_I 2 "register_operand" "w,w,0")) + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) (match_dup: 3) ))] "TARGET_SIMD" @@ -2443,14 +2443,14 @@ ;; permutations of commutative operations, we have to have a separate pattern. (define_insn "*aarch64_simd_bsl_alt" - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") - (xor:VSDQ_I_DI - (and:VSDQ_I_DI - (xor:VSDQ_I_DI - (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0") - (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w")) - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) - (match_dup:VSDQ_I_DI 2)))] + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") + (xor:VDQ_I + (and:VDQ_I + (xor:VDQ_I + (match_operand:VDQ_I 3 "register_operand" "w,w,0") + (match_operand: 2 "register_operand" "w,0,w")) + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) + (match_dup: 2)))] "TARGET_SIMD" "@ bsl\\t%0., %3., %2. @@ -2459,6 +2459,100 @@ [(set_attr "type" "neon_bsl")] ) +;; DImode is special, we want to avoid computing operations which are +;; more naturally computed in general purpose registers in the vector +;; registers. If we do that, we need to move all three operands from general +;; purpose registers to vector registers, then back again. However, we +;; don't want to make this pattern an UNSPEC as we'd lose scope for +;; optimizations based on the component operations of a BSL. +;; +;; That means we need a splitter back to the individual operations, if they +;; would be better calculated on the integer side. + +(define_insn_and_split "aarch64_simd_bsldi_internal" + [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r") + (xor:DI + (and:DI + (xor:DI + (match_operand:DI 3 "register_operand" "w,0,w,r") + (match_operand:DI 2 "register_operand" "w,w,0,r")) + (match_operand:DI 1 "register_operand" "0,w,w,r")) + (match_dup:DI 3) + ))] + "TARGET_SIMD" + "@ + bsl\\t%0.8b, %2.8b, %3.8b + bit\\t%0.8b, %2.8b, %1.8b + bif\\t%0.8b, %3.8b, %1.8b + #" + "&& GP_REGNUM_P (REGNO (operands[0]))" + [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)] +{ + /* Split back to individual operations. If we're before reload, and + able to create a temporary register, do so. If we're after reload, + we've got an early-clobber destination register, so use that. + Otherwise, we can't create pseudos and we can't yet guarantee that + operands[0] is safe to write, so FAIL to split. */ + + rtx scratch; + if (reload_completed) + scratch = operands[0]; + else if (can_create_pseudo_p ()) + scratch = gen_reg_rtx (DImode); + else + FAIL; + + emit_insn (gen_xordi3 (scratch, operands[2], operands[3])); + emit_insn (gen_anddi3 (scratch, scratch, operands[1])); + emit_insn (gen_xordi3 (operands[0], scratch, operands[3])); + DONE; +} + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") + (set_attr "length" "4,4,4,12")] +) + +(define_insn_and_split "aarch64_simd_bsldi_alt" + [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r") + (xor:DI + (and:DI + (xor:DI + (match_operand:DI 3 "register_operand" "w,w,0,r") + (match_operand:DI 2 "register_operand" "w,0,w,r")) + (match_operand:DI 1 "register_operand" "0,w,w,r")) + (match_dup:DI 2) + ))] + "TARGET_SIMD" + "@ + bsl\\t%0.8b, %3.8b, %2.8b + bit\\t%0.8b, %3.8b, %1.8b + bif\\t%0.8b, %2.8b, %1.8b + #" + "&& GP_REGNUM_P (REGNO (operands[0]))" + [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)] +{ + /* Split back to individual operations. If we're before reload, and + able to create a temporary register, do so. If we're after reload, + we've got an early-clobber destination register, so use that. + Otherwise, we can't create pseudos and we can't yet guarantee that + operands[0] is safe to write, so FAIL to split. */ + + rtx scratch; + if (reload_completed) + scratch = operands[0]; + else if (can_create_pseudo_p ()) + scratch = gen_reg_rtx (DImode); + else + FAIL; + + emit_insn (gen_xordi3 (scratch, operands[2], operands[3])); + emit_insn (gen_anddi3 (scratch, scratch, operands[1])); + emit_insn (gen_xordi3 (operands[0], scratch, operands[2])); + DONE; +} + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") + (set_attr "length" "4,4,4,12")] +) + (define_expand "aarch64_simd_bsl" [(match_operand:VALLDIF 0 "register_operand") (match_operand: 1 "register_operand") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e973905049e..825623e9892 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2017-11-14 James Greenhalgh + + * gcc.target/aarch64/bsl-idiom.c: New. + * gcc.target/aarch64/copysign-bsl.c: New. + 2017-11-14 Tom de Vries * c-c++-common/Wstringop-truncation.c: Require effective target alloca. diff --git a/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c new file mode 100644 index 00000000000..8151387600f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c @@ -0,0 +1,88 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fdump-rtl-combine --save-temps" } */ + +/* Test that we don't generate BSL when in DImode with values in integer + registers, and do generate it where we have values in floating-point + registers. This is useful, as it allows us to avoid register moves + in the general case. + + We want: + eor x0, x0, x1 + and x0, x0, x2 + eor x0, x0, x1 + ret + + Rather than: + fmov d2, x0 + fmov d0, x2 + fmov d1, x1 + bsl v0.8b, v2.8b, v1.8b + fmov x0, d0 + ret */ + +extern void abort (void); + +unsigned long long __attribute__ ((noinline)) +foo (unsigned long long a, unsigned long long b, unsigned long long c) +{ + return ((a ^ b) & c) ^ b; +} + +unsigned long long __attribute__ ((noinline)) +foo2 (unsigned long long a, unsigned long long b, unsigned long long c) +{ + return ((a ^ b) & c) ^ a; +} + +#define force_simd(V1) asm volatile ("mov %d0, %1.d[0]" \ + : "=w"(V1) \ + : "w"(V1) \ + : /* No clobbers */); + +unsigned long long __attribute__ ((noinline)) +bar (unsigned long long a, unsigned long long b, unsigned long long c) +{ + force_simd (a); + force_simd (b); + force_simd (c); + c = ((a ^ b) & c) ^ b; + force_simd (c); + return c; +} + +unsigned long long __attribute__ ((noinline)) +bar2 (unsigned long long a, unsigned long long b, unsigned long long c) +{ + force_simd (a); + force_simd (b); + force_simd (c); + c = ((a ^ b) & c) ^ a; + force_simd (c); + return c; +} + +int +main (int argc, char** argv) +{ + unsigned long long a = 0x0123456789abcdefULL; + unsigned long long b = 0xfedcba9876543210ULL; + unsigned long long c = 0xaabbccddeeff7777ULL; + if (foo (a, b, c) != bar (a, b, c)) + abort (); + if (foo2 (a, b, c) != bar2 (a, b, c)) + abort (); + return 0; +} + +/* 2 BSL, 6 FMOV (to floating-point registers), and 2 FMOV (to general +purpose registers) for the "bar" tests, which should still use BSL. */ +/* { dg-final { scan-assembler-times "bsl\tv\[0-9\]" 2 } } */ +/* { dg-final { scan-assembler-times "fmov\td\[0-9\]" 6 } } */ +/* { dg-final { scan-assembler-times "fmov\tx\[0-9\]" 2 } } */ + +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */ +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */ + +/* We always match the idiom during combine. */ +/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_internal" 2 "combine" } } */ +/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_alt" 2 "combine" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c new file mode 100644 index 00000000000..0ec7109c738 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* Test that we can generate DImode BSL when we are using + copysign. */ + +double +foo (double a, double b) +{ + return __builtin_copysign (a, b); +} + +/* { dg-final { scan-assembler "b\(sl|it|if\)\tv\[0-9\]" } } */