AArch64: Lower intrinsics shift to GIMPLE when possible.

This lowers shifts to GIMPLE when the C interpretations of the shift operations
matches that of AArch64.

In C shifting right by BITSIZE is undefined, but the behavior is defined in
AArch64.  Additionally negative shifts lefts are undefined for the register
variant of the instruction (SSHL, USHL) as being right shifts.

Since we have a right shift by immediate I rewrite those cases into right shifts

So:

int64x1_t foo3 (int64x1_t a)
{
  return vshl_s64 (a, vdup_n_s64(-6));
}

produces:

foo3:
        sshr    d0, d0, 6
        ret

instead of:

foo3:
        mov     x0, -6
        fmov    d1, x0
        sshl    d0, d0, d1
        ret

This behavior isn't specifically mentioned for a left shift by immediate, but I
believe that only the case because we do have a right shift by immediate but not
a right shift by register.  As such I do the same for left shift by immediate.

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.c
	(aarch64_general_gimple_fold_builtin): Add ashl, sshl, ushl, ashr,
	ashr_simd, lshr, lshr_simd.
	* config/aarch64/aarch64-simd-builtins.def (lshr): Use USHIFTIMM.
	* config/aarch64/arm_neon.h (vshr_n_u8, vshr_n_u16, vshr_n_u32,
	vshrq_n_u8, vshrq_n_u16, vshrq_n_u32, vshrq_n_u64): Fix type hack.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/advsimd-intrinsics/vshl-opt-1.c: New test.
	* gcc.target/aarch64/advsimd-intrinsics/vshl-opt-2.c: New test.
	* gcc.target/aarch64/advsimd-intrinsics/vshl-opt-3.c: New test.
	* gcc.target/aarch64/advsimd-intrinsics/vshl-opt-4.c: New test.
	* gcc.target/aarch64/advsimd-intrinsics/vshl-opt-5.c: New test.
	* gcc.target/aarch64/advsimd-intrinsics/vshl-opt-6.c: New test.
	* gcc.target/aarch64/advsimd-intrinsics/vshl-opt-7.c: New test.
	* gcc.target/aarch64/advsimd-intrinsics/vshl-opt-8.c: New test.
	* gcc.target/aarch64/signbit-2.c: New test.
This commit is contained in:
Tamar Christina 2021-11-04 17:36:08 +00:00
parent d70720c238
commit 1b4a63593b
12 changed files with 180 additions and 8 deletions

View File

@ -2719,6 +2719,54 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
1, args[0]);
gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
break;
BUILTIN_VSDQ_I_DI (BINOP, ashl, 3, NONE)
if (TREE_CODE (args[1]) == INTEGER_CST
&& wi::ltu_p (wi::to_wide (args[1]), element_precision (args[0])))
new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
LSHIFT_EXPR, args[0], args[1]);
break;
BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, NONE)
BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, NONE)
{
tree cst = args[1];
tree ctype = TREE_TYPE (cst);
/* Left shifts can be both scalar or vector, e.g. uint64x1_t is
treated as a scalar type not a vector one. */
if ((cst = uniform_integer_cst_p (cst)) != NULL_TREE)
{
wide_int wcst = wi::to_wide (cst);
tree unit_ty = TREE_TYPE (cst);
wide_int abs_cst = wi::abs (wcst);
if (wi::geu_p (abs_cst, element_precision (args[0])))
break;
if (wi::neg_p (wcst, TYPE_SIGN (ctype)))
{
tree final_cst;
final_cst = wide_int_to_tree (unit_ty, abs_cst);
if (TREE_CODE (cst) != INTEGER_CST)
final_cst = build_uniform_cst (ctype, final_cst);
new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
RSHIFT_EXPR, args[0],
final_cst);
}
else
new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
LSHIFT_EXPR, args[0], args[1]);
}
}
break;
BUILTIN_VDQ_I (SHIFTIMM, ashr, 3, NONE)
VAR1 (SHIFTIMM, ashr_simd, 0, NONE, di)
BUILTIN_VDQ_I (USHIFTIMM, lshr, 3, NONE)
VAR1 (USHIFTIMM, lshr_simd, 0, NONE, di)
if (TREE_CODE (args[1]) == INTEGER_CST
&& wi::ltu_p (wi::to_wide (args[1]), element_precision (args[0])))
new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
RSHIFT_EXPR, args[0], args[1]);
break;
BUILTIN_GPF (BINOP, fmulx, 0, ALL)
{
gcc_assert (nargs == 2);

View File

@ -436,7 +436,7 @@
BUILTIN_VDQ_I (SHIFTIMM, ashr, 3, NONE)
VAR1 (SHIFTIMM, ashr_simd, 0, NONE, di)
BUILTIN_VDQ_I (SHIFTIMM, lshr, 3, NONE)
BUILTIN_VDQ_I (USHIFTIMM, lshr, 3, NONE)
VAR1 (USHIFTIMM, lshr_simd, 0, NONE, di)
/* Implemented by aarch64_<sur>shr_n<mode>. */
BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0, NONE)

View File

@ -24128,21 +24128,21 @@ __extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vshr_n_u8 (uint8x8_t __a, const int __b)
{
return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b);
return __builtin_aarch64_lshrv8qi_uus (__a, __b);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vshr_n_u16 (uint16x4_t __a, const int __b)
{
return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b);
return __builtin_aarch64_lshrv4hi_uus (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vshr_n_u32 (uint32x2_t __a, const int __b)
{
return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b);
return __builtin_aarch64_lshrv2si_uus (__a, __b);
}
__extension__ extern __inline uint64x1_t
@ -24184,28 +24184,28 @@ __extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vshrq_n_u8 (uint8x16_t __a, const int __b)
{
return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b);
return __builtin_aarch64_lshrv16qi_uus (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vshrq_n_u16 (uint16x8_t __a, const int __b)
{
return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b);
return __builtin_aarch64_lshrv8hi_uus (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vshrq_n_u32 (uint32x4_t __a, const int __b)
{
return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b);
return __builtin_aarch64_lshrv4si_uus (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vshrq_n_u64 (uint64x2_t __a, const int __b)
{
return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b);
return __builtin_aarch64_lshrv2di_uus (__a, __b);
}
__extension__ extern __inline int64_t

View File

@ -0,0 +1,11 @@
/* { dg-do assemble { target aarch64*-*-* } } */
/* { dg-additional-options "--save-temps" } */
#include <arm_neon.h>
uint8x8_t foo (uint8x8_t a)
{
return vshr_n_u8 (a, 2);
}
/* { dg-final { scan-assembler-times {\tushr\t.+, 2} 1 } } */

View File

@ -0,0 +1,11 @@
/* { dg-do assemble { target aarch64*-*-* } } */
/* { dg-additional-options "--save-temps" } */
#include <arm_neon.h>
int16x8_t foo (int16x8_t a)
{
return vshrq_n_s16 (a, 8);
}
/* { dg-final { scan-assembler-times {\tsshr\t.+, 8} 1 } } */

View File

@ -0,0 +1,11 @@
/* { dg-do assemble { target aarch64*-*-* } } */
/* { dg-additional-options "--save-temps" } */
#include <arm_neon.h>
int16x8_t foo (int16x8_t a)
{
return vshrq_n_s16 (a, 16);
}
/* { dg-final { scan-assembler-times {\tsshr\t.+, 16} 1 } } */

View File

@ -0,0 +1,11 @@
/* { dg-do assemble { target aarch64*-*-* } } */
/* { dg-additional-options "--save-temps" } */
#include <arm_neon.h>
int64x1_t foo (int64x1_t a)
{
return vshl_s64 (a, vdup_n_s64(80));
}
/* { dg-final { scan-assembler-times {\tsshl\t.+, d[0-9]+} 1 } } */

View File

@ -0,0 +1,12 @@
/* { dg-do assemble { target aarch64*-*-* } } */
/* { dg-additional-options "--save-temps" } */
/* { dg-skip-if "no optimizations" { *-*-* } { "-O0" } { "" } } */
#include <arm_neon.h>
int64x1_t foo (int64x1_t a)
{
return vshl_s64 (a, vdup_n_s64(-6));
}
/* { dg-final { scan-assembler-times {\tsshr\t.+, 6} 1 } } */

View File

@ -0,0 +1,10 @@
/* { dg-do assemble { target aarch64*-*-* } } */
/* { dg-additional-options "--save-temps" } */
#include <arm_neon.h>
int32x4_t foo (int32x4_t x) {
return vshlq_s32(x, vdupq_n_s32(256));
}
/* { dg-final { scan-assembler-times {\tsshl\t.+, v[0-9].4s} 1 } } */

View File

@ -0,0 +1,12 @@
/* { dg-do assemble { target aarch64*-*-* } } */
/* { dg-additional-options "--save-temps" } */
/* { dg-skip-if "no optimizations" { *-*-* } { "-O0" } { "" } } */
#include <arm_neon.h>
int32x4_t foo (int32x4_t x) {
return vshlq_s32(vdupq_n_s32(1), vdupq_n_s32(10));
}
/* { dg-final { scan-assembler-not {\tsshl\t} } } */
/* { dg-final { scan-assembler-times {\tmovi\t} 1 } } */

View File

@ -0,0 +1,10 @@
/* { dg-do assemble { target aarch64*-*-* } } */
/* { dg-additional-options "--save-temps" } */
#include <arm_neon.h>
int32x4_t foo (int32x4_t x) {
return vshlq_s32(x, vdupq_n_s32(-64));
}
/* { dg-final { scan-assembler-times {\tsshl\t.+, v[0-9]+.4s} 1 } } */

View File

@ -0,0 +1,36 @@
/* { dg-do assemble } */
/* { dg-options "-O1 --save-temps" } */
#include <arm_neon.h>
int32x2_t foo1 (int32x2_t a)
{
return vshr_n_s32 (vneg_s32 (a), 31);
}
int32x4_t foo2 (int32x4_t a)
{
return vshrq_n_s32 (vnegq_s32 (a), 31);
}
int16x8_t foo3 (int16x8_t a)
{
return vshrq_n_s16 (vnegq_s16 (a), 15);
}
int16x4_t foo4 (int16x4_t a)
{
return vshr_n_s16 (vneg_s16 (a), 15);
}
int8x16_t foo5 (int8x16_t a)
{
return vshrq_n_s8 (vnegq_s8 (a), 7);
}
int8x8_t foo6 (int8x8_t a)
{
return vshr_n_s8 (vneg_s8 (a), 7);
}
/* { dg-final { scan-assembler-times {\tcmgt\t} 6 } } */