diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 9b6d59963a3..fdfea33cf73 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index) return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index); } +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const int __index) +{ + return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b, + const int __index) +{ + return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int __index) +{ + return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index) +{ + return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index); +} + #pragma GCC pop_options #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 865de6556fd..c29ae3a265b 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi) VAR2 (UTERNOP, udot, v8qi, v16qi) VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi) VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi) +VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi) +VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi) VAR1 (USTERNOP, usdot, v8qi) VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index e06c8245672..4a8987b49d5 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2866,20 +2866,49 @@ }) -;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "neon_dot" +;; These map to the auto-vectorizer Dot Product optab. +;; The auto-vectorizer expects a dot product builtin that also does an +;; accumulation into the provided register. +;; Given the following pattern +;; +;; for (i=0; idot_prod" [(set (match_operand:VCVTI 0 "register_operand" "=w") - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") - (unspec:VCVTI [(match_operand: 2 - "register_operand" "w") - (match_operand: 3 - "register_operand" "w")] - DOTPROD)))] + (plus:VCVTI + (unspec:VCVTI [(match_operand: 1 "register_operand" "w") + (match_operand: 2 "register_operand" "w")] + DOTPROD) + (match_operand:VCVTI 3 "register_operand" "0")))] "TARGET_DOTPROD" - "vdot.\\t%0, %2, %3" + "vdot.\\t%0, %1, %2" [(set_attr "type" "neon_dot")] ) +;; These instructions map to the __builtins for the Dot Product operations +(define_expand "neon_dot" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI + (unspec:VCVTI [(match_operand: 2 "register_operand") + (match_operand: 3 "register_operand")] + DOTPROD) + (match_operand:VCVTI 1 "register_operand")))] + "TARGET_DOTPROD" +) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "neon_usdot" [(set (match_operand:VCVTI 0 "register_operand" "=w") @@ -2898,17 +2927,40 @@ ;; indexed operations. (define_insn "neon_dot_lane" [(set (match_operand:VCVTI 0 "register_operand" "=w") - (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0") - (unspec:VCVTI [(match_operand: 2 - "register_operand" "w") - (match_operand:V8QI 3 "register_operand" "t") - (match_operand:SI 4 "immediate_operand" "i")] - DOTPROD)))] + (plus:VCVTI + (unspec:VCVTI [(match_operand: 2 "register_operand" "w") + (match_operand:V8QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VCVTI 1 "register_operand" "0")))] + "TARGET_DOTPROD" + "vdot.\\t%0, %2, %P3[%c4]"; + [(set_attr "type" "neon_dot")] +) + +;; These instructions map to the __builtins for the Dot Product +;; indexed operations. +(define_insn "neon_dot_laneq" + [(set (match_operand:VCVTI 0 "register_operand" "=w") + (plus:VCVTI + (unspec:VCVTI [(match_operand: 2 "register_operand" "w") + (match_operand:V16QI 3 "register_operand" "t") + (match_operand:SI 4 "immediate_operand" "i")] + DOTPROD) + (match_operand:VCVTI 1 "register_operand" "0")))] "TARGET_DOTPROD" { - operands[4] - = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4]))); - return "vdot.\\t%0, %2, %P3[%c4]"; + int lane = INTVAL (operands[4]); + if (lane > GET_MODE_NUNITS (V2SImode) - 1) + { + operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode)); + return "vdot.\\t%0, %2, %f3[%c4]"; + } + else + { + operands[4] = GEN_INT (lane); + return "vdot.\\t%0, %2, %e3[%c4]"; + } } [(set_attr "type" "neon_dot")] ) @@ -2932,43 +2984,6 @@ [(set_attr "type" "neon_dot")] ) -;; These expands map to the Dot Product optab the vectorizer checks for. -;; The auto-vectorizer expects a dot product builtin that also does an -;; accumulation into the provided register. -;; Given the following pattern -;; -;; for (i=0; idot_prod" - [(set (match_operand:VCVTI 0 "register_operand") - (plus:VCVTI (unspec:VCVTI [(match_operand: 1 - "register_operand") - (match_operand: 2 - "register_operand")] - DOTPROD) - (match_operand:VCVTI 3 "register_operand")))] - "TARGET_DOTPROD" -{ - emit_insn ( - gen_neon_dot (operands[3], operands[3], operands[1], - operands[2])); - emit_insn (gen_rtx_SET (operands[0], operands[3])); - DONE; -}) - ;; Auto-vectorizer pattern for usdot (define_expand "usdot_prod" [(set (match_operand:VCVTI 0 "register_operand") diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c index b3bd3bf00e3..d3541e829a4 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c @@ -49,8 +49,28 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, int8x8_t y) return vdotq_lane_s32 (r, x, y, 0); } -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+} 4 } } */ -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */ -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */ +int32x2_t sfoo_laneq1 (int32x2_t r, int8x8_t x, int8x16_t y) +{ + return vdot_laneq_s32 (r, x, y, 0); +} + +int32x4_t sfooq_lane1 (int32x4_t r, int8x16_t x, int8x16_t y) +{ + return vdotq_laneq_s32 (r, x, y, 0); +} + +int32x2_t sfoo_laneq2 (int32x2_t r, int8x8_t x, int8x16_t y) +{ + return vdot_laneq_s32 (r, x, y, 2); +} + +int32x4_t sfooq_lane2 (int32x4_t r, int8x16_t x, int8x16_t y) +{ + return vdotq_laneq_s32 (r, x, y, 2); +} + +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+} 6 } } */ +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c index 054f4703394..89a196ea17c 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c @@ -10,7 +10,7 @@ extern void abort(); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ # define ORDER(x, y) y #else -# define ORDER(x, y) x - y +# define ORDER(x, y) (x - y) #endif #define P(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2 @@ -33,7 +33,20 @@ extern void abort(); t3 f##_##rx1 = {0}; \ f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1)); \ if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ - abort (); \ + abort (); + +#define P2(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2,n1,n1,n1,n1,n2,n2,n2,n2 +#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \ + ARR(f, x, t1, r1); \ + ARR(f, y, t2, r2); \ + t3 f##_##rx = {0}; \ + f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 2)); \ + if (f##_##rx[0] != n1 || f##_##rx[1] != n2) \ + abort (); \ + t3 f##_##rx1 = {0}; \ + f##_##rx1 = f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 3)); \ + if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4) \ + abort (); int main() @@ -45,11 +58,16 @@ main() TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, -24); TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24); - TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24); TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24); TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24); + TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32, P(1,2), P2(2,3), 8, 16, 12, 24); + TEST_LANEQ (int8x8_t, int8x16_t, int32x2_t, vdot_laneq_s32, P(1,2), P2(-2,-3), -8, -16, -12, -24); + + TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32, P2(1,2), P2(2,3), 8, 16, 12, 24); + TEST_LANEQ (int8x16_t, int8x16_t, int32x4_t, vdotq_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24); + return 0; }