From 91bd4114a73ca69e5ce3d904282b402b9f2128d3 Mon Sep 17 00:00:00 2001 From: James Greenhalgh Date: Mon, 2 Sep 2013 16:22:10 +0000 Subject: [PATCH] [AArch64] Rewrite the vdup_lane intrinsics in C gcc/ * config/aarch64/aarch64-simd-builtins.def (dup_lane_scalar): Remove. * config/aarch64/aarch64-simd.md (aarch64_simd_dup): Add 'w->w' alternative. (aarch64_dup_lane): Allow for VALL. (aarch64_dup_lane_scalar): Remove. (aarch64_dup_lane_): New. (aarch64_get_lane_signed): Add w->w altenative. (aarch64_get_lane_unsigned): Likewise. (aarch64_get_lane): Likewise. * config/aarch64/aarch64.c (aarch64_evpc_dup): New. (aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup. * config/aarch64/iterators.md (VSWAP_WIDTH): New. (VCON): Change container of V2SF. (vswap_width_name): Likewise. * config/aarch64/arm_neon.h (__aarch64_vdup_lane_any): New. (__aarch64_vdup_lane_<8,16,32,64>): Likewise. (vdup_n_<8,16,32,64>): Convert to C implementation. (vdup_lane_<8,16,32,64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/scalar_intrinsics.c (vdup_lane<8,16,32,64>): Force values to SIMD registers. From-SVN: r202180 --- gcc/ChangeLog | 23 + gcc/config/aarch64/aarch64-simd.md | 49 +- gcc/config/aarch64/aarch64.c | 51 + gcc/config/aarch64/arm_neon.h | 1297 +++++++++-------- gcc/config/aarch64/iterators.md | 16 +- gcc/testsuite/ChangeLog | 5 + .../gcc.target/aarch64/scalar_intrinsics.c | 48 +- 7 files changed, 887 insertions(+), 602 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 918e667c02e..90096122489 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,26 @@ +2013-09-02 James Greenhalgh + + * config/aarch64/aarch64-simd-builtins.def + (dup_lane_scalar): Remove. + * config/aarch64/aarch64-simd.md + (aarch64_simd_dup): Add 'w->w' alternative. + (aarch64_dup_lane): Allow for VALL. + (aarch64_dup_lane_scalar): Remove. + (aarch64_dup_lane_): New. + (aarch64_get_lane_signed): Add w->w altenative. + (aarch64_get_lane_unsigned): Likewise. + (aarch64_get_lane): Likewise. + * config/aarch64/aarch64.c (aarch64_evpc_dup): New. + (aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup. + * config/aarch64/iterators.md (VSWAP_WIDTH): New. + (VCON): Change container of V2SF. + (vswap_width_name): Likewise. + * config/aarch64/arm_neon.h + (__aarch64_vdup_lane_any): New. + (__aarch64_vdup_lane_<8,16,32,64>): Likewise. + (vdup_n_<8,16,32,64>): Convert to C implementation. + (vdup_lane_<8,16,32,64>): Likewise. + 2013-09-02 Eric Botcazou PR middle-end/56382 diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 982373099f7..f4b929edf44 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -336,24 +336,13 @@ }) (define_insn "aarch64_simd_dup" - [(set (match_operand:VDQ 0 "register_operand" "=w") - (vec_duplicate:VDQ (match_operand: 1 "register_operand" "r")))] + [(set (match_operand:VDQ 0 "register_operand" "=w, w") + (vec_duplicate:VDQ (match_operand: 1 "register_operand" "r, w")))] "TARGET_SIMD" - "dup\\t%0., %1" - [(set_attr "simd_type" "simd_dupgp") - (set_attr "simd_mode" "")] -) - -(define_insn "aarch64_dup_lane" - [(set (match_operand:VDQ_I 0 "register_operand" "=w") - (vec_duplicate:VDQ_I - (vec_select: - (match_operand: 1 "register_operand" "w") - (parallel [(match_operand:SI 2 "immediate_operand" "i")]) - )))] - "TARGET_SIMD" - "dup\\t%0, %1.[%2]" - [(set_attr "simd_type" "simd_dup") + "@ + dup\\t%0., %1 + dup\\t%0., %1.[0]" + [(set_attr "simd_type" "simd_dupgp, simd_dup") (set_attr "simd_mode" "")] ) @@ -366,6 +355,32 @@ (set_attr "simd_mode" "")] ) +(define_insn "aarch64_dup_lane" + [(set (match_operand:VALL 0 "register_operand" "=w") + (vec_duplicate:VALL + (vec_select: + (match_operand:VALL 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]) + )))] + "TARGET_SIMD" + "dup\\t%0., %1.[%2]" + [(set_attr "simd_type" "simd_dup") + (set_attr "simd_mode" "")] +) + +(define_insn "aarch64_dup_lane_" + [(set (match_operand:VALL 0 "register_operand" "=w") + (vec_duplicate:VALL + (vec_select: + (match_operand: 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]) + )))] + "TARGET_SIMD" + "dup\\t%0., %1.[%2]" + [(set_attr "simd_type" "simd_dup") + (set_attr "simd_mode" "")] +) + (define_insn "*aarch64_simd_mov" [(set (match_operand:VD 0 "aarch64_simd_nonimmediate_operand" "=w, Utv, w, ?r, ?w, ?r, w") diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index aed035a434e..7635e1e2679 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -7931,6 +7931,55 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d) return true; } +static bool +aarch64_evpc_dup (struct expand_vec_perm_d *d) +{ + rtx (*gen) (rtx, rtx, rtx); + rtx out = d->target; + rtx in0; + enum machine_mode vmode = d->vmode; + unsigned int i, elt, nelt = d->nelt; + rtx lane; + + /* TODO: This may not be big-endian safe. */ + if (BYTES_BIG_ENDIAN) + return false; + + elt = d->perm[0]; + for (i = 1; i < nelt; i++) + { + if (elt != d->perm[i]) + return false; + } + + /* The generic preparation in aarch64_expand_vec_perm_const_1 + swaps the operand order and the permute indices if it finds + d->perm[0] to be in the second operand. Thus, we can always + use d->op0 and need not do any extra arithmetic to get the + correct lane number. */ + in0 = d->op0; + lane = GEN_INT (elt); + + switch (vmode) + { + case V16QImode: gen = gen_aarch64_dup_lanev16qi; break; + case V8QImode: gen = gen_aarch64_dup_lanev8qi; break; + case V8HImode: gen = gen_aarch64_dup_lanev8hi; break; + case V4HImode: gen = gen_aarch64_dup_lanev4hi; break; + case V4SImode: gen = gen_aarch64_dup_lanev4si; break; + case V2SImode: gen = gen_aarch64_dup_lanev2si; break; + case V2DImode: gen = gen_aarch64_dup_lanev2di; break; + case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break; + case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break; + case V2DFmode: gen = gen_aarch64_dup_lanev2df; break; + default: + return false; + } + + emit_insn (gen (out, in0, lane)); + return true; +} + static bool aarch64_evpc_tbl (struct expand_vec_perm_d *d) { @@ -7988,6 +8037,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; else if (aarch64_evpc_trn (d)) return true; + else if (aarch64_evpc_dup (d)) + return true; return aarch64_evpc_tbl (d); } return false; diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index af097ad9a0d..e289a0dd459 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -508,6 +508,107 @@ typedef struct poly16x8x4_t #define __aarch64_vgetq_lane_u64(__a, __b) \ __aarch64_vget_lane_any (v2di, (uint64_t), (int64x2_t), __a, __b) +/* __aarch64_vdup_lane internal macros. */ +#define __aarch64_vdup_lane_any(__size, __q1, __q2, __a, __b) \ + vdup##__q1##_n_##__size (__aarch64_vget##__q2##_lane_##__size (__a, __b)) + +#define __aarch64_vdup_lane_f32(__a, __b) \ + __aarch64_vdup_lane_any (f32, , , __a, __b) +#define __aarch64_vdup_lane_f64(__a, __b) (__a) +#define __aarch64_vdup_lane_p8(__a, __b) \ + __aarch64_vdup_lane_any (p8, , , __a, __b) +#define __aarch64_vdup_lane_p16(__a, __b) \ + __aarch64_vdup_lane_any (p16, , , __a, __b) +#define __aarch64_vdup_lane_s8(__a, __b) \ + __aarch64_vdup_lane_any (s8, , , __a, __b) +#define __aarch64_vdup_lane_s16(__a, __b) \ + __aarch64_vdup_lane_any (s16, , , __a, __b) +#define __aarch64_vdup_lane_s32(__a, __b) \ + __aarch64_vdup_lane_any (s32, , , __a, __b) +#define __aarch64_vdup_lane_s64(__a, __b) (__a) +#define __aarch64_vdup_lane_u8(__a, __b) \ + __aarch64_vdup_lane_any (u8, , , __a, __b) +#define __aarch64_vdup_lane_u16(__a, __b) \ + __aarch64_vdup_lane_any (u16, , , __a, __b) +#define __aarch64_vdup_lane_u32(__a, __b) \ + __aarch64_vdup_lane_any (u32, , , __a, __b) +#define __aarch64_vdup_lane_u64(__a, __b) (__a) + +/* __aarch64_vdup_laneq internal macros. */ +#define __aarch64_vdup_laneq_f32(__a, __b) \ + __aarch64_vdup_lane_any (f32, , q, __a, __b) +#define __aarch64_vdup_laneq_f64(__a, __b) \ + __aarch64_vdup_lane_any (f64, , q, __a, __b) +#define __aarch64_vdup_laneq_p8(__a, __b) \ + __aarch64_vdup_lane_any (p8, , q, __a, __b) +#define __aarch64_vdup_laneq_p16(__a, __b) \ + __aarch64_vdup_lane_any (p16, , q, __a, __b) +#define __aarch64_vdup_laneq_s8(__a, __b) \ + __aarch64_vdup_lane_any (s8, , q, __a, __b) +#define __aarch64_vdup_laneq_s16(__a, __b) \ + __aarch64_vdup_lane_any (s16, , q, __a, __b) +#define __aarch64_vdup_laneq_s32(__a, __b) \ + __aarch64_vdup_lane_any (s32, , q, __a, __b) +#define __aarch64_vdup_laneq_s64(__a, __b) \ + __aarch64_vdup_lane_any (s64, , q, __a, __b) +#define __aarch64_vdup_laneq_u8(__a, __b) \ + __aarch64_vdup_lane_any (u8, , q, __a, __b) +#define __aarch64_vdup_laneq_u16(__a, __b) \ + __aarch64_vdup_lane_any (u16, , q, __a, __b) +#define __aarch64_vdup_laneq_u32(__a, __b) \ + __aarch64_vdup_lane_any (u32, , q, __a, __b) +#define __aarch64_vdup_laneq_u64(__a, __b) \ + __aarch64_vdup_lane_any (u64, , q, __a, __b) + +/* __aarch64_vdupq_lane internal macros. */ +#define __aarch64_vdupq_lane_f32(__a, __b) \ + __aarch64_vdup_lane_any (f32, q, , __a, __b) +#define __aarch64_vdupq_lane_f64(__a, __b) (vdupq_n_f64 (__a)) +#define __aarch64_vdupq_lane_p8(__a, __b) \ + __aarch64_vdup_lane_any (p8, q, , __a, __b) +#define __aarch64_vdupq_lane_p16(__a, __b) \ + __aarch64_vdup_lane_any (p16, q, , __a, __b) +#define __aarch64_vdupq_lane_s8(__a, __b) \ + __aarch64_vdup_lane_any (s8, q, , __a, __b) +#define __aarch64_vdupq_lane_s16(__a, __b) \ + __aarch64_vdup_lane_any (s16, q, , __a, __b) +#define __aarch64_vdupq_lane_s32(__a, __b) \ + __aarch64_vdup_lane_any (s32, q, , __a, __b) +#define __aarch64_vdupq_lane_s64(__a, __b) (vdupq_n_s64 (__a)) +#define __aarch64_vdupq_lane_u8(__a, __b) \ + __aarch64_vdup_lane_any (u8, q, , __a, __b) +#define __aarch64_vdupq_lane_u16(__a, __b) \ + __aarch64_vdup_lane_any (u16, q, , __a, __b) +#define __aarch64_vdupq_lane_u32(__a, __b) \ + __aarch64_vdup_lane_any (u32, q, , __a, __b) +#define __aarch64_vdupq_lane_u64(__a, __b) (vdupq_n_u64 (__a)) + +/* __aarch64_vdupq_laneq internal macros. */ +#define __aarch64_vdupq_laneq_f32(__a, __b) \ + __aarch64_vdup_lane_any (f32, q, q, __a, __b) +#define __aarch64_vdupq_laneq_f64(__a, __b) \ + __aarch64_vdup_lane_any (f64, q, q, __a, __b) +#define __aarch64_vdupq_laneq_p8(__a, __b) \ + __aarch64_vdup_lane_any (p8, q, q, __a, __b) +#define __aarch64_vdupq_laneq_p16(__a, __b) \ + __aarch64_vdup_lane_any (p16, q, q, __a, __b) +#define __aarch64_vdupq_laneq_s8(__a, __b) \ + __aarch64_vdup_lane_any (s8, q, q, __a, __b) +#define __aarch64_vdupq_laneq_s16(__a, __b) \ + __aarch64_vdup_lane_any (s16, q, q, __a, __b) +#define __aarch64_vdupq_laneq_s32(__a, __b) \ + __aarch64_vdup_lane_any (s32, q, q, __a, __b) +#define __aarch64_vdupq_laneq_s64(__a, __b) \ + __aarch64_vdup_lane_any (s64, q, q, __a, __b) +#define __aarch64_vdupq_laneq_u8(__a, __b) \ + __aarch64_vdup_lane_any (u8, q, q, __a, __b) +#define __aarch64_vdupq_laneq_u16(__a, __b) \ + __aarch64_vdup_lane_any (u16, q, q, __a, __b) +#define __aarch64_vdupq_laneq_u32(__a, __b) \ + __aarch64_vdup_lane_any (u32, q, q, __a, __b) +#define __aarch64_vdupq_laneq_u64(__a, __b) \ + __aarch64_vdup_lane_any (u64, q, q, __a, __b) + /* vadd */ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vadd_s8 (int8x8_t __a, int8x8_t __b) @@ -5676,559 +5777,6 @@ vcvtxd_f32_f64 (float64_t a) return result; } -#define vdup_lane_f32(a, b) \ - __extension__ \ - ({ \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("dup %0.2s,%1.s[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_p8(a, b) \ - __extension__ \ - ({ \ - poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("dup %0.8b,%1.b[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_p16(a, b) \ - __extension__ \ - ({ \ - poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("dup %0.4h,%1.h[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_s8(a, b) \ - __extension__ \ - ({ \ - int8x8_t a_ = (a); \ - int8x8_t result; \ - __asm__ ("dup %0.8b,%1.b[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_s16(a, b) \ - __extension__ \ - ({ \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("dup %0.4h,%1.h[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_s32(a, b) \ - __extension__ \ - ({ \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("dup %0.2s,%1.s[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_s64(a, b) \ - __extension__ \ - ({ \ - int64x1_t a_ = (a); \ - int64x1_t result; \ - __asm__ ("ins %0.d[0],%1.d[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_u8(a, b) \ - __extension__ \ - ({ \ - uint8x8_t a_ = (a); \ - uint8x8_t result; \ - __asm__ ("dup %0.8b,%1.b[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_u16(a, b) \ - __extension__ \ - ({ \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("dup %0.4h,%1.h[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_u32(a, b) \ - __extension__ \ - ({ \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("dup %0.2s,%1.s[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdup_lane_u64(a, b) \ - __extension__ \ - ({ \ - uint64x1_t a_ = (a); \ - uint64x1_t result; \ - __asm__ ("ins %0.d[0],%1.d[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vdup_n_f32 (float32_t a) -{ - float32x2_t result; - __asm__ ("dup %0.2s, %w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vdup_n_p8 (uint32_t a) -{ - poly8x8_t result; - __asm__ ("dup %0.8b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vdup_n_p16 (uint32_t a) -{ - poly16x4_t result; - __asm__ ("dup %0.4h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vdup_n_s8 (int32_t a) -{ - int8x8_t result; - __asm__ ("dup %0.8b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vdup_n_s16 (int32_t a) -{ - int16x4_t result; - __asm__ ("dup %0.4h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vdup_n_s32 (int32_t a) -{ - int32x2_t result; - __asm__ ("dup %0.2s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vdup_n_s64 (int64_t a) -{ - int64x1_t result; - __asm__ ("ins %0.d[0],%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vdup_n_u8 (uint32_t a) -{ - uint8x8_t result; - __asm__ ("dup %0.8b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vdup_n_u16 (uint32_t a) -{ - uint16x4_t result; - __asm__ ("dup %0.4h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vdup_n_u32 (uint32_t a) -{ - uint32x2_t result; - __asm__ ("dup %0.2s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vdup_n_u64 (uint64_t a) -{ - uint64x1_t result; - __asm__ ("ins %0.d[0],%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -#define vdupd_lane_f64(a, b) \ - __extension__ \ - ({ \ - float64x2_t a_ = (a); \ - float64_t result; \ - __asm__ ("dup %d0, %1.d[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_f32(a, b) \ - __extension__ \ - ({ \ - float32x2_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("dup %0.4s,%1.s[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_f64(a, b) \ - __extension__ \ - ({ \ - float64x1_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("dup %0.2d,%1.d[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_p8(a, b) \ - __extension__ \ - ({ \ - poly8x8_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("dup %0.16b,%1.b[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_p16(a, b) \ - __extension__ \ - ({ \ - poly16x4_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("dup %0.8h,%1.h[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_s8(a, b) \ - __extension__ \ - ({ \ - int8x8_t a_ = (a); \ - int8x16_t result; \ - __asm__ ("dup %0.16b,%1.b[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_s16(a, b) \ - __extension__ \ - ({ \ - int16x4_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("dup %0.8h,%1.h[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_s32(a, b) \ - __extension__ \ - ({ \ - int32x2_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("dup %0.4s,%1.s[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_s64(a, b) \ - __extension__ \ - ({ \ - int64x1_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("dup %0.2d,%1.d[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_u8(a, b) \ - __extension__ \ - ({ \ - uint8x8_t a_ = (a); \ - uint8x16_t result; \ - __asm__ ("dup %0.16b,%1.b[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_u16(a, b) \ - __extension__ \ - ({ \ - uint16x4_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("dup %0.8h,%1.h[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_u32(a, b) \ - __extension__ \ - ({ \ - uint32x2_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("dup %0.4s,%1.s[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vdupq_lane_u64(a, b) \ - __extension__ \ - ({ \ - uint64x1_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("dup %0.2d,%1.d[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vdupq_n_f32 (float32_t a) -{ - float32x4_t result; - __asm__ ("dup %0.4s, %w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vdupq_n_f64 (float64_t a) -{ - float64x2_t result; - __asm__ ("dup %0.2d, %x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vdupq_n_p8 (uint32_t a) -{ - poly8x16_t result; - __asm__ ("dup %0.16b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vdupq_n_p16 (uint32_t a) -{ - poly16x8_t result; - __asm__ ("dup %0.8h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vdupq_n_s8 (int32_t a) -{ - int8x16_t result; - __asm__ ("dup %0.16b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vdupq_n_s16 (int32_t a) -{ - int16x8_t result; - __asm__ ("dup %0.8h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vdupq_n_s32 (int32_t a) -{ - int32x4_t result; - __asm__ ("dup %0.4s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vdupq_n_s64 (int64_t a) -{ - int64x2_t result; - __asm__ ("dup %0.2d,%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vdupq_n_u8 (uint32_t a) -{ - uint8x16_t result; - __asm__ ("dup %0.16b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vdupq_n_u16 (uint32_t a) -{ - uint16x8_t result; - __asm__ ("dup %0.8h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vdupq_n_u32 (uint32_t a) -{ - uint32x4_t result; - __asm__ ("dup %0.4s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vdupq_n_u64 (uint64_t a) -{ - uint64x2_t result; - __asm__ ("dup %0.2d,%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -#define vdups_lane_f32(a, b) \ - __extension__ \ - ({ \ - float32x4_t a_ = (a); \ - float32_t result; \ - __asm__ ("dup %s0, %1.s[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - #define vext_f32(a, b, c) \ __extension__ \ ({ \ @@ -19755,54 +19303,601 @@ vcvtpq_u64_f64 (float64x2_t __a) return (uint64x2_t) __builtin_aarch64_lceiluv2dfv2di (__a); } -/* vdup */ +/* vdup_n */ -__extension__ static __inline int8x1_t __attribute__ ((__always_inline__)) -vdupb_lane_s8 (int8x16_t a, int const b) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vdup_n_f32 (float32_t __a) { - return __aarch64_vgetq_lane_s8 (a, b); + return (float32x2_t) {__a, __a}; } -__extension__ static __inline uint8x1_t __attribute__ ((__always_inline__)) -vdupb_lane_u8 (uint8x16_t a, int const b) +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vdup_n_f64 (float64_t __a) { - return __aarch64_vgetq_lane_u8 (a, b); + return __a; } -__extension__ static __inline int16x1_t __attribute__ ((__always_inline__)) -vduph_lane_s16 (int16x8_t a, int const b) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vdup_n_p8 (poly8_t __a) { - return __aarch64_vgetq_lane_s16 (a, b); + return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } -__extension__ static __inline uint16x1_t __attribute__ ((__always_inline__)) -vduph_lane_u16 (uint16x8_t a, int const b) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vdup_n_p16 (poly16_t __a) { - return __aarch64_vgetq_lane_u16 (a, b); + return (poly16x4_t) {__a, __a, __a, __a}; } -__extension__ static __inline int32x1_t __attribute__ ((__always_inline__)) -vdups_lane_s32 (int32x4_t a, int const b) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vdup_n_s8 (int8_t __a) { - return __aarch64_vgetq_lane_s32 (a, b); + return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } -__extension__ static __inline uint32x1_t __attribute__ ((__always_inline__)) -vdups_lane_u32 (uint32x4_t a, int const b) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vdup_n_s16 (int16_t __a) { - return __aarch64_vgetq_lane_u32 (a, b); + return (int16x4_t) {__a, __a, __a, __a}; +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vdup_n_s32 (int32_t __a) +{ + return (int32x2_t) {__a, __a}; } __extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vdupd_lane_s64 (int64x2_t a, int const b) +vdup_n_s64 (int64_t __a) { - return __aarch64_vgetq_lane_s64 (a, b); + return __a; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vdup_n_u8 (uint8_t __a) +{ + return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vdup_n_u16 (uint16_t __a) +{ + return (uint16x4_t) {__a, __a, __a, __a}; +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vdup_n_u32 (uint32_t __a) +{ + return (uint32x2_t) {__a, __a}; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vdupd_lane_u64 (uint64x2_t a, int const b) +vdup_n_u64 (uint64_t __a) { - return __aarch64_vgetq_lane_u64 (a, b); + return __a; +} + +/* vdupq_n */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vdupq_n_f32 (float32_t __a) +{ + return (float32x4_t) {__a, __a, __a, __a}; +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vdupq_n_f64 (float64_t __a) +{ + return (float64x2_t) {__a, __a}; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vdupq_n_p8 (uint32_t __a) +{ + return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vdupq_n_p16 (uint32_t __a) +{ + return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vdupq_n_s8 (int32_t __a) +{ + return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vdupq_n_s16 (int32_t __a) +{ + return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vdupq_n_s32 (int32_t __a) +{ + return (int32x4_t) {__a, __a, __a, __a}; +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vdupq_n_s64 (int64_t __a) +{ + return (int64x2_t) {__a, __a}; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vdupq_n_u8 (uint32_t __a) +{ + return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vdupq_n_u16 (uint32_t __a) +{ + return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vdupq_n_u32 (uint32_t __a) +{ + return (uint32x4_t) {__a, __a, __a, __a}; +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vdupq_n_u64 (uint64_t __a) +{ + return (uint64x2_t) {__a, __a}; +} + +/* vdup_lane */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vdup_lane_f32 (float32x2_t __a, const int __b) +{ + return __aarch64_vdup_lane_f32 (__a, __b); +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vdup_lane_f64 (float64x1_t __a, const int __b) +{ + return __aarch64_vdup_lane_f64 (__a, __b); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vdup_lane_p8 (poly8x8_t __a, const int __b) +{ + return __aarch64_vdup_lane_p8 (__a, __b); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vdup_lane_p16 (poly16x4_t __a, const int __b) +{ + return __aarch64_vdup_lane_p16 (__a, __b); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vdup_lane_s8 (int8x8_t __a, const int __b) +{ + return __aarch64_vdup_lane_s8 (__a, __b); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vdup_lane_s16 (int16x4_t __a, const int __b) +{ + return __aarch64_vdup_lane_s16 (__a, __b); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vdup_lane_s32 (int32x2_t __a, const int __b) +{ + return __aarch64_vdup_lane_s32 (__a, __b); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vdup_lane_s64 (int64x1_t __a, const int __b) +{ + return __aarch64_vdup_lane_s64 (__a, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vdup_lane_u8 (uint8x8_t __a, const int __b) +{ + return __aarch64_vdup_lane_u8 (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vdup_lane_u16 (uint16x4_t __a, const int __b) +{ + return __aarch64_vdup_lane_u16 (__a, __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vdup_lane_u32 (uint32x2_t __a, const int __b) +{ + return __aarch64_vdup_lane_u32 (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vdup_lane_u64 (uint64x1_t __a, const int __b) +{ + return __aarch64_vdup_lane_u64 (__a, __b); +} + +/* vdup_laneq */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vdup_laneq_f32 (float32x4_t __a, const int __b) +{ + return __aarch64_vdup_laneq_f32 (__a, __b); +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vdup_laneq_f64 (float64x2_t __a, const int __b) +{ + return __aarch64_vdup_laneq_f64 (__a, __b); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vdup_laneq_p8 (poly8x16_t __a, const int __b) +{ + return __aarch64_vdup_laneq_p8 (__a, __b); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vdup_laneq_p16 (poly16x8_t __a, const int __b) +{ + return __aarch64_vdup_laneq_p16 (__a, __b); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vdup_laneq_s8 (int8x16_t __a, const int __b) +{ + return __aarch64_vdup_laneq_s8 (__a, __b); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vdup_laneq_s16 (int16x8_t __a, const int __b) +{ + return __aarch64_vdup_laneq_s16 (__a, __b); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vdup_laneq_s32 (int32x4_t __a, const int __b) +{ + return __aarch64_vdup_laneq_s32 (__a, __b); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vdup_laneq_s64 (int64x2_t __a, const int __b) +{ + return __aarch64_vdup_laneq_s64 (__a, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vdup_laneq_u8 (uint8x16_t __a, const int __b) +{ + return __aarch64_vdup_laneq_u8 (__a, __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vdup_laneq_u16 (uint16x8_t __a, const int __b) +{ + return __aarch64_vdup_laneq_u16 (__a, __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vdup_laneq_u32 (uint32x4_t __a, const int __b) +{ + return __aarch64_vdup_laneq_u32 (__a, __b); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vdup_laneq_u64 (uint64x2_t __a, const int __b) +{ + return __aarch64_vdup_laneq_u64 (__a, __b); +} + +/* vdupq_lane */ +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vdupq_lane_f32 (float32x2_t __a, const int __b) +{ + return __aarch64_vdupq_lane_f32 (__a, __b); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vdupq_lane_f64 (float64x1_t __a, const int __b) +{ + return __aarch64_vdupq_lane_f64 (__a, __b); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vdupq_lane_p8 (poly8x8_t __a, const int __b) +{ + return __aarch64_vdupq_lane_p8 (__a, __b); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_p16 (poly16x4_t __a, const int __b) +{ + return __aarch64_vdupq_lane_p16 (__a, __b); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vdupq_lane_s8 (int8x8_t __a, const int __b) +{ + return __aarch64_vdupq_lane_s8 (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_s16 (int16x4_t __a, const int __b) +{ + return __aarch64_vdupq_lane_s16 (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vdupq_lane_s32 (int32x2_t __a, const int __b) +{ + return __aarch64_vdupq_lane_s32 (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vdupq_lane_s64 (int64x1_t __a, const int __b) +{ + return __aarch64_vdupq_lane_s64 (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vdupq_lane_u8 (uint8x8_t __a, const int __b) +{ + return __aarch64_vdupq_lane_u8 (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_u16 (uint16x4_t __a, const int __b) +{ + return __aarch64_vdupq_lane_u16 (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vdupq_lane_u32 (uint32x2_t __a, const int __b) +{ + return __aarch64_vdupq_lane_u32 (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vdupq_lane_u64 (uint64x1_t __a, const int __b) +{ + return __aarch64_vdupq_lane_u64 (__a, __b); +} + +/* vdupq_laneq */ +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vdupq_laneq_f32 (float32x4_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_f32 (__a, __b); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vdupq_laneq_f64 (float64x2_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_f64 (__a, __b); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vdupq_laneq_p8 (poly8x16_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_p8 (__a, __b); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vdupq_laneq_p16 (poly16x8_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_p16 (__a, __b); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vdupq_laneq_s8 (int8x16_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_s8 (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vdupq_laneq_s16 (int16x8_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_s16 (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vdupq_laneq_s32 (int32x4_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_s32 (__a, __b); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vdupq_laneq_s64 (int64x2_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_s64 (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vdupq_laneq_u8 (uint8x16_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_u8 (__a, __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vdupq_laneq_u16 (uint16x8_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_u16 (__a, __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vdupq_laneq_u32 (uint32x4_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_u32 (__a, __b); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vdupq_laneq_u64 (uint64x2_t __a, const int __b) +{ + return __aarch64_vdupq_laneq_u64 (__a, __b); +} + +/* vdupb_lane */ +__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) +vdupb_lane_p8 (poly8x8_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vget_lane_p8 (__a, 0); +} + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vdupb_lane_s8 (int8x8_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vget_lane_s8 (__a, 0); +} + +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vdupb_lane_u8 (uint8x8_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vget_lane_u8 (__a, 0); +} + +/* vduph_lane */ +__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) +vduph_lane_p16 (poly16x4_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vget_lane_p16 (__a, 0); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vduph_lane_s16 (int16x4_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vget_lane_s16 (__a, 0); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vduph_lane_u16 (uint16x4_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vget_lane_u16 (__a, 0); +} + +/* vdups_lane */ +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vdups_lane_f32 (float32x2_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vget_lane_f32 (__a, 0); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vdups_lane_s32 (int32x2_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vget_lane_s32 (__a, 0); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vdups_lane_u32 (uint32x2_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vget_lane_u32 (__a, 0); +} + +/* vdupd_lane */ +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vdupd_lane_f64 (float64x1_t __a, const int __attribute__ ((unused)) __b) +{ + return __a; +} + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vdupd_lane_s64 (int64x1_t __a, const int __attribute__ ((unused)) __b) +{ + return __a; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vdupd_lane_u64 (uint64x1_t __a, const int __attribute__ ((unused)) __b) +{ + return __a; +} + +/* vdupb_laneq */ +__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) +vdupb_laneq_p8 (poly8x16_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_p8 (__a, 0); +} + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_s8 (__a, 0); +} + +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vdupb_laneq_u8 (uint8x16_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_u8 (__a, 0); +} + +/* vduph_laneq */ +__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) +vduph_laneq_p16 (poly16x8_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_p16 (__a, 0); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vduph_laneq_s16 (int16x8_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_s16 (__a, 0); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vduph_laneq_u16 (uint16x8_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_u16 (__a, 0); +} + +/* vdups_laneq */ +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vdups_laneq_f32 (float32x4_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_f32 (__a, 0); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vdups_laneq_s32 (int32x4_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_s32 (__a, 0); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vdups_laneq_u32 (uint32x4_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_u32 (__a, 0); +} + +/* vdupd_laneq */ +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vdupd_laneq_f64 (float64x2_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_f64 (__a, 0); +} + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vdupd_laneq_s64 (int64x2_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_s64 (__a, 0); +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vdupd_laneq_u64 (uint64x2_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_u64 (__a, 0); } /* vld1 */ @@ -25636,4 +25731,54 @@ __INTERLEAVE_LIST (zip) #undef __aarch64_vgetq_lane_u32 #undef __aarch64_vgetq_lane_u64 +#undef __aarch64_vdup_lane_any +#undef __aarch64_vdup_lane_f32 +#undef __aarch64_vdup_lane_f64 +#undef __aarch64_vdup_lane_p8 +#undef __aarch64_vdup_lane_p16 +#undef __aarch64_vdup_lane_s8 +#undef __aarch64_vdup_lane_s16 +#undef __aarch64_vdup_lane_s32 +#undef __aarch64_vdup_lane_s64 +#undef __aarch64_vdup_lane_u8 +#undef __aarch64_vdup_lane_u16 +#undef __aarch64_vdup_lane_u32 +#undef __aarch64_vdup_lane_u64 +#undef __aarch64_vdup_laneq_f32 +#undef __aarch64_vdup_laneq_f64 +#undef __aarch64_vdup_laneq_p8 +#undef __aarch64_vdup_laneq_p16 +#undef __aarch64_vdup_laneq_s8 +#undef __aarch64_vdup_laneq_s16 +#undef __aarch64_vdup_laneq_s32 +#undef __aarch64_vdup_laneq_s64 +#undef __aarch64_vdup_laneq_u8 +#undef __aarch64_vdup_laneq_u16 +#undef __aarch64_vdup_laneq_u32 +#undef __aarch64_vdup_laneq_u64 +#undef __aarch64_vdupq_lane_f32 +#undef __aarch64_vdupq_lane_f64 +#undef __aarch64_vdupq_lane_p8 +#undef __aarch64_vdupq_lane_p16 +#undef __aarch64_vdupq_lane_s8 +#undef __aarch64_vdupq_lane_s16 +#undef __aarch64_vdupq_lane_s32 +#undef __aarch64_vdupq_lane_s64 +#undef __aarch64_vdupq_lane_u8 +#undef __aarch64_vdupq_lane_u16 +#undef __aarch64_vdupq_lane_u32 +#undef __aarch64_vdupq_lane_u64 +#undef __aarch64_vdupq_laneq_f32 +#undef __aarch64_vdupq_laneq_f64 +#undef __aarch64_vdupq_laneq_p8 +#undef __aarch64_vdupq_laneq_p16 +#undef __aarch64_vdupq_laneq_s8 +#undef __aarch64_vdupq_laneq_s16 +#undef __aarch64_vdupq_laneq_s32 +#undef __aarch64_vdupq_laneq_s64 +#undef __aarch64_vdupq_laneq_u8 +#undef __aarch64_vdupq_laneq_u16 +#undef __aarch64_vdupq_laneq_u32 +#undef __aarch64_vdupq_laneq_u64 + #endif diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 37b6cbc8dc8..ffe125b5583 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -383,7 +383,7 @@ (V4HI "V8HI") (V8HI "V8HI") (V2SI "V4SI") (V4SI "V4SI") (DI "V2DI") (V2DI "V2DI") - (V2SF "V2SF") (V4SF "V4SF") + (V2SF "V4SF") (V4SF "V4SF") (V2DF "V2DF") (SI "V4SI") (HI "V8HI") (QI "V16QI")]) @@ -527,6 +527,20 @@ (define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")]) (define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")]) +(define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI") + (V4HI "V8HI") (V8HI "V4HI") + (V2SI "V4SI") (V4SI "V2SI") + (DI "V2DI") (V2DI "DI") + (V2SF "V4SF") (V4SF "V2SF") + (DF "V2DF") (V2DF "DF")]) + +(define_mode_attr vswap_width_name [(V8QI "to_128") (V16QI "to_64") + (V4HI "to_128") (V8HI "to_64") + (V2SI "to_128") (V4SI "to_64") + (DI "to_128") (V2DI "to_64") + (V2SF "to_128") (V4SF "to_64") + (DF "to_128") (V2DF "to_64")]) + ;; ------------------------------------------------------------------- ;; Code Iterators ;; ------------------------------------------------------------------- diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index b190c4d382a..d3892f696ea 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-09-02 James Greenhalgh + + * gcc.target/aarch64/scalar_intrinsics.c + (vdup_lane<8,16,32,64>): Force values to SIMD registers. + 2013-09-02 Richard Biener PR middle-end/57511 diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c index d84bfeb55e9..aa041cc2c20 100644 --- a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c +++ b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c @@ -198,13 +198,21 @@ test_vcltzd_s64 (int64x1_t a) int8x1_t test_vdupb_lane_s8 (int8x16_t a) { - return vdupb_lane_s8 (a, 2); + int8x1_t res; + force_simd (a); + res = vdupb_laneq_s8 (a, 2); + force_simd (res); + return res; } uint8x1_t test_vdupb_lane_u8 (uint8x16_t a) { - return vdupb_lane_u8 (a, 2); + uint8x1_t res; + force_simd (a); + res = vdupb_laneq_u8 (a, 2); + force_simd (res); + return res; } /* { dg-final { scan-assembler-times "aarch64_get_lanev8hi" 2 } } */ @@ -212,13 +220,21 @@ test_vdupb_lane_u8 (uint8x16_t a) int16x1_t test_vduph_lane_s16 (int16x8_t a) { - return vduph_lane_s16 (a, 2); + int16x1_t res; + force_simd (a); + res = vduph_laneq_s16 (a, 2); + force_simd (res); + return res; } uint16x1_t test_vduph_lane_u16 (uint16x8_t a) { - return vduph_lane_u16 (a, 2); + uint16x1_t res; + force_simd (a); + res = vduph_laneq_u16 (a, 2); + force_simd (res); + return res; } /* { dg-final { scan-assembler-times "aarch64_get_lanev4si" 2 } } */ @@ -226,13 +242,21 @@ test_vduph_lane_u16 (uint16x8_t a) int32x1_t test_vdups_lane_s32 (int32x4_t a) { - return vdups_lane_s32 (a, 2); + int32x1_t res; + force_simd (a); + res = vdups_laneq_s32 (a, 2); + force_simd (res); + return res; } uint32x1_t test_vdups_lane_u32 (uint32x4_t a) { - return vdups_lane_u32 (a, 2); + uint32x1_t res; + force_simd (a); + res = vdups_laneq_u32 (a, 2); + force_simd (res); + return res; } /* { dg-final { scan-assembler-times "aarch64_get_lanev2di" 2 } } */ @@ -240,13 +264,21 @@ test_vdups_lane_u32 (uint32x4_t a) int64x1_t test_vdupd_lane_s64 (int64x2_t a) { - return vdupd_lane_s64 (a, 1); + int64x1_t res; + force_simd (a); + res = vdupd_laneq_s64 (a, 1); + force_simd (res); + return res; } uint64x1_t test_vdupd_lane_u64 (uint64x2_t a) { - return vdupd_lane_u64 (a, 1); + uint64x1_t res; + force_simd (a); + res = vdupd_laneq_u64 (a, 1); + force_simd (res); + return res; } /* { dg-final { scan-assembler-times "\\tcmtst\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 2 } } */