[AArch64] Improve arm_neon.h vml<as>_lane handling.

gcc/
	* config/aarch64/aarch64-simd-builtins.def (fma): New.
	* config/aarch64/aarch64-simd.md
	(aarch64_mla_elt<mode>): New.
	(aarch64_mla_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_mls_elt<mode>): Likewise.
	(aarch64_mls_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_fma4_elt<mode>): Likewise.
	(aarch64_fma4_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_fma4_elt_to_128v2df): Likewise.
	(aarch64_fma4_elt_to_64df): Likewise.
	(fnma<mode>4): Likewise.
	(aarch64_fnma4_elt<mode>): Likewise.
	(aarch64_fnma4_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_fnma4_elt_to_128v2df): Likewise.
	(aarch64_fnma4_elt_to_64df): Likewise.
	* config/aarch64/iterators.md (VDQSF): New.
	* config/aarch64/arm_neon.h
	(vfm<as><sdq>_lane<q>_f<32, 64>): Convert to C implementation.
	(vml<sa><q>_lane<q>_<fsu><16, 32, 64>): Likewise.

gcc/testsuite/
	* gcc.target/aarch64/fmla-intrinsic.c: New.
	* gcc.target/aarch64/mla-intrinsic.c: Likewise.
	* gcc.target/aarch64/fmls-intrinsic.c: Likewise.
	* gcc.target/aarch64/mls-intrinsic.c: Likewise.

From-SVN: r202625
This commit is contained in:
James Greenhalgh 2013-09-16 09:53:11 +00:00 committed by James Greenhalgh
parent 779aea46cc
commit 828e70c1d7
10 changed files with 1137 additions and 561 deletions

View File

@ -1,3 +1,25 @@
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/aarch64-simd-builtins.def (fma): New.
* config/aarch64/aarch64-simd.md
(aarch64_mla_elt<mode>): New.
(aarch64_mla_elt_<vswap_width_name><mode>): Likewise.
(aarch64_mls_elt<mode>): Likewise.
(aarch64_mls_elt_<vswap_width_name><mode>): Likewise.
(aarch64_fma4_elt<mode>): Likewise.
(aarch64_fma4_elt_<vswap_width_name><mode>): Likewise.
(aarch64_fma4_elt_to_128v2df): Likewise.
(aarch64_fma4_elt_to_64df): Likewise.
(fnma<mode>4): Likewise.
(aarch64_fnma4_elt<mode>): Likewise.
(aarch64_fnma4_elt_<vswap_width_name><mode>): Likewise.
(aarch64_fnma4_elt_to_128v2df): Likewise.
(aarch64_fnma4_elt_to_64df): Likewise.
* config/aarch64/iterators.md (VDQSF): New.
* config/aarch64/arm_neon.h
(vfm<as><sdq>_lane<q>_f<32, 64>): Convert to C implementation.
(vml<sa><q>_lane<q>_<fsu><16, 32, 64>): Likewise.
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.

View File

@ -359,3 +359,6 @@
/* Implemented by aarch64_st1<VALL:mode>. */
BUILTIN_VALL (STORE1, st1, 0)
/* Implemented by fma<mode>4. */
BUILTIN_VDQF (TERNOP, fma, 4)

View File

@ -1070,6 +1070,38 @@
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mla_elt<mode>"
[(set (match_operand:VDQHS 0 "register_operand" "=w")
(plus:VDQHS
(mult:VDQHS
(vec_duplicate:VDQHS
(vec_select:<VEL>
(match_operand:VDQHS 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQHS 3 "register_operand" "w"))
(match_operand:VDQHS 4 "register_operand" "0")))]
"TARGET_SIMD"
"mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_mla")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mla_elt_<vswap_width_name><mode>"
[(set (match_operand:VDQHS 0 "register_operand" "=w")
(plus:VDQHS
(mult:VDQHS
(vec_duplicate:VDQHS
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQHS 3 "register_operand" "w"))
(match_operand:VDQHS 4 "register_operand" "0")))]
"TARGET_SIMD"
"mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_mla")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "aarch64_mls<mode>"
[(set (match_operand:VQ_S 0 "register_operand" "=w")
(minus:VQ_S (match_operand:VQ_S 1 "register_operand" "0")
@ -1081,6 +1113,38 @@
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mls_elt<mode>"
[(set (match_operand:VDQHS 0 "register_operand" "=w")
(minus:VDQHS
(match_operand:VDQHS 4 "register_operand" "0")
(mult:VDQHS
(vec_duplicate:VDQHS
(vec_select:<VEL>
(match_operand:VDQHS 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQHS 3 "register_operand" "w"))))]
"TARGET_SIMD"
"mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_mla")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mls_elt_<vswap_width_name><mode>"
[(set (match_operand:VDQHS 0 "register_operand" "=w")
(minus:VDQHS
(match_operand:VDQHS 4 "register_operand" "0")
(mult:VDQHS
(vec_duplicate:VDQHS
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQHS 3 "register_operand" "w"))))]
"TARGET_SIMD"
"mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_mla")
(set_attr "simd_mode" "<MODE>")]
)
;; Max/Min operations.
(define_insn "<su><maxmin><mode>3"
[(set (match_operand:VQ_S 0 "register_operand" "=w")
@ -1483,6 +1547,137 @@
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fma4_elt<mode>"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(fma:VDQF
(vec_duplicate:VDQF
(vec_select:<VEL>
(match_operand:VDQF 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQF 3 "register_operand" "w")
(match_operand:VDQF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>"
[(set (match_operand:VDQSF 0 "register_operand" "=w")
(fma:VDQSF
(vec_duplicate:VDQSF
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQSF 3 "register_operand" "w")
(match_operand:VDQSF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fma4_elt_to_128df"
[(set (match_operand:V2DF 0 "register_operand" "=w")
(fma:V2DF
(vec_duplicate:V2DF
(match_operand:DF 1 "register_operand" "w"))
(match_operand:V2DF 2 "register_operand" "w")
(match_operand:V2DF 3 "register_operand" "0")))]
"TARGET_SIMD"
"fmla\\t%0.2d, %2.2d, %1.2d[0]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "*aarch64_fma4_elt_to_64v2df"
[(set (match_operand:DF 0 "register_operand" "=w")
(fma:DF
(vec_select:DF
(match_operand:V2DF 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand")]))
(match_operand:DF 3 "register_operand" "w")
(match_operand:DF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmla\\t%0.2d, %3.2d, %1.2d[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "fnma<mode>4"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(fma:VDQF
(match_operand:VDQF 1 "register_operand" "w")
(neg:VDQF
(match_operand:VDQF 2 "register_operand" "w"))
(match_operand:VDQF 3 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
[(set_attr "simd_type" "simd_fmla")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fnma4_elt<mode>"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(fma:VDQF
(neg:VDQF
(match_operand:VDQF 3 "register_operand" "w"))
(vec_duplicate:VDQF
(vec_select:<VEL>
(match_operand:VDQF 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>"
[(set (match_operand:VDQSF 0 "register_operand" "=w")
(fma:VDQSF
(neg:VDQSF
(match_operand:VDQSF 3 "register_operand" "w"))
(vec_duplicate:VDQSF
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQSF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fnma4_elt_to_128df"
[(set (match_operand:V2DF 0 "register_operand" "=w")
(fma:V2DF
(neg:V2DF
(match_operand:V2DF 2 "register_operand" "w"))
(vec_duplicate:V2DF
(match_operand:DF 1 "register_operand" "w"))
(match_operand:V2DF 3 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.2d, %2.2d, %1.2d[0]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "*aarch64_fnma4_elt_to_64v2df"
[(set (match_operand:DF 0 "register_operand" "=w")
(fma:DF
(vec_select:DF
(match_operand:V2DF 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand")]))
(neg:DF
(match_operand:DF 3 "register_operand" "w"))
(match_operand:DF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.2d, %3.2d, %1.2d[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "V2DF")]
)
;; Vector versions of the floating-point frint patterns.
;; Expands to btrunc, ceil, floor, nearbyint, rint, round.
(define_insn "<frint_pattern><mode>2"

File diff suppressed because it is too large Load Diff

View File

@ -89,6 +89,9 @@
;; Vector Float modes.
(define_mode_iterator VDQF [V2SF V4SF V2DF])
;; Vector single Float modes.
(define_mode_iterator VDQSF [V2SF V4SF])
;; Modes suitable to use as the return type of a vcond expression.
(define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])

View File

@ -1,3 +1,10 @@
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* gcc.target/aarch64/fmla-intrinsic.c: New.
* gcc.target/aarch64/mla-intrinsic.c: Likewise.
* gcc.target/aarch64/fmls-intrinsic.c: Likewise.
* gcc.target/aarch64/mls-intrinsic.c: Likewise.
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* gcc.target/aarch64/mul_intrinsic_1.c: New.

View File

@ -0,0 +1,116 @@
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
#define DELTA 0.0001
extern double fabs (double);
extern void abort (void);
#define TEST_VMLA(q1, q2, size, in1_lanes, in2_lanes) \
static void \
test_vfma##q1##_lane##q2##_f##size (float##size##_t * res, \
const float##size##_t *in1, \
const float##size##_t *in2) \
{ \
float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \
float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \
float##size##x##in2_lanes##_t c; \
if (in2_lanes > 1) \
{ \
c = vld1##q2##_f##size (in2); \
a = vfma##q1##_lane##q2##_f##size (a, b, c, 1); \
} \
else \
{ \
c = vld1##q2##_f##size (in2 + 1); \
a = vfma##q1##_lane##q2##_f##size (a, b, c, 0); \
} \
vst1##q1##_f##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMLA ( , , width, n_half_lanes, n_half_lanes) \
TEST_VMLA (q, , width, n_lanes, n_half_lanes) \
TEST_VMLA ( , q, width, n_half_lanes, n_lanes) \
TEST_VMLA (q, q, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (64, 2, 1)
#define POOL2 {0.0, 1.0}
#define POOL4 {0.0, 1.0, 2.0, 3.0}
#define EMPTY2 {0.0, 0.0}
#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
#define BUILD_TEST(size, lanes) \
static void \
test_f##size (void) \
{ \
int i; \
float##size##_t pool[lanes] = POOL##lanes; \
float##size##_t res[lanes] = EMPTY##lanes; \
float##size##_t res2[lanes] = EMPTY##lanes; \
float##size##_t res3[lanes] = EMPTY##lanes; \
float##size##_t res4[lanes] = EMPTY##lanes; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfma_lane_f##size (res, pool, pool); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res[i] - pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfmaq_lane_f##size (res2, pool, pool); \
for (i = 0; i < lanes; i++) \
if (fabs (res2[i] - pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfma_laneq_f##size (res3, pool, pool); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res3[i] - pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfmaq_laneq_f##size (res4, pool, pool); \
for (i = 0; i < lanes; i++) \
if (fabs (res4[i] - pool[i]) > DELTA) \
abort (); \
}
BUILD_TEST (32, 4)
BUILD_TEST (64, 2)
int
main (int argc, char **argv)
{
test_f32 ();
test_f64 ();
return 0;
}
/* vfma_laneq_f32.
vfma_lane_f32. */
/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */
/* vfmaq_lane_f32.
vfmaq_laneq_f32. */
/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */
/* vfma_lane_f64. */
/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
/* vfmaq_lane_f64.
vfma_laneq_f64.
vfmaq_laneq_f64. */
/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
/* { dg-final { cleanup-saved-temps } } */

View File

@ -0,0 +1,117 @@
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
#define DELTA 0.0001
extern double fabs (double);
extern void abort (void);
#define TEST_VMLS(q1, q2, size, in1_lanes, in2_lanes) \
static void \
test_vfms##q1##_lane##q2##_f##size (float##size##_t * res, \
const float##size##_t *in1, \
const float##size##_t *in2) \
{ \
float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \
float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \
float##size##x##in2_lanes##_t c; \
if (in2_lanes > 1) \
{ \
c = vld1##q2##_f##size (in2); \
a = vfms##q1##_lane##q2##_f##size (a, b, c, 1); \
} \
else \
{ \
c = vld1##q2##_f##size (in2 + 1); \
a = vfms##q1##_lane##q2##_f##size (a, b, c, 0); \
} \
vst1##q1##_f##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMLS ( , , width, n_half_lanes, n_half_lanes) \
TEST_VMLS (q, , width, n_lanes, n_half_lanes) \
TEST_VMLS ( , q, width, n_half_lanes, n_lanes) \
TEST_VMLS (q, q, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (64, 2, 1)
#define POOL2 {0.0, 1.0}
#define POOL4 {0.0, 1.0, 2.0, 3.0}
#define EMPTY2 {0.0, 0.0}
#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
#define BUILD_TEST(size, lanes) \
static void \
test_f##size (void) \
{ \
int i; \
float##size##_t pool[lanes] = POOL##lanes; \
float##size##_t res[lanes] = EMPTY##lanes; \
float##size##_t res2[lanes] = EMPTY##lanes; \
float##size##_t res3[lanes] = EMPTY##lanes; \
float##size##_t res4[lanes] = EMPTY##lanes; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfms_lane_f##size (res, pool, pool); \
asm volatile ("" : :"Q" (res) : "memory"); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res[i] + pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
test_vfmsq_lane_f##size (res2, pool, pool); \
asm volatile ("" : :"Q" (res2) : "memory"); \
for (i = 0; i < lanes; i++) \
if (fabs (res2[i] + pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
test_vfms_laneq_f##size (res3, pool, pool); \
asm volatile ("" : :"Q" (res3) : "memory"); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res3[i] + pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
test_vfmsq_laneq_f##size (res4, pool, pool); \
asm volatile ("" : :"Q" (res4) : "memory"); \
for (i = 0; i < lanes; i++) \
if (fabs (res4[i] + pool[i]) > DELTA) \
abort (); \
}
BUILD_TEST (32, 4)
BUILD_TEST (64, 2)
int
main (int argc, char **argv)
{
test_f32 ();
test_f64 ();
return 0;
}
/* vfms_laneq_f32.
vfms_lane_f32. */
/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */
/* vfmsq_lane_f32.
vfmsq_laneq_f32. */
/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */
/* vfms_lane_f64. */
/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
/* vfmsq_lane_f64.
vfms_laneq_f64.
vfmsq_laneq_f64. */
/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
/* { dg-final { cleanup-saved-temps } } */

View File

@ -0,0 +1,84 @@
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
extern void abort (void);
#define MAPs(size, xx) int##size##xx##_t
#define MAPu(size, xx) uint##size##xx##_t
#define TEST_VMLA(q, su, size, in1_lanes, in2_lanes) \
static void \
test_vmlaq_lane##q##_##su##size (MAP##su (size, ) * res, \
const MAP##su(size, ) *in1, \
const MAP##su(size, ) *in2) \
{ \
MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res); \
MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1); \
MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2); \
a = vmlaq_lane##q##_##su##size (a, b, c, 1); \
vst1q_##su##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMLA (, s, width, n_lanes, n_half_lanes) \
TEST_VMLA (q, s, width, n_lanes, n_lanes) \
TEST_VMLA (, u, width, n_lanes, n_half_lanes) \
TEST_VMLA (q, u, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (16, 8, 4)
#define POOL4 {0, 1, 2, 3}
#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
#define EMPTY4 {0, 0, 0, 0}
#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0}
#define BUILD_TEST(su, size, lanes) \
static void \
test_##su##size (void) \
{ \
int i; \
MAP##su (size,) pool[lanes] = POOL##lanes; \
MAP##su (size,) res[lanes] = EMPTY##lanes; \
MAP##su (size,) res2[lanes] = EMPTY##lanes; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmlaq_lane_##su##size (res, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res[i] != pool[i]) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmlaq_laneq_##su##size (res2, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res2[i] != pool[i]) \
abort (); \
}
#undef BUILD_VARS
#define BUILD_VARS(size, lanes) \
BUILD_TEST (s, size, lanes) \
BUILD_TEST (u, size, lanes)
BUILD_VARS (32, 4)
BUILD_VARS (16, 8)
int
main (int argc, char **argv)
{
test_s32 ();
test_u32 ();
test_s16 ();
test_u16 ();
return 0;
}
/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { cleanup-saved-temps } } */

View File

@ -0,0 +1,89 @@
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
extern void abort (void);
#define MAPs(size, xx) int##size##xx##_t
#define MAPu(size, xx) uint##size##xx##_t
#define TEST_VMLS(q, su, size, in1_lanes, in2_lanes) \
static void \
test_vmlsq_lane##q##_##su##size (MAP##su (size, ) * res, \
const MAP##su(size, ) *in1, \
const MAP##su(size, ) *in2) \
{ \
MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res); \
MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1); \
MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2); \
a = vmlsq_lane##q##_##su##size (a, b, c, 1); \
vst1q_##su##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMLS (, s, width, n_lanes, n_half_lanes) \
TEST_VMLS (q, s, width, n_lanes, n_lanes) \
TEST_VMLS (, u, width, n_lanes, n_half_lanes) \
TEST_VMLS (q, u, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (16, 8, 4)
#define MAP_OPs +
#define MAP_OPu -
#define POOL4 {0, 1, 2, 3}
#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
#define EMPTY4s {0, 0, 0, 0}
#define EMPTY8s {0, 0, 0, 0, 0, 0, 0, 0}
#define EMPTY4u {0, 2, 4, 6}
#define EMPTY8u {0, 2, 4, 6, 8, 10, 12, 14}
#define BUILD_TEST(su, size, lanes) \
static void \
test_##su##size (void) \
{ \
int i; \
MAP##su (size,) pool[lanes] = POOL##lanes; \
MAP##su (size,) res[lanes] = EMPTY##lanes##su; \
MAP##su (size,) res2[lanes] = EMPTY##lanes##su; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmlsq_lane_##su##size (res, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res[i] MAP_OP##su pool[i] != 0) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmlsq_laneq_##su##size (res2, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res2[i] MAP_OP##su pool[i] != 0) \
abort (); \
}
#undef BUILD_VARS
#define BUILD_VARS(size, lanes) \
BUILD_TEST (s, size, lanes) \
BUILD_TEST (u, size, lanes)
BUILD_VARS (32, 4)
BUILD_VARS (16, 8)
int
main (int argc, char **argv)
{
test_s32 ();
test_u32 ();
test_s16 ();
test_u16 ();
return 0;
}
/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { cleanup-saved-temps } } */