[AARCH64] Use STLUR for atomic_store
Use the STLUR instruction introduced in Armv8.4-a. This instruction has the store-release semantic like STLR but can take a 9-bit unscaled signed immediate offset. Example test case: ``` void foo () { int32_t *atomic_vals = calloc (4, sizeof (int32_t)); atomic_store_explicit (atomic_vals + 1, 2, memory_order_release); } ``` Before patch generates ``` foo: stp x29, x30, [sp, -16]! mov x1, 4 mov x0, x1 mov x29, sp bl calloc mov w1, 2 add x0, x0, 4 stlr w1, [x0] ldp x29, x30, [sp], 16 ret ``` After patch generates ``` foo: stp x29, x30, [sp, -16]! mov x1, 4 mov x0, x1 mov x29, sp bl calloc mov w1, 2 stlur w1, [x0, 4] ldp x29, x30, [sp], 16 ret ``` We introduce a new feature flag to indicate the presence of this instruction. The feature flag is called AARCH64_ISA_RCPC8_4 and is included when targeting armv8.4 architecture. We also introduce an "arch" attribute to be checked called "rcpc8_4" after this feature flag. gcc/ 2018-09-19 Matthew Malcomson <matthew.malcomson@arm.com> * config/aarch64/aarch64-protos.h (aarch64_offset_9bit_signed_unscaled_p): New declaration. * config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value. (arch_enabled): Add check for "rcpc8_4" attribute value of "arch". * config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield. (AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4. (AARCH64_FL_PROFILE): Move index so flags are ordered. (AARCH64_ISA_RCPC8_4): New flag. * config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed to aarch64_offset_9bit_signed_unscaled_p. * config/aarch64/atomics.md (atomic_store<mode>): Allow offset and use stlur. * config/aarch64/constraints.md (Ust): New constraint. * config/aarch64/predicates.md. (aarch64_9bit_offset_memory_operand): New predicate. (aarch64_rcpc_memory_operand): New predicate. gcc/testsuite/ 2018-09-19 Matthew Malcomson <matthew.malcomson@arm.com> * gcc.target/aarch64/atomic-store.c: New. From-SVN: r264421
This commit is contained in:
parent
574f5885f7
commit
3c5af60836
|
@ -1,3 +1,22 @@
|
|||
2018-09-19 Matthew Malcomson <matthew.malcomson@arm.com>
|
||||
|
||||
* config/aarch64/aarch64-protos.h
|
||||
(aarch64_offset_9bit_signed_unscaled_p): New declaration.
|
||||
* config/aarch64/aarch64.md (arches): New "rcpc8_4" attribute value.
|
||||
(arch_enabled): Add check for "rcpc8_4" attribute value of "arch".
|
||||
* config/aarch64/aarch64.h (AARCH64_FL_RCPC8_4): New bitfield.
|
||||
(AARCH64_FL_FOR_ARCH8_4): Include AARCH64_FL_RCPC8_4.
|
||||
(AARCH64_FL_PROFILE): Move index so flags are ordered.
|
||||
(AARCH64_ISA_RCPC8_4): New flag.
|
||||
* config/aarch64/aarch64.c (offset_9bit_signed_unscaled_p): Renamed
|
||||
to aarch64_offset_9bit_signed_unscaled_p.
|
||||
* config/aarch64/atomics.md (atomic_store<mode>): Allow offset
|
||||
and use stlur.
|
||||
* config/aarch64/constraints.md (Ust): New constraint.
|
||||
* config/aarch64/predicates.md.
|
||||
(aarch64_9bit_offset_memory_operand): New predicate.
|
||||
(aarch64_rcpc_memory_operand): New predicate.
|
||||
|
||||
2018-09-19 Eric Botcazou <ebotcazou@adacore.com>
|
||||
|
||||
PR rtl-optimization/87361
|
||||
|
|
|
@ -436,6 +436,7 @@ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
|
|||
bool aarch64_mov_operand_p (rtx, machine_mode);
|
||||
rtx aarch64_reverse_mask (machine_mode, unsigned int);
|
||||
bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
|
||||
bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
|
||||
char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
|
||||
char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
|
||||
char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
|
||||
|
|
|
@ -4452,9 +4452,9 @@ aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
|
|||
|
||||
/* Return true if OFFSET is a signed 9-bit value. */
|
||||
|
||||
static inline bool
|
||||
offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
|
||||
poly_int64 offset)
|
||||
bool
|
||||
aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
|
||||
poly_int64 offset)
|
||||
{
|
||||
HOST_WIDE_INT const_offset;
|
||||
return (offset.is_constant (&const_offset)
|
||||
|
@ -5721,7 +5721,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
|
|||
instruction memory accesses. */
|
||||
if (mode == TImode || mode == TFmode)
|
||||
return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
|
||||
&& (offset_9bit_signed_unscaled_p (mode, offset)
|
||||
&& (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
|
||||
|| offset_12bit_unsigned_scaled_p (mode, offset)));
|
||||
|
||||
/* A 7bit offset check because OImode will emit a ldp/stp
|
||||
|
@ -5735,7 +5735,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
|
|||
ldr/str instructions (only big endian will get here). */
|
||||
if (mode == CImode)
|
||||
return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
|
||||
&& (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
|
||||
&& (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
|
||||
offset + 32)
|
||||
|| offset_12bit_unsigned_scaled_p (V16QImode,
|
||||
offset + 32)));
|
||||
|
||||
|
@ -5775,7 +5776,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
|
|||
|| known_eq (GET_MODE_SIZE (mode), 16))
|
||||
&& aarch64_offset_7bit_signed_scaled_p (mode, offset));
|
||||
else
|
||||
return (offset_9bit_signed_unscaled_p (mode, offset)
|
||||
return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
|
||||
|| offset_12bit_unsigned_scaled_p (mode, offset));
|
||||
}
|
||||
|
||||
|
@ -5828,7 +5829,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
|
|||
*/
|
||||
if (mode == TImode || mode == TFmode)
|
||||
return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
|
||||
&& offset_9bit_signed_unscaled_p (mode, offset));
|
||||
&& aarch64_offset_9bit_signed_unscaled_p (mode, offset));
|
||||
|
||||
if (load_store_pair_p)
|
||||
return ((known_eq (GET_MODE_SIZE (mode), 4)
|
||||
|
@ -5836,7 +5837,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
|
|||
|| known_eq (GET_MODE_SIZE (mode), 16))
|
||||
&& aarch64_offset_7bit_signed_scaled_p (mode, offset));
|
||||
else
|
||||
return offset_9bit_signed_unscaled_p (mode, offset);
|
||||
return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
|
||||
}
|
||||
return false;
|
||||
|
||||
|
|
|
@ -157,9 +157,10 @@ extern unsigned aarch64_architecture_version;
|
|||
#define AARCH64_FL_SM4 (1 << 17) /* Has ARMv8.4-A SM3 and SM4. */
|
||||
#define AARCH64_FL_SHA3 (1 << 18) /* Has ARMv8.4-a SHA3 and SHA512. */
|
||||
#define AARCH64_FL_F16FML (1 << 19) /* Has ARMv8.4-a FP16 extensions. */
|
||||
#define AARCH64_FL_RCPC8_4 (1 << 20) /* Has ARMv8.4-a RCPC extensions. */
|
||||
|
||||
/* Statistical Profiling extensions. */
|
||||
#define AARCH64_FL_PROFILE (1 << 20)
|
||||
#define AARCH64_FL_PROFILE (1 << 21)
|
||||
|
||||
/* Has FP and SIMD. */
|
||||
#define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD)
|
||||
|
@ -178,7 +179,7 @@ extern unsigned aarch64_architecture_version;
|
|||
(AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
|
||||
#define AARCH64_FL_FOR_ARCH8_4 \
|
||||
(AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML \
|
||||
| AARCH64_FL_DOTPROD)
|
||||
| AARCH64_FL_DOTPROD | AARCH64_FL_RCPC8_4)
|
||||
|
||||
/* Macros to test ISA flags. */
|
||||
|
||||
|
@ -199,6 +200,7 @@ extern unsigned aarch64_architecture_version;
|
|||
#define AARCH64_ISA_SM4 (aarch64_isa_flags & AARCH64_FL_SM4)
|
||||
#define AARCH64_ISA_SHA3 (aarch64_isa_flags & AARCH64_FL_SHA3)
|
||||
#define AARCH64_ISA_F16FML (aarch64_isa_flags & AARCH64_FL_F16FML)
|
||||
#define AARCH64_ISA_RCPC8_4 (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
|
||||
|
||||
/* Crypto is an optional extension to AdvSIMD. */
|
||||
#define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
|
||||
|
|
|
@ -263,7 +263,7 @@
|
|||
;; alternative). This attribute is used to compute attribute "enabled", use type
|
||||
;; "any" to enable an alternative in all cases.
|
||||
|
||||
(define_enum "arches" [ any fp simd sve fp16])
|
||||
(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
|
||||
|
||||
(define_enum_attr "arch" "arches" (const_string "any"))
|
||||
|
||||
|
@ -285,6 +285,9 @@
|
|||
(ior
|
||||
(eq_attr "arch" "any")
|
||||
|
||||
(and (eq_attr "arch" "rcpc8_4")
|
||||
(match_test "AARCH64_ISA_RCPC8_4"))
|
||||
|
||||
(and (eq_attr "arch" "fp")
|
||||
(match_test "TARGET_FLOAT"))
|
||||
|
||||
|
|
|
@ -481,9 +481,9 @@
|
|||
)
|
||||
|
||||
(define_insn "atomic_store<mode>"
|
||||
[(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q")
|
||||
[(set (match_operand:ALLI 0 "aarch64_rcpc_memory_operand" "=Q,Ust")
|
||||
(unspec_volatile:ALLI
|
||||
[(match_operand:ALLI 1 "general_operand" "rZ")
|
||||
[(match_operand:ALLI 1 "general_operand" "rZ,rZ")
|
||||
(match_operand:SI 2 "const_int_operand")] ;; model
|
||||
UNSPECV_STL))]
|
||||
""
|
||||
|
@ -491,9 +491,12 @@
|
|||
enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
|
||||
if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
|
||||
return "str<atomic_sfx>\t%<w>1, %0";
|
||||
else
|
||||
else if (which_alternative == 0)
|
||||
return "stlr<atomic_sfx>\t%<w>1, %0";
|
||||
else
|
||||
return "stlur<atomic_sfx>\t%<w>1, %0";
|
||||
}
|
||||
[(set_attr "arch" "*,rcpc8_4")]
|
||||
)
|
||||
|
||||
(define_insn "@aarch64_load_exclusive<mode>"
|
||||
|
|
|
@ -225,6 +225,11 @@
|
|||
(and (match_code "mem")
|
||||
(match_test "REG_P (XEXP (op, 0))")))
|
||||
|
||||
(define_memory_constraint "Ust"
|
||||
"@internal
|
||||
A memory address with 9bit unscaled offset."
|
||||
(match_operand 0 "aarch64_9bit_offset_memory_operand"))
|
||||
|
||||
(define_memory_constraint "Ump"
|
||||
"@internal
|
||||
A memory address suitable for a load/store pair operation."
|
||||
|
|
|
@ -359,6 +359,36 @@
|
|||
(and (match_operand 0 "memory_operand")
|
||||
(match_code "reg" "0")))
|
||||
|
||||
(define_predicate "aarch64_9bit_offset_memory_operand"
|
||||
(and (match_operand 0 "memory_operand")
|
||||
(ior (match_code "reg" "0")
|
||||
(and (match_code "plus" "0")
|
||||
(match_code "reg" "00")
|
||||
(match_code "const_int" "01"))))
|
||||
{
|
||||
rtx mem_op = XEXP (op, 0);
|
||||
|
||||
if (REG_P (mem_op))
|
||||
return GET_MODE (mem_op) == DImode;
|
||||
|
||||
rtx plus_op0 = XEXP (mem_op, 0);
|
||||
rtx plus_op1 = XEXP (mem_op, 1);
|
||||
|
||||
if (GET_MODE (plus_op0) != DImode)
|
||||
return false;
|
||||
|
||||
poly_int64 offset;
|
||||
if (!poly_int_rtx_p (plus_op1, &offset))
|
||||
gcc_unreachable ();
|
||||
|
||||
return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
|
||||
})
|
||||
|
||||
(define_predicate "aarch64_rcpc_memory_operand"
|
||||
(if_then_else (match_test "AARCH64_ISA_RCPC8_4")
|
||||
(match_operand 0 "aarch64_9bit_offset_memory_operand")
|
||||
(match_operand 0 "aarch64_sync_memory_operand")))
|
||||
|
||||
;; Predicates for parallel expanders based on mode.
|
||||
(define_special_predicate "vect_par_cnst_hi_half"
|
||||
(match_code "parallel")
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
2018-09-19 Matthew Malcomson <matthew.malcomson@arm.com>
|
||||
|
||||
* gcc.target/aarch64/atomic-store.c: New.
|
||||
|
||||
2018-09-19 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR tree-optimization/87349
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-march=armv8.4-a -O2" } */
|
||||
|
||||
#include <stdatomic.h>
|
||||
|
||||
typedef __INT8_TYPE__ int8_t;
|
||||
typedef __INT16_TYPE__ int16_t;
|
||||
typedef __INT32_TYPE__ int32_t;
|
||||
typedef __INT64_TYPE__ int64_t;
|
||||
|
||||
#define STORE_TESTS(size) \
|
||||
void \
|
||||
foo##size (int##size##_t *atomic_vals) \
|
||||
{ \
|
||||
atomic_store_explicit (atomic_vals, 2, memory_order_relaxed); \
|
||||
atomic_store_explicit (atomic_vals, 2, memory_order_release); \
|
||||
atomic_store_explicit ((atomic_vals + 1), 2, memory_order_release); \
|
||||
atomic_store ((atomic_vals + 2), 2); \
|
||||
atomic_store_explicit ((atomic_vals + 3), 2, memory_order_relaxed); \
|
||||
}
|
||||
|
||||
STORE_TESTS (8);
|
||||
/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 { target { ! ilp32 } } } } */
|
||||
/* { dg-final { scan-assembler-times "stlrb\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 2 { target { ilp32 } } } } */
|
||||
/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 1\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "strb\tw\[0-9\]+, \\\[x\[0-9\]+, 3\\\]" 1 } } */
|
||||
|
||||
STORE_TESTS (16);
|
||||
/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlrh\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 2\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlurh\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "strh\tw\[0-9\]+, \\\[x\[0-9\]+, 6\\\]" 1 } } */
|
||||
|
||||
STORE_TESTS (32);
|
||||
/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlr\tw\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 4\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlur\tw\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "str\tw\[0-9\]+, \\\[x\[0-9\]+, 12\\\]" 1 } } */
|
||||
|
||||
STORE_TESTS (64);
|
||||
/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 8\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "stlur\tx\[0-9\]+, \\\[x\[0-9\]+, 16\\\]" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "str\tx\[0-9\]+, \\\[x\[0-9\]+, 24\\\]" 1 } } */
|
||||
|
||||
void
|
||||
foo_toolarge_offset (int64_t *atomic_vals)
|
||||
{
|
||||
/* 9bit signed unscaled immediate =>
|
||||
largest representable value +255.
|
||||
smallest representable value -256. */
|
||||
atomic_store_explicit (atomic_vals + 32, 2, memory_order_release);
|
||||
atomic_store_explicit (atomic_vals - 33, 2, memory_order_release);
|
||||
}
|
||||
|
||||
void
|
||||
foo_negative (int8_t *atomic_vals)
|
||||
{
|
||||
atomic_store_explicit (atomic_vals - 2, 2, memory_order_release);
|
||||
}
|
||||
/* { dg-final { scan-assembler-times "stlurb\tw\[0-9\]+, \\\[x\[0-9\]+, -2\\\]" 1 { target { ! ilp32 } } } } */
|
||||
|
||||
#pragma GCC target ("arch=armv8.3-a")
|
||||
void
|
||||
foo_older_arch (int64_t *atomic_vals)
|
||||
{
|
||||
atomic_store_explicit (atomic_vals + 2, 2, memory_order_release);
|
||||
}
|
||||
|
||||
/* Three times, one for each of the three above functions. */
|
||||
/* { dg-final { scan-assembler-times "stlr\tx\[0-9\]+, \\\[x\[0-9\]+\\\]" 4 } } */
|
Loading…
Reference in New Issue