[arm]: redefine aes patterns
This patch removes the arch-common aese/aesmc and aesd/aesimc fusions (i.e. aes fusion) implemented in the scheduling phase through the aarch_crypto_can_dual function. The reason is due to observing undesired behaviour in cases such as: - when register allocation goes bad (e.g. extra movs) - aes operations with xor and zeroed keys among interleaved operations A more stable version should be provided by instead doing the aes fusion during the combine pass. As such, new combine patterns have been added to enable this. The second change is the aese and aesd patterns have been rewritten as encapsulating a xor operation. The purpose is to simplify the need of having additional combine patterns for cases like the ones below: For AESE (though it also applies to AESD as both have a xor operation): data = data ^ key; data = vaeseq_u8(data, zero); --- veor q1, q0, q1 aese.8 q1, q9 Should mean and generate the same as: data = vaeseq_u8(data, key); --- aese.8 q1, q0 2019-07-09 Sylvia Taylor <sylvia.taylor@arm.com> * config/arm/crypto.md: (crypto_<crypto_pattern>): Redefine aese/aesd pattern with xor. (crypto_<crypto_pattern>): Remove attribute enabled for aesmc. (crypto_<crypto_pattern>): Split CRYPTO_BINARY into 2 patterns. (*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New. * config/arm/arm.c (aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check. * config/arm/aarch-common-protos.h (aarch_crypto_can_dual_issue): Remove. * config/arm/aarch-common.c (aarch_crypto_can_dual_issue): Likewise. * config/arm/exynos-m1.md: Remove aese/aesmc fusion. * config/arm/cortex-a53.md: Likewise. * config/arm/cortex-a57.md: Likewise. * config/arm/iterators.md: (CRYPTO_BINARY): Redefine. (CRYPTO_UNARY): Removed. (CRYPTO_AES, CRYPTO_AESMC): New. * gcc.target/arm/aes-fuse-1.c: New. * gcc.target/arm/aes-fuse-2.c: New. * gcc.target/arm/aes_xor_combine.c: New. From-SVN: r273296
This commit is contained in:
parent
b52db15861
commit
4c12dc0556
@ -1,3 +1,24 @@
|
||||
2019-07-09 Sylvia Taylor <sylvia.taylor@arm.com>
|
||||
|
||||
* config/arm/crypto.md:
|
||||
(crypto_<crypto_pattern>): Redefine aese/aesd pattern with xor.
|
||||
(crypto_<crypto_pattern>): Remove attribute enabled for aesmc.
|
||||
(crypto_<crypto_pattern>): Split CRYPTO_BINARY into 2 patterns.
|
||||
(*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New.
|
||||
* config/arm/arm.c
|
||||
(aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check.
|
||||
* config/arm/aarch-common-protos.h
|
||||
(aarch_crypto_can_dual_issue): Remove.
|
||||
* config/arm/aarch-common.c
|
||||
(aarch_crypto_can_dual_issue): Likewise.
|
||||
* config/arm/exynos-m1.md: Remove aese/aesmc fusion.
|
||||
* config/arm/cortex-a53.md: Likewise.
|
||||
* config/arm/cortex-a57.md: Likewise.
|
||||
* config/arm/iterators.md:
|
||||
(CRYPTO_BINARY): Redefine.
|
||||
(CRYPTO_UNARY): Removed.
|
||||
(CRYPTO_AES, CRYPTO_AESMC): New.
|
||||
|
||||
2019-07-09 Richard Biener <rguenther@suse.de>
|
||||
|
||||
* tree-ssa-sccvn.c (struct vn_walk_cb_data): Add orig_ref member.
|
||||
|
@ -24,7 +24,6 @@
|
||||
#define GCC_AARCH_COMMON_PROTOS_H
|
||||
|
||||
extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *);
|
||||
extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *);
|
||||
extern bool aarch_rev16_p (rtx);
|
||||
extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode);
|
||||
extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode);
|
||||
|
@ -31,46 +31,6 @@
|
||||
#include "rtl-iter.h"
|
||||
#include "memmodel.h"
|
||||
|
||||
/* In ARMv8-A there's a general expectation that AESE/AESMC
|
||||
and AESD/AESIMC sequences of the form:
|
||||
|
||||
AESE Vn, _
|
||||
AESMC Vn, Vn
|
||||
|
||||
will issue both instructions in a single cycle on super-scalar
|
||||
implementations. This function identifies such pairs. */
|
||||
|
||||
int
|
||||
aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn)
|
||||
{
|
||||
rtx producer_set, consumer_set;
|
||||
rtx producer_src, consumer_src;
|
||||
|
||||
producer_set = single_set (producer_insn);
|
||||
consumer_set = single_set (consumer_insn);
|
||||
|
||||
producer_src = producer_set ? SET_SRC (producer_set) : NULL;
|
||||
consumer_src = consumer_set ? SET_SRC (consumer_set) : NULL;
|
||||
|
||||
if (producer_src && consumer_src
|
||||
&& GET_CODE (producer_src) == UNSPEC && GET_CODE (consumer_src) == UNSPEC
|
||||
&& ((XINT (producer_src, 1) == UNSPEC_AESE
|
||||
&& XINT (consumer_src, 1) == UNSPEC_AESMC)
|
||||
|| (XINT (producer_src, 1) == UNSPEC_AESD
|
||||
&& XINT (consumer_src, 1) == UNSPEC_AESIMC)))
|
||||
{
|
||||
unsigned int regno = REGNO (SET_DEST (producer_set));
|
||||
|
||||
/* Before reload the registers are virtual, so the destination of
|
||||
consumer_set doesn't need to match. */
|
||||
|
||||
return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed)
|
||||
&& REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Return TRUE if X is either an arithmetic shift left, or
|
||||
is a multiplication by a power of two. */
|
||||
bool
|
||||
|
@ -30606,10 +30606,6 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
|
||||
if (!arm_macro_fusion_p ())
|
||||
return false;
|
||||
|
||||
if (current_tune->fusible_ops & tune_params::FUSE_AES_AESMC
|
||||
&& aarch_crypto_can_dual_issue (prev, curr))
|
||||
return true;
|
||||
|
||||
if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT
|
||||
&& arm_sets_movw_movt_fusible_p (prev_set, curr_set))
|
||||
return true;
|
||||
|
@ -801,9 +801,3 @@
|
||||
;; help.
|
||||
(define_bypass 1 "cortex_a57_*"
|
||||
"cortex_a57_call,cortex_a57_branch")
|
||||
|
||||
;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency
|
||||
(define_bypass 0 "cortex_a57_crypto_simple"
|
||||
"cortex_a57_crypto_simple"
|
||||
"aarch_crypto_can_dual_issue")
|
||||
|
||||
|
@ -19,33 +19,76 @@
|
||||
;; <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
;; When AES/AESMC fusion is enabled we want the register allocation to
|
||||
;; look like:
|
||||
;; AESE Vn, _
|
||||
;; AESMC Vn, Vn
|
||||
;; So prefer to tie operand 1 to operand 0 when fusing.
|
||||
|
||||
(define_insn "crypto_<crypto_pattern>"
|
||||
[(set (match_operand:<crypto_mode> 0 "register_operand" "=w,w")
|
||||
(unspec:<crypto_mode> [(match_operand:<crypto_mode> 1
|
||||
"register_operand" "0,w")]
|
||||
CRYPTO_UNARY))]
|
||||
[(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
|
||||
(unspec:<crypto_mode>
|
||||
[(match_operand:<crypto_mode> 1 "register_operand" "w")]
|
||||
CRYPTO_AESMC))]
|
||||
"TARGET_CRYPTO"
|
||||
"<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q1"
|
||||
[(set_attr "type" "<crypto_type>")
|
||||
(set_attr_alternative "enabled"
|
||||
[(if_then_else (match_test
|
||||
"arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)")
|
||||
(const_string "yes" )
|
||||
(const_string "no"))
|
||||
(const_string "yes")])]
|
||||
[(set_attr "type" "<crypto_type>")]
|
||||
)
|
||||
|
||||
(define_insn "crypto_<crypto_pattern>"
|
||||
[(set (match_operand:V16QI 0 "register_operand" "=w")
|
||||
(unspec:V16QI
|
||||
[(xor:V16QI
|
||||
(match_operand:V16QI 1 "register_operand" "%0")
|
||||
(match_operand:V16QI 2 "register_operand" "w"))]
|
||||
CRYPTO_AES))]
|
||||
"TARGET_CRYPTO"
|
||||
"<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2"
|
||||
[(set_attr "type" "<crypto_type>")]
|
||||
)
|
||||
|
||||
;; When AESE/AESMC fusion is enabled we really want to keep the two together
|
||||
;; and enforce the register dependency without scheduling or register
|
||||
;; allocation messing up the order or introducing moves inbetween.
|
||||
;; Mash the two together during combine.
|
||||
|
||||
(define_insn "*aarch32_crypto_aese_fused"
|
||||
[(set (match_operand:V16QI 0 "register_operand" "=w")
|
||||
(unspec:V16QI
|
||||
[(unspec:V16QI
|
||||
[(xor:V16QI
|
||||
(match_operand:V16QI 1 "register_operand" "%0")
|
||||
(match_operand:V16QI 2 "register_operand" "w"))]
|
||||
UNSPEC_AESE)]
|
||||
UNSPEC_AESMC))]
|
||||
"TARGET_CRYPTO
|
||||
&& arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)"
|
||||
"aese.8\\t%q0, %q2\;aesmc.8\\t%q0, %q0"
|
||||
[(set_attr "type" "crypto_aese")
|
||||
(set_attr "length" "8")]
|
||||
)
|
||||
|
||||
;; When AESD/AESIMC fusion is enabled we really want to keep the two together
|
||||
;; and enforce the register dependency without scheduling or register
|
||||
;; allocation messing up the order or introducing moves inbetween.
|
||||
;; Mash the two together during combine.
|
||||
|
||||
(define_insn "*aarch32_crypto_aesd_fused"
|
||||
[(set (match_operand:V16QI 0 "register_operand" "=w")
|
||||
(unspec:V16QI
|
||||
[(unspec:V16QI
|
||||
[(xor:V16QI
|
||||
(match_operand:V16QI 1 "register_operand" "%0")
|
||||
(match_operand:V16QI 2 "register_operand" "w"))]
|
||||
UNSPEC_AESD)]
|
||||
UNSPEC_AESIMC))]
|
||||
"TARGET_CRYPTO
|
||||
&& arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)"
|
||||
"aesd.8\\t%q0, %q2\;aesimc.8\\t%q0, %q0"
|
||||
[(set_attr "type" "crypto_aese")
|
||||
(set_attr "length" "8")]
|
||||
)
|
||||
|
||||
(define_insn "crypto_<crypto_pattern>"
|
||||
[(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
|
||||
(unspec:<crypto_mode> [(match_operand:<crypto_mode> 1 "register_operand" "0")
|
||||
(match_operand:<crypto_mode> 2 "register_operand" "w")]
|
||||
CRYPTO_BINARY))]
|
||||
(unspec:<crypto_mode>
|
||||
[(match_operand:<crypto_mode> 1 "register_operand" "0")
|
||||
(match_operand:<crypto_mode> 2 "register_operand" "w")]
|
||||
CRYPTO_BINARY))]
|
||||
"TARGET_CRYPTO"
|
||||
"<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2"
|
||||
[(set_attr "type" "<crypto_type>")]
|
||||
|
@ -950,11 +950,6 @@
|
||||
"exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
|
||||
exynos_m1_crypto_poly*")
|
||||
|
||||
;; AES{D,E}/AESMC pairs can feed each other instantly.
|
||||
(define_bypass 0 "exynos_m1_crypto_simple"
|
||||
"exynos_m1_crypto_simple"
|
||||
"aarch_crypto_can_dual_issue")
|
||||
|
||||
;; Predicted branches take no time, but mispredicted ones take forever anyway.
|
||||
(define_bypass 1 "exynos_m1_*"
|
||||
"exynos_m1_call, exynos_m1_branch")
|
||||
|
@ -413,10 +413,11 @@
|
||||
(define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W
|
||||
UNSPEC_CRC32CB UNSPEC_CRC32CH UNSPEC_CRC32CW])
|
||||
|
||||
(define_int_iterator CRYPTO_UNARY [UNSPEC_AESMC UNSPEC_AESIMC])
|
||||
(define_int_iterator CRYPTO_AESMC [UNSPEC_AESMC UNSPEC_AESIMC])
|
||||
|
||||
(define_int_iterator CRYPTO_BINARY [UNSPEC_AESD UNSPEC_AESE
|
||||
UNSPEC_SHA1SU1 UNSPEC_SHA256SU0])
|
||||
(define_int_iterator CRYPTO_AES [UNSPEC_AESD UNSPEC_AESE])
|
||||
|
||||
(define_int_iterator CRYPTO_BINARY [UNSPEC_SHA1SU1 UNSPEC_SHA256SU0])
|
||||
|
||||
(define_int_iterator CRYPTO_TERNARY [UNSPEC_SHA1SU0 UNSPEC_SHA256H
|
||||
UNSPEC_SHA256H2 UNSPEC_SHA256SU1])
|
||||
|
@ -1,3 +1,9 @@
|
||||
2019-07-09 Sylvia Taylor <sylvia.taylor@arm.com>
|
||||
|
||||
* gcc.target/arm/aes-fuse-1.c: New.
|
||||
* gcc.target/arm/aes-fuse-2.c: New.
|
||||
* gcc.target/arm/aes_xor_combine.c: New.
|
||||
|
||||
2019-07-09 Martin Liska <mliska@suse.cz>
|
||||
|
||||
* gcc.dg/predict-17.c: Test loop optimizer assumption
|
||||
|
66
gcc/testsuite/gcc.target/arm/aes-fuse-1.c
Normal file
66
gcc/testsuite/gcc.target/arm/aes-fuse-1.c
Normal file
@ -0,0 +1,66 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target arm_crypto_ok } */
|
||||
/* { dg-add-options arm_crypto } */
|
||||
/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
|
||||
#define AESMC(r, i) (r = vaesmcq_u8 (i))
|
||||
|
||||
const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
uint8x16_t dummy;
|
||||
uint8x16_t a;
|
||||
uint8x16_t b;
|
||||
uint8x16_t c;
|
||||
uint8x16_t d;
|
||||
uint8x16_t x;
|
||||
uint8x16_t y;
|
||||
uint8x16_t k;
|
||||
|
||||
void foo (void)
|
||||
{
|
||||
AESE (a, a, k);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESE (b, b, k);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESE (c, c, k);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESE (d, d, k);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
|
||||
x = x ^ k;
|
||||
AESE (x, x, zero);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
y = y ^ k;
|
||||
AESE (y, y, zero);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
|
||||
AESMC (d, d);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESMC (c, c);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESMC (b, b);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESMC (a, a);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
|
||||
AESMC (y, y);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESMC (x, x);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "crypto_aese_fused" 6 } } */
|
||||
/* { dg-final { scan-assembler-not "veor" } } */
|
66
gcc/testsuite/gcc.target/arm/aes-fuse-2.c
Normal file
66
gcc/testsuite/gcc.target/arm/aes-fuse-2.c
Normal file
@ -0,0 +1,66 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target arm_crypto_ok } */
|
||||
/* { dg-add-options arm_crypto } */
|
||||
/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
|
||||
#define AESIMC(r, i) (r = vaesimcq_u8 (i))
|
||||
|
||||
const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
uint8x16_t dummy;
|
||||
uint8x16_t a;
|
||||
uint8x16_t b;
|
||||
uint8x16_t c;
|
||||
uint8x16_t d;
|
||||
uint8x16_t x;
|
||||
uint8x16_t y;
|
||||
uint8x16_t k;
|
||||
|
||||
void foo (void)
|
||||
{
|
||||
AESD (a, a, k);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESD (b, b, k);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESD (c, c, k);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESD (d, d, k);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
|
||||
x = x ^ k;
|
||||
AESD (x, x, zero);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
y = y ^ k;
|
||||
AESD (y, y, zero);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
|
||||
AESIMC (d, d);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESIMC (c, c);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESIMC (b, b);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESIMC (a, a);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
|
||||
AESIMC (y, y);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
dummy = vaddq_u8 (dummy, dummy);
|
||||
AESIMC (x, x);
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "crypto_aesd_fused" 6 } } */
|
||||
/* { dg-final { scan-assembler-not "veor" } } */
|
43
gcc/testsuite/gcc.target/arm/aes_xor_combine.c
Normal file
43
gcc/testsuite/gcc.target/arm/aes_xor_combine.c
Normal file
@ -0,0 +1,43 @@
|
||||
/* { dg-do compile } */
|
||||
/* { dg-require-effective-target arm_crypto_ok } */
|
||||
/* { dg-add-options arm_crypto } */
|
||||
/* { dg-additional-options "-O3" } */
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
|
||||
#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
|
||||
|
||||
const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
uint8x16_t foo_key_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar)
|
||||
{
|
||||
dummy = dummy ^ foo;
|
||||
AESE(dummy, dummy, zero);
|
||||
dummy = dummy ^ bar;
|
||||
AESE(dummy, dummy, zero);
|
||||
|
||||
dummy = dummy ^ foo;
|
||||
AESD(dummy, dummy, zero);
|
||||
dummy = dummy ^ bar;
|
||||
AESD(dummy, dummy, zero);
|
||||
|
||||
return dummy;
|
||||
}
|
||||
|
||||
uint8x16_t foo_data_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar)
|
||||
{
|
||||
dummy = dummy ^ foo;
|
||||
AESE(dummy, zero, dummy);
|
||||
dummy = dummy ^ bar;
|
||||
AESE(dummy, zero, dummy);
|
||||
|
||||
dummy = dummy ^ foo;
|
||||
AESD(dummy, zero, dummy);
|
||||
dummy = dummy ^ bar;
|
||||
AESD(dummy, zero, dummy);
|
||||
|
||||
return dummy;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-not "veor" } } */
|
Loading…
Reference in New Issue
Block a user