[arm]: redefine aes patterns

This patch removes the arch-common aese/aesmc and aesd/aesimc fusions
(i.e. aes fusion) implemented in the scheduling phase through the
aarch_crypto_can_dual function. The reason is due to observing
undesired behaviour in cases such as:
- when register allocation goes bad (e.g. extra movs)
- aes operations with xor and zeroed keys among interleaved operations

A more stable version should be provided by instead doing the aes fusion
during the combine pass. As such, new combine patterns have been added to
enable this.

The second change is the aese and aesd patterns have been rewritten as
encapsulating a xor operation. The purpose is to simplify the need of
having additional combine patterns for cases like the ones below:

For AESE (though it also applies to AESD as both have a xor operation):

    data = data ^ key;
    data = vaeseq_u8(data, zero);
    ---
    veor        q1, q0, q1
    aese.8      q1, q9

Should mean and generate the same as:

    data = vaeseq_u8(data, key);
    ---
    aese.8       q1, q0

2019-07-09  Sylvia Taylor  <sylvia.taylor@arm.com>

	* config/arm/crypto.md:
	(crypto_<crypto_pattern>): Redefine aese/aesd pattern with xor.
	(crypto_<crypto_pattern>): Remove attribute enabled for aesmc.
	(crypto_<crypto_pattern>): Split CRYPTO_BINARY into 2 patterns.
	(*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New.
	* config/arm/arm.c
	(aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check.
	* config/arm/aarch-common-protos.h
	(aarch_crypto_can_dual_issue): Remove.
	* config/arm/aarch-common.c
	(aarch_crypto_can_dual_issue): Likewise.
	* config/arm/exynos-m1.md: Remove aese/aesmc fusion.
	* config/arm/cortex-a53.md: Likewise.
	* config/arm/cortex-a57.md: Likewise.
	* config/arm/iterators.md:
	(CRYPTO_BINARY): Redefine.
	(CRYPTO_UNARY): Removed.
	(CRYPTO_AES, CRYPTO_AESMC): New.

	* gcc.target/arm/aes-fuse-1.c: New.
	* gcc.target/arm/aes-fuse-2.c: New.
	* gcc.target/arm/aes_xor_combine.c: New.

From-SVN: r273296
This commit is contained in:
Sylvia Taylor 2019-07-09 09:28:09 +00:00 committed by Kyrylo Tkachov
parent b52db15861
commit 4c12dc0556
12 changed files with 269 additions and 79 deletions

View File

@ -1,3 +1,24 @@
2019-07-09 Sylvia Taylor <sylvia.taylor@arm.com>
* config/arm/crypto.md:
(crypto_<crypto_pattern>): Redefine aese/aesd pattern with xor.
(crypto_<crypto_pattern>): Remove attribute enabled for aesmc.
(crypto_<crypto_pattern>): Split CRYPTO_BINARY into 2 patterns.
(*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New.
* config/arm/arm.c
(aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check.
* config/arm/aarch-common-protos.h
(aarch_crypto_can_dual_issue): Remove.
* config/arm/aarch-common.c
(aarch_crypto_can_dual_issue): Likewise.
* config/arm/exynos-m1.md: Remove aese/aesmc fusion.
* config/arm/cortex-a53.md: Likewise.
* config/arm/cortex-a57.md: Likewise.
* config/arm/iterators.md:
(CRYPTO_BINARY): Redefine.
(CRYPTO_UNARY): Removed.
(CRYPTO_AES, CRYPTO_AESMC): New.
2019-07-09 Richard Biener <rguenther@suse.de>
* tree-ssa-sccvn.c (struct vn_walk_cb_data): Add orig_ref member.

View File

@ -24,7 +24,6 @@
#define GCC_AARCH_COMMON_PROTOS_H
extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *);
extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *);
extern bool aarch_rev16_p (rtx);
extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode);
extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode);

View File

@ -31,46 +31,6 @@
#include "rtl-iter.h"
#include "memmodel.h"
/* In ARMv8-A there's a general expectation that AESE/AESMC
and AESD/AESIMC sequences of the form:
AESE Vn, _
AESMC Vn, Vn
will issue both instructions in a single cycle on super-scalar
implementations. This function identifies such pairs. */
int
aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn)
{
rtx producer_set, consumer_set;
rtx producer_src, consumer_src;
producer_set = single_set (producer_insn);
consumer_set = single_set (consumer_insn);
producer_src = producer_set ? SET_SRC (producer_set) : NULL;
consumer_src = consumer_set ? SET_SRC (consumer_set) : NULL;
if (producer_src && consumer_src
&& GET_CODE (producer_src) == UNSPEC && GET_CODE (consumer_src) == UNSPEC
&& ((XINT (producer_src, 1) == UNSPEC_AESE
&& XINT (consumer_src, 1) == UNSPEC_AESMC)
|| (XINT (producer_src, 1) == UNSPEC_AESD
&& XINT (consumer_src, 1) == UNSPEC_AESIMC)))
{
unsigned int regno = REGNO (SET_DEST (producer_set));
/* Before reload the registers are virtual, so the destination of
consumer_set doesn't need to match. */
return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed)
&& REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
}
return 0;
}
/* Return TRUE if X is either an arithmetic shift left, or
is a multiplication by a power of two. */
bool

View File

@ -30606,10 +30606,6 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
if (!arm_macro_fusion_p ())
return false;
if (current_tune->fusible_ops & tune_params::FUSE_AES_AESMC
&& aarch_crypto_can_dual_issue (prev, curr))
return true;
if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT
&& arm_sets_movw_movt_fusible_p (prev_set, curr_set))
return true;

View File

@ -801,9 +801,3 @@
;; help.
(define_bypass 1 "cortex_a57_*"
"cortex_a57_call,cortex_a57_branch")
;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency
(define_bypass 0 "cortex_a57_crypto_simple"
"cortex_a57_crypto_simple"
"aarch_crypto_can_dual_issue")

View File

@ -19,33 +19,76 @@
;; <http://www.gnu.org/licenses/>.
;; When AES/AESMC fusion is enabled we want the register allocation to
;; look like:
;; AESE Vn, _
;; AESMC Vn, Vn
;; So prefer to tie operand 1 to operand 0 when fusing.
(define_insn "crypto_<crypto_pattern>"
[(set (match_operand:<crypto_mode> 0 "register_operand" "=w,w")
(unspec:<crypto_mode> [(match_operand:<crypto_mode> 1
"register_operand" "0,w")]
CRYPTO_UNARY))]
[(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
(unspec:<crypto_mode>
[(match_operand:<crypto_mode> 1 "register_operand" "w")]
CRYPTO_AESMC))]
"TARGET_CRYPTO"
"<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q1"
[(set_attr "type" "<crypto_type>")
(set_attr_alternative "enabled"
[(if_then_else (match_test
"arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)")
(const_string "yes" )
(const_string "no"))
(const_string "yes")])]
[(set_attr "type" "<crypto_type>")]
)
(define_insn "crypto_<crypto_pattern>"
[(set (match_operand:V16QI 0 "register_operand" "=w")
(unspec:V16QI
[(xor:V16QI
(match_operand:V16QI 1 "register_operand" "%0")
(match_operand:V16QI 2 "register_operand" "w"))]
CRYPTO_AES))]
"TARGET_CRYPTO"
"<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2"
[(set_attr "type" "<crypto_type>")]
)
;; When AESE/AESMC fusion is enabled we really want to keep the two together
;; and enforce the register dependency without scheduling or register
;; allocation messing up the order or introducing moves inbetween.
;; Mash the two together during combine.
(define_insn "*aarch32_crypto_aese_fused"
[(set (match_operand:V16QI 0 "register_operand" "=w")
(unspec:V16QI
[(unspec:V16QI
[(xor:V16QI
(match_operand:V16QI 1 "register_operand" "%0")
(match_operand:V16QI 2 "register_operand" "w"))]
UNSPEC_AESE)]
UNSPEC_AESMC))]
"TARGET_CRYPTO
&& arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)"
"aese.8\\t%q0, %q2\;aesmc.8\\t%q0, %q0"
[(set_attr "type" "crypto_aese")
(set_attr "length" "8")]
)
;; When AESD/AESIMC fusion is enabled we really want to keep the two together
;; and enforce the register dependency without scheduling or register
;; allocation messing up the order or introducing moves inbetween.
;; Mash the two together during combine.
(define_insn "*aarch32_crypto_aesd_fused"
[(set (match_operand:V16QI 0 "register_operand" "=w")
(unspec:V16QI
[(unspec:V16QI
[(xor:V16QI
(match_operand:V16QI 1 "register_operand" "%0")
(match_operand:V16QI 2 "register_operand" "w"))]
UNSPEC_AESD)]
UNSPEC_AESIMC))]
"TARGET_CRYPTO
&& arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)"
"aesd.8\\t%q0, %q2\;aesimc.8\\t%q0, %q0"
[(set_attr "type" "crypto_aese")
(set_attr "length" "8")]
)
(define_insn "crypto_<crypto_pattern>"
[(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
(unspec:<crypto_mode> [(match_operand:<crypto_mode> 1 "register_operand" "0")
(match_operand:<crypto_mode> 2 "register_operand" "w")]
CRYPTO_BINARY))]
(unspec:<crypto_mode>
[(match_operand:<crypto_mode> 1 "register_operand" "0")
(match_operand:<crypto_mode> 2 "register_operand" "w")]
CRYPTO_BINARY))]
"TARGET_CRYPTO"
"<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2"
[(set_attr "type" "<crypto_type>")]

View File

@ -950,11 +950,6 @@
"exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
exynos_m1_crypto_poly*")
;; AES{D,E}/AESMC pairs can feed each other instantly.
(define_bypass 0 "exynos_m1_crypto_simple"
"exynos_m1_crypto_simple"
"aarch_crypto_can_dual_issue")
;; Predicted branches take no time, but mispredicted ones take forever anyway.
(define_bypass 1 "exynos_m1_*"
"exynos_m1_call, exynos_m1_branch")

View File

@ -413,10 +413,11 @@
(define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W
UNSPEC_CRC32CB UNSPEC_CRC32CH UNSPEC_CRC32CW])
(define_int_iterator CRYPTO_UNARY [UNSPEC_AESMC UNSPEC_AESIMC])
(define_int_iterator CRYPTO_AESMC [UNSPEC_AESMC UNSPEC_AESIMC])
(define_int_iterator CRYPTO_BINARY [UNSPEC_AESD UNSPEC_AESE
UNSPEC_SHA1SU1 UNSPEC_SHA256SU0])
(define_int_iterator CRYPTO_AES [UNSPEC_AESD UNSPEC_AESE])
(define_int_iterator CRYPTO_BINARY [UNSPEC_SHA1SU1 UNSPEC_SHA256SU0])
(define_int_iterator CRYPTO_TERNARY [UNSPEC_SHA1SU0 UNSPEC_SHA256H
UNSPEC_SHA256H2 UNSPEC_SHA256SU1])

View File

@ -1,3 +1,9 @@
2019-07-09 Sylvia Taylor <sylvia.taylor@arm.com>
* gcc.target/arm/aes-fuse-1.c: New.
* gcc.target/arm/aes-fuse-2.c: New.
* gcc.target/arm/aes_xor_combine.c: New.
2019-07-09 Martin Liska <mliska@suse.cz>
* gcc.dg/predict-17.c: Test loop optimizer assumption

View File

@ -0,0 +1,66 @@
/* { dg-do compile } */
/* { dg-require-effective-target arm_crypto_ok } */
/* { dg-add-options arm_crypto } */
/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */
#include <arm_neon.h>
#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
#define AESMC(r, i) (r = vaesmcq_u8 (i))
const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
uint8x16_t dummy;
uint8x16_t a;
uint8x16_t b;
uint8x16_t c;
uint8x16_t d;
uint8x16_t x;
uint8x16_t y;
uint8x16_t k;
void foo (void)
{
AESE (a, a, k);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESE (b, b, k);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESE (c, c, k);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESE (d, d, k);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
x = x ^ k;
AESE (x, x, zero);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
y = y ^ k;
AESE (y, y, zero);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESMC (d, d);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESMC (c, c);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESMC (b, b);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESMC (a, a);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESMC (y, y);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESMC (x, x);
}
/* { dg-final { scan-assembler-times "crypto_aese_fused" 6 } } */
/* { dg-final { scan-assembler-not "veor" } } */

View File

@ -0,0 +1,66 @@
/* { dg-do compile } */
/* { dg-require-effective-target arm_crypto_ok } */
/* { dg-add-options arm_crypto } */
/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */
#include <arm_neon.h>
#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
#define AESIMC(r, i) (r = vaesimcq_u8 (i))
const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
uint8x16_t dummy;
uint8x16_t a;
uint8x16_t b;
uint8x16_t c;
uint8x16_t d;
uint8x16_t x;
uint8x16_t y;
uint8x16_t k;
void foo (void)
{
AESD (a, a, k);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESD (b, b, k);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESD (c, c, k);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESD (d, d, k);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
x = x ^ k;
AESD (x, x, zero);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
y = y ^ k;
AESD (y, y, zero);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESIMC (d, d);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESIMC (c, c);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESIMC (b, b);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESIMC (a, a);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESIMC (y, y);
dummy = vaddq_u8 (dummy, dummy);
dummy = vaddq_u8 (dummy, dummy);
AESIMC (x, x);
}
/* { dg-final { scan-assembler-times "crypto_aesd_fused" 6 } } */
/* { dg-final { scan-assembler-not "veor" } } */

View File

@ -0,0 +1,43 @@
/* { dg-do compile } */
/* { dg-require-effective-target arm_crypto_ok } */
/* { dg-add-options arm_crypto } */
/* { dg-additional-options "-O3" } */
#include <arm_neon.h>
#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
uint8x16_t foo_key_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar)
{
dummy = dummy ^ foo;
AESE(dummy, dummy, zero);
dummy = dummy ^ bar;
AESE(dummy, dummy, zero);
dummy = dummy ^ foo;
AESD(dummy, dummy, zero);
dummy = dummy ^ bar;
AESD(dummy, dummy, zero);
return dummy;
}
uint8x16_t foo_data_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar)
{
dummy = dummy ^ foo;
AESE(dummy, zero, dummy);
dummy = dummy ^ bar;
AESE(dummy, zero, dummy);
dummy = dummy ^ foo;
AESD(dummy, zero, dummy);
dummy = dummy ^ bar;
AESD(dummy, zero, dummy);
return dummy;
}
/* { dg-final { scan-assembler-not "veor" } } */