crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR

This implements 5-way interleaving for ECB, CBC decryption and CTR,
resulting in a speedup of ~11% on Marvell ThunderX2, which has a
very deep pipeline and therefore a high issue latency for NEON
instructions operating on the same registers.

Note that XTS is left alone: implementing 5-way interleave there
would either involve spilling of the calculated tweaks to the
stack, or recalculating them after the encryption operation, and
doing either of those would most likely penalize low end cores.

For ECB, this is not a concern at all, given that we have plenty
of spare registers. For CTR and CBC decryption, we take advantage
of the fact that v16 is not used by the CE version of the code
(which is the only one targeted by the optimization), and so we
can reshuffle the code a bit and avoid having to spill to memory
(with the exception of one extra reload in the CBC routine)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2019-06-24 19:38:31 +02:00 committed by Herbert Xu
parent e217413964
commit 7367bfeb2c
3 changed files with 75 additions and 31 deletions

View File

@ -18,6 +18,8 @@
.arch armv8-a+crypto .arch armv8-a+crypto
xtsmask .req v16 xtsmask .req v16
cbciv .req v16
vctr .req v16
.macro xts_reload_mask, tmp .macro xts_reload_mask, tmp
.endm .endm

View File

@ -17,6 +17,14 @@
#define MAX_STRIDE 4 #define MAX_STRIDE 4
#endif #endif
#if MAX_STRIDE == 4
#define ST4(x...) x
#define ST5(x...)
#else
#define ST4(x...)
#define ST5(x...) x
#endif
aes_encrypt_block4x: aes_encrypt_block4x:
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret ret
@ -53,14 +61,17 @@ AES_ENTRY(aes_ecb_encrypt)
enc_prepare w3, x2, x5 enc_prepare w3, x2, x5
.LecbencloopNx: .LecbencloopNx:
subs w4, w4, #4 subs w4, w4, #MAX_STRIDE
bmi .Lecbenc1x bmi .Lecbenc1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
bl aes_encrypt_block4x ST4( bl aes_encrypt_block4x )
ST5( ld1 {v4.16b}, [x1], #16 )
ST5( bl aes_encrypt_block5x )
st1 {v0.16b-v3.16b}, [x0], #64 st1 {v0.16b-v3.16b}, [x0], #64
ST5( st1 {v4.16b}, [x0], #16 )
b .LecbencloopNx b .LecbencloopNx
.Lecbenc1x: .Lecbenc1x:
adds w4, w4, #4 adds w4, w4, #MAX_STRIDE
beq .Lecbencout beq .Lecbencout
.Lecbencloop: .Lecbencloop:
ld1 {v0.16b}, [x1], #16 /* get next pt block */ ld1 {v0.16b}, [x1], #16 /* get next pt block */
@ -81,14 +92,17 @@ AES_ENTRY(aes_ecb_decrypt)
dec_prepare w3, x2, x5 dec_prepare w3, x2, x5
.LecbdecloopNx: .LecbdecloopNx:
subs w4, w4, #4 subs w4, w4, #MAX_STRIDE
bmi .Lecbdec1x bmi .Lecbdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
bl aes_decrypt_block4x ST4( bl aes_decrypt_block4x )
ST5( ld1 {v4.16b}, [x1], #16 )
ST5( bl aes_decrypt_block5x )
st1 {v0.16b-v3.16b}, [x0], #64 st1 {v0.16b-v3.16b}, [x0], #64
ST5( st1 {v4.16b}, [x0], #16 )
b .LecbdecloopNx b .LecbdecloopNx
.Lecbdec1x: .Lecbdec1x:
adds w4, w4, #4 adds w4, w4, #MAX_STRIDE
beq .Lecbdecout beq .Lecbdecout
.Lecbdecloop: .Lecbdecloop:
ld1 {v0.16b}, [x1], #16 /* get next ct block */ ld1 {v0.16b}, [x1], #16 /* get next ct block */
@ -148,39 +162,56 @@ AES_ENTRY(aes_cbc_decrypt)
stp x29, x30, [sp, #-16]! stp x29, x30, [sp, #-16]!
mov x29, sp mov x29, sp
ld1 {v7.16b}, [x5] /* get iv */ ld1 {cbciv.16b}, [x5] /* get iv */
dec_prepare w3, x2, x6 dec_prepare w3, x2, x6
.LcbcdecloopNx: .LcbcdecloopNx:
subs w4, w4, #4 subs w4, w4, #MAX_STRIDE
bmi .Lcbcdec1x bmi .Lcbcdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
#if MAX_STRIDE == 5
ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
mov v5.16b, v0.16b
mov v6.16b, v1.16b
mov v7.16b, v2.16b
bl aes_decrypt_block5x
sub x1, x1, #32
eor v0.16b, v0.16b, cbciv.16b
eor v1.16b, v1.16b, v5.16b
ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
eor v4.16b, v4.16b, v5.16b
#else
mov v4.16b, v0.16b mov v4.16b, v0.16b
mov v5.16b, v1.16b mov v5.16b, v1.16b
mov v6.16b, v2.16b mov v6.16b, v2.16b
bl aes_decrypt_block4x bl aes_decrypt_block4x
sub x1, x1, #16 sub x1, x1, #16
eor v0.16b, v0.16b, v7.16b eor v0.16b, v0.16b, cbciv.16b
eor v1.16b, v1.16b, v4.16b eor v1.16b, v1.16b, v4.16b
ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v5.16b eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b eor v3.16b, v3.16b, v6.16b
#endif
st1 {v0.16b-v3.16b}, [x0], #64 st1 {v0.16b-v3.16b}, [x0], #64
ST5( st1 {v4.16b}, [x0], #16 )
b .LcbcdecloopNx b .LcbcdecloopNx
.Lcbcdec1x: .Lcbcdec1x:
adds w4, w4, #4 adds w4, w4, #MAX_STRIDE
beq .Lcbcdecout beq .Lcbcdecout
.Lcbcdecloop: .Lcbcdecloop:
ld1 {v1.16b}, [x1], #16 /* get next ct block */ ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */ mov v0.16b, v1.16b /* ...and copy to v0 */
decrypt_block v0, w3, x2, x6, w7 decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
mov v7.16b, v1.16b /* ct is next iv */ mov cbciv.16b, v1.16b /* ct is next iv */
st1 {v0.16b}, [x0], #16 st1 {v0.16b}, [x0], #16
subs w4, w4, #1 subs w4, w4, #1
bne .Lcbcdecloop bne .Lcbcdecloop
.Lcbcdecout: .Lcbcdecout:
st1 {v7.16b}, [x5] /* return iv */ st1 {cbciv.16b}, [x5] /* return iv */
ldp x29, x30, [sp], #16 ldp x29, x30, [sp], #16
ret ret
AES_ENDPROC(aes_cbc_decrypt) AES_ENDPROC(aes_cbc_decrypt)
@ -274,51 +305,60 @@ AES_ENTRY(aes_ctr_encrypt)
mov x29, sp mov x29, sp
enc_prepare w3, x2, x6 enc_prepare w3, x2, x6
ld1 {v4.16b}, [x5] ld1 {vctr.16b}, [x5]
umov x6, v4.d[1] /* keep swabbed ctr in reg */ umov x6, vctr.d[1] /* keep swabbed ctr in reg */
rev x6, x6 rev x6, x6
cmn w6, w4 /* 32 bit overflow? */ cmn w6, w4 /* 32 bit overflow? */
bcs .Lctrloop bcs .Lctrloop
.LctrloopNx: .LctrloopNx:
subs w4, w4, #4 subs w4, w4, #MAX_STRIDE
bmi .Lctr1x bmi .Lctr1x
add w7, w6, #1 add w7, w6, #1
mov v0.16b, v4.16b mov v0.16b, vctr.16b
add w8, w6, #2 add w8, w6, #2
mov v1.16b, v4.16b mov v1.16b, vctr.16b
add w9, w6, #3
mov v2.16b, vctr.16b
add w9, w6, #3 add w9, w6, #3
mov v2.16b, v4.16b
rev w7, w7 rev w7, w7
mov v3.16b, v4.16b mov v3.16b, vctr.16b
rev w8, w8 rev w8, w8
ST5( mov v4.16b, vctr.16b )
mov v1.s[3], w7 mov v1.s[3], w7
rev w9, w9 rev w9, w9
ST5( add w10, w6, #4 )
mov v2.s[3], w8 mov v2.s[3], w8
ST5( rev w10, w10 )
mov v3.s[3], w9 mov v3.s[3], w9
ST5( mov v4.s[3], w10 )
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
bl aes_encrypt_block4x ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
eor v0.16b, v5.16b, v0.16b eor v0.16b, v5.16b, v0.16b
ld1 {v5.16b}, [x1], #16 /* get 1 input block */ ST4( ld1 {v5.16b}, [x1], #16 )
eor v1.16b, v6.16b, v1.16b eor v1.16b, v6.16b, v1.16b
ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
eor v2.16b, v7.16b, v2.16b eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b eor v3.16b, v5.16b, v3.16b
ST5( eor v4.16b, v6.16b, v4.16b )
st1 {v0.16b-v3.16b}, [x0], #64 st1 {v0.16b-v3.16b}, [x0], #64
add x6, x6, #4 ST5( st1 {v4.16b}, [x0], #16 )
add x6, x6, #MAX_STRIDE
rev x7, x6 rev x7, x6
ins v4.d[1], x7 ins vctr.d[1], x7
cbz w4, .Lctrout cbz w4, .Lctrout
b .LctrloopNx b .LctrloopNx
.Lctr1x: .Lctr1x:
adds w4, w4, #4 adds w4, w4, #MAX_STRIDE
beq .Lctrout beq .Lctrout
.Lctrloop: .Lctrloop:
mov v0.16b, v4.16b mov v0.16b, vctr.16b
encrypt_block v0, w3, x2, x8, w7 encrypt_block v0, w3, x2, x8, w7
adds x6, x6, #1 /* increment BE ctr */ adds x6, x6, #1 /* increment BE ctr */
rev x7, x6 rev x7, x6
ins v4.d[1], x7 ins vctr.d[1], x7
bcs .Lctrcarry /* overflow? */ bcs .Lctrcarry /* overflow? */
.Lctrcarrydone: .Lctrcarrydone:
@ -330,7 +370,7 @@ AES_ENTRY(aes_ctr_encrypt)
bne .Lctrloop bne .Lctrloop
.Lctrout: .Lctrout:
st1 {v4.16b}, [x5] /* return next CTR value */ st1 {vctr.16b}, [x5] /* return next CTR value */
ldp x29, x30, [sp], #16 ldp x29, x30, [sp], #16
ret ret
@ -339,11 +379,11 @@ AES_ENTRY(aes_ctr_encrypt)
b .Lctrout b .Lctrout
.Lctrcarry: .Lctrcarry:
umov x7, v4.d[0] /* load upper word of ctr */ umov x7, vctr.d[0] /* load upper word of ctr */
rev x7, x7 /* ... to handle the carry */ rev x7, x7 /* ... to handle the carry */
add x7, x7, #1 add x7, x7, #1
rev x7, x7 rev x7, x7
ins v4.d[0], x7 ins vctr.d[0], x7
b .Lctrcarrydone b .Lctrcarrydone
AES_ENDPROC(aes_ctr_encrypt) AES_ENDPROC(aes_ctr_encrypt)

View File

@ -15,6 +15,8 @@
#define AES_ENDPROC(func) ENDPROC(neon_ ## func) #define AES_ENDPROC(func) ENDPROC(neon_ ## func)
xtsmask .req v7 xtsmask .req v7
cbciv .req v7
vctr .req v4
.macro xts_reload_mask, tmp .macro xts_reload_mask, tmp
xts_load_mask \tmp xts_load_mask \tmp