crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR
This implements 5-way interleaving for ECB, CBC decryption and CTR, resulting in a speedup of ~11% on Marvell ThunderX2, which has a very deep pipeline and therefore a high issue latency for NEON instructions operating on the same registers. Note that XTS is left alone: implementing 5-way interleave there would either involve spilling of the calculated tweaks to the stack, or recalculating them after the encryption operation, and doing either of those would most likely penalize low end cores. For ECB, this is not a concern at all, given that we have plenty of spare registers. For CTR and CBC decryption, we take advantage of the fact that v16 is not used by the CE version of the code (which is the only one targeted by the optimization), and so we can reshuffle the code a bit and avoid having to spill to memory (with the exception of one extra reload in the CBC routine) Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
e217413964
commit
7367bfeb2c
@ -18,6 +18,8 @@
|
|||||||
.arch armv8-a+crypto
|
.arch armv8-a+crypto
|
||||||
|
|
||||||
xtsmask .req v16
|
xtsmask .req v16
|
||||||
|
cbciv .req v16
|
||||||
|
vctr .req v16
|
||||||
|
|
||||||
.macro xts_reload_mask, tmp
|
.macro xts_reload_mask, tmp
|
||||||
.endm
|
.endm
|
||||||
|
@ -17,6 +17,14 @@
|
|||||||
#define MAX_STRIDE 4
|
#define MAX_STRIDE 4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if MAX_STRIDE == 4
|
||||||
|
#define ST4(x...) x
|
||||||
|
#define ST5(x...)
|
||||||
|
#else
|
||||||
|
#define ST4(x...)
|
||||||
|
#define ST5(x...) x
|
||||||
|
#endif
|
||||||
|
|
||||||
aes_encrypt_block4x:
|
aes_encrypt_block4x:
|
||||||
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
|
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
|
||||||
ret
|
ret
|
||||||
@ -53,14 +61,17 @@ AES_ENTRY(aes_ecb_encrypt)
|
|||||||
enc_prepare w3, x2, x5
|
enc_prepare w3, x2, x5
|
||||||
|
|
||||||
.LecbencloopNx:
|
.LecbencloopNx:
|
||||||
subs w4, w4, #4
|
subs w4, w4, #MAX_STRIDE
|
||||||
bmi .Lecbenc1x
|
bmi .Lecbenc1x
|
||||||
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
|
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
|
||||||
bl aes_encrypt_block4x
|
ST4( bl aes_encrypt_block4x )
|
||||||
|
ST5( ld1 {v4.16b}, [x1], #16 )
|
||||||
|
ST5( bl aes_encrypt_block5x )
|
||||||
st1 {v0.16b-v3.16b}, [x0], #64
|
st1 {v0.16b-v3.16b}, [x0], #64
|
||||||
|
ST5( st1 {v4.16b}, [x0], #16 )
|
||||||
b .LecbencloopNx
|
b .LecbencloopNx
|
||||||
.Lecbenc1x:
|
.Lecbenc1x:
|
||||||
adds w4, w4, #4
|
adds w4, w4, #MAX_STRIDE
|
||||||
beq .Lecbencout
|
beq .Lecbencout
|
||||||
.Lecbencloop:
|
.Lecbencloop:
|
||||||
ld1 {v0.16b}, [x1], #16 /* get next pt block */
|
ld1 {v0.16b}, [x1], #16 /* get next pt block */
|
||||||
@ -81,14 +92,17 @@ AES_ENTRY(aes_ecb_decrypt)
|
|||||||
dec_prepare w3, x2, x5
|
dec_prepare w3, x2, x5
|
||||||
|
|
||||||
.LecbdecloopNx:
|
.LecbdecloopNx:
|
||||||
subs w4, w4, #4
|
subs w4, w4, #MAX_STRIDE
|
||||||
bmi .Lecbdec1x
|
bmi .Lecbdec1x
|
||||||
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
|
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
|
||||||
bl aes_decrypt_block4x
|
ST4( bl aes_decrypt_block4x )
|
||||||
|
ST5( ld1 {v4.16b}, [x1], #16 )
|
||||||
|
ST5( bl aes_decrypt_block5x )
|
||||||
st1 {v0.16b-v3.16b}, [x0], #64
|
st1 {v0.16b-v3.16b}, [x0], #64
|
||||||
|
ST5( st1 {v4.16b}, [x0], #16 )
|
||||||
b .LecbdecloopNx
|
b .LecbdecloopNx
|
||||||
.Lecbdec1x:
|
.Lecbdec1x:
|
||||||
adds w4, w4, #4
|
adds w4, w4, #MAX_STRIDE
|
||||||
beq .Lecbdecout
|
beq .Lecbdecout
|
||||||
.Lecbdecloop:
|
.Lecbdecloop:
|
||||||
ld1 {v0.16b}, [x1], #16 /* get next ct block */
|
ld1 {v0.16b}, [x1], #16 /* get next ct block */
|
||||||
@ -148,39 +162,56 @@ AES_ENTRY(aes_cbc_decrypt)
|
|||||||
stp x29, x30, [sp, #-16]!
|
stp x29, x30, [sp, #-16]!
|
||||||
mov x29, sp
|
mov x29, sp
|
||||||
|
|
||||||
ld1 {v7.16b}, [x5] /* get iv */
|
ld1 {cbciv.16b}, [x5] /* get iv */
|
||||||
dec_prepare w3, x2, x6
|
dec_prepare w3, x2, x6
|
||||||
|
|
||||||
.LcbcdecloopNx:
|
.LcbcdecloopNx:
|
||||||
subs w4, w4, #4
|
subs w4, w4, #MAX_STRIDE
|
||||||
bmi .Lcbcdec1x
|
bmi .Lcbcdec1x
|
||||||
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
|
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
|
||||||
|
#if MAX_STRIDE == 5
|
||||||
|
ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
|
||||||
|
mov v5.16b, v0.16b
|
||||||
|
mov v6.16b, v1.16b
|
||||||
|
mov v7.16b, v2.16b
|
||||||
|
bl aes_decrypt_block5x
|
||||||
|
sub x1, x1, #32
|
||||||
|
eor v0.16b, v0.16b, cbciv.16b
|
||||||
|
eor v1.16b, v1.16b, v5.16b
|
||||||
|
ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
|
||||||
|
ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
|
||||||
|
eor v2.16b, v2.16b, v6.16b
|
||||||
|
eor v3.16b, v3.16b, v7.16b
|
||||||
|
eor v4.16b, v4.16b, v5.16b
|
||||||
|
#else
|
||||||
mov v4.16b, v0.16b
|
mov v4.16b, v0.16b
|
||||||
mov v5.16b, v1.16b
|
mov v5.16b, v1.16b
|
||||||
mov v6.16b, v2.16b
|
mov v6.16b, v2.16b
|
||||||
bl aes_decrypt_block4x
|
bl aes_decrypt_block4x
|
||||||
sub x1, x1, #16
|
sub x1, x1, #16
|
||||||
eor v0.16b, v0.16b, v7.16b
|
eor v0.16b, v0.16b, cbciv.16b
|
||||||
eor v1.16b, v1.16b, v4.16b
|
eor v1.16b, v1.16b, v4.16b
|
||||||
ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
|
ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
|
||||||
eor v2.16b, v2.16b, v5.16b
|
eor v2.16b, v2.16b, v5.16b
|
||||||
eor v3.16b, v3.16b, v6.16b
|
eor v3.16b, v3.16b, v6.16b
|
||||||
|
#endif
|
||||||
st1 {v0.16b-v3.16b}, [x0], #64
|
st1 {v0.16b-v3.16b}, [x0], #64
|
||||||
|
ST5( st1 {v4.16b}, [x0], #16 )
|
||||||
b .LcbcdecloopNx
|
b .LcbcdecloopNx
|
||||||
.Lcbcdec1x:
|
.Lcbcdec1x:
|
||||||
adds w4, w4, #4
|
adds w4, w4, #MAX_STRIDE
|
||||||
beq .Lcbcdecout
|
beq .Lcbcdecout
|
||||||
.Lcbcdecloop:
|
.Lcbcdecloop:
|
||||||
ld1 {v1.16b}, [x1], #16 /* get next ct block */
|
ld1 {v1.16b}, [x1], #16 /* get next ct block */
|
||||||
mov v0.16b, v1.16b /* ...and copy to v0 */
|
mov v0.16b, v1.16b /* ...and copy to v0 */
|
||||||
decrypt_block v0, w3, x2, x6, w7
|
decrypt_block v0, w3, x2, x6, w7
|
||||||
eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
|
eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
|
||||||
mov v7.16b, v1.16b /* ct is next iv */
|
mov cbciv.16b, v1.16b /* ct is next iv */
|
||||||
st1 {v0.16b}, [x0], #16
|
st1 {v0.16b}, [x0], #16
|
||||||
subs w4, w4, #1
|
subs w4, w4, #1
|
||||||
bne .Lcbcdecloop
|
bne .Lcbcdecloop
|
||||||
.Lcbcdecout:
|
.Lcbcdecout:
|
||||||
st1 {v7.16b}, [x5] /* return iv */
|
st1 {cbciv.16b}, [x5] /* return iv */
|
||||||
ldp x29, x30, [sp], #16
|
ldp x29, x30, [sp], #16
|
||||||
ret
|
ret
|
||||||
AES_ENDPROC(aes_cbc_decrypt)
|
AES_ENDPROC(aes_cbc_decrypt)
|
||||||
@ -274,51 +305,60 @@ AES_ENTRY(aes_ctr_encrypt)
|
|||||||
mov x29, sp
|
mov x29, sp
|
||||||
|
|
||||||
enc_prepare w3, x2, x6
|
enc_prepare w3, x2, x6
|
||||||
ld1 {v4.16b}, [x5]
|
ld1 {vctr.16b}, [x5]
|
||||||
|
|
||||||
umov x6, v4.d[1] /* keep swabbed ctr in reg */
|
umov x6, vctr.d[1] /* keep swabbed ctr in reg */
|
||||||
rev x6, x6
|
rev x6, x6
|
||||||
cmn w6, w4 /* 32 bit overflow? */
|
cmn w6, w4 /* 32 bit overflow? */
|
||||||
bcs .Lctrloop
|
bcs .Lctrloop
|
||||||
.LctrloopNx:
|
.LctrloopNx:
|
||||||
subs w4, w4, #4
|
subs w4, w4, #MAX_STRIDE
|
||||||
bmi .Lctr1x
|
bmi .Lctr1x
|
||||||
add w7, w6, #1
|
add w7, w6, #1
|
||||||
mov v0.16b, v4.16b
|
mov v0.16b, vctr.16b
|
||||||
add w8, w6, #2
|
add w8, w6, #2
|
||||||
mov v1.16b, v4.16b
|
mov v1.16b, vctr.16b
|
||||||
|
add w9, w6, #3
|
||||||
|
mov v2.16b, vctr.16b
|
||||||
add w9, w6, #3
|
add w9, w6, #3
|
||||||
mov v2.16b, v4.16b
|
|
||||||
rev w7, w7
|
rev w7, w7
|
||||||
mov v3.16b, v4.16b
|
mov v3.16b, vctr.16b
|
||||||
rev w8, w8
|
rev w8, w8
|
||||||
|
ST5( mov v4.16b, vctr.16b )
|
||||||
mov v1.s[3], w7
|
mov v1.s[3], w7
|
||||||
rev w9, w9
|
rev w9, w9
|
||||||
|
ST5( add w10, w6, #4 )
|
||||||
mov v2.s[3], w8
|
mov v2.s[3], w8
|
||||||
|
ST5( rev w10, w10 )
|
||||||
mov v3.s[3], w9
|
mov v3.s[3], w9
|
||||||
|
ST5( mov v4.s[3], w10 )
|
||||||
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
|
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
|
||||||
bl aes_encrypt_block4x
|
ST4( bl aes_encrypt_block4x )
|
||||||
|
ST5( bl aes_encrypt_block5x )
|
||||||
eor v0.16b, v5.16b, v0.16b
|
eor v0.16b, v5.16b, v0.16b
|
||||||
ld1 {v5.16b}, [x1], #16 /* get 1 input block */
|
ST4( ld1 {v5.16b}, [x1], #16 )
|
||||||
eor v1.16b, v6.16b, v1.16b
|
eor v1.16b, v6.16b, v1.16b
|
||||||
|
ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
|
||||||
eor v2.16b, v7.16b, v2.16b
|
eor v2.16b, v7.16b, v2.16b
|
||||||
eor v3.16b, v5.16b, v3.16b
|
eor v3.16b, v5.16b, v3.16b
|
||||||
|
ST5( eor v4.16b, v6.16b, v4.16b )
|
||||||
st1 {v0.16b-v3.16b}, [x0], #64
|
st1 {v0.16b-v3.16b}, [x0], #64
|
||||||
add x6, x6, #4
|
ST5( st1 {v4.16b}, [x0], #16 )
|
||||||
|
add x6, x6, #MAX_STRIDE
|
||||||
rev x7, x6
|
rev x7, x6
|
||||||
ins v4.d[1], x7
|
ins vctr.d[1], x7
|
||||||
cbz w4, .Lctrout
|
cbz w4, .Lctrout
|
||||||
b .LctrloopNx
|
b .LctrloopNx
|
||||||
.Lctr1x:
|
.Lctr1x:
|
||||||
adds w4, w4, #4
|
adds w4, w4, #MAX_STRIDE
|
||||||
beq .Lctrout
|
beq .Lctrout
|
||||||
.Lctrloop:
|
.Lctrloop:
|
||||||
mov v0.16b, v4.16b
|
mov v0.16b, vctr.16b
|
||||||
encrypt_block v0, w3, x2, x8, w7
|
encrypt_block v0, w3, x2, x8, w7
|
||||||
|
|
||||||
adds x6, x6, #1 /* increment BE ctr */
|
adds x6, x6, #1 /* increment BE ctr */
|
||||||
rev x7, x6
|
rev x7, x6
|
||||||
ins v4.d[1], x7
|
ins vctr.d[1], x7
|
||||||
bcs .Lctrcarry /* overflow? */
|
bcs .Lctrcarry /* overflow? */
|
||||||
|
|
||||||
.Lctrcarrydone:
|
.Lctrcarrydone:
|
||||||
@ -330,7 +370,7 @@ AES_ENTRY(aes_ctr_encrypt)
|
|||||||
bne .Lctrloop
|
bne .Lctrloop
|
||||||
|
|
||||||
.Lctrout:
|
.Lctrout:
|
||||||
st1 {v4.16b}, [x5] /* return next CTR value */
|
st1 {vctr.16b}, [x5] /* return next CTR value */
|
||||||
ldp x29, x30, [sp], #16
|
ldp x29, x30, [sp], #16
|
||||||
ret
|
ret
|
||||||
|
|
||||||
@ -339,11 +379,11 @@ AES_ENTRY(aes_ctr_encrypt)
|
|||||||
b .Lctrout
|
b .Lctrout
|
||||||
|
|
||||||
.Lctrcarry:
|
.Lctrcarry:
|
||||||
umov x7, v4.d[0] /* load upper word of ctr */
|
umov x7, vctr.d[0] /* load upper word of ctr */
|
||||||
rev x7, x7 /* ... to handle the carry */
|
rev x7, x7 /* ... to handle the carry */
|
||||||
add x7, x7, #1
|
add x7, x7, #1
|
||||||
rev x7, x7
|
rev x7, x7
|
||||||
ins v4.d[0], x7
|
ins vctr.d[0], x7
|
||||||
b .Lctrcarrydone
|
b .Lctrcarrydone
|
||||||
AES_ENDPROC(aes_ctr_encrypt)
|
AES_ENDPROC(aes_ctr_encrypt)
|
||||||
|
|
||||||
|
@ -15,6 +15,8 @@
|
|||||||
#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
|
#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
|
||||||
|
|
||||||
xtsmask .req v7
|
xtsmask .req v7
|
||||||
|
cbciv .req v7
|
||||||
|
vctr .req v4
|
||||||
|
|
||||||
.macro xts_reload_mask, tmp
|
.macro xts_reload_mask, tmp
|
||||||
xts_load_mask \tmp
|
xts_load_mask \tmp
|
||||||
|
Loading…
Reference in New Issue
Block a user