crypto: aesni - AVX and AVX2 version of AESNI-GCM encode and decode

We have added AVX and AVX2 routines that optimize AESNI-GCM encode/decode.
These routines are optimized for encrypt and decrypt of large buffers.
In tests we have seen up to 6% speedup for 1K, 11% speedup for 2K and
18% speedup for 8K buffer over the existing SSE version.  These routines
should provide even better speedup for future Intel x86_64 cpus.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Tim Chen 2013-12-11 14:28:41 -08:00 committed by Herbert Xu
parent fed286110f
commit d764593af9
3 changed files with 2953 additions and 3 deletions

View File

@ -75,7 +75,7 @@ ifeq ($(avx2_supported),yes)
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
endif endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o aesni-intel-y := aesni-intel_asm.o aesni-intel_avx.o aesni-intel_glue.o fpu.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-y := crc32c-intel_glue.o

File diff suppressed because it is too large Load Diff

View File

@ -101,6 +101,9 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
int crypto_fpu_init(void); int crypto_fpu_init(void);
void crypto_fpu_exit(void); void crypto_fpu_exit(void);
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv); const u8 *in, unsigned int len, u8 *iv);
@ -150,6 +153,123 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len); u8 *auth_tag, unsigned long auth_tag_len);
#ifdef CONFIG_AS_AVX
/*
* asmlinkage void aesni_gcm_precomp_avx_gen2()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
static void aesni_gcm_enc_avx(void *ctx, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
if (plaintext_len < AVX_GEN2_OPTSIZE) {
aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
}
}
static void aesni_gcm_dec_avx(void *ctx, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
if (ciphertext_len < AVX_GEN2_OPTSIZE) {
aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
}
}
#endif
#ifdef CONFIG_AS_AVX2
/*
* asmlinkage void aesni_gcm_precomp_avx_gen4()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
if (plaintext_len < AVX_GEN2_OPTSIZE) {
aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
} else {
aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
}
}
static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
if (ciphertext_len < AVX_GEN2_OPTSIZE) {
aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
aad, aad_len, auth_tag, auth_tag_len);
} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
} else {
aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
}
}
#endif
static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
static inline struct static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
{ {
@ -915,7 +1035,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
dst = src; dst = src;
} }
aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+ ((unsigned long)req->cryptlen), auth_tag_len); + ((unsigned long)req->cryptlen), auth_tag_len);
@ -996,7 +1116,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
dst = src; dst = src;
} }
aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
ctx->hash_subkey, assoc, (unsigned long)req->assoclen, ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
authTag, auth_tag_len); authTag, auth_tag_len);
@ -1353,6 +1473,25 @@ static int __init aesni_init(void)
if (!x86_match_cpu(aesni_cpu_id)) if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV; return -ENODEV;
#ifdef CONFIG_AS_AVX2
if (boot_cpu_has(X86_FEATURE_AVX2)) {
pr_info("AVX2 version of gcm_enc/dec engaged.\n");
aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
} else
#endif
#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
pr_info("AVX version of gcm_enc/dec engaged.\n");
aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
} else
#endif
{
pr_info("SSE version of gcm_enc/dec engaged.\n");
aesni_gcm_enc_tfm = aesni_gcm_enc;
aesni_gcm_dec_tfm = aesni_gcm_dec;
}
err = crypto_fpu_init(); err = crypto_fpu_init();
if (err) if (err)