diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-26 13:00:59 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-26 13:00:59 -0700 |
commit | 44a6b8442190cf213081060b610dae2e822f802b (patch) | |
tree | 2280bfe385bef8b6416a6493ea8988a975008165 /arch/x86 | |
parent | 945c40c6b007eb4b07374a38ea37b2a34da306b1 (diff) | |
parent | a43478863b16cb0986fd2ec9d1f1b9ebaaec5922 (diff) | |
download | op-kernel-dev-44a6b8442190cf213081060b610dae2e822f802b.zip op-kernel-dev-44a6b8442190cf213081060b610dae2e822f802b.tar.gz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu:
- Fixed algorithm construction hang when self-test fails.
- Added SHA variants to talitos AEAD list.
- New driver for Exynos random number generator.
- Performance enhancements for arc4.
- Added hwrng support to caam.
- Added ahash support to caam.
- Fixed bad kfree in aesni-intel.
- Allow aesni-intel in FIPS mode.
- Added atmel driver with support for AES/3DES/SHA.
- Bug fixes for mv_cesa.
- CRC hardware driver for BF60x family processors.
* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (66 commits)
crypto: twofish-avx - remove useless instruction
crypto: testmgr - add aead cbc aes hmac sha1,256,512 test vectors
crypto: talitos - add sha224, sha384 and sha512 to existing AEAD algorithms
crypto: talitos - export the talitos_submit function
crypto: talitos - move talitos structures to header file
crypto: atmel - add new tests to tcrypt
crypto: atmel - add Atmel SHA1/SHA256 driver
crypto: atmel - add Atmel DES/TDES driver
crypto: atmel - add Atmel AES driver
ARM: AT91SAM9G45: add crypto peripherals
crypto: testmgr - allow aesni-intel and ghash_clmulni-intel in fips mode
hwrng: exynos - Add support for Exynos random number generator
crypto: aesni-intel - fix wrong kfree pointer
crypto: caam - ERA retrieval and printing for SEC device
crypto: caam - Using alloc_coherent for caam job rings
crypto: algapi - Fix hang on crypto allocation
crypto: arc4 - now arc needs blockcipher support
crypto: caam - one tasklet per job ring
crypto: caam - consolidate memory barriers from job ring en/dequeue
crypto: caam - only query h/w in job ring dequeue path
...
Diffstat (limited to 'arch/x86')
20 files changed, 3266 insertions, 1093 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index e191ac0..e908e5d 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,6 +2,9 @@ # Arch-specific CryptoAPI modules. # +obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o +obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o + obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o @@ -12,8 +15,10 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o +obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o +obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -30,16 +35,11 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o +twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o +serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o - ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o - -# enable AVX support only when $(AS) can actually assemble the instructions -ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes) -AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT -CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT -endif sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c new file mode 100644 index 0000000..43282fe --- /dev/null +++ b/arch/x86/crypto/ablk_helper.c @@ -0,0 +1,149 @@ +/* + * Shared async block cipher helpers + * + * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * Based on aesni-intel_glue.c by: + * Copyright (C) 2008, Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include <linux/kernel.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <crypto/algapi.h> +#include <crypto/cryptd.h> +#include <asm/i387.h> +#include <asm/crypto/ablk_helper.h> + +int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; +} +EXPORT_SYMBOL_GPL(ablk_set_key); + +int __ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->encrypt( + &desc, req->dst, req->src, req->nbytes); +} +EXPORT_SYMBOL_GPL(__ablk_encrypt); + +int ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_encrypt(cryptd_req); + } else { + return __ablk_encrypt(req); + } +} +EXPORT_SYMBOL_GPL(ablk_encrypt); + +int ablk_decrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_decrypt(cryptd_req); + } else { + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->decrypt( + &desc, req->dst, req->src, req->nbytes); + } +} +EXPORT_SYMBOL_GPL(ablk_decrypt); + +void ablk_exit(struct crypto_tfm *tfm) +{ + struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ablkcipher(ctx->cryptd_tfm); +} +EXPORT_SYMBOL_GPL(ablk_exit); + +int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name) +{ + struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(&cryptd_tfm->base); + + return 0; +} +EXPORT_SYMBOL_GPL(ablk_init_common); + +int ablk_init(struct crypto_tfm *tfm) +{ + char drv_name[CRYPTO_MAX_ALG_NAME]; + + snprintf(drv_name, sizeof(drv_name), "__driver-%s", + crypto_tfm_alg_driver_name(tfm)); + + return ablk_init_common(tfm, drv_name); +} +EXPORT_SYMBOL_GPL(ablk_init); + +MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 8efcf42..59b37de 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <crypto/aes.h> -#include <asm/aes.h> +#include <asm/crypto/aes.h> asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index ac7f5cd..34fdcff 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -30,7 +30,8 @@ #include <crypto/ctr.h> #include <asm/cpu_device_id.h> #include <asm/i387.h> -#include <asm/aes.h> +#include <asm/crypto/aes.h> +#include <asm/crypto/ablk_helper.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> #include <linux/workqueue.h> @@ -52,10 +53,6 @@ #define HAS_XTS #endif -struct async_aes_ctx { - struct cryptd_ablkcipher *cryptd_tfm; -}; - /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. @@ -377,87 +374,6 @@ static int ctr_crypt(struct blkcipher_desc *desc, } #endif -static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int key_len) -{ - struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; - int err; - - crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - err = crypto_ablkcipher_setkey(child, key, key_len); - crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) - & CRYPTO_TFM_RES_MASK); - return err; -} - -static int ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_ablkcipher_encrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - return crypto_blkcipher_crt(desc.tfm)->encrypt( - &desc, req->dst, req->src, req->nbytes); - } -} - -static int ablk_decrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_ablkcipher_decrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - return crypto_blkcipher_crt(desc.tfm)->decrypt( - &desc, req->dst, req->src, req->nbytes); - } -} - -static void ablk_exit(struct crypto_tfm *tfm) -{ - struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ablkcipher(ctx->cryptd_tfm); -} - -static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name) -{ - struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm); - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(&cryptd_tfm->base); - - return 0; -} - static int ablk_ecb_init(struct crypto_tfm *tfm) { return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); @@ -613,7 +529,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); struct aesni_rfc4106_gcm_ctx *child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child); - u8 *new_key_mem = NULL; + u8 *new_key_align, *new_key_mem = NULL; if (key_len < 4) { crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); @@ -637,9 +553,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, if (!new_key_mem) return -ENOMEM; - new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN); - memcpy(new_key_mem, key, key_len); - key = new_key_mem; + new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN); + memcpy(new_key_align, key, key_len); + key = new_key_align; } if (!irq_fpu_usable()) @@ -968,7 +884,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -989,7 +905,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1033,7 +949,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1098,7 +1014,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1126,7 +1042,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1150,7 +1066,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -1174,7 +1090,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 3306dc0..eeb2b3b 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -5,10 +5,6 @@ * * Camellia parts based on code by: * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -34,9 +30,9 @@ #include <linux/module.h> #include <linux/types.h> #include <crypto/algapi.h> -#include <crypto/b128ops.h> #include <crypto/lrw.h> #include <crypto/xts.h> +#include <asm/crypto/glue_helper.h> #define CAMELLIA_MIN_KEY_SIZE 16 #define CAMELLIA_MAX_KEY_SIZE 32 @@ -1312,307 +1308,128 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, &tfm->crt_flags); } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - void (*fn)(struct camellia_ctx *, u8 *, const u8 *), - void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *)) +static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) { - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = CAMELLIA_BLOCK_SIZE; - unsigned int nbytes; - int err; - - err = blkcipher_walk_virt(desc, walk); - - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - /* Process two block batch */ - if (nbytes >= bsize * 2) { - do { - fn_2way(ctx, wdst, wsrc); - - wsrc += bsize * 2; - wdst += bsize * 2; - nbytes -= bsize * 2; - } while (nbytes >= bsize * 2); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } - - return err; -} - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way); -} + u128 iv = *src; -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way); -} + camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = CAMELLIA_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 *iv = (u128 *)walk->iv; - - do { - u128_xor(dst, src, iv); - camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); - return nbytes; + u128_xor(&dst[1], &dst[1], &iv); } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) { - struct blkcipher_walk walk; - int err; + be128 ctrblk; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + if (dst != src) + *dst = *src; - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } + u128_to_be128(&ctrblk, iv); + u128_inc(iv); - return err; + camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, + u128 *iv) { - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = CAMELLIA_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ivs[2 - 1]; - u128 last_iv; + be128 ctrblks[2]; - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process two block batch */ - if (nbytes >= bsize * 2) { - do { - nbytes -= bsize * (2 - 1); - src -= 2 - 1; - dst -= 2 - 1; - - ivs[0] = src[0]; - - camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); - - u128_xor(dst + 1, dst + 1, ivs + 0); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * 2); - - if (nbytes < bsize) - goto done; + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; } - /* Handle leftovers */ - for (;;) { - camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; + u128_to_be128(&ctrblks[0], iv); + u128_inc(iv); + u128_to_be128(&ctrblks[1], iv); + u128_inc(iv); - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } - -done: - u128_xor(dst, dst, (u128 *)walk->iv); - *(u128 *)walk->iv = last_iv; - - return nbytes; + camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); +static const struct common_glue_ctx camellia_enc = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 2, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } + } } +}; - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } +static const struct common_glue_ctx camellia_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 2, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } + } } +}; - return err; -} +static const struct common_glue_ctx camellia_dec = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 2, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } + } } +}; -static inline void u128_to_be128(be128 *dst, const u128 *src) -{ - dst->a = cpu_to_be64(src->a); - dst->b = cpu_to_be64(src->b); -} +static const struct common_glue_ctx camellia_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 2, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } + } } +}; -static inline void be128_to_u128(u128 *dst, const be128 *src) +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - dst->a = be64_to_cpu(src->a); - dst->b = be64_to_cpu(src->b); + return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); } -static inline void u128_inc(u128 *i) +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - i->b++; - if (!i->b) - i->a++; + return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); } -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 keystream[CAMELLIA_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - u128 ctrblk; - - memcpy(keystream, src, nbytes); - camellia_enc_blk_xor(ctx, keystream, walk->iv); - memcpy(dst, keystream, nbytes); - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - u128_inc(&ctrblk); - u128_to_be128((be128 *)walk->iv, &ctrblk); + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, + dst, src, nbytes); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = CAMELLIA_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ctrblk; - be128 ctrblocks[2]; - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - - /* Process two block batch */ - if (nbytes >= bsize * 2) { - do { - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - } - - /* create ctrblks for parallel encrypt */ - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - u128_to_be128(&ctrblocks[1], &ctrblk); - u128_inc(&ctrblk); - - camellia_enc_blk_xor_2way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += 2; - dst += 2; - nbytes -= bsize * 2; - } while (nbytes >= bsize * 2); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - - camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks); - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - u128_to_be128((be128 *)walk->iv, &ctrblk); - return nbytes; + return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, + nbytes); } static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE); - - while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) { - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; + return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); } static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c new file mode 100644 index 0000000..4854f0f --- /dev/null +++ b/arch/x86/crypto/glue_helper.c @@ -0,0 +1,307 @@ +/* + * Shared glue code for 128bit block ciphers + * + * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include <linux/module.h> +#include <crypto/b128ops.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/crypto/glue_helper.h> +#include <crypto/scatterwalk.h> + +static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes, i, func_bytes; + bool fpu_enabled = false; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + + for (i = 0; i < gctx->num_funcs; i++) { + func_bytes = bsize * gctx->funcs[i].num_blocks; + + /* Process multi-block batch */ + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.ecb(ctx, wdst, + wsrc); + + wsrc += func_bytes; + wdst += func_bytes; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + glue_fpu_end(fpu_enabled); + return err; +} + +int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return __glue_ecb_crypt_128bit(gctx, desc, &walk); +} +EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit); + +static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + fn(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +int glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} +EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit); + +static unsigned int +__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 last_iv; + unsigned int num_blocks, func_bytes; + unsigned int i; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + /* Process multi-block batch */ + if (nbytes >= func_bytes) { + do { + nbytes -= func_bytes - bsize; + src -= num_blocks - 1; + dst -= num_blocks - 1; + + gctx->funcs[i].fn_u.cbc(ctx, dst, src); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + const unsigned int bsize = 128 / 8; + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + glue_fpu_end(fpu_enabled); + return err; +} +EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit); + +static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *src = (u8 *)walk->src.virt.addr; + u8 *dst = (u8 *)walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + u128 ctrblk; + u128 tmp; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + memcpy(&tmp, src, nbytes); + fn_ctr(ctx, &tmp, &tmp, &ctrblk); + memcpy(dst, &tmp, nbytes); + + u128_to_be128((be128 *)walk->iv, &ctrblk); +} +EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); + +static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + const unsigned int bsize = 128 / 8; + void *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ctrblk; + unsigned int num_blocks, func_bytes; + unsigned int i; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + /* Process multi-block batch */ + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk); + + src += num_blocks; + dst += num_blocks; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + u128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + const unsigned int bsize = 128 / 8; + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, bsize); + + while ((nbytes = walk.nbytes) >= bsize) { + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + glue_fpu_end(fpu_enabled); + + if (walk.nbytes) { + glue_ctr_crypt_final_128bit( + gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} +EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit); + +MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S new file mode 100644 index 0000000..504106b --- /dev/null +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -0,0 +1,704 @@ +/* + * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) + * + * Copyright (C) 2012 Johannes Goetzfried + * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> + * + * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by + * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "serpent-avx-x86_64-asm_64.S" +.text + +#define CTX %rdi + +/********************************************************************** + 8-way AVX serpent + **********************************************************************/ +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 +#define RE1 %xmm4 + +#define tp %xmm5 + +#define RA2 %xmm6 +#define RB2 %xmm7 +#define RC2 %xmm8 +#define RD2 %xmm9 +#define RE2 %xmm10 + +#define RNOT %xmm11 + +#define RK0 %xmm12 +#define RK1 %xmm13 +#define RK2 %xmm14 +#define RK3 %xmm15 + + +#define S0_1(x0, x1, x2, x3, x4) \ + vpor x0, x3, tp; \ + vpxor x3, x0, x0; \ + vpxor x2, x3, x4; \ + vpxor RNOT, x4, x4; \ + vpxor x1, tp, x3; \ + vpand x0, x1, x1; \ + vpxor x4, x1, x1; \ + vpxor x0, x2, x2; +#define S0_2(x0, x1, x2, x3, x4) \ + vpxor x3, x0, x0; \ + vpor x0, x4, x4; \ + vpxor x2, x0, x0; \ + vpand x1, x2, x2; \ + vpxor x2, x3, x3; \ + vpxor RNOT, x1, x1; \ + vpxor x4, x2, x2; \ + vpxor x2, x1, x1; + +#define S1_1(x0, x1, x2, x3, x4) \ + vpxor x0, x1, tp; \ + vpxor x3, x0, x0; \ + vpxor RNOT, x3, x3; \ + vpand tp, x1, x4; \ + vpor tp, x0, x0; \ + vpxor x2, x3, x3; \ + vpxor x3, x0, x0; \ + vpxor x3, tp, x1; +#define S1_2(x0, x1, x2, x3, x4) \ + vpxor x4, x3, x3; \ + vpor x4, x1, x1; \ + vpxor x2, x4, x4; \ + vpand x0, x2, x2; \ + vpxor x1, x2, x2; \ + vpor x0, x1, x1; \ + vpxor RNOT, x0, x0; \ + vpxor x2, x0, x0; \ + vpxor x1, x4, x4; + +#define S2_1(x0, x1, x2, x3, x4) \ + vpxor RNOT, x3, x3; \ + vpxor x0, x1, x1; \ + vpand x2, x0, tp; \ + vpxor x3, tp, tp; \ + vpor x0, x3, x3; \ + vpxor x1, x2, x2; \ + vpxor x1, x3, x3; \ + vpand tp, x1, x1; +#define S2_2(x0, x1, x2, x3, x4) \ + vpxor x2, tp, tp; \ + vpand x3, x2, x2; \ + vpor x1, x3, x3; \ + vpxor RNOT, tp, tp; \ + vpxor tp, x3, x3; \ + vpxor tp, x0, x4; \ + vpxor x2, tp, x0; \ + vpor x2, x1, x1; + +#define S3_1(x0, x1, x2, x3, x4) \ + vpxor x3, x1, tp; \ + vpor x0, x3, x3; \ + vpand x0, x1, x4; \ + vpxor x2, x0, x0; \ + vpxor tp, x2, x2; \ + vpand x3, tp, x1; \ + vpxor x3, x2, x2; \ + vpor x4, x0, x0; \ + vpxor x3, x4, x4; +#define S3_2(x0, x1, x2, x3, x4) \ + vpxor x0, x1, x1; \ + vpand x3, x0, x0; \ + vpand x4, x3, x3; \ + vpxor x2, x3, x3; \ + vpor x1, x4, x4; \ + vpand x1, x2, x2; \ + vpxor x3, x4, x4; \ + vpxor x3, x0, x0; \ + vpxor x2, x3, x3; + +#define S4_1(x0, x1, x2, x3, x4) \ + vpand x0, x3, tp; \ + vpxor x3, x0, x0; \ + vpxor x2, tp, tp; \ + vpor x3, x2, x2; \ + vpxor x1, x0, x0; \ + vpxor tp, x3, x4; \ + vpor x0, x2, x2; \ + vpxor x1, x2, x2; +#define S4_2(x0, x1, x2, x3, x4) \ + vpand x0, x1, x1; \ + vpxor x4, x1, x1; \ + vpand x2, x4, x4; \ + vpxor tp, x2, x2; \ + vpxor x0, x4, x4; \ + vpor x1, tp, x3; \ + vpxor RNOT, x1, x1; \ + vpxor x0, x3, x3; + +#define S5_1(x0, x1, x2, x3, x4) \ + vpor x0, x1, tp; \ + vpxor tp, x2, x2; \ + vpxor RNOT, x3, x3; \ + vpxor x0, x1, x4; \ + vpxor x2, x0, x0; \ + vpand x4, tp, x1; \ + vpor x3, x4, x4; \ + vpxor x0, x4, x4; +#define S5_2(x0, x1, x2, x3, x4) \ + vpand x3, x0, x0; \ + vpxor x3, x1, x1; \ + vpxor x2, x3, x3; \ + vpxor x1, x0, x0; \ + vpand x4, x2, x2; \ + vpxor x2, x1, x1; \ + vpand x0, x2, x2; \ + vpxor x2, x3, x3; + +#define S6_1(x0, x1, x2, x3, x4) \ + vpxor x0, x3, x3; \ + vpxor x2, x1, tp; \ + vpxor x0, x2, x2; \ + vpand x3, x0, x0; \ + vpor x3, tp, tp; \ + vpxor RNOT, x1, x4; \ + vpxor tp, x0, x0; \ + vpxor x2, tp, x1; +#define S6_2(x0, x1, x2, x3, x4) \ + vpxor x4, x3, x3; \ + vpxor x0, x4, x4; \ + vpand x0, x2, x2; \ + vpxor x1, x4, x4; \ + vpxor x3, x2, x2; \ + vpand x1, x3, x3; \ + vpxor x0, x3, x3; \ + vpxor x2, x1, x1; + +#define S7_1(x0, x1, x2, x3, x4) \ + vpxor RNOT, x1, tp; \ + vpxor RNOT, x0, x0; \ + vpand x2, tp, x1; \ + vpxor x3, x1, x1; \ + vpor tp, x3, x3; \ + vpxor x2, tp, x4; \ + vpxor x3, x2, x2; \ + vpxor x0, x3, x3; \ + vpor x1, x0, x0; +#define S7_2(x0, x1, x2, x3, x4) \ + vpand x0, x2, x2; \ + vpxor x4, x0, x0; \ + vpxor x3, x4, x4; \ + vpand x0, x3, x3; \ + vpxor x1, x4, x4; \ + vpxor x4, x2, x2; \ + vpxor x1, x3, x3; \ + vpor x0, x4, x4; \ + vpxor x1, x4, x4; + +#define SI0_1(x0, x1, x2, x3, x4) \ + vpxor x0, x1, x1; \ + vpor x1, x3, tp; \ + vpxor x1, x3, x4; \ + vpxor RNOT, x0, x0; \ + vpxor tp, x2, x2; \ + vpxor x0, tp, x3; \ + vpand x1, x0, x0; \ + vpxor x2, x0, x0; +#define SI0_2(x0, x1, x2, x3, x4) \ + vpand x3, x2, x2; \ + vpxor x4, x3, x3; \ + vpxor x3, x2, x2; \ + vpxor x3, x1, x1; \ + vpand x0, x3, x3; \ + vpxor x0, x1, x1; \ + vpxor x2, x0, x0; \ + vpxor x3, x4, x4; + +#define SI1_1(x0, x1, x2, x3, x4) \ + vpxor x3, x1, x1; \ + vpxor x2, x0, tp; \ + vpxor RNOT, x2, x2; \ + vpor x1, x0, x4; \ + vpxor x3, x4, x4; \ + vpand x1, x3, x3; \ + vpxor x2, x1, x1; \ + vpand x4, x2, x2; +#define SI1_2(x0, x1, x2, x3, x4) \ + vpxor x1, x4, x4; \ + vpor x3, x1, x1; \ + vpxor tp, x3, x3; \ + vpxor tp, x2, x2; \ + vpor x4, tp, x0; \ + vpxor x4, x2, x2; \ + vpxor x0, x1, x1; \ + vpxor x1, x4, x4; + +#define SI2_1(x0, x1, x2, x3, x4) \ + vpxor x1, x2, x2; \ + vpxor RNOT, x3, tp; \ + vpor x2, tp, tp; \ + vpxor x3, x2, x2; \ + vpxor x0, x3, x4; \ + vpxor x1, tp, x3; \ + vpor x2, x1, x1; \ + vpxor x0, x2, x2; +#define SI2_2(x0, x1, x2, x3, x4) \ + vpxor x4, x1, x1; \ + vpor x3, x4, x4; \ + vpxor x3, x2, x2; \ + vpxor x2, x4, x4; \ + vpand x1, x2, x2; \ + vpxor x3, x2, x2; \ + vpxor x4, x3, x3; \ + vpxor x0, x4, x4; + +#define SI3_1(x0, x1, x2, x3, x4) \ + vpxor x1, x2, x2; \ + vpand x2, x1, tp; \ + vpxor x0, tp, tp; \ + vpor x1, x0, x0; \ + vpxor x3, x1, x4; \ + vpxor x3, x0, x0; \ + vpor tp, x3, x3; \ + vpxor x2, tp, x1; +#define SI3_2(x0, x1, x2, x3, x4) \ + vpxor x3, x1, x1; \ + vpxor x2, x0, x0; \ + vpxor x3, x2, x2; \ + vpand x1, x3, x3; \ + vpxor x0, x1, x1; \ + vpand x2, x0, x0; \ + vpxor x3, x4, x4; \ + vpxor x0, x3, x3; \ + vpxor x1, x0, x0; + +#define SI4_1(x0, x1, x2, x3, x4) \ + vpxor x3, x2, x2; \ + vpand x1, x0, tp; \ + vpxor x2, tp, tp; \ + vpor x3, x2, x2; \ + vpxor RNOT, x0, x4; \ + vpxor tp, x1, x1; \ + vpxor x2, tp, x0; \ + vpand x4, x2, x2; +#define SI4_2(x0, x1, x2, x3, x4) \ + vpxor x0, x2, x2; \ + vpor x4, x0, x0; \ + vpxor x3, x0, x0; \ + vpand x2, x3, x3; \ + vpxor x3, x4, x4; \ + vpxor x1, x3, x3; \ + vpand x0, x1, x1; \ + vpxor x1, x4, x4; \ + vpxor x3, x0, x0; + +#define SI5_1(x0, x1, x2, x3, x4) \ + vpor x2, x1, tp; \ + vpxor x1, x2, x2; \ + vpxor x3, tp, tp; \ + vpand x1, x3, x3; \ + vpxor x3, x2, x2; \ + vpor x0, x3, x3; \ + vpxor RNOT, x0, x0; \ + vpxor x2, x3, x3; \ + vpor x0, x2, x2; +#define SI5_2(x0, x1, x2, x3, x4) \ + vpxor tp, x1, x4; \ + vpxor x4, x2, x2; \ + vpand x0, x4, x4; \ + vpxor tp, x0, x0; \ + vpxor x3, tp, x1; \ + vpand x2, x0, x0; \ + vpxor x3, x2, x2; \ + vpxor x2, x0, x0; \ + vpxor x4, x2, x2; \ + vpxor x3, x4, x4; + +#define SI6_1(x0, x1, x2, x3, x4) \ + vpxor x2, x0, x0; \ + vpand x3, x0, tp; \ + vpxor x3, x2, x2; \ + vpxor x2, tp, tp; \ + vpxor x1, x3, x3; \ + vpor x0, x2, x2; \ + vpxor x3, x2, x2; \ + vpand tp, x3, x3; +#define SI6_2(x0, x1, x2, x3, x4) \ + vpxor RNOT, tp, tp; \ + vpxor x1, x3, x3; \ + vpand x2, x1, x1; \ + vpxor tp, x0, x4; \ + vpxor x4, x3, x3; \ + vpxor x2, x4, x4; \ + vpxor x1, tp, x0; \ + vpxor x0, x2, x2; + +#define SI7_1(x0, x1, x2, x3, x4) \ + vpand x0, x3, tp; \ + vpxor x2, x0, x0; \ + vpor x3, x2, x2; \ + vpxor x1, x3, x4; \ + vpxor RNOT, x0, x0; \ + vpor tp, x1, x1; \ + vpxor x0, x4, x4; \ + vpand x2, x0, x0; \ + vpxor x1, x0, x0; +#define SI7_2(x0, x1, x2, x3, x4) \ + vpand x2, x1, x1; \ + vpxor x2, tp, x3; \ + vpxor x3, x4, x4; \ + vpand x3, x2, x2; \ + vpor x0, x3, x3; \ + vpxor x4, x1, x1; \ + vpxor x4, x3, x3; \ + vpand x0, x4, x4; \ + vpxor x2, x4, x4; + +#define get_key(i, j, t) \ + vbroadcastss (4*(i)+(j))*4(CTX), t; + +#define K2(x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + get_key(i, 1, RK1); \ + get_key(i, 2, RK2); \ + get_key(i, 3, RK3); \ + vpxor RK0, x0 ## 1, x0 ## 1; \ + vpxor RK1, x1 ## 1, x1 ## 1; \ + vpxor RK2, x2 ## 1, x2 ## 1; \ + vpxor RK3, x3 ## 1, x3 ## 1; \ + vpxor RK0, x0 ## 2, x0 ## 2; \ + vpxor RK1, x1 ## 2, x1 ## 2; \ + vpxor RK2, x2 ## 2, x2 ## 2; \ + vpxor RK3, x3 ## 2, x3 ## 2; + +#define LK2(x0, x1, x2, x3, x4, i) \ + vpslld $13, x0 ## 1, x4 ## 1; \ + vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $3, x2 ## 1, x4 ## 1; \ + vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $13, x0 ## 2, x4 ## 2; \ + vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $3, x2 ## 2, x4 ## 2; \ + vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $1, x1 ## 1, x4 ## 1; \ + vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ + vpor x4 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $3, x0 ## 1, x4 ## 1; \ + vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ + vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ + get_key(i, 1, RK1); \ + vpslld $1, x1 ## 2, x4 ## 2; \ + vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ + vpor x4 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $3, x0 ## 2, x4 ## 2; \ + vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ + vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ + get_key(i, 3, RK3); \ + vpslld $7, x3 ## 1, x4 ## 1; \ + vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ + vpor x4 ## 1, x3 ## 1, x3 ## 1; \ + vpslld $7, x1 ## 1, x4 ## 1; \ + vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ + vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ + get_key(i, 0, RK0); \ + vpslld $7, x3 ## 2, x4 ## 2; \ + vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ + vpor x4 ## 2, x3 ## 2, x3 ## 2; \ + vpslld $7, x1 ## 2, x4 ## 2; \ + vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ + get_key(i, 2, RK2); \ + vpxor RK1, x1 ## 1, x1 ## 1; \ + vpxor RK3, x3 ## 1, x3 ## 1; \ + vpslld $5, x0 ## 1, x4 ## 1; \ + vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpslld $22, x2 ## 1, x4 ## 1; \ + vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpxor RK0, x0 ## 1, x0 ## 1; \ + vpxor RK2, x2 ## 1, x2 ## 1; \ + vpxor RK1, x1 ## 2, x1 ## 2; \ + vpxor RK3, x3 ## 2, x3 ## 2; \ + vpslld $5, x0 ## 2, x4 ## 2; \ + vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpslld $22, x2 ## 2, x4 ## 2; \ + vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpxor RK0, x0 ## 2, x0 ## 2; \ + vpxor RK2, x2 ## 2, x2 ## 2; + +#define KL2(x0, x1, x2, x3, x4, i) \ + vpxor RK0, x0 ## 1, x0 ## 1; \ + vpxor RK2, x2 ## 1, x2 ## 1; \ + vpsrld $5, x0 ## 1, x4 ## 1; \ + vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpxor RK3, x3 ## 1, x3 ## 1; \ + vpxor RK1, x1 ## 1, x1 ## 1; \ + vpsrld $22, x2 ## 1, x4 ## 1; \ + vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ + vpxor RK0, x0 ## 2, x0 ## 2; \ + vpxor RK2, x2 ## 2, x2 ## 2; \ + vpsrld $5, x0 ## 2, x4 ## 2; \ + vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpxor RK3, x3 ## 2, x3 ## 2; \ + vpxor RK1, x1 ## 2, x1 ## 2; \ + vpsrld $22, x2 ## 2, x4 ## 2; \ + vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ + vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ + vpslld $7, x1 ## 1, x4 ## 1; \ + vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpsrld $1, x1 ## 1, x4 ## 1; \ + vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ + vpor x4 ## 1, x1 ## 1, x1 ## 1; \ + vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ + vpslld $7, x1 ## 2, x4 ## 2; \ + vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ + vpsrld $1, x1 ## 2, x4 ## 2; \ + vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ + vpor x4 ## 2, x1 ## 2, x1 ## 2; \ + vpsrld $7, x3 ## 1, x4 ## 1; \ + vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ + vpor x4 ## 1, x3 ## 1, x3 ## 1; \ + vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ + vpslld $3, x0 ## 1, x4 ## 1; \ + vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ + vpsrld $7, x3 ## 2, x4 ## 2; \ + vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ + vpor x4 ## 2, x3 ## 2, x3 ## 2; \ + vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ + vpslld $3, x0 ## 2, x4 ## 2; \ + vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ + vpsrld $13, x0 ## 1, x4 ## 1; \ + vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ + vpor x4 ## 1, x0 ## 1, x0 ## 1; \ + vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ + vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ + vpsrld $3, x2 ## 1, x4 ## 1; \ + vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ + vpor x4 ## 1, x2 ## 1, x2 ## 1; \ + vpsrld $13, x0 ## 2, x4 ## 2; \ + vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ + vpor x4 ## 2, x0 ## 2, x0 ## 2; \ + vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ + vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ + vpsrld $3, x2 ## 2, x4 ## 2; \ + vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ + vpor x4 ## 2, x2 ## 2, x2 ## 2; + +#define S(SBOX, x0, x1, x2, x3, x4) \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); + +#define SP(SBOX, x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 2, RK2); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 3, RK3); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + get_key(i, 1, RK1); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + vpunpckldq x1, x0, t0; \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x3; \ + \ + vpunpcklqdq t1, t0, x0; \ + vpunpckhqdq t1, t0, x1; \ + vpunpcklqdq x3, t2, x2; \ + vpunpckhqdq x3, t2, x3; + +#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ + vmovdqu (0*4*4)(in), x0; \ + vmovdqu (1*4*4)(in), x1; \ + vmovdqu (2*4*4)(in), x2; \ + vmovdqu (3*4*4)(in), x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vmovdqu x0, (0*4*4)(out); \ + vmovdqu x1, (1*4*4)(out); \ + vmovdqu x2, (2*4*4)(out); \ + vmovdqu x3, (3*4*4)(out); + +#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpxor (0*4*4)(out), x0, x0; \ + vmovdqu x0, (0*4*4)(out); \ + vpxor (1*4*4)(out), x1, x1; \ + vmovdqu x1, (1*4*4)(out); \ + vpxor (2*4*4)(out), x2, x2; \ + vmovdqu x2, (2*4*4)(out); \ + vpxor (3*4*4)(out), x3, x3; \ + vmovdqu x3, (3*4*4)(out); + +.align 8 +.global __serpent_enc_blk_8way_avx +.type __serpent_enc_blk_8way_avx,@function; + +__serpent_enc_blk_8way_avx: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + + vpcmpeqd RNOT, RNOT, RNOT; + + leaq (4*4*4)(%rdx), %rax; + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 0); + S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); + S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); + S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); + S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); + S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); + S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); + S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); + S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); + S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); + S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); + S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); + S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); + S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); + S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); + S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); + S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); + S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); + S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); + S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); + S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); + S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); + S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); + S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); + S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); + S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); + S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); + S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); + S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); + S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); + S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); + S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); + S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); + + leaq (4*4*4)(%rsi), %rax; + + testb %cl, %cl; + jnz __enc_xor8; + + write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + ret; + +__enc_xor8: + xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + ret; + +.align 8 +.global serpent_dec_blk_8way_avx +.type serpent_dec_blk_8way_avx,@function; + +serpent_dec_blk_8way_avx: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + vpcmpeqd RNOT, RNOT, RNOT; + + leaq (4*4*4)(%rdx), %rax; + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 32); + SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); + SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); + SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); + SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); + SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); + SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); + SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); + SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); + SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); + SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); + SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); + SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); + SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); + SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); + SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); + SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); + SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); + SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); + SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); + SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); + SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); + SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); + SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); + SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); + SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); + SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); + SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); + SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); + SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); + SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); + SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); + S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); + + leaq (4*4*4)(%rsi), %rax; + write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); + write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); + + ret; diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c new file mode 100644 index 0000000..b36bdac --- /dev/null +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -0,0 +1,636 @@ +/* + * Glue Code for AVX assembler versions of Serpent Cipher + * + * Copyright (C) 2012 Johannes Goetzfried + * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> + * + * Glue code based on serpent_sse2_glue.c by: + * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include <linux/module.h> +#include <linux/hardirq.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <crypto/algapi.h> +#include <crypto/serpent.h> +#include <crypto/cryptd.h> +#include <crypto/b128ops.h> +#include <crypto/ctr.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <asm/crypto/serpent-avx.h> +#include <asm/crypto/ablk_helper.h> +#include <asm/crypto/glue_helper.h> + +static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) +{ + u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; + unsigned int j; + + for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) + ivs[j] = src[j]; + + serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) + u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); +} + +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +{ + be128 ctrblk; + + u128_to_be128(&ctrblk, iv); + u128_inc(iv); + + __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); + u128_xor(dst, src, (u128 *)&ctrblk); +} + +static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, + u128 *iv) +{ + be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; + unsigned int i; + + for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + u128_to_be128(&ctrblks[i], iv); + u128_inc(iv); + } + + serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); +} + +static const struct common_glue_ctx serpent_enc = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + } } +}; + +static const struct common_glue_ctx serpent_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } + } } +}; + +static const struct common_glue_ctx serpent_dec = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + } } +}; + +static const struct common_glue_ctx serpent_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + } } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, + dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, + nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); +} + +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS, + NULL, fpu_enabled, nbytes); +} + +static inline void serpent_fpu_end(bool fpu_enabled) +{ + glue_fpu_end(fpu_enabled); +} + +struct crypt_priv { + struct serpent_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_encrypt(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_decrypt(ctx->ctx, srcdst, srcdst); +} + +struct serpent_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct serpent_ctx serpent_ctx; +}; + +static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __serpent_setkey(&ctx->serpent_ctx, key, keylen - + SERPENT_BLOCK_SIZE); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - + SERPENT_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +struct serpent_xts_ctx { + struct serpent_ctx tweak_ctx; + struct serpent_ctx crypt_ctx; +}; + +static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static struct crypto_alg serpent_algs[10] = { { + .cra_name = "__ecb-serpent-avx", + .cra_driver_name = "__driver-ecb-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[0].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-serpent-avx", + .cra_driver_name = "__driver-cbc-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[1].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-serpent-avx", + .cra_driver_name = "__driver-ctr-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[2].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-serpent-avx", + .cra_driver_name = "__driver-lrw-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[3].cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = lrw_serpent_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-serpent-avx", + .cra_driver_name = "__driver-xts-serpent-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[4].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = xts_serpent_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "ecb(serpent)", + .cra_driver_name = "ecb-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[5].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(serpent)", + .cra_driver_name = "cbc-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[6].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(serpent)", + .cra_driver_name = "ctr-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[7].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "lrw(serpent)", + .cra_driver_name = "lrw-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[8].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "xts(serpent)", + .cra_driver_name = "xts-serpent-avx", + .cra_priority = 500, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(serpent_algs[9].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +} }; + +static int __init serpent_init(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) { + printk(KERN_INFO "AVX instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + printk(KERN_INFO "AVX detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); +} + +static void __exit serpent_exit(void) +{ + crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); +} + +module_init(serpent_init); +module_exit(serpent_exit); + +MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("serpent"); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 4b21be8..d679c86 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -41,358 +41,145 @@ #include <crypto/ctr.h> #include <crypto/lrw.h> #include <crypto/xts.h> -#include <asm/i387.h> -#include <asm/serpent.h> -#include <crypto/scatterwalk.h> -#include <linux/workqueue.h> -#include <linux/spinlock.h> - -struct async_serpent_ctx { - struct cryptd_ablkcipher *cryptd_tfm; -}; +#include <asm/crypto/serpent-sse2.h> +#include <asm/crypto/ablk_helper.h> +#include <asm/crypto/glue_helper.h> -static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) -{ - if (fpu_enabled) - return true; - - /* SSE2 is only used when chunk to be processed is large enough, so - * do not enable FPU until it is necessary. - */ - if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS) - return false; - - kernel_fpu_begin(); - return true; -} - -static inline void serpent_fpu_end(bool fpu_enabled) +static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) { - if (fpu_enabled) - kernel_fpu_end(); -} - -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - bool enc) -{ - bool fpu_enabled = false; - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; - unsigned int nbytes; - int err; - - err = blkcipher_walk_virt(desc, walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); - - /* Process multi-block batch */ - if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { - do { - if (enc) - serpent_enc_blk_xway(ctx, wdst, wsrc); - else - serpent_dec_blk_xway(ctx, wdst, wsrc); - - wsrc += bsize * SERPENT_PARALLEL_BLOCKS; - wdst += bsize * SERPENT_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; - } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (enc) - __serpent_encrypt(ctx, wdst, wsrc); - else - __serpent_decrypt(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } + u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; + unsigned int j; - serpent_fpu_end(fpu_enabled); - return err; -} + for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) + ivs[j] = src[j]; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; + serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, true); + for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) + u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) { - struct blkcipher_walk walk; + be128 ctrblk; - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, false); -} + u128_to_be128(&ctrblk, iv); + u128_inc(iv); -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 *iv = (u128 *)walk->iv; - - do { - u128_xor(dst, src, iv); - __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); - return nbytes; + __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); + u128_xor(dst, src, (u128 *)&ctrblk); } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, + u128 *iv) { - struct blkcipher_walk walk; - int err; + be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; + unsigned int i; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); + u128_to_be128(&ctrblks[i], iv); + u128_inc(iv); } - return err; + serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; - u128 last_iv; - int i; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process multi-block batch */ - if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { - do { - nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1); - src -= SERPENT_PARALLEL_BLOCKS - 1; - dst -= SERPENT_PARALLEL_BLOCKS - 1; - - for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) - ivs[i] = src[i]; - - serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - - for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) - u128_xor(dst + (i + 1), dst + (i + 1), ivs + i); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; +static const struct common_glue_ctx serpent_enc = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - for (;;) { - __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } + } } +}; - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } +static const struct common_glue_ctx serpent_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } + } } +}; -done: - u128_xor(dst, dst, (u128 *)walk->iv); - *(u128 *)walk->iv = last_iv; +static const struct common_glue_ctx serpent_dec = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } + } } +}; - return nbytes; -} +static const struct common_glue_ctx serpent_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = SERPENT_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } + } } +}; -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes)) { - fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - serpent_fpu_end(fpu_enabled); - return err; + return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); } -static inline void u128_to_be128(be128 *dst, const u128 *src) +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - dst->a = cpu_to_be64(src->a); - dst->b = cpu_to_be64(src->b); + return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); } -static inline void be128_to_u128(u128 *dst, const be128 *src) +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - dst->a = be64_to_cpu(src->a); - dst->b = be64_to_cpu(src->b); + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, + dst, src, nbytes); } -static inline void u128_inc(u128 *i) +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - i->b++; - if (!i->b) - i->a++; + return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, + nbytes); } -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *ctrblk = walk->iv; - u8 keystream[SERPENT_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - __serpent_encrypt(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); - - crypto_inc(ctrblk, SERPENT_BLOCK_SIZE); + return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) { - struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - const unsigned int bsize = SERPENT_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ctrblk; - be128 ctrblocks[SERPENT_PARALLEL_BLOCKS]; - int i; - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - - /* Process multi-block batch */ - if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { - do { - /* create ctrblks for parallel encrypt */ - for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { - if (dst != src) - dst[i] = src[i]; - - u128_to_be128(&ctrblocks[i], &ctrblk); - u128_inc(&ctrblk); - } - - serpent_enc_blk_xway_xor(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += SERPENT_PARALLEL_BLOCKS; - dst += SERPENT_PARALLEL_BLOCKS; - nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; - } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - - __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - u128_xor(dst, dst, (u128 *)ctrblocks); - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - u128_to_be128((be128 *)walk->iv, &ctrblk); - return nbytes; + return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS, + NULL, fpu_enabled, nbytes); } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static inline void serpent_fpu_end(bool fpu_enabled) { - bool fpu_enabled = false; - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE); - desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - - while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) { - fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - serpent_fpu_end(fpu_enabled); - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; + glue_fpu_end(fpu_enabled); } struct crypt_priv { @@ -596,106 +383,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return ret; } -static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int key_len) -{ - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; - int err; - - crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - err = crypto_ablkcipher_setkey(child, key, key_len); - crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) - & CRYPTO_TFM_RES_MASK); - return err; -} - -static int __ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->encrypt( - &desc, req->dst, req->src, req->nbytes); -} - -static int ablk_encrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_encrypt(cryptd_req); - } else { - return __ablk_encrypt(req); - } -} - -static int ablk_decrypt(struct ablkcipher_request *req) -{ - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - - return crypto_ablkcipher_decrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - - return crypto_blkcipher_crt(desc.tfm)->decrypt( - &desc, req->dst, req->src, req->nbytes); - } -} - -static void ablk_exit(struct crypto_tfm *tfm) -{ - struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ablkcipher(ctx->cryptd_tfm); -} - -static int ablk_init(struct crypto_tfm *tfm) -{ - struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); - struct cryptd_ablkcipher *cryptd_tfm; - char drv_name[CRYPTO_MAX_ALG_NAME]; - - snprintf(drv_name, sizeof(drv_name), "__driver-%s", - crypto_tfm_alg_driver_name(tfm)); - - cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(&cryptd_tfm->base); - - return 0; -} - static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ecb-serpent-sse2", .cra_driver_name = "__driver-ecb-serpent-sse2", @@ -808,7 +495,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -830,7 +517,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -853,7 +540,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -877,7 +564,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, @@ -902,7 +589,7 @@ static struct crypto_alg serpent_algs[10] = { { .cra_priority = 400, .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = SERPENT_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index b2c2f57..49d6987 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -468,7 +468,7 @@ W_PRECALC_SSSE3 */ SHA1_VECTOR_ASM sha1_transform_ssse3 -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX .macro W_PRECALC_AVX diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index f916499..4a11a9d 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -35,7 +35,7 @@ asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, unsigned int rounds); -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX asmlinkage void sha1_transform_avx(u32 *digest, const char *data, unsigned int rounds); #endif @@ -184,7 +184,7 @@ static struct shash_alg alg = { } }; -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX static bool __init avx_usable(void) { u64 xcr0; @@ -209,7 +209,7 @@ static int __init sha1_ssse3_mod_init(void) if (cpu_has_ssse3) sha1_transform_asm = sha1_transform_ssse3; -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX /* allow AVX to override SSSE3, it's a little faster */ if (avx_usable()) sha1_transform_asm = sha1_transform_avx; diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S new file mode 100644 index 0000000..35f4557 --- /dev/null +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -0,0 +1,300 @@ +/* + * Twofish Cipher 8-way parallel algorithm (AVX/x86_64) + * + * Copyright (C) 2012 Johannes Goetzfried + * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "twofish-avx-x86_64-asm_64.S" +.text + +/* structure of crypto context */ +#define s0 0 +#define s1 1024 +#define s2 2048 +#define s3 3072 +#define w 4096 +#define k 4128 + +/********************************************************************** + 8-way AVX twofish + **********************************************************************/ +#define CTX %rdi + +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 + +#define RA2 %xmm4 +#define RB2 %xmm5 +#define RC2 %xmm6 +#define RD2 %xmm7 + +#define RX %xmm8 +#define RY %xmm9 + +#define RK1 %xmm10 +#define RK2 %xmm11 + +#define RID1 %rax +#define RID1b %al +#define RID2 %rbx +#define RID2b %bl + +#define RGI1 %rdx +#define RGI1bl %dl +#define RGI1bh %dh +#define RGI2 %rcx +#define RGI2bl %cl +#define RGI2bh %ch + +#define RGS1 %r8 +#define RGS1d %r8d +#define RGS2 %r9 +#define RGS2d %r9d +#define RGS3 %r10 +#define RGS3d %r10d + + +#define lookup_32bit(t0, t1, t2, t3, src, dst) \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + movl t0(CTX, RID1, 4), dst ## d; \ + xorl t1(CTX, RID2, 4), dst ## d; \ + shrq $16, src; \ + movb src ## bl, RID1b; \ + movb src ## bh, RID2b; \ + xorl t2(CTX, RID1, 4), dst ## d; \ + xorl t3(CTX, RID2, 4), dst ## d; + +#define G(a, x, t0, t1, t2, t3) \ + vmovq a, RGI1; \ + vpsrldq $8, a, x; \ + vmovq x, RGI2; \ + \ + lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \ + shrq $16, RGI1; \ + lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \ + shlq $32, RGS2; \ + orq RGS1, RGS2; \ + \ + lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \ + shrq $16, RGI2; \ + lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \ + shlq $32, RGS3; \ + orq RGS1, RGS3; \ + \ + vmovq RGS2, x; \ + vpinsrq $1, RGS3, x, x; + +#define encround(a, b, c, d, x, y) \ + G(a, x, s0, s1, s2, s3); \ + G(b, y, s1, s2, s3, s0); \ + vpaddd x, y, x; \ + vpaddd y, x, y; \ + vpaddd x, RK1, x; \ + vpaddd y, RK2, y; \ + vpxor x, c, c; \ + vpsrld $1, c, x; \ + vpslld $(32 - 1), c, c; \ + vpor c, x, c; \ + vpslld $1, d, x; \ + vpsrld $(32 - 1), d, d; \ + vpor d, x, d; \ + vpxor d, y, d; + +#define decround(a, b, c, d, x, y) \ + G(a, x, s0, s1, s2, s3); \ + G(b, y, s1, s2, s3, s0); \ + vpaddd x, y, x; \ + vpaddd y, x, y; \ + vpaddd y, RK2, y; \ + vpxor d, y, d; \ + vpsrld $1, d, y; \ + vpslld $(32 - 1), d, d; \ + vpor d, y, d; \ + vpslld $1, c, y; \ + vpsrld $(32 - 1), c, c; \ + vpor c, y, c; \ + vpaddd x, RK1, x; \ + vpxor x, c, c; + +#define encrypt_round(n, a, b, c, d) \ + vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ + vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ + encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ + encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); + +#define decrypt_round(n, a, b, c, d) \ + vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ + vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ + decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ + decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); + +#define encrypt_cycle(n) \ + encrypt_round((2*n), RA, RB, RC, RD); \ + encrypt_round(((2*n) + 1), RC, RD, RA, RB); + +#define decrypt_cycle(n) \ + decrypt_round(((2*n) + 1), RC, RD, RA, RB); \ + decrypt_round((2*n), RA, RB, RC, RD); + + +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + vpunpckldq x1, x0, t0; \ + vpunpckhdq x1, x0, t2; \ + vpunpckldq x3, x2, t1; \ + vpunpckhdq x3, x2, x3; \ + \ + vpunpcklqdq t1, t0, x0; \ + vpunpckhqdq t1, t0, x1; \ + vpunpcklqdq x3, t2, x2; \ + vpunpckhqdq x3, t2, x3; + +#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ + vpxor (0*4*4)(in), wkey, x0; \ + vpxor (1*4*4)(in), wkey, x1; \ + vpxor (2*4*4)(in), wkey, x2; \ + vpxor (3*4*4)(in), wkey, x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpxor x0, wkey, x0; \ + vmovdqu x0, (0*4*4)(out); \ + vpxor x1, wkey, x1; \ + vmovdqu x1, (1*4*4)(out); \ + vpxor x2, wkey, x2; \ + vmovdqu x2, (2*4*4)(out); \ + vpxor x3, wkey, x3; \ + vmovdqu x3, (3*4*4)(out); + +#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + vpxor x0, wkey, x0; \ + vpxor (0*4*4)(out), x0, x0; \ + vmovdqu x0, (0*4*4)(out); \ + vpxor x1, wkey, x1; \ + vpxor (1*4*4)(out), x1, x1; \ + vmovdqu x1, (1*4*4)(out); \ + vpxor x2, wkey, x2; \ + vpxor (2*4*4)(out), x2, x2; \ + vmovdqu x2, (2*4*4)(out); \ + vpxor x3, wkey, x3; \ + vpxor (3*4*4)(out), x3, x3; \ + vmovdqu x3, (3*4*4)(out); + +.align 8 +.global __twofish_enc_blk_8way +.type __twofish_enc_blk_8way,@function; + +__twofish_enc_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + + pushq %rbx; + pushq %rcx; + + vmovdqu w(CTX), RK1; + + leaq (4*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); + + xorq RID1, RID1; + xorq RID2, RID2; + + encrypt_cycle(0); + encrypt_cycle(1); + encrypt_cycle(2); + encrypt_cycle(3); + encrypt_cycle(4); + encrypt_cycle(5); + encrypt_cycle(6); + encrypt_cycle(7); + + vmovdqu (w+4*4)(CTX), RK1; + + popq %rcx; + popq %rbx; + + leaq (4*4*4)(%rsi), %rax; + + testb %cl, %cl; + jnz __enc_xor8; + + outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); + outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + + ret; + +__enc_xor8: + outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); + outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + + ret; + +.align 8 +.global twofish_dec_blk_8way +.type twofish_dec_blk_8way,@function; + +twofish_dec_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pushq %rbx; + + vmovdqu (w+4*4)(CTX), RK1; + + leaq (4*4*4)(%rdx), %rax; + inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); + inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + + xorq RID1, RID1; + xorq RID2, RID2; + + decrypt_cycle(7); + decrypt_cycle(6); + decrypt_cycle(5); + decrypt_cycle(4); + decrypt_cycle(3); + decrypt_cycle(2); + decrypt_cycle(1); + decrypt_cycle(0); + + vmovdqu (w)(CTX), RK1; + + popq %rbx; + + leaq (4*4*4)(%rsi), %rax; + outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); + + ret; diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c new file mode 100644 index 0000000..782b67d --- /dev/null +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -0,0 +1,624 @@ +/* + * Glue Code for AVX assembler version of Twofish Cipher + * + * Copyright (C) 2012 Johannes Goetzfried + * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include <linux/module.h> +#include <linux/hardirq.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <crypto/algapi.h> +#include <crypto/twofish.h> +#include <crypto/cryptd.h> +#include <crypto/b128ops.h> +#include <crypto/ctr.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/i387.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <asm/crypto/twofish.h> +#include <asm/crypto/ablk_helper.h> +#include <asm/crypto/glue_helper.h> +#include <crypto/scatterwalk.h> +#include <linux/workqueue.h> +#include <linux/spinlock.h> + +#define TWOFISH_PARALLEL_BLOCKS 8 + +static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_3way(ctx, dst, src, false); +} + +/* 8-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_8way(ctx, dst, src, false); +} + +static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_8way(ctx, dst, src, true); +} + +static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + twofish_dec_blk_8way(ctx, dst, src); +} + +static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src) +{ + u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1]; + unsigned int j; + + for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) + ivs[j] = src[j]; + + twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) + u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); +} + +static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src, + u128 *iv) +{ + be128 ctrblks[TWOFISH_PARALLEL_BLOCKS]; + unsigned int i; + + for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + u128_to_be128(&ctrblks[i], iv); + u128_inc(iv); + } + + twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); +} + +static const struct common_glue_ctx twofish_enc = { + .num_funcs = 3, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } + }, { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } + } } +}; + +static const struct common_glue_ctx twofish_ctr = { + .num_funcs = 3, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } + }, { + .num_blocks = 3, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } + } } +}; + +static const struct common_glue_ctx twofish_dec = { + .num_funcs = 3, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } + }, { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } + } } +}; + +static const struct common_glue_ctx twofish_dec_cbc = { + .num_funcs = 3, + .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + + .funcs = { { + .num_blocks = TWOFISH_PARALLEL_BLOCKS, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } + }, { + .num_blocks = 3, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } + } } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, + dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, + nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); +} + +static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL, + fpu_enabled, nbytes); +} + +static inline void twofish_fpu_end(bool fpu_enabled) +{ + glue_fpu_end(fpu_enabled); +} + +struct crypt_priv { + struct twofish_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = TF_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { + twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) + twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); + + nbytes %= bsize * 3; + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + twofish_enc_blk(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = TF_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { + twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) + twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); + + nbytes %= bsize * 3; + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + twofish_dec_blk(ctx->ctx, srcdst, srcdst); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[TWOFISH_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->twofish_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + twofish_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[TWOFISH_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->twofish_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = lrw_crypt(desc, dst, src, nbytes, &req); + twofish_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[TWOFISH_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + twofish_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[TWOFISH_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + ret = xts_crypt(desc, dst, src, nbytes, &req); + twofish_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static struct crypto_alg twofish_algs[10] = { { + .cra_name = "__ecb-twofish-avx", + .cra_driver_name = "__driver-ecb-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[0].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = twofish_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-twofish-avx", + .cra_driver_name = "__driver-cbc-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[1].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = twofish_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "__ctr-twofish-avx", + .cra_driver_name = "__driver-ctr-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[2].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "__lrw-twofish-avx", + .cra_driver_name = "__driver-lrw-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[3].cra_list), + .cra_exit = lrw_twofish_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE + + TF_BLOCK_SIZE, + .max_keysize = TF_MAX_KEY_SIZE + + TF_BLOCK_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = lrw_twofish_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-twofish-avx", + .cra_driver_name = "__driver-xts-twofish-avx", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[4].cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE * 2, + .max_keysize = TF_MAX_KEY_SIZE * 2, + .ivsize = TF_BLOCK_SIZE, + .setkey = xts_twofish_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "ecb(twofish)", + .cra_driver_name = "ecb-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[5].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(twofish)", + .cra_driver_name = "cbc-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[6].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "ctr(twofish)", + .cra_driver_name = "ctr-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[7].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "lrw(twofish)", + .cra_driver_name = "lrw-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[8].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE + + TF_BLOCK_SIZE, + .max_keysize = TF_MAX_KEY_SIZE + + TF_BLOCK_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "xts(twofish)", + .cra_driver_name = "xts-twofish-avx", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(twofish_algs[9].cra_list), + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = TF_MIN_KEY_SIZE * 2, + .max_keysize = TF_MAX_KEY_SIZE * 2, + .ivsize = TF_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +} }; + +static int __init twofish_init(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) { + printk(KERN_INFO "AVX instructions are not detected.\n"); + return -ENODEV; + } + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + printk(KERN_INFO "AVX detected but unusable.\n"); + return -ENODEV; + } + + return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs)); +} + +static void __exit twofish_exit(void) +{ + crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs)); +} + +module_init(twofish_init); +module_exit(twofish_exit); + +MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("twofish"); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 922ab24..15f9347 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -3,11 +3,6 @@ * * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -33,20 +28,13 @@ #include <crypto/algapi.h> #include <crypto/twofish.h> #include <crypto/b128ops.h> +#include <asm/crypto/twofish.h> +#include <asm/crypto/glue_helper.h> #include <crypto/lrw.h> #include <crypto/xts.h> -/* regular block cipher functions from twofish_x86_64 module */ -asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); -asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); - -/* 3-way parallel cipher functions */ -asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src, bool xor); -asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, - const u8 *src); +EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way); +EXPORT_SYMBOL_GPL(twofish_dec_blk_3way); static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, const u8 *src) @@ -60,311 +48,139 @@ static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst, __twofish_enc_blk_3way(ctx, dst, src, true); } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, - void (*fn)(struct twofish_ctx *, u8 *, const u8 *), - void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *)) -{ - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes; - int err; - - err = blkcipher_walk_virt(desc, walk); - - while ((nbytes = walk->nbytes)) { - u8 *wsrc = walk->src.virt.addr; - u8 *wdst = walk->dst.virt.addr; - - /* Process three block batch */ - if (nbytes >= bsize * 3) { - do { - fn_3way(ctx, wdst, wsrc); - - wsrc += bsize * 3; - wdst += bsize * 3; - nbytes -= bsize * 3; - } while (nbytes >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - fn(ctx, wdst, wsrc); - - wsrc += bsize; - wdst += bsize; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - err = blkcipher_walk_done(desc, walk, nbytes); - } - - return err; -} - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) { - struct blkcipher_walk walk; + u128 ivs[2]; - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way); -} + ivs[0] = src[0]; + ivs[1] = src[1]; -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; + twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); - blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); + u128_xor(&dst[1], &dst[1], &ivs[0]); + u128_xor(&dst[2], &dst[2], &ivs[1]); } +EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 *iv = (u128 *)walk->iv; - - do { - u128_xor(dst, src, iv); - twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); - iv = dst; - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - - u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); - return nbytes; -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) { - struct blkcipher_walk walk; - int err; + be128 ctrblk; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); + if (dst != src) + *dst = *src; - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_encrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } + u128_to_be128(&ctrblk, iv); + u128_inc(iv); - return err; + twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); + u128_xor(dst, dst, (u128 *)&ctrblk); } +EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, + u128 *iv) { - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ivs[3 - 1]; - u128 last_iv; - - /* Start of the last block. */ - src += nbytes / bsize - 1; - dst += nbytes / bsize - 1; - - last_iv = *src; - - /* Process three block batch */ - if (nbytes >= bsize * 3) { - do { - nbytes -= bsize * (3 - 1); - src -= 3 - 1; - dst -= 3 - 1; - - ivs[0] = src[0]; - ivs[1] = src[1]; - - twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); - - u128_xor(dst + 1, dst + 1, ivs + 0); - u128_xor(dst + 2, dst + 2, ivs + 1); - - nbytes -= bsize; - if (nbytes < bsize) - goto done; - - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; - } while (nbytes >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - for (;;) { - twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src); - - nbytes -= bsize; - if (nbytes < bsize) - break; + be128 ctrblks[3]; - u128_xor(dst, dst, src - 1); - src -= 1; - dst -= 1; + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; } -done: - u128_xor(dst, dst, (u128 *)walk->iv); - *(u128 *)walk->iv = last_iv; + u128_to_be128(&ctrblks[0], iv); + u128_inc(iv); + u128_to_be128(&ctrblks[1], iv); + u128_inc(iv); + u128_to_be128(&ctrblks[2], iv); + u128_inc(iv); - return nbytes; + twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); } +EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way); + +static const struct common_glue_ctx twofish_enc = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } + } } +}; -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt(desc, &walk); +static const struct common_glue_ctx twofish_ctr = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) } + } } +}; - while ((nbytes = walk.nbytes)) { - nbytes = __cbc_decrypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } +static const struct common_glue_ctx twofish_dec = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 3, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } + }, { + .num_blocks = 1, + .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } + } } +}; - return err; -} +static const struct common_glue_ctx twofish_dec_cbc = { + .num_funcs = 2, + .fpu_blocks_limit = -1, + + .funcs = { { + .num_blocks = 3, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } + }, { + .num_blocks = 1, + .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } + } } +}; -static inline void u128_to_be128(be128 *dst, const u128 *src) +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - dst->a = cpu_to_be64(src->a); - dst->b = cpu_to_be64(src->b); + return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); } -static inline void be128_to_u128(u128 *dst, const be128 *src) +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - dst->a = be64_to_cpu(src->a); - dst->b = be64_to_cpu(src->b); + return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); } -static inline void u128_inc(u128 *i) +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - i->b++; - if (!i->b) - i->a++; + return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, + dst, src, nbytes); } -static void ctr_crypt_final(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - u8 *ctrblk = walk->iv; - u8 keystream[TF_BLOCK_SIZE]; - u8 *src = walk->src.virt.addr; - u8 *dst = walk->dst.virt.addr; - unsigned int nbytes = walk->nbytes; - - twofish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); - - crypto_inc(ctrblk, TF_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk) -{ - struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - unsigned int bsize = TF_BLOCK_SIZE; - unsigned int nbytes = walk->nbytes; - u128 *src = (u128 *)walk->src.virt.addr; - u128 *dst = (u128 *)walk->dst.virt.addr; - u128 ctrblk; - be128 ctrblocks[3]; - - be128_to_u128(&ctrblk, (be128 *)walk->iv); - - /* Process three block batch */ - if (nbytes >= bsize * 3) { - do { - if (dst != src) { - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - } - - /* create ctrblks for parallel encrypt */ - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - u128_to_be128(&ctrblocks[1], &ctrblk); - u128_inc(&ctrblk); - u128_to_be128(&ctrblocks[2], &ctrblk); - u128_inc(&ctrblk); - - twofish_enc_blk_xor_3way(ctx, (u8 *)dst, - (u8 *)ctrblocks); - - src += 3; - dst += 3; - nbytes -= bsize * 3; - } while (nbytes >= bsize * 3); - - if (nbytes < bsize) - goto done; - } - - /* Handle leftovers */ - do { - if (dst != src) - *dst = *src; - - u128_to_be128(&ctrblocks[0], &ctrblk); - u128_inc(&ctrblk); - - twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); - u128_xor(dst, dst, (u128 *)ctrblocks); - - src += 1; - dst += 1; - nbytes -= bsize; - } while (nbytes >= bsize); - -done: - u128_to_be128((be128 *)walk->iv, &ctrblk); - return nbytes; + return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, + nbytes); } static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct blkcipher_walk walk; - int err; - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE); - - while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) { - nbytes = __ctr_crypt(desc, &walk); - err = blkcipher_walk_done(desc, &walk, nbytes); - } - - if (walk.nbytes) { - ctr_crypt_final(desc, &walk); - err = blkcipher_walk_done(desc, &walk, 0); - } - - return err; + return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); } static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) @@ -397,13 +213,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) twofish_dec_blk(ctx, srcdst, srcdst); } -struct twofish_lrw_ctx { - struct lrw_table_ctx lrw_table; - struct twofish_ctx twofish_ctx; -}; - -static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) { struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); int err; @@ -415,6 +226,7 @@ static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE); } +EXPORT_SYMBOL_GPL(lrw_twofish_setkey); static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -450,20 +262,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, return lrw_crypt(desc, dst, src, nbytes, &req); } -static void lrw_exit_tfm(struct crypto_tfm *tfm) +void lrw_twofish_exit_tfm(struct crypto_tfm *tfm) { struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); lrw_free_table(&ctx->lrw_table); } +EXPORT_SYMBOL_GPL(lrw_twofish_exit_tfm); -struct twofish_xts_ctx { - struct twofish_ctx tweak_ctx; - struct twofish_ctx crypt_ctx; -}; - -static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) { struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; @@ -486,6 +294,7 @@ static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, flags); } +EXPORT_SYMBOL_GPL(xts_twofish_setkey); static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -596,7 +405,7 @@ static struct crypto_alg tf_algs[5] = { { .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(tf_algs[3].cra_list), - .cra_exit = lrw_exit_tfm, + .cra_exit = lrw_twofish_exit_tfm, .cra_u = { .blkcipher = { .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, diff --git a/arch/x86/include/asm/crypto/ablk_helper.h b/arch/x86/include/asm/crypto/ablk_helper.h new file mode 100644 index 0000000..4f93df5 --- /dev/null +++ b/arch/x86/include/asm/crypto/ablk_helper.h @@ -0,0 +1,31 @@ +/* + * Shared async block cipher helpers + */ + +#ifndef _CRYPTO_ABLK_HELPER_H +#define _CRYPTO_ABLK_HELPER_H + +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <crypto/cryptd.h> + +struct async_helper_ctx { + struct cryptd_ablkcipher *cryptd_tfm; +}; + +extern int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len); + +extern int __ablk_encrypt(struct ablkcipher_request *req); + +extern int ablk_encrypt(struct ablkcipher_request *req); + +extern int ablk_decrypt(struct ablkcipher_request *req); + +extern void ablk_exit(struct crypto_tfm *tfm); + +extern int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name); + +extern int ablk_init(struct crypto_tfm *tfm); + +#endif /* _CRYPTO_ABLK_HELPER_H */ diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/crypto/aes.h index 80545a1..80545a1 100644 --- a/arch/x86/include/asm/aes.h +++ b/arch/x86/include/asm/crypto/aes.h diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h new file mode 100644 index 0000000..3e408bd --- /dev/null +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -0,0 +1,115 @@ +/* + * Shared glue code for 128bit block ciphers + */ + +#ifndef _CRYPTO_GLUE_HELPER_H +#define _CRYPTO_GLUE_HELPER_H + +#include <linux/kernel.h> +#include <linux/crypto.h> +#include <asm/i387.h> +#include <crypto/b128ops.h> + +typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); +typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); +typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, + u128 *iv); + +#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) +#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) +#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) + +struct common_glue_func_entry { + unsigned int num_blocks; /* number of blocks that @fn will process */ + union { + common_glue_func_t ecb; + common_glue_cbc_func_t cbc; + common_glue_ctr_func_t ctr; + } fn_u; +}; + +struct common_glue_ctx { + unsigned int num_funcs; + int fpu_blocks_limit; /* -1 means fpu not needed at all */ + + /* + * First funcs entry must have largest num_blocks and last funcs entry + * must have num_blocks == 1! + */ + struct common_glue_func_entry funcs[]; +}; + +static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, + struct blkcipher_desc *desc, + bool fpu_enabled, unsigned int nbytes) +{ + if (likely(fpu_blocks_limit < 0)) + return false; + + if (fpu_enabled) + return true; + + /* + * Vector-registers are only used when chunk to be processed is large + * enough, so do not enable FPU until it is necessary. + */ + if (nbytes < bsize * (unsigned int)fpu_blocks_limit) + return false; + + if (desc) { + /* prevent sleeping if FPU is in use */ + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + } + + kernel_fpu_begin(); + return true; +} + +static inline void glue_fpu_end(bool fpu_enabled) +{ + if (fpu_enabled) + kernel_fpu_end(); +} + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ + dst->a = cpu_to_be64(src->a); + dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ + dst->a = be64_to_cpu(src->a); + dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ + i->b++; + if (!i->b) + i->a++; +} + +extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes); + +extern int glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, + unsigned int nbytes); + +extern int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, + unsigned int nbytes); + +extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes); + +#endif /* _CRYPTO_GLUE_HELPER_H */ diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h new file mode 100644 index 0000000..432deed --- /dev/null +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -0,0 +1,32 @@ +#ifndef ASM_X86_SERPENT_AVX_H +#define ASM_X86_SERPENT_AVX_H + +#include <linux/crypto.h> +#include <crypto/serpent.h> + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way_avx(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way_avx(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_8way_avx(ctx, dst, src); +} + +#endif diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/crypto/serpent-sse2.h index d3ef63f..e6e77df 100644 --- a/arch/x86/include/asm/serpent.h +++ b/arch/x86/include/asm/crypto/serpent-sse2.h @@ -1,5 +1,5 @@ -#ifndef ASM_X86_SERPENT_H -#define ASM_X86_SERPENT_H +#ifndef ASM_X86_SERPENT_SSE2_H +#define ASM_X86_SERPENT_SSE2_H #include <linux/crypto.h> #include <crypto/serpent.h> diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h new file mode 100644 index 0000000..9d2c514 --- /dev/null +++ b/arch/x86/include/asm/crypto/twofish.h @@ -0,0 +1,46 @@ +#ifndef ASM_X86_TWOFISH_H +#define ASM_X86_TWOFISH_H + +#include <linux/crypto.h> +#include <crypto/twofish.h> +#include <crypto/lrw.h> +#include <crypto/b128ops.h> + +struct twofish_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct twofish_ctx twofish_ctx; +}; + +struct twofish_xts_ctx { + struct twofish_ctx tweak_ctx; + struct twofish_ctx crypt_ctx; +}; + +/* regular block cipher functions from twofish_x86_64 module */ +asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +/* 3-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +/* helpers from twofish_x86_64-3way module */ +extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); +extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, + u128 *iv); +extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, + u128 *iv); + +extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen); + +extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); + +extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen); + +#endif /* ASM_X86_TWOFISH_H */ |