From d64a9255590b1462b95f79aade2cd6193a7f06f7 Mon Sep 17 00:00:00 2001 From: jmg Date: Sat, 16 Nov 2013 09:01:24 +0000 Subject: MFC r257757: fix issues w/ AES-NI on unaligned data blocks... Approved by: re (kib) --- sys/crypto/aesni/aesencdec.h | 12 ++--- sys/crypto/aesni/aesni.h | 39 +++++++-------- sys/crypto/aesni/aesni_wrap.c | 111 +++++++++++++++++++++++++++--------------- 3 files changed, 95 insertions(+), 67 deletions(-) diff --git a/sys/crypto/aesni/aesencdec.h b/sys/crypto/aesni/aesencdec.h index 0c9bf5f..5e4f128 100644 --- a/sys/crypto/aesni/aesencdec.h +++ b/sys/crypto/aesni/aesencdec.h @@ -30,11 +30,10 @@ #include static inline void -aesni_enc8(int rounds, const uint8_t *key_schedule, __m128i a, +aesni_enc8(int rounds, const __m128i *keysched, __m128i a, __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g, __m128i h, __m128i out[8]) { - const __m128i *keysched = (const __m128i *)key_schedule; int i; a ^= keysched[0]; @@ -68,11 +67,10 @@ aesni_enc8(int rounds, const uint8_t *key_schedule, __m128i a, } static inline void -aesni_dec8(int rounds, const uint8_t *key_schedule, __m128i a, +aesni_dec8(int rounds, const __m128i *keysched, __m128i a, __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g, __m128i h, __m128i out[8]) { - const __m128i *keysched = (const __m128i *)key_schedule; int i; a ^= keysched[0]; @@ -106,10 +104,9 @@ aesni_dec8(int rounds, const uint8_t *key_schedule, __m128i a, } static inline __m128i -aesni_enc(int rounds, const uint8_t *key_schedule, const __m128i from) +aesni_enc(int rounds, const __m128i *keysched, const __m128i from) { __m128i tmp; - const __m128i *keysched = (const __m128i *)key_schedule; int i; tmp = from ^ keysched[0]; @@ -121,10 +118,9 @@ aesni_enc(int rounds, const uint8_t *key_schedule, const __m128i from) } static inline __m128i -aesni_dec(int rounds, const uint8_t *key_schedule, const __m128i from) +aesni_dec(int rounds, const __m128i *keysched, const __m128i from) { __m128i tmp; - const __m128i *keysched = (const __m128i *)key_schedule; int i; tmp = from ^ keysched[0]; diff --git a/sys/crypto/aesni/aesni.h b/sys/crypto/aesni/aesni.h index 17ca9c5..99edd42 100644 --- a/sys/crypto/aesni/aesni.h +++ b/sys/crypto/aesni/aesni.h @@ -71,29 +71,30 @@ struct aesni_session { /* * Internal functions, implemented in assembler. */ -void aesni_set_enckey(const uint8_t *userkey, uint8_t *encrypt_schedule, - int number_of_rounds); -void aesni_set_deckey(const uint8_t *encrypt_schedule, - uint8_t *decrypt_schedule, int number_of_rounds); +void aesni_set_enckey(const uint8_t *userkey, + uint8_t *encrypt_schedule /*__aligned(16)*/, int number_of_rounds); +void aesni_set_deckey(const uint8_t *encrypt_schedule /*__aligned(16)*/, + uint8_t *decrypt_schedule /*__aligned(16)*/, int number_of_rounds); /* * Slightly more public interfaces. */ -void aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, - const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]); -void aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, - uint8_t *buf, const uint8_t iv[AES_BLOCK_LEN]); -void aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, - const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]); -void aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, - const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]); - -void aesni_encrypt_xts(int rounds, const void *data_schedule, - const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, - const uint8_t iv[AES_BLOCK_LEN]); -void aesni_decrypt_xts(int rounds, const void *data_schedule, - const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, +void aesni_encrypt_cbc(int rounds, const void *key_schedule /*__aligned(16)*/, + size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]); +void aesni_decrypt_cbc(int rounds, const void *key_schedule /*__aligned(16)*/, + size_t len, uint8_t *buf, const uint8_t iv[AES_BLOCK_LEN]); +void aesni_encrypt_ecb(int rounds, const void *key_schedule /*__aligned(16)*/, + size_t len, const uint8_t *from, uint8_t *to); +void aesni_decrypt_ecb(int rounds, const void *key_schedule /*__aligned(16)*/, + size_t len, const uint8_t *from, uint8_t *to); + +void aesni_encrypt_xts(int rounds, const void *data_schedule /*__aligned(16)*/, + const void *tweak_schedule /*__aligned(16)*/, size_t len, + const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]); +void aesni_decrypt_xts(int rounds, const void *data_schedule /*__aligned(16)*/, + const void *tweak_schedule /*__aligned(16)*/, size_t len, + const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]); int aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini); @@ -103,4 +104,4 @@ int aesni_cipher_process(struct aesni_session *ses, uint8_t *aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp, int *allocated); -#endif +#endif /* _AESNI_H_ */ diff --git a/sys/crypto/aesni/aesni_wrap.c b/sys/crypto/aesni/aesni_wrap.c index 197baf7..4f5acf3 100644 --- a/sys/crypto/aesni/aesni_wrap.c +++ b/sys/crypto/aesni/aesni_wrap.c @@ -41,6 +41,10 @@ __FBSDID("$FreeBSD$"); MALLOC_DECLARE(M_AESNI); +struct blocks8 { + __m128i blk[8]; +} __packed; + void aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]) @@ -65,20 +69,20 @@ aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, uint8_t *buf, const uint8_t iv[AES_BLOCK_LEN]) { __m128i blocks[8]; - __m128i *bufs; + struct blocks8 *blks; __m128i ivreg, nextiv; size_t i, j, cnt; ivreg = _mm_loadu_si128((const __m128i *)iv); cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { - bufs = (__m128i *)buf; - aesni_dec8(rounds - 1, key_schedule, bufs[0], bufs[1], - bufs[2], bufs[3], bufs[4], bufs[5], bufs[6], - bufs[7], &blocks[0]); + blks = (struct blocks8 *)buf; + aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], + blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], + blks->blk[6], blks->blk[7], &blocks[0]); for (j = 0; j < 8; j++) { - nextiv = bufs[j]; - bufs[j] = blocks[j] ^ ivreg; + nextiv = blks->blk[j]; + blks->blk[j] = blocks[j] ^ ivreg; ivreg = nextiv; } buf += AES_BLOCK_LEN * 8; @@ -86,9 +90,9 @@ aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { - bufs = (__m128i *)buf; - nextiv = bufs[0]; - bufs[0] = aesni_dec(rounds - 1, key_schedule, bufs[0]) ^ ivreg; + nextiv = _mm_loadu_si128((void *)buf); + _mm_storeu_si128((void *)buf, + aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg); ivreg = nextiv; buf += AES_BLOCK_LEN; } @@ -99,15 +103,26 @@ aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to) { __m128i tot; - const __m128i *blocks; + __m128i tout[8]; + struct blocks8 *top; + const struct blocks8 *blks; size_t i, cnt; cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { - blocks = (const __m128i *)from; - aesni_enc8(rounds - 1, key_schedule, blocks[0], blocks[1], - blocks[2], blocks[3], blocks[4], blocks[5], blocks[6], - blocks[7], (__m128i *)to); + blks = (const struct blocks8 *)from; + top = (struct blocks8 *)to; + aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], + blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], + blks->blk[6], blks->blk[7], tout); + top->blk[0] = tout[0]; + top->blk[1] = tout[1]; + top->blk[2] = tout[2]; + top->blk[3] = tout[3]; + top->blk[4] = tout[4]; + top->blk[5] = tout[5]; + top->blk[6] = tout[6]; + top->blk[7] = tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } @@ -127,15 +142,26 @@ aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]) { __m128i tot; - const __m128i *blocks; + __m128i tout[8]; + const struct blocks8 *blks; + struct blocks8 *top; size_t i, cnt; cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { - blocks = (const __m128i *)from; - aesni_dec8(rounds - 1, key_schedule, blocks[0], blocks[1], - blocks[2], blocks[3], blocks[4], blocks[5], blocks[6], - blocks[7], (__m128i *)to); + blks = (const struct blocks8 *)from; + top = (struct blocks8 *)to; + aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], + blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], + blks->blk[6], blks->blk[7], tout); + top->blk[0] = tout[0]; + top->blk[1] = tout[1]; + top->blk[2] = tout[2]; + top->blk[3] = tout[3]; + top->blk[4] = tout[4]; + top->blk[5] = tout[5]; + top->blk[6] = tout[6]; + top->blk[7] = tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } @@ -173,31 +199,33 @@ xts_crank_lfsr(__m128i inp) } static void -aesni_crypt_xts_block(int rounds, const void *key_schedule, __m128i *tweak, - const __m128i *from, __m128i *to, int do_encrypt) +aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak, + const uint8_t *from, uint8_t *to, int do_encrypt) { __m128i block; - block = *from ^ *tweak; + block = _mm_loadu_si128((const __m128i *)from) ^ *tweak; if (do_encrypt) block = aesni_enc(rounds - 1, key_schedule, block); else block = aesni_dec(rounds - 1, key_schedule, block); - *to = block ^ *tweak; + _mm_storeu_si128((__m128i *)to, block ^ *tweak); *tweak = xts_crank_lfsr(*tweak); } static void -aesni_crypt_xts_block8(int rounds, const void *key_schedule, __m128i *tweak, - const __m128i *from, __m128i *to, int do_encrypt) +aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak, + const uint8_t *from, uint8_t *to, int do_encrypt) { __m128i tmptweak; __m128i a, b, c, d, e, f, g, h; __m128i tweaks[8]; __m128i tmp[8]; + __m128i *top; + const __m128i *fromp; tmptweak = *tweak; @@ -205,10 +233,12 @@ aesni_crypt_xts_block8(int rounds, const void *key_schedule, __m128i *tweak, * unroll the loop. This lets gcc put values directly in the * register and saves memory accesses. */ + fromp = (const __m128i *)from; #define PREPINP(v, pos) \ do { \ tweaks[(pos)] = tmptweak; \ - (v) = from[(pos)] ^ tmptweak; \ + (v) = _mm_loadu_si128(&fromp[pos]) ^ \ + tmptweak; \ tmptweak = xts_crank_lfsr(tmptweak); \ } while (0) PREPINP(a, 0); @@ -228,20 +258,21 @@ aesni_crypt_xts_block8(int rounds, const void *key_schedule, __m128i *tweak, aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, tmp); - to[0] = tmp[0] ^ tweaks[0]; - to[1] = tmp[1] ^ tweaks[1]; - to[2] = tmp[2] ^ tweaks[2]; - to[3] = tmp[3] ^ tweaks[3]; - to[4] = tmp[4] ^ tweaks[4]; - to[5] = tmp[5] ^ tweaks[5]; - to[6] = tmp[6] ^ tweaks[6]; - to[7] = tmp[7] ^ tweaks[7]; + top = (__m128i *)to; + _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]); + _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]); + _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]); + _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]); + _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]); + _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]); + _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]); + _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]); } static void -aesni_crypt_xts(int rounds, const void *data_schedule, - const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, - const uint8_t iv[AES_BLOCK_LEN], int do_encrypt) +aesni_crypt_xts(int rounds, const __m128i *data_schedule, + const __m128i *tweak_schedule, size_t len, const uint8_t *from, + uint8_t *to, const uint8_t iv[AES_BLOCK_LEN], int do_encrypt) { __m128i tweakreg; uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); @@ -264,7 +295,7 @@ aesni_crypt_xts(int rounds, const void *data_schedule, cnt = len / AES_XTS_BLOCKSIZE / 8; for (i = 0; i < cnt; i++) { aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg, - (const __m128i *)from, (__m128i *)to, do_encrypt); + from, to, do_encrypt); from += AES_XTS_BLOCKSIZE * 8; to += AES_XTS_BLOCKSIZE * 8; } @@ -272,7 +303,7 @@ aesni_crypt_xts(int rounds, const void *data_schedule, cnt = len / AES_XTS_BLOCKSIZE; for (; i < cnt; i++) { aesni_crypt_xts_block(rounds, data_schedule, &tweakreg, - (const __m128i *)from, (__m128i *)to, do_encrypt); + from, to, do_encrypt); from += AES_XTS_BLOCKSIZE; to += AES_XTS_BLOCKSIZE; } -- cgit v1.1