summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Smith <mgsmith@netgate.com>2015-11-18 10:30:53 -0600
committerMatt Smith <mgsmith@netgate.com>2015-11-18 10:30:53 -0600
commit46d9d0730a66d3f3a205a5b621ce0fa7dec20624 (patch)
tree6d5262aaacd99b7a4d7b60fab7d65ba520ad9939
parentd5adcd9f5d56621a1c4250793234580167990202 (diff)
downloadFreeBSD-src-46d9d0730a66d3f3a205a5b621ce0fa7dec20624.zip
FreeBSD-src-46d9d0730a66d3f3a205a5b621ce0fa7dec20624.tar.gz
Importing pfSense patch aesgcm.hwaccl.diff
-rw-r--r--sys/crypto/aesni/aesni.c281
-rw-r--r--sys/crypto/aesni/aesni.h14
-rw-r--r--sys/crypto/aesni/aesni_ghash.c511
-rw-r--r--sys/crypto/aesni/aesni_wrap.c37
-rw-r--r--sys/modules/aesni/Makefile8
5 files changed, 792 insertions, 59 deletions
diff --git a/sys/crypto/aesni/aesni.c b/sys/crypto/aesni/aesni.c
index 7d7a740..77c0236 100644
--- a/sys/crypto/aesni/aesni.c
+++ b/sys/crypto/aesni/aesni.c
@@ -42,6 +42,8 @@ __FBSDID("$FreeBSD$");
#include <crypto/aesni/aesni.h>
#include <cryptodev_if.h>
+#include <opencrypto/gmac.h>
+
struct aesni_softc {
int32_t cid;
uint32_t sid;
@@ -56,7 +58,7 @@ static void aesni_freesession_locked(struct aesni_softc *sc,
static int aesni_cipher_setup(struct aesni_session *ses,
struct cryptoini *encini);
static int aesni_cipher_process(struct aesni_session *ses,
- struct cryptodesc *enccrd, struct cryptop *crp);
+ struct cryptodesc *enccrd, struct cryptodesc *authcrd, struct cryptop *crp);
MALLOC_DEFINE(M_AESNI, "aesni_data", "AESNI Data");
@@ -79,12 +81,12 @@ aesni_probe(device_t dev)
return (EINVAL);
}
- if ((cpu_feature & CPUID_SSE2) == 0) {
- device_printf(dev, "No SSE2 support but AESNI!?!\n");
+ if ((cpu_feature & CPUID2_SSE41) == 0 && (cpu_feature2 & CPUID2_SSE41) == 0) {
+ device_printf(dev, "No SSE4.1 support.\n");
return (EINVAL);
}
- device_set_desc_copy(dev, "AES-CBC,AES-XTS");
+ device_set_desc_copy(dev, "AES-CBC,AES-XTS,AES-GCM");
return (0);
}
@@ -106,6 +108,10 @@ aesni_attach(device_t dev)
rw_init(&sc->lock, "aesni_lock");
crypto_register(sc->cid, CRYPTO_AES_CBC, 0, 0);
crypto_register(sc->cid, CRYPTO_AES_XTS, 0, 0);
+ crypto_register(sc->cid, CRYPTO_AES_RFC4106_GCM_16, 0, 0);
+ crypto_register(sc->cid, CRYPTO_AES_128_GMAC, 0, 0);
+ crypto_register(sc->cid, CRYPTO_AES_192_GMAC, 0, 0);
+ crypto_register(sc->cid, CRYPTO_AES_256_GMAC, 0, 0);
return (0);
}
@@ -127,7 +133,8 @@ aesni_detach(device_t dev)
}
while ((ses = TAILQ_FIRST(&sc->sessions)) != NULL) {
TAILQ_REMOVE(&sc->sessions, ses, next);
- fpu_kern_free_ctx(ses->fpu_ctx);
+ if (ses->fpu_ctx != NULL)
+ fpu_kern_free_ctx(ses->fpu_ctx);
free(ses, M_AESNI);
}
rw_wunlock(&sc->lock);
@@ -144,8 +151,10 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
struct cryptoini *encini;
int error;
- if (sidp == NULL || cri == NULL)
+ if (sidp == NULL || cri == NULL) {
+ printf("no sidp or cri");
return (EINVAL);
+ }
sc = device_get_softc(dev);
ses = NULL;
@@ -153,17 +162,37 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
for (; cri != NULL; cri = cri->cri_next) {
switch (cri->cri_alg) {
case CRYPTO_AES_CBC:
+ if (encini != NULL) {
+ printf("encini already set");
+ return (EINVAL);
+ }
+ encini = cri;
+ break;
case CRYPTO_AES_XTS:
- if (encini != NULL)
+ case CRYPTO_AES_RFC4106_GCM_16:
+ if (encini != NULL) {
+ printf("encini already set");
return (EINVAL);
- encini = cri;
+ }
+ encini = cri;
+ break;
+ case CRYPTO_AES_128_GMAC:
+ case CRYPTO_AES_192_GMAC:
+ case CRYPTO_AES_256_GMAC:
+ /*
+ * nothing to do here, maybe in the future cache some
+ * values for GHASH
+ */
break;
default:
+ printf("unhandled algorithm");
return (EINVAL);
}
}
- if (encini == NULL)
+ if (encini == NULL) {
+ printf("no cipher");
return (EINVAL);
+ }
rw_wlock(&sc->lock);
/*
@@ -195,6 +224,7 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
error = aesni_cipher_setup(ses, encini);
if (error != 0) {
+ printf("setup failed");
rw_wlock(&sc->lock);
aesni_freesession_locked(sc, ses);
rw_wunlock(&sc->lock);
@@ -248,11 +278,13 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)
{
struct aesni_softc *sc = device_get_softc(dev);
struct aesni_session *ses = NULL;
- struct cryptodesc *crd, *enccrd;
- int error;
+ struct cryptodesc *crd, *enccrd, *authcrd;
+ int error, needauth;
error = 0;
enccrd = NULL;
+ authcrd = NULL;
+ needauth = 0;
/* Sanity check. */
if (crp == NULL)
@@ -273,11 +305,40 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)
}
enccrd = crd;
break;
+
+ case CRYPTO_AES_RFC4106_GCM_16:
+ if (enccrd != NULL) {
+ error = EINVAL;
+ goto out;
+ }
+ enccrd = crd;
+ needauth = 1;
+ break;
+
+ case CRYPTO_AES_128_GMAC:
+ case CRYPTO_AES_192_GMAC:
+ case CRYPTO_AES_256_GMAC:
+ if (authcrd != NULL) {
+ error = EINVAL;
+ goto out;
+ }
+ authcrd = crd;
+ needauth = 1;
+ break;
+
default:
return (EINVAL);
}
}
- if (enccrd == NULL || (enccrd->crd_len % AES_BLOCK_LEN) != 0) {
+
+ if (enccrd == NULL || (needauth && authcrd == NULL)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* CBC & XTS can only handle full blocks for now */
+ if ((enccrd->crd_len == CRYPTO_AES_CBC || enccrd->crd_len ==
+ CRYPTO_AES_XTS) && (enccrd->crd_len % AES_BLOCK_LEN) != 0) {
error = EINVAL;
goto out;
}
@@ -293,7 +354,7 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)
goto out;
}
- error = aesni_cipher_process(ses, enccrd, crp);
+ error = aesni_cipher_process(ses, enccrd, authcrd, crp);
if (error != 0)
goto out;
@@ -366,6 +427,7 @@ aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini)
int error;
td = curthread;
+ critical_enter();
error = fpu_kern_enter(td, ses->fpu_ctx, FPU_KERN_NORMAL |
FPU_KERN_KTHR);
if (error != 0)
@@ -373,76 +435,193 @@ aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini)
error = aesni_cipher_setup_common(ses, encini->cri_key,
encini->cri_klen);
fpu_kern_leave(td, ses->fpu_ctx);
+ critical_exit();
return (error);
}
+#ifdef AESNI_DEBUG
+static void
+aesni_printhexstr(uint8_t *ptr, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++)
+ printf("%02hhx", ptr[i]);
+}
+#endif
+
static int
aesni_cipher_process(struct aesni_session *ses, struct cryptodesc *enccrd,
- struct cryptop *crp)
+ struct cryptodesc *authcrd, struct cryptop *crp)
{
+ uint8_t tag[GMAC_DIGEST_LEN];
+ uint8_t iv[AES_BLOCK_LEN] __aligned(16);
struct thread *td;
- uint8_t *buf;
- int error, allocated;
+ uint8_t *buf, *authbuf;
+ int error, allocated, authallocated;
+ int ivlen, encflag, i;
+
+ encflag = (enccrd->crd_flags & CRD_F_ENCRYPT) == CRD_F_ENCRYPT;
buf = aesni_cipher_alloc(enccrd, crp, &allocated);
if (buf == NULL)
return (ENOMEM);
- td = curthread;
- error = fpu_kern_enter(td, ses->fpu_ctx, FPU_KERN_NORMAL |
- FPU_KERN_KTHR);
- if (error != 0)
- goto out1;
+ authbuf = NULL;
+ authallocated = 0;
+ if (authcrd != NULL) {
+ authbuf = aesni_cipher_alloc(authcrd, crp, &authallocated);
+ if (authbuf == NULL) {
+ error = ENOMEM;
+ goto out1;
+ }
+ }
- if ((enccrd->crd_flags & CRD_F_KEY_EXPLICIT) != 0) {
- error = aesni_cipher_setup_common(ses, enccrd->crd_key,
- enccrd->crd_klen);
- if (error != 0)
- goto out;
+ /* XXX - validate that enccrd and authcrd have/use same key? */
+ switch (enccrd->crd_alg) {
+ case CRYPTO_AES_CBC:
+ ivlen = 16;
+ break;
+ case CRYPTO_AES_XTS:
+ ivlen = 8;
+ break;
+ case CRYPTO_AES_RFC4106_GCM_16:
+ ivlen = 12; /* should support arbitarily larger */
+ break;
}
- if ((enccrd->crd_flags & CRD_F_ENCRYPT) != 0) {
+ /* Setup ses->iv */
+ if (encflag) {
if ((enccrd->crd_flags & CRD_F_IV_EXPLICIT) != 0)
- bcopy(enccrd->crd_iv, ses->iv, AES_BLOCK_LEN);
- if ((enccrd->crd_flags & CRD_F_IV_PRESENT) == 0)
- crypto_copyback(crp->crp_flags, crp->crp_buf,
- enccrd->crd_inject, AES_BLOCK_LEN, ses->iv);
- if (ses->algo == CRYPTO_AES_CBC) {
- aesni_encrypt_cbc(ses->rounds, ses->enc_schedule,
- enccrd->crd_len, buf, buf, ses->iv);
- } else /* if (ses->algo == CRYPTO_AES_XTS) */ {
- aesni_encrypt_xts(ses->rounds, ses->enc_schedule,
- ses->xts_schedule, enccrd->crd_len, buf, buf,
- ses->iv);
+ bcopy(enccrd->crd_iv, iv, ivlen);
+ else if ((enccrd->crd_flags & CRD_F_IV_PRESENT) == 0) {
+ if (enccrd->crd_alg == CRYPTO_AES_RFC4106_GCM_16) {
+ for (i = 0; i < AESCTR_NONCESIZE; i++)
+ iv[i] = ses->nonce[i];
+ /* XXX: Is this enough? */
+ atomic_add_long(&ses->aesgcmcounter, 1);
+ bcopy((void *)&ses->aesgcmcounter, iv + AESCTR_NONCESIZE, sizeof(uint64_t));
+ crypto_copyback(crp->crp_flags, crp->crp_buf,
+ enccrd->crd_inject, AESCTR_IVSIZE, iv + AESCTR_NONCESIZE);
+ } else {
+ arc4rand(iv, AES_BLOCK_LEN, 0);
+ crypto_copyback(crp->crp_flags, crp->crp_buf,
+ enccrd->crd_inject, ivlen, iv);
+ }
}
} else {
if ((enccrd->crd_flags & CRD_F_IV_EXPLICIT) != 0)
- bcopy(enccrd->crd_iv, ses->iv, AES_BLOCK_LEN);
+ bcopy(enccrd->crd_iv, iv, ivlen);
+ else {
+ if (enccrd->crd_alg == CRYPTO_AES_RFC4106_GCM_16) {
+ for (i = 0; i < AESCTR_NONCESIZE; i++)
+ iv[i] = ses->nonce[i];
+ crypto_copydata(crp->crp_flags, crp->crp_buf,
+ enccrd->crd_inject, AESCTR_IVSIZE, iv + AESCTR_NONCESIZE);
+ } else
+ crypto_copydata(crp->crp_flags, crp->crp_buf,
+ enccrd->crd_inject, ivlen, iv);
+ }
+ }
+#ifdef AESNI_DEBUG
+ aesni_printhexstr(iv, ivlen);
+ printf("\n");
+#endif
+
+ if (authcrd != NULL && !encflag) {
+ crypto_copydata(crp->crp_flags, crp->crp_buf,
+ authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
+ } else {
+#ifdef AESNI_DEBUG
+ printf("ptag: ");
+ aesni_printhexstr(tag, sizeof tag);
+ printf("\n");
+#endif
+ bzero(tag, sizeof tag);
+ }
+
+ td = curthread;
+
+ critical_enter();
+ error = fpu_kern_enter(td, ses->fpu_ctx, FPU_KERN_NORMAL |
+ FPU_KERN_KTHR);
+ if (error != 0)
+ goto out1;
+ /* Do work */
+ switch (ses->algo) {
+ case CRYPTO_AES_CBC:
+ if (encflag)
+ aesni_encrypt_cbc(ses->rounds, ses->enc_schedule,
+ enccrd->crd_len, buf, buf, iv);
else
- crypto_copydata(crp->crp_flags, crp->crp_buf,
- enccrd->crd_inject, AES_BLOCK_LEN, ses->iv);
- if (ses->algo == CRYPTO_AES_CBC) {
aesni_decrypt_cbc(ses->rounds, ses->dec_schedule,
- enccrd->crd_len, buf, ses->iv);
- } else /* if (ses->algo == CRYPTO_AES_XTS) */ {
+ enccrd->crd_len, buf, iv);
+ break;
+ case CRYPTO_AES_XTS:
+ if (encflag)
+ aesni_encrypt_xts(ses->rounds, ses->enc_schedule,
+ ses->xts_schedule, enccrd->crd_len, buf, buf,
+ iv);
+ else
aesni_decrypt_xts(ses->rounds, ses->dec_schedule,
ses->xts_schedule, enccrd->crd_len, buf, buf,
- ses->iv);
+ iv);
+ break;
+ case CRYPTO_AES_RFC4106_GCM_16:
+#ifdef AESNI_DEBUG
+ printf("GCM: %d\n", encflag);
+ printf("buf(%d): ", enccrd->crd_len);
+ aesni_printhexstr(buf, enccrd->crd_len);
+ printf("\nauthbuf(%d): ", authcrd->crd_len);
+ aesni_printhexstr(authbuf, authcrd->crd_len);
+ printf("\niv: ");
+ aesni_printhexstr(iv, ivlen);
+ printf("\ntag: ");
+ aesni_printhexstr(tag, 16);
+ printf("\nsched: ");
+ aesni_printhexstr(ses->enc_schedule, 16 * (ses->rounds + 1));
+ printf("\n");
+#endif
+ if (encflag)
+ AES_GCM_encrypt(buf, buf, authbuf, iv, tag,
+ enccrd->crd_len, authcrd->crd_len, ivlen,
+ ses->enc_schedule, ses->rounds);
+ else {
+ if (!AES_GCM_decrypt(buf, buf, authbuf, iv, tag,
+ enccrd->crd_len, authcrd->crd_len, ivlen,
+ ses->enc_schedule, ses->rounds))
+ error = EBADMSG;
}
+ break;
}
+ fpu_kern_leave(td, ses->fpu_ctx);
+ critical_exit();
+
if (allocated)
crypto_copyback(crp->crp_flags, crp->crp_buf, enccrd->crd_skip,
enccrd->crd_len, buf);
- if ((enccrd->crd_flags & CRD_F_ENCRYPT) != 0)
- crypto_copydata(crp->crp_flags, crp->crp_buf,
- enccrd->crd_skip + enccrd->crd_len - AES_BLOCK_LEN,
- AES_BLOCK_LEN, ses->iv);
-out:
- fpu_kern_leave(td, ses->fpu_ctx);
+
+#if 0
+ /*
+ * OpenBSD doesn't copy this back. This primes the IV for the next
+ * chain. Why do we not do it for decrypt?
+ */
+ if (encflag && enccrd->crd_alg == CRYPTO_AES_CBC)
+ bcopy(buf + enccrd->crd_len - AES_BLOCK_LEN, iv, AES_BLOCK_LEN);
+
+#endif
+ if (!error && authcrd != NULL) {
+ crypto_copyback(crp->crp_flags, crp->crp_buf,
+ authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
+ }
+
out1:
if (allocated) {
bzero(buf, enccrd->crd_len);
free(buf, M_AESNI);
}
+ if (authallocated)
+ free(authbuf, M_AESNI);
+
return (error);
}
diff --git a/sys/crypto/aesni/aesni.h b/sys/crypto/aesni/aesni.h
index ff1d1a2..a0b1c53 100644
--- a/sys/crypto/aesni/aesni.h
+++ b/sys/crypto/aesni/aesni.h
@@ -56,7 +56,9 @@ struct aesni_session {
uint8_t enc_schedule[AES_SCHED_LEN] __aligned(16);
uint8_t dec_schedule[AES_SCHED_LEN] __aligned(16);
uint8_t xts_schedule[AES_SCHED_LEN] __aligned(16);
- uint8_t iv[AES_BLOCK_LEN];
+ /* AES-GCM needs a counter hence the separated enc/dec IV */
+ uint8_t nonce[4];
+ volatile uint64_t aesgcmcounter;
int algo;
int rounds;
/* uint8_t *ses_ictx; */
@@ -96,6 +98,16 @@ void aesni_decrypt_xts(int rounds, const void *data_schedule /*__aligned(16)*/,
const void *tweak_schedule /*__aligned(16)*/, size_t len,
const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]);
+/* GCM & GHASH functions */
+void AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
+ const unsigned char *addt, const unsigned char *ivec,
+ unsigned char *tag, int nbytes, int abytes, int ibytes,
+ const unsigned char *key, int nr);
+int AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
+ const unsigned char *addt, const unsigned char *ivec,
+ unsigned char *tag, int nbytes, int abytes, int ibytes,
+ const unsigned char *key, int nr);
+
int aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
int keylen);
uint8_t *aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp,
diff --git a/sys/crypto/aesni/aesni_ghash.c b/sys/crypto/aesni/aesni_ghash.c
new file mode 100644
index 0000000..d1b27d1
--- /dev/null
+++ b/sys/crypto/aesni/aesni_ghash.c
@@ -0,0 +1,511 @@
+/*-
+ * Copyright (c) 2014 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by John-Mark Gurney under
+ * the sponsorship from the FreeBSD Foundation.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *
+ * $Id$
+ *
+ */
+
+/*
+ * Figure 5, 8 and 12 are copied from the Intel white paper: Intel
+ * s Multiplication Instruction and its Usage for Computing the GCM Mode
+ *
+ * and as such are: Copyright © 2010 Intel Corporation. All rights reserved.
+ *
+ * Please see white paper for complete license.
+ */
+
+#ifdef _KERNEL
+#include <crypto/aesni/aesni.h>
+#else
+#include <stdint.h>
+#endif
+
+#include <wmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
+
+#define REFLECT(X) \
+ hlp1 = _mm_srli_epi16(X,4);\
+ X = _mm_and_si128(AMASK,X);\
+ hlp1 = _mm_and_si128(AMASK,hlp1);\
+ X = _mm_shuffle_epi8(MASKH,X);\
+ hlp1 = _mm_shuffle_epi8(MASKL,hlp1);\
+ X = _mm_xor_si128(X,hlp1)
+
+/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
+static void
+gfmul_decrypt(__m128i a, __m128i b, __m128i * res)
+{
+ __m128i /* tmp0, tmp1, tmp2, */ tmp3, tmp4, tmp5, tmp6,
+ tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
+ __m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
+
+ tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
+ tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
+ tmp4 = _mm_shuffle_epi32(a, 78);
+ tmp5 = _mm_shuffle_epi32(b, 78);
+ tmp4 = _mm_xor_si128(tmp4, a);
+ tmp5 = _mm_xor_si128(tmp5, b);
+ tmp4 = _mm_clmulepi64_si128(tmp4, tmp5, 0x00);
+ tmp4 = _mm_xor_si128(tmp4, tmp3);
+ tmp4 = _mm_xor_si128(tmp4, tmp6);
+ tmp5 = _mm_slli_si128(tmp4, 8);
+ tmp4 = _mm_srli_si128(tmp4, 8);
+ tmp3 = _mm_xor_si128(tmp3, tmp5);
+ tmp6 = _mm_xor_si128(tmp6, tmp4);
+ tmp7 = _mm_srli_epi32(tmp6, 31);
+ tmp8 = _mm_srli_epi32(tmp6, 30);
+ tmp9 = _mm_srli_epi32(tmp6, 25);
+ tmp7 = _mm_xor_si128(tmp7, tmp8);
+ tmp7 = _mm_xor_si128(tmp7, tmp9);
+ tmp8 = _mm_shuffle_epi32(tmp7, 147);
+
+ tmp7 = _mm_and_si128(XMMMASK, tmp8);
+ tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
+ tmp3 = _mm_xor_si128(tmp3, tmp8);
+ tmp6 = _mm_xor_si128(tmp6, tmp7);
+ tmp10 = _mm_slli_epi32(tmp6, 1);
+ tmp3 = _mm_xor_si128(tmp3, tmp10);
+ tmp11 = _mm_slli_epi32(tmp6, 2);
+ tmp3 = _mm_xor_si128(tmp3, tmp11);
+ tmp12 = _mm_slli_epi32(tmp6, 7);
+ tmp3 = _mm_xor_si128(tmp3, tmp12);
+
+ *res = _mm_xor_si128(tmp3, tmp6);
+}
+
+void
+AES_GCM_encrypt(const unsigned char *in,
+ unsigned char *out,
+ const unsigned char *addt,
+ const unsigned char *ivec,
+ unsigned char *tag,
+ int nbytes,
+ int abytes,
+ int ibytes,
+ const unsigned char *key,
+ int nr)
+{
+ int i , j, k;
+ __m128i hlp1 /* , hlp2, hlp3, hlp4 */ ;
+ __m128i tmp1 , tmp2, tmp3, tmp4;
+ __m128i H , T;
+ __m128i *KEY = (__m128i *) key;
+ __m128i ctr1 , ctr2, ctr3, ctr4;
+ __m128i last_block = _mm_setzero_si128();
+ __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
+ __m128i FOUR = _mm_set_epi32(0, 4, 0, 0);
+ __m128i BSWAP_EPI64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
+ 6, 7);
+ /*
+ * __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ * 10, 11, 12, 13, 14, 15);
+ */
+ __m128i X = _mm_setzero_si128(), Y = _mm_setzero_si128();
+ __m128i AMASK = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
+ __m128i MASKL = _mm_set_epi32(0x0f070b03, 0x0d050901, 0x0e060a02, 0x0c040800);
+ __m128i MASKH = _mm_set_epi32(0xf070b030, 0xd0509010, 0xe060a020, 0xc0408000);
+ __m128i MASKF = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
+
+ if (ibytes == 96 / 8) {
+ Y = _mm_loadu_si128((__m128i *) ivec);
+ Y = _mm_insert_epi32(Y, 0x1000000, 3);
+ /* (Compute E[ZERO, KS] and E[Y0, KS] together */
+ tmp1 = _mm_xor_si128(X, KEY[0]);
+ tmp2 = _mm_xor_si128(Y, KEY[0]);
+ for (j = 1; j < nr - 1; j += 2) {
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j + 1]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[j + 1]);
+ }
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[nr - 1]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[nr - 1]);
+ H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ T = _mm_aesenclast_si128(tmp2, KEY[nr]);
+ REFLECT(H);
+ } else {
+ tmp1 = _mm_xor_si128(X, KEY[0]);
+ for (j = 1; j < nr; j++)
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ REFLECT(H);
+ Y = _mm_xor_si128(Y, Y);
+ for (i = 0; i < ibytes / 16; i++) {
+ tmp1 = _mm_loadu_si128(&((__m128i *) ivec)[i]);
+ REFLECT(tmp1);
+ Y = _mm_xor_si128(Y, tmp1);
+ gfmul_decrypt(Y, H, &Y);
+ }
+ if (ibytes % 16) {
+ for (j = 0; j < ibytes % 16; j++)
+ ((unsigned char *)&last_block)[j] = ivec[i * 16 + j];
+ tmp1 = last_block;
+ REFLECT(tmp1);
+ Y = _mm_xor_si128(Y, tmp1);
+ gfmul_decrypt(Y, H, &Y);
+ }
+ tmp1 = _mm_insert_epi64(tmp1, ibytes * 8, 0);
+ tmp1 = _mm_insert_epi64(tmp1, 0, 1);
+ REFLECT(tmp1);
+ tmp1 = _mm_shuffle_epi8(tmp1, MASKF);
+ Y = _mm_xor_si128(Y, tmp1);
+ gfmul_decrypt(Y, H, &Y);
+ REFLECT(Y);
+ /* Compute E(K, Y0) */
+ tmp1 = _mm_xor_si128(Y, KEY[0]);
+ for (j = 1; j < nr; j++)
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ T = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ }
+
+ for (i = 0; i < abytes / 16; i++) {
+ tmp1 = _mm_loadu_si128(&((__m128i *) addt)[i]);
+ REFLECT(tmp1);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ }
+ if (abytes % 16) {
+ last_block = _mm_setzero_si128();
+ for (j = 0; j < abytes % 16; j++)
+ ((unsigned char *)&last_block)[j] = addt[i * 16 + j];
+ tmp1 = last_block;
+ REFLECT(tmp1);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ }
+ ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
+ ctr1 = _mm_add_epi64(ctr1, ONE);
+ ctr2 = _mm_add_epi64(ctr1, ONE);
+ ctr3 = _mm_add_epi64(ctr2, ONE);
+ ctr4 = _mm_add_epi64(ctr3, ONE);
+ for (i = 0; i < nbytes / 16 / 4; i++) {
+ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+ tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
+ tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
+ tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
+ ctr1 = _mm_add_epi64(ctr1, FOUR);
+ ctr2 = _mm_add_epi64(ctr2, FOUR);
+ ctr3 = _mm_add_epi64(ctr3, FOUR);
+ ctr4 = _mm_add_epi64(ctr4, FOUR);
+ tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+ tmp2 = _mm_xor_si128(tmp2, KEY[0]);
+ tmp3 = _mm_xor_si128(tmp3, KEY[0]);
+ tmp4 = _mm_xor_si128(tmp4, KEY[0]);
+ for (j = 1; j < nr - 1; j += 2) {
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+ tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
+ tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j + 1]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[j + 1]);
+ tmp3 = _mm_aesenc_si128(tmp3, KEY[j + 1]);
+ tmp4 = _mm_aesenc_si128(tmp4, KEY[j + 1]);
+ }
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[nr - 1]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[nr - 1]);
+ tmp3 = _mm_aesenc_si128(tmp3, KEY[nr - 1]);
+ tmp4 = _mm_aesenc_si128(tmp4, KEY[nr - 1]);
+ tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ tmp2 = _mm_aesenclast_si128(tmp2, KEY[nr]);
+ tmp3 = _mm_aesenclast_si128(tmp3, KEY[nr]);
+ tmp4 = _mm_aesenclast_si128(tmp4, KEY[nr]);
+ tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i *) in)[i * 4 + 0]));
+ tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i *) in)[i * 4 + 1]));
+ tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i *) in)[i * 4 + 2]));
+ tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i *) in)[i * 4 + 3]));
+ _mm_storeu_si128(&((__m128i *) out)[i * 4 + 0], tmp1);
+ _mm_storeu_si128(&((__m128i *) out)[i * 4 + 1], tmp2);
+ _mm_storeu_si128(&((__m128i *) out)[i * 4 + 2], tmp3);
+ _mm_storeu_si128(&((__m128i *) out)[i * 4 + 3], tmp4);
+ REFLECT(tmp1);
+ REFLECT(tmp2);
+ REFLECT(tmp3);
+ REFLECT(tmp4);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ X = _mm_xor_si128(X, tmp2);
+ gfmul_decrypt(X, H, &X);
+ X = _mm_xor_si128(X, tmp3);
+ gfmul_decrypt(X, H, &X);
+ X = _mm_xor_si128(X, tmp4);
+ gfmul_decrypt(X, H, &X);
+ }
+ for (k = i * 4; k < nbytes / 16; k++) {
+ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+ ctr1 = _mm_add_epi64(ctr1, ONE);
+ tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+ for (j = 1; j < nr - 1; j += 2) {
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j + 1]);
+ }
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[nr - 1]);
+ tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i *) in)[k]));
+ _mm_storeu_si128(&((__m128i *) out)[k], tmp1);
+ REFLECT(tmp1);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ }
+ //If one partial block remains
+ if (nbytes % 16) {
+ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+ tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+ for (j = 1; j < nr - 1; j += 2) {
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j + 1]);
+ }
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[nr - 1]);
+ tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i *) in)[k]));
+ last_block = tmp1;
+ for (j = 0; j < nbytes % 16; j++)
+ out[k * 16 + j] = ((unsigned char *)&last_block)[j];
+ for (; j < 16; j++)
+ ((unsigned char *)&last_block)[j] = 0;
+ tmp1 = last_block;
+ REFLECT(tmp1);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ }
+ tmp1 = _mm_insert_epi64(tmp1, nbytes * 8, 0);
+ tmp1 = _mm_insert_epi64(tmp1, abytes * 8, 1);
+ REFLECT(tmp1);
+ tmp1 = _mm_shuffle_epi8(tmp1, MASKF);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ REFLECT(X);
+ T = _mm_xor_si128(X, T);
+ _mm_storeu_si128((__m128i *) tag, T);
+}
+
+int
+AES_GCM_decrypt(const unsigned char *in,
+ unsigned char *out,
+ const unsigned char *addt,
+ const unsigned char *ivec,
+ unsigned char *tag,
+ int nbytes,
+ int abytes,
+ int ibytes,
+ const unsigned char *key,
+ int nr)
+{
+ int i , j, k;
+ __m128i hlp1 /* , hlp2, hlp3, hlp4 */ ;
+ __m128i tmp1 , tmp2, tmp3, tmp4;
+ __m128i H , T;
+ __m128i *KEY = (__m128i *) key;
+ __m128i ctr1 , ctr2, ctr3, ctr4;
+ __m128i last_block = _mm_setzero_si128();
+ __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
+ __m128i FOUR = _mm_set_epi32(0, 4, 0, 0);
+ __m128i BSWAP_EPI64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
+ 6, 7);
+ __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15);
+ __m128i X = _mm_setzero_si128(), Y = _mm_setzero_si128();
+ __m128i AMASK = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
+ __m128i MASKL = _mm_set_epi32(0x0f070b03, 0x0d050901, 0x0e060a02, 0x0c040800);
+ __m128i MASKH = _mm_set_epi32(0xf070b030, 0xd0509010, 0xe060a020, 0xc0408000);
+ __m128i MASKF = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
+
+ if (ibytes == 96 / 8) {
+ Y = _mm_loadu_si128((__m128i *) ivec);
+ Y = _mm_insert_epi32(Y, 0x1000000, 3);
+ /* (Compute E[ZERO, KS] and E[Y0, KS] together */
+ tmp1 = _mm_xor_si128(X, KEY[0]);
+ tmp2 = _mm_xor_si128(Y, KEY[0]);
+ for (j = 1; j < nr - 1; j += 2) {
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j + 1]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[j + 1]);
+ };
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[nr - 1]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[nr - 1]);
+ H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ T = _mm_aesenclast_si128(tmp2, KEY[nr]);
+ REFLECT(H);
+ } else {
+ tmp1 = _mm_xor_si128(X, KEY[0]);
+ for (j = 1; j < nr; j++)
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ REFLECT(H);
+ Y = _mm_xor_si128(Y, Y);
+ for (i = 0; i < ibytes / 16; i++) {
+ tmp1 = _mm_loadu_si128(&((__m128i *) ivec)[i]);
+ REFLECT(tmp1);
+ Y = _mm_xor_si128(Y, tmp1);
+ gfmul_decrypt(Y, H, &Y);
+ }
+ if (ibytes % 16) {
+ for (j = 0; j < ibytes % 16; j++)
+ ((unsigned char *)&last_block)[j] = ivec[i * 16 + j];
+ tmp1 = last_block;
+ REFLECT(tmp1);
+ Y = _mm_xor_si128(Y, tmp1);
+ gfmul_decrypt(Y, H, &Y);
+ }
+ tmp1 = _mm_insert_epi64(tmp1, ibytes * 8, 0);
+ tmp1 = _mm_insert_epi64(tmp1, 0, 1);
+ REFLECT(tmp1);
+ tmp1 = _mm_shuffle_epi8(tmp1, MASKF);
+ Y = _mm_xor_si128(Y, tmp1);
+ gfmul_decrypt(Y, H, &Y);
+ REFLECT(Y);
+ /* Compute E(K, Y0) */
+ tmp1 = _mm_xor_si128(Y, KEY[0]);
+ for (j = 1; j < nr; j++)
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ T = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ }
+ for (i = 0; i < abytes / 16; i++) {
+ tmp1 = _mm_loadu_si128(&((__m128i *) addt)[i]);
+ REFLECT(tmp1);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ }
+ if (abytes % 16) {
+ last_block = _mm_setzero_si128();
+ for (j = 0; j < abytes % 16; j++)
+ ((unsigned char *)&last_block)[j] = addt[i * 16 + j];
+ tmp1 = last_block;
+ REFLECT(tmp1);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ }
+ for (i = 0; i < nbytes / 16; i++) {
+ tmp1 = _mm_loadu_si128(&((__m128i *) in)[i]);
+ REFLECT(tmp1);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ }
+ if (nbytes % 16) {
+ last_block = _mm_setzero_si128();
+ for (j = 0; j < nbytes % 16; j++)
+ ((unsigned char *)&last_block)[j] = in[i * 16 + j];
+ tmp1 = last_block;
+ REFLECT(tmp1);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ }
+ tmp1 = _mm_insert_epi64(tmp1, nbytes * 8, 0);
+ tmp1 = _mm_insert_epi64(tmp1, abytes * 8, 1);
+ REFLECT(tmp1);
+ tmp1 = _mm_shuffle_epi8(tmp1, MASKF);
+ X = _mm_xor_si128(X, tmp1);
+ gfmul_decrypt(X, H, &X);
+ REFLECT(X);
+ T = _mm_xor_si128(X, T);
+ if (_mm_testz_si128(T, _mm_loadu_si128((__m128i *) tag)))
+ return 0;
+ //in case the authentication failed
+ ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
+ ctr1 = _mm_add_epi64(ctr1, ONE);
+ ctr2 = _mm_add_epi64(ctr1, ONE);
+ ctr3 = _mm_add_epi64(ctr2, ONE);
+ ctr4 = _mm_add_epi64(ctr3, ONE);
+ for (i = 0; i < nbytes / 16 / 4; i++) {
+ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+ tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
+ tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
+ tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
+ ctr1 = _mm_add_epi64(ctr1, FOUR);
+ ctr2 = _mm_add_epi64(ctr2, FOUR);
+ ctr3 = _mm_add_epi64(ctr3, FOUR);
+ ctr4 = _mm_add_epi64(ctr4, FOUR);
+ tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+ tmp2 = _mm_xor_si128(tmp2, KEY[0]);
+ tmp3 = _mm_xor_si128(tmp3, KEY[0]);
+ tmp4 = _mm_xor_si128(tmp4, KEY[0]);
+ for (j = 1; j < nr - 1; j += 2) {
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+ tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
+ tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j + 1]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[j + 1]);
+ tmp3 = _mm_aesenc_si128(tmp3, KEY[j + 1]);
+ tmp4 = _mm_aesenc_si128(tmp4, KEY[j + 1]);
+ }
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[nr - 1]);
+ tmp2 = _mm_aesenc_si128(tmp2, KEY[nr - 1]);
+ tmp3 = _mm_aesenc_si128(tmp3, KEY[nr - 1]);
+ tmp4 = _mm_aesenc_si128(tmp4, KEY[nr - 1]);
+ tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ tmp2 = _mm_aesenclast_si128(tmp2, KEY[nr]);
+ tmp3 = _mm_aesenclast_si128(tmp3, KEY[nr]);
+ tmp4 = _mm_aesenclast_si128(tmp4, KEY[nr]);
+ tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i *) in)[i * 4 + 0]));
+ tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i *) in)[i * 4 + 1]));
+ tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i *) in)[i * 4 + 2]));
+ tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i *) in)[i * 4 + 3]));
+ _mm_storeu_si128(&((__m128i *) out)[i * 4 + 0], tmp1);
+ _mm_storeu_si128(&((__m128i *) out)[i * 4 + 1], tmp2);
+ _mm_storeu_si128(&((__m128i *) out)[i * 4 + 2], tmp3);
+ _mm_storeu_si128(&((__m128i *) out)[i * 4 + 3], tmp4);
+ tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+ tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
+ tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
+ tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
+ }
+ for (k = i * 4; k < nbytes / 16; k++) {
+ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+ ctr1 = _mm_add_epi64(ctr1, ONE);
+ tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+ for (j = 1; j < nr - 1; j += 2) {
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j + 1]);
+ }
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[nr - 1]);
+ tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i *) in)[k]));
+ _mm_storeu_si128(&((__m128i *) out)[k], tmp1);
+ }
+ //If one partial block remains
+ if (nbytes % 16) {
+ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+ tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+ for (j = 1; j < nr - 1; j += 2) {
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[j + 1]);
+ }
+ tmp1 = _mm_aesenc_si128(tmp1, KEY[nr - 1]);
+ tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+ tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i *) in)[k]));
+ last_block = tmp1;
+ for (j = 0; j < nbytes % 16; j++)
+ out[k * 16 + j] = ((unsigned char *)&last_block)[j];
+ }
+ return 1;
+ //when sucessfull returns 1
+}
diff --git a/sys/crypto/aesni/aesni_wrap.c b/sys/crypto/aesni/aesni_wrap.c
index 39819a6..83d79fc 100644
--- a/sys/crypto/aesni/aesni_wrap.c
+++ b/sys/crypto/aesni/aesni_wrap.c
@@ -176,10 +176,6 @@ aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
}
}
-#define AES_XTS_BLOCKSIZE 16
-#define AES_XTS_IVSIZE 8
-#define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */
-
static inline __m128i
xts_crank_lfsr(__m128i inp)
{
@@ -347,6 +343,23 @@ aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
ses->rounds = AES256_ROUNDS;
break;
default:
+ printf("invalid CBC/GCM key length");
+ return (EINVAL);
+ }
+ break;
+ case CRYPTO_AES_RFC4106_GCM_16:
+ switch (keylen) {
+ case 160:
+ ses->rounds = AES128_ROUNDS;
+ break;
+ case 224:
+ ses->rounds = AES192_ROUNDS;
+ break;
+ case 288:
+ ses->rounds = AES256_ROUNDS;
+ break;
+ default:
+ printf("invalid CBC/GCM key length");
return (EINVAL);
}
break;
@@ -359,6 +372,7 @@ aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
ses->rounds = AES256_ROUNDS;
break;
default:
+ printf("invalid XTS key length");
return (EINVAL);
}
break;
@@ -368,11 +382,20 @@ aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
aesni_set_enckey(key, ses->enc_schedule, ses->rounds);
aesni_set_deckey(ses->enc_schedule, ses->dec_schedule, ses->rounds);
- if (ses->algo == CRYPTO_AES_CBC)
- arc4rand(ses->iv, sizeof(ses->iv), 0);
- else /* if (ses->algo == CRYPTO_AES_XTS) */ {
+
+ /* setup IV */
+ switch (ses->algo) {
+ case CRYPTO_AES_CBC:
+ /* Nothing todo */
+ break;
+ case CRYPTO_AES_RFC4106_GCM_16:
+ bcopy(key + ((keylen - 32) / 8), ses->nonce, AESCTR_NONCESIZE);
+ arc4rand((void *)&ses->aesgcmcounter, sizeof(uint64_t), 0);
+ break;
+ case CRYPTO_AES_XTS:
aesni_set_enckey(key + keylen / 16, ses->xts_schedule,
ses->rounds);
+ break;
}
return (0);
diff --git a/sys/modules/aesni/Makefile b/sys/modules/aesni/Makefile
index 26dbedc..e66f941 100644
--- a/sys/modules/aesni/Makefile
+++ b/sys/modules/aesni/Makefile
@@ -15,5 +15,13 @@ aesni_wrap.o: aesni_wrap.c
-mmmx -msse -maes ${.IMPSRC}
${CTFCONVERT_CMD}
+.if ${MACHINE_CPUARCH} == "amd64"
+OBJS+= aesni_ghash.o
+aesni_ghash.o: aesni_ghash.c
+ ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+ -mmmx -mpclmul -msse -msse4.1 -maes ${.IMPSRC}
+ ${CTFCONVERT_CMD}
+.endif
+
.include <bsd.kmod.mk>
OpenPOWER on IntegriCloud