Add some new modes to OpenCrypto. These modes are AES-ICM (can be used

for counter mode), and AES-GCM. Both of these modes have been added to the aesni module. Included is a set of tests to validate that the software and aesni module calculate the correct values. These use the NIST KAT test vectors. To run the test, you will need to install a soon to be committed port, nist-kat that will install the vectors. Using a port is necessary as the test vectors are around 25MB. All the man pages were updated. I have added a new man page, crypto.7, which includes a description of how to use each mode. All the new modes and some other AES modes are present. It would be good for someone else to go through and document the other modes. A new ioctl was added to support AEAD modes which AES-GCM is one of them. Without this ioctl, it is not possible to test AEAD modes from userland. Add a timing safe bcmp for use to compare MACs. Previously we were using bcmp which could leak timing info and result in the ability to forge messages. Add a minor optimization to the aesni module so that single segment mbufs don't get copied and instead are updated in place. The aesni module needs to be updated to support blocked IO so segmented mbufs don't have to be copied. We require that the IV be specified for all calls for both GCM and ICM. This is to ensure proper use of these functions. Obtained from: p4: //depot/projects/opencrypto Relnotes: yes Sponsored by: FreeBSD Foundation Sponsored by: NetGate
author: jmg <jmg@FreeBSD.org> 2014-12-12 19:56:36 +0000
committer: jmg <jmg@FreeBSD.org> 2014-12-12 19:56:36 +0000
commit: c3ff54cc39aaa96a4ce2b1ee496a649c86143942 (patch)
tree: 7e0047b9e1e2005d85782cf44fc29f74f116317e /sys/crypto
parent: 8bf652525111dc99d0ac033111fbe20a623483e8 (diff)
download: FreeBSD-src-c3ff54cc39aaa96a4ce2b1ee496a649c86143942.zip
FreeBSD-src-c3ff54cc39aaa96a4ce2b1ee496a649c86143942.tar.gz
5 files changed, 1128 insertions, 56 deletions
diff --git a/sys/crypto/aesni/aesni.c b/sys/crypto/aesni/aesni.c
index 7d7a740..2535750 100644
--- a/sys/crypto/aesni/aesni.c
+++ b/sys/crypto/aesni/aesni.c
@@ -1,8 +1,13 @@
 /*-
  * Copyright (c) 2005-2008 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2014 The FreeBSD Foundation
  * All rights reserved.
  *
+ * Portions of this software were developed by John-Mark Gurney
+ * under sponsorship of the FreeBSD Foundation and
+ * Rubicon Communications, LLC (Netgate).
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -39,8 +44,10 @@ __FBSDID("$FreeBSD$");
 #include <sys/rwlock.h>
 #include <sys/bus.h>
 #include <sys/uio.h>
+#include <sys/mbuf.h>
 #include <crypto/aesni/aesni.h>
 #include <cryptodev_if.h>
+#include <opencrypto/gmac.h>
 
 struct aesni_softc {
 	int32_t cid;
@@ -56,7 +63,7 @@ static void aesni_freesession_locked(struct aesni_softc *sc,
 static int aesni_cipher_setup(struct aesni_session *ses,
     struct cryptoini *encini);
 static int aesni_cipher_process(struct aesni_session *ses,
-    struct cryptodesc *enccrd, struct cryptop *crp);
+    struct cryptodesc *enccrd, struct cryptodesc *authcrd, struct cryptop *crp);
 
 MALLOC_DEFINE(M_AESNI, "aesni_data", "AESNI Data");
 
@@ -79,12 +86,12 @@ aesni_probe(device_t dev)
 		return (EINVAL);
 	}
 
-	if ((cpu_feature & CPUID_SSE2) == 0) {
-		device_printf(dev, "No SSE2 support but AESNI!?!\n");
+	if ((cpu_feature2 & CPUID2_SSE41) == 0) {
+		device_printf(dev, "No SSE4.1 support.\n");
 		return (EINVAL);
 	}
 
-	device_set_desc_copy(dev, "AES-CBC,AES-XTS");
+	device_set_desc_copy(dev, "AES-CBC,AES-XTS,AES-GCM,AES-ICM");
 	return (0);
 }
 
@@ -105,6 +112,11 @@ aesni_attach(device_t dev)
 
 	rw_init(&sc->lock, "aesni_lock");
 	crypto_register(sc->cid, CRYPTO_AES_CBC, 0, 0);
+	crypto_register(sc->cid, CRYPTO_AES_ICM, 0, 0);
+	crypto_register(sc->cid, CRYPTO_AES_NIST_GCM_16, 0, 0);
+	crypto_register(sc->cid, CRYPTO_AES_128_NIST_GMAC, 0, 0);
+	crypto_register(sc->cid, CRYPTO_AES_192_NIST_GMAC, 0, 0);
+	crypto_register(sc->cid, CRYPTO_AES_256_NIST_GMAC, 0, 0);
 	crypto_register(sc->cid, CRYPTO_AES_XTS, 0, 0);
 	return (0);
 }
@@ -144,8 +156,10 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
 	struct cryptoini *encini;
 	int error;
 
-	if (sidp == NULL || cri == NULL)
+	if (sidp == NULL || cri == NULL) {
+		CRYPTDEB("no sidp or cri");
 		return (EINVAL);
+	}
 
 	sc = device_get_softc(dev);
 	ses = NULL;
@@ -153,17 +167,32 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
 	for (; cri != NULL; cri = cri->cri_next) {
 		switch (cri->cri_alg) {
 		case CRYPTO_AES_CBC:
+		case CRYPTO_AES_ICM:
 		case CRYPTO_AES_XTS:
-			if (encini != NULL)
+		case CRYPTO_AES_NIST_GCM_16:
+			if (encini != NULL) {
+				CRYPTDEB("encini already set");
 				return (EINVAL);
+			}
 			encini = cri;
 			break;
+		case CRYPTO_AES_128_NIST_GMAC:
+		case CRYPTO_AES_192_NIST_GMAC:
+		case CRYPTO_AES_256_NIST_GMAC:
+			/*
+			 * nothing to do here, maybe in the future cache some
+			 * values for GHASH
+			 */
+			break;
 		default:
+			CRYPTDEB("unhandled algorithm");
 			return (EINVAL);
 		}
 	}
-	if (encini == NULL)
+	if (encini == NULL) {
+		CRYPTDEB("no cipher");
 		return (EINVAL);
+	}
 
 	rw_wlock(&sc->lock);
 	/*
@@ -195,6 +224,7 @@ aesni_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
 
 	error = aesni_cipher_setup(ses, encini);
 	if (error != 0) {
+		CRYPTDEB("setup failed");
 		rw_wlock(&sc->lock);
 		aesni_freesession_locked(sc, ses);
 		rw_wunlock(&sc->lock);
@@ -214,7 +244,7 @@ aesni_freesession_locked(struct aesni_softc *sc, struct aesni_session *ses)
 	sid = ses->id;
 	TAILQ_REMOVE(&sc->sessions, ses, next);
 	ctx = ses->fpu_ctx;
-	bzero(ses, sizeof(*ses));
+	*ses = (struct aesni_session){};
 	ses->id = sid;
 	ses->fpu_ctx = ctx;
 	TAILQ_INSERT_HEAD(&sc->sessions, ses, next);
@@ -248,11 +278,13 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)
 {
 	struct aesni_softc *sc = device_get_softc(dev);
 	struct aesni_session *ses = NULL;
-	struct cryptodesc *crd, *enccrd;
-	int error;
+	struct cryptodesc *crd, *enccrd, *authcrd;
+	int error, needauth;
 
 	error = 0;
 	enccrd = NULL;
+	authcrd = NULL;
+	needauth = 0;
 
 	/* Sanity check. */
 	if (crp == NULL)
@@ -266,6 +298,7 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)
 	for (crd = crp->crp_desc; crd != NULL; crd = crd->crd_next) {
 		switch (crd->crd_alg) {
 		case CRYPTO_AES_CBC:
+		case CRYPTO_AES_ICM:
 		case CRYPTO_AES_XTS:
 			if (enccrd != NULL) {
 				error = EINVAL;
@@ -273,11 +306,41 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)
 			}
 			enccrd = crd;
 			break;
+
+		case CRYPTO_AES_NIST_GCM_16:
+			if (enccrd != NULL) {
+				error = EINVAL;
+				goto out;
+			}
+			enccrd = crd;
+			needauth = 1;
+			break;
+
+		case CRYPTO_AES_128_NIST_GMAC:
+		case CRYPTO_AES_192_NIST_GMAC:
+		case CRYPTO_AES_256_NIST_GMAC:
+			if (authcrd != NULL) {
+				error = EINVAL;
+				goto out;
+			}
+			authcrd = crd;
+			needauth = 1;
+			break;
+
 		default:
-			return (EINVAL);
+			error = EINVAL;
+			goto out;
 		}
 	}
-	if (enccrd == NULL || (enccrd->crd_len % AES_BLOCK_LEN) != 0) {
+
+	if (enccrd == NULL || (needauth && authcrd == NULL)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/* CBC & XTS can only handle full blocks for now */
+	if ((enccrd->crd_alg == CRYPTO_AES_CBC || enccrd->crd_alg ==
+	    CRYPTO_AES_XTS) && (enccrd->crd_len % AES_BLOCK_LEN) != 0) {
 		error = EINVAL;
 		goto out;
 	}
@@ -293,7 +356,7 @@ aesni_process(device_t dev, struct cryptop *crp, int hint __unused)
 		goto out;
 	}
 
-	error = aesni_cipher_process(ses, enccrd, crp);
+	error = aesni_cipher_process(ses, enccrd, authcrd, crp);
 	if (error != 0)
 		goto out;
 
@@ -307,21 +370,26 @@ uint8_t *
 aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp,
     int *allocated)
 {
+	struct mbuf *m;
 	struct uio *uio;
 	struct iovec *iov;
 	uint8_t *addr;
 
-	if (crp->crp_flags & CRYPTO_F_IMBUF)
-		goto alloc;
-	else if (crp->crp_flags & CRYPTO_F_IOV) {
+	if (crp->crp_flags & CRYPTO_F_IMBUF) {
+		m = (struct mbuf *)crp->crp_buf;
+		if (m->m_next != NULL)
+			goto alloc;
+		addr = mtod(m, uint8_t *);
+	} else if (crp->crp_flags & CRYPTO_F_IOV) {
 		uio = (struct uio *)crp->crp_buf;
 		if (uio->uio_iovcnt != 1)
 			goto alloc;
 		iov = uio->uio_iov;
-		addr = (u_char *)iov->iov_base + enccrd->crd_skip;
+		addr = (uint8_t *)iov->iov_base;
 	} else
-		addr = (u_char *)crp->crp_buf;
+		addr = (uint8_t *)crp->crp_buf;
 	*allocated = 0;
+	addr += enccrd->crd_skip;
 	return (addr);
 
 alloc:
@@ -376,18 +444,40 @@ aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini)
 	return (error);
 }
 
+/*
+ * authcrd contains the associated date.
+ */
 static int
 aesni_cipher_process(struct aesni_session *ses, struct cryptodesc *enccrd,
-    struct cryptop *crp)
+    struct cryptodesc *authcrd, struct cryptop *crp)
 {
+	uint8_t tag[GMAC_DIGEST_LEN];
 	struct thread *td;
-	uint8_t *buf;
-	int error, allocated;
+	uint8_t *buf, *authbuf;
+	int error, allocated, authallocated;
+	int ivlen, encflag;
+
+	encflag = (enccrd->crd_flags & CRD_F_ENCRYPT) == CRD_F_ENCRYPT;
+
+	if ((enccrd->crd_alg == CRYPTO_AES_ICM ||
+	    enccrd->crd_alg == CRYPTO_AES_NIST_GCM_16) &&
+	    (enccrd->crd_flags & CRD_F_IV_EXPLICIT) == 0)
+		return (EINVAL);
 
 	buf = aesni_cipher_alloc(enccrd, crp, &allocated);
 	if (buf == NULL)
 		return (ENOMEM);
 
+	authbuf = NULL;
+	authallocated = 0;
+	if (authcrd != NULL) {
+		authbuf = aesni_cipher_alloc(authcrd, crp, &authallocated);
+		if (authbuf == NULL) {
+			error = ENOMEM;
+			goto out1;
+		}
+	}
+
 	td = curthread;
 	error = fpu_kern_enter(td, ses->fpu_ctx, FPU_KERN_NORMAL |
 	    FPU_KERN_KTHR);
@@ -401,42 +491,91 @@ aesni_cipher_process(struct aesni_session *ses, struct cryptodesc *enccrd,
 			goto out;
 	}
 
-	if ((enccrd->crd_flags & CRD_F_ENCRYPT) != 0) {
-		if ((enccrd->crd_flags & CRD_F_IV_EXPLICIT) != 0)
-			bcopy(enccrd->crd_iv, ses->iv, AES_BLOCK_LEN);
-		if ((enccrd->crd_flags & CRD_F_IV_PRESENT) == 0)
-			crypto_copyback(crp->crp_flags, crp->crp_buf,
-			    enccrd->crd_inject, AES_BLOCK_LEN, ses->iv);
-		if (ses->algo == CRYPTO_AES_CBC) {
+	/* XXX - validate that enccrd and authcrd have/use same key? */
+	switch (enccrd->crd_alg) {
+	case CRYPTO_AES_CBC:
+	case CRYPTO_AES_ICM:
+		ivlen = AES_BLOCK_LEN;
+		break;
+	case CRYPTO_AES_XTS:
+		ivlen = 8;
+		break;
+	case CRYPTO_AES_NIST_GCM_16:
+		ivlen = 12;	/* should support arbitarily larger */
+		break;
+	}
+
+	/* Setup ses->iv */
+	bzero(ses->iv, sizeof ses->iv);
+	if ((enccrd->crd_flags & CRD_F_IV_EXPLICIT) != 0)
+		bcopy(enccrd->crd_iv, ses->iv, ivlen);
+	else if (encflag && ((enccrd->crd_flags & CRD_F_IV_PRESENT) != 0))
+		arc4rand(ses->iv, ivlen, 0);
+	else
+		crypto_copydata(crp->crp_flags, crp->crp_buf,
+		    enccrd->crd_inject, ivlen, ses->iv);
+
+	if (authcrd != NULL && !encflag)
+		crypto_copydata(crp->crp_flags, crp->crp_buf,
+		    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
+	else
+		bzero(tag, sizeof tag);
+
+	/* Do work */
+	switch (ses->algo) {
+	case CRYPTO_AES_CBC:
+		if (encflag)
 			aesni_encrypt_cbc(ses->rounds, ses->enc_schedule,
 			    enccrd->crd_len, buf, buf, ses->iv);
-		} else /* if (ses->algo == CRYPTO_AES_XTS) */ {
+		else
+			aesni_decrypt_cbc(ses->rounds, ses->dec_schedule,
+			    enccrd->crd_len, buf, ses->iv);
+		break;
+	case CRYPTO_AES_ICM:
+		/* encryption & decryption are the same */
+		aesni_encrypt_icm(ses->rounds, ses->enc_schedule,
+		    enccrd->crd_len, buf, buf, ses->iv);
+		break;
+	case CRYPTO_AES_XTS:
+		if (encflag)
 			aesni_encrypt_xts(ses->rounds, ses->enc_schedule,
 			    ses->xts_schedule, enccrd->crd_len, buf, buf,
 			    ses->iv);
-		}
-	} else {
-		if ((enccrd->crd_flags & CRD_F_IV_EXPLICIT) != 0)
-			bcopy(enccrd->crd_iv, ses->iv, AES_BLOCK_LEN);
 		else
-			crypto_copydata(crp->crp_flags, crp->crp_buf,
-			    enccrd->crd_inject, AES_BLOCK_LEN, ses->iv);
-		if (ses->algo == CRYPTO_AES_CBC) {
-			aesni_decrypt_cbc(ses->rounds, ses->dec_schedule,
-			    enccrd->crd_len, buf, ses->iv);
-		} else /* if (ses->algo == CRYPTO_AES_XTS) */ {
 			aesni_decrypt_xts(ses->rounds, ses->dec_schedule,
 			    ses->xts_schedule, enccrd->crd_len, buf, buf,
 			    ses->iv);
+		break;
+	case CRYPTO_AES_NIST_GCM_16:
+		if (encflag)
+			AES_GCM_encrypt(buf, buf, authbuf, ses->iv, tag,
+			    enccrd->crd_len, authcrd->crd_len, ivlen,
+			    ses->enc_schedule, ses->rounds);
+		else {
+			if (!AES_GCM_decrypt(buf, buf, authbuf, ses->iv, tag,
+			    enccrd->crd_len, authcrd->crd_len, ivlen,
+			    ses->enc_schedule, ses->rounds))
+				error = EBADMSG;
 		}
+		break;
 	}
+
 	if (allocated)
 		crypto_copyback(crp->crp_flags, crp->crp_buf, enccrd->crd_skip,
 		    enccrd->crd_len, buf);
-	if ((enccrd->crd_flags & CRD_F_ENCRYPT) != 0)
-		crypto_copydata(crp->crp_flags, crp->crp_buf,
-		    enccrd->crd_skip + enccrd->crd_len - AES_BLOCK_LEN,
-		    AES_BLOCK_LEN, ses->iv);
+
+	/*
+	 * OpenBSD doesn't copy this back.  This primes the IV for the next
+	 * chain.  Why do we not do it for decrypt?
+	 */
+	if (encflag && enccrd->crd_alg == CRYPTO_AES_CBC)
+		bcopy(buf + enccrd->crd_len - AES_BLOCK_LEN, ses->iv, AES_BLOCK_LEN);
+
+	if (!error && authcrd != NULL) {
+		crypto_copyback(crp->crp_flags, crp->crp_buf,
+		    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
+	}
+
 out:
 	fpu_kern_leave(td, ses->fpu_ctx);
 out1:
@@ -444,5 +583,7 @@ out1:
 		bzero(buf, enccrd->crd_len);
 		free(buf, M_AESNI);
 	}
+	if (authallocated)
+		free(authbuf, M_AESNI);
 	return (error);
 }
diff --git a/sys/crypto/aesni/aesni.h b/sys/crypto/aesni/aesni.h
index ff1d1a2..5cd8925 100644
--- a/sys/crypto/aesni/aesni.h
+++ b/sys/crypto/aesni/aesni.h
@@ -88,6 +88,9 @@ void aesni_encrypt_ecb(int rounds, const void *key_schedule /*__aligned(16)*/,
     size_t len, const uint8_t *from, uint8_t *to);
 void aesni_decrypt_ecb(int rounds, const void *key_schedule /*__aligned(16)*/,
     size_t len, const uint8_t *from, uint8_t *to);
+void aesni_encrypt_icm(int rounds, const void *key_schedule /*__aligned(16)*/,
+    size_t len, const uint8_t *from, uint8_t *to,
+    const uint8_t iv[AES_BLOCK_LEN]);
 
 void aesni_encrypt_xts(int rounds, const void *data_schedule /*__aligned(16)*/,
     const void *tweak_schedule /*__aligned(16)*/, size_t len,
@@ -96,6 +99,16 @@ void aesni_decrypt_xts(int rounds, const void *data_schedule /*__aligned(16)*/,
     const void *tweak_schedule /*__aligned(16)*/, size_t len,
     const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]);
 
+/* GCM & GHASH functions */
+void AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
+    const unsigned char *addt, const unsigned char *ivec,
+    unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
+    const unsigned char *key, int nr);
+int AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
+    const unsigned char *addt, const unsigned char *ivec,
+    unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
+    const unsigned char *key, int nr);
+
 int aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
     int keylen);
 uint8_t *aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp,
diff --git a/sys/crypto/aesni/aesni_ghash.c b/sys/crypto/aesni/aesni_ghash.c
new file mode 100644
index 0000000..005ba81
--- /dev/null
+++ b/sys/crypto/aesni/aesni_ghash.c
@@ -0,0 +1,803 @@
+/*-
+ * Copyright (c) 2014 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by John-Mark Gurney under
+ * the sponsorship of the FreeBSD Foundation and
+ * Rubicon Communications, LLC (Netgate).
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1.  Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ * 2.  Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *
+ *	$FreeBSD$
+ *
+ */
+
+/*
+ * Figure 5, 8 and 12 are copied from the Intel white paper:
+ * Intel® Carry-Less Multiplication Instruction and its Usage for
+ * Computing the GCM Mode
+ *
+ * and as such are:
+ * Copyright © 2010 Intel Corporation.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of Intel Corporation nor the
+ *     names of its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef _KERNEL
+#include <crypto/aesni/aesni.h>
+#else
+#include <stdint.h>
+#endif
+
+#include <wmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+static inline int
+m128icmp(__m128i a, __m128i b)
+{
+	__m128i cmp;
+
+	cmp = _mm_cmpeq_epi32(a, b);
+
+	return _mm_movemask_epi8(cmp) == 0xffff;
+}
+
+#ifdef __i386__
+static inline __m128i
+_mm_insert_epi64(__m128i a, int64_t b, const int ndx)
+{  
+
+	if (!ndx) {
+		a = _mm_insert_epi32(a, b, 0);
+		a = _mm_insert_epi32(a, b >> 32, 1);
+	} else {
+		a = _mm_insert_epi32(a, b, 2);
+		a = _mm_insert_epi32(a, b >> 32, 3);
+	}
+
+	return a;
+}
+#endif
+
+/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
+
+/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
+static void
+gfmul(__m128i a, __m128i b, __m128i *res)
+{
+	__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+
+	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
+	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
+	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
+	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
+
+	tmp4 = _mm_xor_si128(tmp4, tmp5);
+	tmp5 = _mm_slli_si128(tmp4, 8);
+	tmp4 = _mm_srli_si128(tmp4, 8);
+	tmp3 = _mm_xor_si128(tmp3, tmp5);
+	tmp6 = _mm_xor_si128(tmp6, tmp4);
+
+	tmp7 = _mm_srli_epi32(tmp3, 31);
+	tmp8 = _mm_srli_epi32(tmp6, 31);
+	tmp3 = _mm_slli_epi32(tmp3, 1);
+	tmp6 = _mm_slli_epi32(tmp6, 1);
+
+	tmp9 = _mm_srli_si128(tmp7, 12);
+	tmp8 = _mm_slli_si128(tmp8, 4);
+	tmp7 = _mm_slli_si128(tmp7, 4);
+	tmp3 = _mm_or_si128(tmp3, tmp7);
+	tmp6 = _mm_or_si128(tmp6, tmp8);
+	tmp6 = _mm_or_si128(tmp6, tmp9);
+
+	tmp7 = _mm_slli_epi32(tmp3, 31);
+	tmp8 = _mm_slli_epi32(tmp3, 30);
+	tmp9 = _mm_slli_epi32(tmp3, 25);
+
+	tmp7 = _mm_xor_si128(tmp7, tmp8);
+	tmp7 = _mm_xor_si128(tmp7, tmp9);
+	tmp8 = _mm_srli_si128(tmp7, 4);
+	tmp7 = _mm_slli_si128(tmp7, 12);
+	tmp3 = _mm_xor_si128(tmp3, tmp7);
+
+	tmp2 = _mm_srli_epi32(tmp3, 1);
+	tmp4 = _mm_srli_epi32(tmp3, 2);
+	tmp5 = _mm_srli_epi32(tmp3, 7);
+	tmp2 = _mm_xor_si128(tmp2, tmp4);
+	tmp2 = _mm_xor_si128(tmp2, tmp5);
+	tmp2 = _mm_xor_si128(tmp2, tmp8);
+	tmp3 = _mm_xor_si128(tmp3, tmp2);
+	tmp6 = _mm_xor_si128(tmp6, tmp3);
+
+	*res = tmp6;
+}
+
+/*
+ * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
+ * Method */
+static void
+reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
+    __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
+{
+	/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
+	__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
+	    H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
+	__m128i tmp0, tmp1, tmp2, tmp3;
+	__m128i tmp4, tmp5, tmp6, tmp7;
+	__m128i tmp8, tmp9;
+
+	H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
+	H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
+	H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
+	H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
+
+	lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
+	lo = _mm_xor_si128(lo, H3_X3_lo);
+	lo = _mm_xor_si128(lo, H4_X4_lo);
+
+	H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
+	H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
+	H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
+	H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
+
+	hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
+	hi = _mm_xor_si128(hi, H3_X3_hi);
+	hi = _mm_xor_si128(hi, H4_X4_hi);
+
+	tmp0 = _mm_shuffle_epi32(H1, 78);
+	tmp4 = _mm_shuffle_epi32(X1, 78);
+	tmp0 = _mm_xor_si128(tmp0, H1);
+	tmp4 = _mm_xor_si128(tmp4, X1);
+	tmp1 = _mm_shuffle_epi32(H2, 78);
+	tmp5 = _mm_shuffle_epi32(X2, 78);
+	tmp1 = _mm_xor_si128(tmp1, H2);
+	tmp5 = _mm_xor_si128(tmp5, X2);
+	tmp2 = _mm_shuffle_epi32(H3, 78);
+	tmp6 = _mm_shuffle_epi32(X3, 78);
+	tmp2 = _mm_xor_si128(tmp2, H3);
+	tmp6 = _mm_xor_si128(tmp6, X3);
+	tmp3 = _mm_shuffle_epi32(H4, 78);
+	tmp7 = _mm_shuffle_epi32(X4, 78);
+	tmp3 = _mm_xor_si128(tmp3, H4);
+	tmp7 = _mm_xor_si128(tmp7, X4);
+
+	tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
+	tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
+	tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
+	tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
+
+	tmp0 = _mm_xor_si128(tmp0, lo);
+	tmp0 = _mm_xor_si128(tmp0, hi);
+	tmp0 = _mm_xor_si128(tmp1, tmp0);
+	tmp0 = _mm_xor_si128(tmp2, tmp0);
+	tmp0 = _mm_xor_si128(tmp3, tmp0);
+
+	tmp4 = _mm_slli_si128(tmp0, 8);
+	tmp0 = _mm_srli_si128(tmp0, 8);
+
+	lo = _mm_xor_si128(tmp4, lo);
+	hi = _mm_xor_si128(tmp0, hi);
+
+	tmp3 = lo;
+	tmp6 = hi;
+
+	tmp7 = _mm_srli_epi32(tmp3, 31);
+	tmp8 = _mm_srli_epi32(tmp6, 31);
+	tmp3 = _mm_slli_epi32(tmp3, 1);
+	tmp6 = _mm_slli_epi32(tmp6, 1);
+
+	tmp9 = _mm_srli_si128(tmp7, 12);
+	tmp8 = _mm_slli_si128(tmp8, 4);
+	tmp7 = _mm_slli_si128(tmp7, 4);
+	tmp3 = _mm_or_si128(tmp3, tmp7);
+	tmp6 = _mm_or_si128(tmp6, tmp8);
+	tmp6 = _mm_or_si128(tmp6, tmp9);
+
+	tmp7 = _mm_slli_epi32(tmp3, 31);
+	tmp8 = _mm_slli_epi32(tmp3, 30);
+	tmp9 = _mm_slli_epi32(tmp3, 25);
+
+	tmp7 = _mm_xor_si128(tmp7, tmp8);
+	tmp7 = _mm_xor_si128(tmp7, tmp9);
+	tmp8 = _mm_srli_si128(tmp7, 4);
+	tmp7 = _mm_slli_si128(tmp7, 12);
+	tmp3 = _mm_xor_si128(tmp3, tmp7);
+
+	tmp2 = _mm_srli_epi32(tmp3, 1);
+	tmp4 = _mm_srli_epi32(tmp3, 2);
+	tmp5 = _mm_srli_epi32(tmp3, 7);
+	tmp2 = _mm_xor_si128(tmp2, tmp4);
+	tmp2 = _mm_xor_si128(tmp2, tmp5);
+	tmp2 = _mm_xor_si128(tmp2, tmp8);
+	tmp3 = _mm_xor_si128(tmp3, tmp2);
+	tmp6 = _mm_xor_si128(tmp6, tmp3);
+
+	*res = tmp6;
+}
+
+/*
+ * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
+ * Every Four Blocks
+ */
+/*
+ * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
+ * 2^32-256*8*16 bytes.
+ */
+void
+AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
+	const unsigned char *addt, const unsigned char *ivec,
+	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
+	const unsigned char *key, int nr)
+{
+	int i, j ,k;
+	__m128i tmp1, tmp2, tmp3, tmp4;
+	__m128i tmp5, tmp6, tmp7, tmp8;
+	__m128i H, H2, H3, H4, Y, T;
+	__m128i *KEY = (__m128i*)key;
+	__m128i ctr1, ctr2, ctr3, ctr4;
+	__m128i ctr5, ctr6, ctr7, ctr8;
+	__m128i last_block = _mm_setzero_si128();
+	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
+	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
+	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
+	    7);
+	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
+	    15);
+	__m128i X = _mm_setzero_si128();
+
+	if (ibytes == 96/8) {
+		Y = _mm_loadu_si128((__m128i*)ivec);
+		Y = _mm_insert_epi32(Y, 0x1000000, 3);
+		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
+		tmp1 = _mm_xor_si128(X, KEY[0]);
+		tmp2 = _mm_xor_si128(Y, KEY[0]);
+		for (j=1; j < nr-1; j+=2) {
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
+		}
+		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
+
+		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
+
+		H = _mm_shuffle_epi8(H, BSWAP_MASK);
+	} else {
+		tmp1 = _mm_xor_si128(X, KEY[0]);
+		for (j=1; j <nr; j++)
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+
+		H = _mm_shuffle_epi8(H, BSWAP_MASK);
+		Y = _mm_setzero_si128();
+
+		for (i=0; i < ibytes/16; i++) {
+			tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
+			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+			Y = _mm_xor_si128(Y, tmp1);
+			gfmul(Y, H, &Y);
+		}
+		if (ibytes%16) {
+			for (j=0; j < ibytes%16; j++)
+				((unsigned char*)&last_block)[j] = ivec[i*16+j];
+			tmp1 = last_block;
+			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+			Y = _mm_xor_si128(Y, tmp1);
+			gfmul(Y, H, &Y);
+		}
+		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
+		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
+
+		Y = _mm_xor_si128(Y, tmp1);
+		gfmul(Y, H, &Y);
+		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
+		tmp1 = _mm_xor_si128(Y, KEY[0]);
+		for (j=1; j < nr; j++)
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
+	}
+
+	gfmul(H,H,&H2);
+	gfmul(H,H2,&H3);
+	gfmul(H,H3,&H4);
+
+	for (i=0; i<abytes/16/4; i++) {
+		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
+		tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
+		tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
+		tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
+
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
+		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
+		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
+		tmp1 = _mm_xor_si128(X, tmp1);
+
+		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
+	}
+	for (i=i*4; i<abytes/16; i++) {
+		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		X = _mm_xor_si128(X,tmp1);
+		gfmul(X, H, &X);
+	}
+	if (abytes%16) {
+		last_block = _mm_setzero_si128();
+		for (j=0; j<abytes%16; j++)
+			((unsigned char*)&last_block)[j] = addt[i*16+j];
+		tmp1 = last_block;
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		X =_mm_xor_si128(X,tmp1);
+		gfmul(X,H,&X);
+	}
+
+	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
+	ctr1 = _mm_add_epi64(ctr1, ONE);
+	ctr2 = _mm_add_epi64(ctr1, ONE);
+	ctr3 = _mm_add_epi64(ctr2, ONE);
+	ctr4 = _mm_add_epi64(ctr3, ONE);
+	ctr5 = _mm_add_epi64(ctr4, ONE);
+	ctr6 = _mm_add_epi64(ctr5, ONE);
+	ctr7 = _mm_add_epi64(ctr6, ONE);
+	ctr8 = _mm_add_epi64(ctr7, ONE);
+
+	for (i=0; i<nbytes/16/8; i++) {
+		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
+		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
+		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
+		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
+		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
+		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
+		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
+
+		ctr1 = _mm_add_epi64(ctr1, EIGHT);
+		ctr2 = _mm_add_epi64(ctr2, EIGHT);
+		ctr3 = _mm_add_epi64(ctr3, EIGHT);
+		ctr4 = _mm_add_epi64(ctr4, EIGHT);
+		ctr5 = _mm_add_epi64(ctr5, EIGHT);
+		ctr6 = _mm_add_epi64(ctr6, EIGHT);
+		ctr7 = _mm_add_epi64(ctr7, EIGHT);
+		ctr8 = _mm_add_epi64(ctr8, EIGHT);
+
+		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
+		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
+		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
+		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
+		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
+		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
+		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
+		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
+
+		for (j=1; j<nr; j++) {
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
+			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
+			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
+			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
+			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
+			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
+		}
+		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
+		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
+		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
+		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
+		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
+		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
+		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
+		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
+
+		tmp1 = _mm_xor_si128(tmp1,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+0]));
+		tmp2 = _mm_xor_si128(tmp2,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+1]));
+		tmp3 = _mm_xor_si128(tmp3,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+2]));
+		tmp4 = _mm_xor_si128(tmp4,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+3]));
+		tmp5 = _mm_xor_si128(tmp5,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+4]));
+		tmp6 = _mm_xor_si128(tmp6,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+5]));
+		tmp7 = _mm_xor_si128(tmp7,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+6]));
+		tmp8 = _mm_xor_si128(tmp8,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+7]));
+
+		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
+
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
+		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
+		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
+		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
+		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
+		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
+		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
+
+		tmp1 = _mm_xor_si128(X, tmp1);
+
+		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
+
+		tmp5 = _mm_xor_si128(X, tmp5);
+		reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
+	}
+	for (k=i*8; k<nbytes/16; k++) {
+		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+		ctr1 = _mm_add_epi64(ctr1, ONE);
+		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+		for (j=1; j<nr-1; j+=2) {
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+		}
+		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
+		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		X = _mm_xor_si128(X, tmp1);
+		gfmul(X,H,&X);
+	}
+	//If remains one incomplete block
+	if (nbytes%16) {
+		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+		for (j=1; j<nr-1; j+=2) {
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+		}
+		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
+		last_block = tmp1;
+		for (j=0; j<nbytes%16; j++)
+			out[k*16+j] = ((unsigned char*)&last_block)[j];
+		for ((void)j; j<16; j++)
+			((unsigned char*)&last_block)[j] = 0;
+		tmp1 = last_block;
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		X = _mm_xor_si128(X, tmp1);
+		gfmul(X, H, &X);
+	}
+	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
+	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
+
+	X = _mm_xor_si128(X, tmp1);
+	gfmul(X,H,&X);
+	X = _mm_shuffle_epi8(X, BSWAP_MASK);
+	T = _mm_xor_si128(X, T);
+	_mm_storeu_si128((__m128i*)tag, T);
+}
+
+/* My modification of _encrypt to be _decrypt */
+int
+AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
+	const unsigned char *addt, const unsigned char *ivec,
+	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
+	const unsigned char *key, int nr)
+{
+	int i, j ,k;
+	__m128i tmp1, tmp2, tmp3, tmp4;
+	__m128i tmp5, tmp6, tmp7, tmp8;
+	__m128i H, H2, H3, H4, Y, T;
+	__m128i *KEY = (__m128i*)key;
+	__m128i ctr1, ctr2, ctr3, ctr4;
+	__m128i ctr5, ctr6, ctr7, ctr8;
+	__m128i last_block = _mm_setzero_si128();
+	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
+	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
+	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
+	    7);
+	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
+	    15);
+	__m128i X = _mm_setzero_si128();
+
+	if (ibytes == 96/8) {
+		Y = _mm_loadu_si128((__m128i*)ivec);
+		Y = _mm_insert_epi32(Y, 0x1000000, 3);
+		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
+		tmp1 = _mm_xor_si128(X, KEY[0]);
+		tmp2 = _mm_xor_si128(Y, KEY[0]);
+		for (j=1; j < nr-1; j+=2) {
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
+		}
+		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
+
+		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
+
+		H = _mm_shuffle_epi8(H, BSWAP_MASK);
+	} else {
+		tmp1 = _mm_xor_si128(X, KEY[0]);
+		for (j=1; j <nr; j++)
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
+
+		H = _mm_shuffle_epi8(H, BSWAP_MASK);
+		Y = _mm_setzero_si128();
+
+		for (i=0; i < ibytes/16; i++) {
+			tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
+			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+			Y = _mm_xor_si128(Y, tmp1);
+			gfmul(Y, H, &Y);
+		}
+		if (ibytes%16) {
+			for (j=0; j < ibytes%16; j++)
+				((unsigned char*)&last_block)[j] = ivec[i*16+j];
+			tmp1 = last_block;
+			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+			Y = _mm_xor_si128(Y, tmp1);
+			gfmul(Y, H, &Y);
+		}
+		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
+		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
+
+		Y = _mm_xor_si128(Y, tmp1);
+		gfmul(Y, H, &Y);
+		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
+		tmp1 = _mm_xor_si128(Y, KEY[0]);
+		for (j=1; j < nr; j++)
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
+	}
+
+	gfmul(H,H,&H2);
+	gfmul(H,H2,&H3);
+	gfmul(H,H3,&H4);
+
+	for (i=0; i<abytes/16/4; i++) {
+		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
+		tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
+		tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
+		tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
+
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
+		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
+		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
+
+		tmp1 = _mm_xor_si128(X, tmp1);
+
+		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
+	}
+	for (i=i*4; i<abytes/16; i++) {
+		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		X = _mm_xor_si128(X,tmp1);
+		gfmul(X, H, &X);
+	}
+	if (abytes%16) {
+		last_block = _mm_setzero_si128();
+		for (j=0; j<abytes%16; j++)
+			((unsigned char*)&last_block)[j] = addt[i*16+j];
+		tmp1 = last_block;
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		X =_mm_xor_si128(X,tmp1);
+		gfmul(X,H,&X);
+	}
+
+	/* This is where we validate the cipher text before decrypt */
+	for (i = 0; i<nbytes/16/4; i++) {
+		tmp1 = _mm_loadu_si128(&((__m128i*)in)[i*4]);
+		tmp2 = _mm_loadu_si128(&((__m128i*)in)[i*4+1]);
+		tmp3 = _mm_loadu_si128(&((__m128i*)in)[i*4+2]);
+		tmp4 = _mm_loadu_si128(&((__m128i*)in)[i*4+3]);
+
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
+		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
+		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
+
+		tmp1 = _mm_xor_si128(X, tmp1);
+
+		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
+	}
+	for (i = i*4; i<nbytes/16; i++) {
+		tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]);
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		X = _mm_xor_si128(X, tmp1);
+		gfmul(X,H,&X);
+	}
+	if (nbytes%16) {
+		last_block = _mm_setzero_si128();
+		for (j=0; j<nbytes%16; j++)
+			((unsigned char*)&last_block)[j] = in[i*16+j];
+		tmp1 = last_block;
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		X = _mm_xor_si128(X, tmp1);
+		gfmul(X, H, &X);
+	}
+
+	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
+	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
+
+	X = _mm_xor_si128(X, tmp1);
+	gfmul(X,H,&X);
+	X = _mm_shuffle_epi8(X, BSWAP_MASK);
+	T = _mm_xor_si128(X, T);
+
+	if (!m128icmp(T, _mm_loadu_si128((__m128i*)tag)))
+		return 0; //in case the authentication failed
+
+	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
+	ctr1 = _mm_add_epi64(ctr1, ONE);
+	ctr2 = _mm_add_epi64(ctr1, ONE);
+	ctr3 = _mm_add_epi64(ctr2, ONE);
+	ctr4 = _mm_add_epi64(ctr3, ONE);
+	ctr5 = _mm_add_epi64(ctr4, ONE);
+	ctr6 = _mm_add_epi64(ctr5, ONE);
+	ctr7 = _mm_add_epi64(ctr6, ONE);
+	ctr8 = _mm_add_epi64(ctr7, ONE);
+
+	for (i=0; i<nbytes/16/8; i++) {
+		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
+		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
+		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
+		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
+		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
+		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
+		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
+
+		ctr1 = _mm_add_epi64(ctr1, EIGHT);
+		ctr2 = _mm_add_epi64(ctr2, EIGHT);
+		ctr3 = _mm_add_epi64(ctr3, EIGHT);
+		ctr4 = _mm_add_epi64(ctr4, EIGHT);
+		ctr5 = _mm_add_epi64(ctr5, EIGHT);
+		ctr6 = _mm_add_epi64(ctr6, EIGHT);
+		ctr7 = _mm_add_epi64(ctr7, EIGHT);
+		ctr8 = _mm_add_epi64(ctr8, EIGHT);
+
+		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
+		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
+		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
+		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
+		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
+		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
+		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
+		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
+
+		for (j=1; j<nr; j++) {
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
+			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
+			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
+			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
+			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
+			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
+			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
+		}
+		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
+		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
+		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
+		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
+		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
+		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
+		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
+		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
+
+		tmp1 = _mm_xor_si128(tmp1,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+0]));
+		tmp2 = _mm_xor_si128(tmp2,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+1]));
+		tmp3 = _mm_xor_si128(tmp3,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+2]));
+		tmp4 = _mm_xor_si128(tmp4,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+3]));
+		tmp5 = _mm_xor_si128(tmp5,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+4]));
+		tmp6 = _mm_xor_si128(tmp6,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+5]));
+		tmp7 = _mm_xor_si128(tmp7,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+6]));
+		tmp8 = _mm_xor_si128(tmp8,
+		    _mm_loadu_si128(&((__m128i*)in)[i*8+7]));
+
+		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
+		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
+
+		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
+		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
+		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
+		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
+		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
+		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
+		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
+		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
+	}
+	for (k=i*8; k<nbytes/16; k++) {
+		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+		ctr1 = _mm_add_epi64(ctr1, ONE);
+		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+		for (j=1; j<nr-1; j+=2) {
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+		}
+		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
+		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
+	}
+	//If remains one incomplete block
+	if (nbytes%16) {
+		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
+		for (j=1; j<nr-1; j+=2) {
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
+			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
+		}
+		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
+		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
+		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
+		last_block = tmp1;
+		for (j=0; j<nbytes%16; j++)
+			out[k*16+j] = ((unsigned char*)&last_block)[j];
+	}
+	return 1; //when sucessfull returns 1
+}
diff --git a/sys/crypto/aesni/aesni_wrap.c b/sys/crypto/aesni/aesni_wrap.c
index 39819a6..e5e2a69 100644
--- a/sys/crypto/aesni/aesni_wrap.c
+++ b/sys/crypto/aesni/aesni_wrap.c
@@ -3,8 +3,13 @@
  * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
  * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
  * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
+ * Copyright (c) 2014 The FreeBSD Foundation
  * All rights reserved.
  *
+ * Portions of this software were developed by John-Mark Gurney
+ * under sponsorship of the FreeBSD Foundation and
+ * Rubicon Communications, LLC (Netgate).
+ * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -29,15 +34,18 @@
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
- 
+
 #include <sys/param.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <crypto/aesni/aesni.h>
- 
+
+#include <opencrypto/gmac.h>
+
 #include "aesencdec.h"
+#include <smmintrin.h>
 
 MALLOC_DECLARE(M_AESNI);
 
@@ -176,6 +184,104 @@ aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
 	}
 }
 
+/*
+ * mixed endian increment, low 64bits stored in hi word to be compatible
+ * with _icm's BSWAP.
+ */
+static inline __m128i
+nextc(__m128i x)
+{
+	const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0);
+	const __m128i ZERO = _mm_setzero_si128();
+
+	x = _mm_add_epi64(x, ONE);
+	__m128i t = _mm_cmpeq_epi64(x, ZERO);
+	t = _mm_unpackhi_epi64(t, ZERO);
+	x = _mm_sub_epi64(x, t);
+
+	return x;
+}
+
+void
+aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len,
+    const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN])
+{
+	__m128i tot;
+	__m128i tmp1, tmp2, tmp3, tmp4;
+	__m128i tmp5, tmp6, tmp7, tmp8;
+	__m128i ctr1, ctr2, ctr3, ctr4;
+	__m128i ctr5, ctr6, ctr7, ctr8;
+	__m128i BSWAP_EPI64;
+	__m128i tout[8];
+	struct blocks8 *top;
+	const struct blocks8 *blks;
+	size_t i, cnt;
+
+	BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
+
+	ctr1 = _mm_loadu_si128((__m128i*)iv);
+	ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+
+	cnt = len / AES_BLOCK_LEN / 8;
+	for (i = 0; i < cnt; i++) {
+		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+		ctr2 = nextc(ctr1);
+		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
+		ctr3 = nextc(ctr2);
+		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
+		ctr4 = nextc(ctr3);
+		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
+		ctr5 = nextc(ctr4);
+		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
+		ctr6 = nextc(ctr5);
+		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
+		ctr7 = nextc(ctr6);
+		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
+		ctr8 = nextc(ctr7);
+		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
+		ctr1 = nextc(ctr8);
+
+		blks = (const struct blocks8 *)from;
+		top = (struct blocks8 *)to;
+		aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4,
+		    tmp5, tmp6, tmp7, tmp8, tout);
+
+		top->blk[0] = blks->blk[0] ^ tout[0];
+		top->blk[1] = blks->blk[1] ^ tout[1];
+		top->blk[2] = blks->blk[2] ^ tout[2];
+		top->blk[3] = blks->blk[3] ^ tout[3];
+		top->blk[4] = blks->blk[4] ^ tout[4];
+		top->blk[5] = blks->blk[5] ^ tout[5];
+		top->blk[6] = blks->blk[6] ^ tout[6];
+		top->blk[7] = blks->blk[7] ^ tout[7];
+
+		from += AES_BLOCK_LEN * 8;
+		to += AES_BLOCK_LEN * 8;
+	}
+	i *= 8;
+	cnt = len / AES_BLOCK_LEN;
+	for (; i < cnt; i++) {
+		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+		ctr1 = nextc(ctr1);
+
+		tot = aesni_enc(rounds - 1, key_schedule, tmp1);
+
+		tot = tot ^ _mm_loadu_si128((const __m128i *)from);
+		_mm_storeu_si128((__m128i *)to, tot);
+
+		from += AES_BLOCK_LEN;
+		to += AES_BLOCK_LEN;
+	}
+
+	/* handle remaining partial round */
+	if (len % AES_BLOCK_LEN != 0) {
+		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
+		tot = aesni_enc(rounds - 1, key_schedule, tmp1);
+		tot = tot ^ _mm_loadu_si128((const __m128i *)from);
+		memcpy(to, &tot, len % AES_BLOCK_LEN);
+	}
+}
+
 #define	AES_XTS_BLOCKSIZE	16
 #define	AES_XTS_IVSIZE		8
 #define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
@@ -333,8 +439,15 @@ int
 aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
     int keylen)
 {
+	int decsched;
+
+	decsched = 1;
 
 	switch (ses->algo) {
+	case CRYPTO_AES_ICM:
+	case CRYPTO_AES_NIST_GCM_16:
+		decsched = 0;
+		/* FALLTHROUGH */
 	case CRYPTO_AES_CBC:
 		switch (keylen) {
 		case 128:
@@ -347,6 +460,7 @@ aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
 			ses->rounds = AES256_ROUNDS;
 			break;
 		default:
+			CRYPTDEB("invalid CBC/ICM/GCM key length");
 			return (EINVAL);
 		}
 		break;
@@ -359,6 +473,7 @@ aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
 			ses->rounds = AES256_ROUNDS;
 			break;
 		default:
+			CRYPTDEB("invalid XTS key length");
 			return (EINVAL);
 		}
 		break;
@@ -367,13 +482,13 @@ aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
 	}
 
 	aesni_set_enckey(key, ses->enc_schedule, ses->rounds);
-	aesni_set_deckey(ses->enc_schedule, ses->dec_schedule, ses->rounds);
-	if (ses->algo == CRYPTO_AES_CBC)
-		arc4rand(ses->iv, sizeof(ses->iv), 0);
-	else /* if (ses->algo == CRYPTO_AES_XTS) */ {
+	if (decsched)
+		aesni_set_deckey(ses->enc_schedule, ses->dec_schedule,
+		    ses->rounds);
+
+	if (ses->algo == CRYPTO_AES_XTS)
 		aesni_set_enckey(key + keylen / 16, ses->xts_schedule,
 		    ses->rounds);
-	}
 
 	return (0);
 }
diff --git a/sys/crypto/via/padlock_hash.c b/sys/crypto/via/padlock_hash.c
index 9dffc40..c952b63 100644
--- a/sys/crypto/via/padlock_hash.c
+++ b/sys/crypto/via/padlock_hash.c
@@ -75,7 +75,7 @@ struct padlock_sha_ctx {
 CTASSERT(sizeof(struct padlock_sha_ctx) <= sizeof(union authctx));
 
 static void padlock_sha_init(struct padlock_sha_ctx *ctx);
-static int padlock_sha_update(struct padlock_sha_ctx *ctx, uint8_t *buf,
+static int padlock_sha_update(struct padlock_sha_ctx *ctx, const uint8_t *buf,
     uint16_t bufsize);
 static void padlock_sha1_final(uint8_t *hash, struct padlock_sha_ctx *ctx);
 static void padlock_sha256_final(uint8_t *hash, struct padlock_sha_ctx *ctx);
@@ -83,16 +83,16 @@ static void padlock_sha256_final(uint8_t *hash, struct padlock_sha_ctx *ctx);
 static struct auth_hash padlock_hmac_sha1 = {
 	CRYPTO_SHA1_HMAC, "HMAC-SHA1",
 	20, SHA1_HASH_LEN, SHA1_HMAC_BLOCK_LEN, sizeof(struct padlock_sha_ctx),
-        (void (*)(void *))padlock_sha_init,
-	(int (*)(void *, uint8_t *, uint16_t))padlock_sha_update,
+        (void (*)(void *))padlock_sha_init, NULL, NULL,
+	(int (*)(void *, const uint8_t *, uint16_t))padlock_sha_update,
 	(void (*)(uint8_t *, void *))padlock_sha1_final
 };
 
 static struct auth_hash padlock_hmac_sha256 = {
 	CRYPTO_SHA2_256_HMAC, "HMAC-SHA2-256",
 	32, SHA2_256_HASH_LEN, SHA2_256_HMAC_BLOCK_LEN, sizeof(struct padlock_sha_ctx),
-        (void (*)(void *))padlock_sha_init,
-	(int (*)(void *, uint8_t *, uint16_t))padlock_sha_update,
+        (void (*)(void *))padlock_sha_init, NULL, NULL,
+	(int (*)(void *, const uint8_t *, uint16_t))padlock_sha_update,
 	(void (*)(uint8_t *, void *))padlock_sha256_final
 };
 
@@ -167,7 +167,7 @@ padlock_sha_init(struct padlock_sha_ctx *ctx)
 }
 
 static int
-padlock_sha_update(struct padlock_sha_ctx *ctx, uint8_t *buf, uint16_t bufsize)
+padlock_sha_update(struct padlock_sha_ctx *ctx, const uint8_t *buf, uint16_t bufsize)
 {
 
 	if (ctx->psc_size - ctx->psc_offset < bufsize) {
author	jmg <jmg@FreeBSD.org>	2014-12-12 19:56:36 +0000
committer	jmg <jmg@FreeBSD.org>	2014-12-12 19:56:36 +0000
commit	c3ff54cc39aaa96a4ce2b1ee496a649c86143942 (patch)
tree	7e0047b9e1e2005d85782cf44fc29f74f116317e /sys/crypto
parent	8bf652525111dc99d0ac033111fbe20a623483e8 (diff)
download	FreeBSD-src-c3ff54cc39aaa96a4ce2b1ee496a649c86143942.zip FreeBSD-src-c3ff54cc39aaa96a4ce2b1ee496a649c86143942.tar.gz