14 files changed, 1181 insertions, 28 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c617b9d..7396f50 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -374,6 +374,9 @@ config STACK_VALIDATION
 	  pointers (if CONFIG_FRAME_POINTER is enabled).  This helps ensure
 	  that runtime stack traces are more reliable.
 
+	  This is also a prerequisite for generation of ORC unwind data, which
+	  is needed for CONFIG_ORC_UNWINDER.
+
 	  For more information, see
 	  tools/objtool/Documentation/stack-validation.txt.
 
@@ -1088,6 +1091,8 @@ config PROVE_LOCKING
 	select DEBUG_MUTEXES
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
+	select LOCKDEP_CROSSRELEASE
+	select LOCKDEP_COMPLETIONS
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1128,7 +1133,7 @@ config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE
+	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE && !X86
 	select KALLSYMS
 	select KALLSYMS_ALL
 
@@ -1157,6 +1162,22 @@ config LOCK_STAT
 	 CONFIG_LOCK_STAT defines "contended" and "acquired" lock events.
 	 (CONFIG_LOCKDEP defines "acquire" and "release" events.)
 
+config LOCKDEP_CROSSRELEASE
+	bool
+	help
+	 This makes lockdep work for crosslock which is a lock allowed to
+	 be released in a different context from the acquisition context.
+	 Normally a lock must be released in the context acquiring the lock.
+	 However, relexing this constraint helps synchronization primitives
+	 such as page locks or completions can use the lock correctness
+	 detector, lockdep.
+
+config LOCKDEP_COMPLETIONS
+	bool
+	help
+	 A deadlock caused by wait_for_completion() and complete() can be
+	 detected by lockdep using crossrelease feature.
+
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
@@ -1547,7 +1568,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
 	depends on !X86_64
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE && !X86
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
@@ -1556,7 +1577,7 @@ config LATENCYTOP
 	depends on DEBUG_KERNEL
 	depends on STACKTRACE_SUPPORT
 	depends on PROC_FS
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86
 	select KALLSYMS
 	select KALLSYMS_ALL
 	select STACKTRACE
diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index 59fd7c0..155c55d 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -1,6 +1,6 @@
 /* Generic associative array implementation.
  *
- * See Documentation/assoc_array.txt for information.
+ * See Documentation/core-api/assoc_array.rst for information.
  *
  * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 17afb04..2f5349c 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -18,6 +18,7 @@
 #include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/hash.h>
+#include <linux/kmemleak.h>
 
 #define ODEBUG_HASH_BITS	14
 #define ODEBUG_HASH_SIZE	(1 << ODEBUG_HASH_BITS)
@@ -110,6 +111,7 @@ static void fill_pool(void)
 		if (!new)
 			return;
 
+		kmemleak_ignore(new);
 		raw_spin_lock_irqsave(&pool_lock, flags);
 		hlist_add_head(&new->node, &obj_pool);
 		debug_objects_allocated++;
@@ -1080,6 +1082,7 @@ static int __init debug_objects_replace_static_objects(void)
 		obj = kmem_cache_zalloc(obj_cache, GFP_KERNEL);
 		if (!obj)
 			goto free;
+		kmemleak_ignore(obj);
 		hlist_add_head(&obj->node, &objects);
 	}
 
diff --git a/lib/errseq.c b/lib/errseq.c
index 841fa24..7b900c2 100644
--- a/lib/errseq.c
+++ b/lib/errseq.c
@@ -41,23 +41,20 @@
 #define ERRSEQ_CTR_INC		(1 << (ERRSEQ_SHIFT + 1))
 
 /**
- * __errseq_set - set a errseq_t for later reporting
+ * errseq_set - set a errseq_t for later reporting
  * @eseq: errseq_t field that should be set
- * @err: error to set
+ * @err: error to set (must be between -1 and -MAX_ERRNO)
  *
  * This function sets the error in *eseq, and increments the sequence counter
  * if the last sequence was sampled at some point in the past.
  *
  * Any error set will always overwrite an existing error.
  *
- * Most callers will want to use the errseq_set inline wrapper to efficiently
- * handle the common case where err is 0.
- *
- * We do return an errseq_t here, primarily for debugging purposes. The return
- * value should not be used as a previously sampled value in later calls as it
- * will not have the SEEN flag set.
+ * We do return the latest value here, primarily for debugging purposes. The
+ * return value should not be used as a previously sampled value in later calls
+ * as it will not have the SEEN flag set.
  */
-errseq_t __errseq_set(errseq_t *eseq, int err)
+errseq_t errseq_set(errseq_t *eseq, int err)
 {
 	errseq_t cur, old;
 
@@ -107,7 +104,7 @@ errseq_t __errseq_set(errseq_t *eseq, int err)
 	}
 	return cur;
 }
-EXPORT_SYMBOL(__errseq_set);
+EXPORT_SYMBOL(errseq_set);
 
 /**
  * errseq_sample - grab current errseq_t value
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 719c155..e590523 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -52,6 +52,8 @@ static const char *kobject_actions[] = {
 	[KOBJ_MOVE] =		"move",
 	[KOBJ_ONLINE] =		"online",
 	[KOBJ_OFFLINE] =	"offline",
+	[KOBJ_BIND] =		"bind",
+	[KOBJ_UNBIND] =		"unbind",
 };
 
 static int kobject_action_type(const char *buf, size_t count,
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 6f2b135..cd0b5c9 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -363,6 +363,103 @@ static void rsem_AA3(void)
 }
 
 /*
+ * read_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		write_lock(A)
+ */
+static void rlock_ABBA1(void)
+{
+	RL(X1);
+	L(Y1);
+	U(Y1);
+	RU(X1);
+
+	L(Y1);
+	WL(X1);
+	WU(X1);
+	U(Y1); // should fail
+}
+
+static void rwsem_ABBA1(void)
+{
+	RSL(X1);
+	ML(Y1);
+	MU(Y1);
+	RSU(X1);
+
+	ML(Y1);
+	WSL(X1);
+	WSU(X1);
+	MU(Y1); // should fail
+}
+
+/*
+ * read_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		read_lock(A)
+ */
+static void rlock_ABBA2(void)
+{
+	RL(X1);
+	L(Y1);
+	U(Y1);
+	RU(X1);
+
+	L(Y1);
+	RL(X1);
+	RU(X1);
+	U(Y1); // should NOT fail
+}
+
+static void rwsem_ABBA2(void)
+{
+	RSL(X1);
+	ML(Y1);
+	MU(Y1);
+	RSU(X1);
+
+	ML(Y1);
+	RSL(X1);
+	RSU(X1);
+	MU(Y1); // should fail
+}
+
+
+/*
+ * write_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		write_lock(A)
+ */
+static void rlock_ABBA3(void)
+{
+	WL(X1);
+	L(Y1);
+	U(Y1);
+	WU(X1);
+
+	L(Y1);
+	WL(X1);
+	WU(X1);
+	U(Y1); // should fail
+}
+
+static void rwsem_ABBA3(void)
+{
+	WSL(X1);
+	ML(Y1);
+	MU(Y1);
+	WSU(X1);
+
+	ML(Y1);
+	WSL(X1);
+	WSU(X1);
+	MU(Y1); // should fail
+}
+
+/*
  * ABBA deadlock:
  */
 
@@ -1056,8 +1153,6 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
 	if (debug_locks != expected) {
 		unexpected_testcase_failures++;
 		pr_cont("FAILED|");
-
-		dump_stack();
 	} else {
 		testcase_successes++;
 		pr_cont("  ok  |");
@@ -1933,6 +2028,30 @@ void locking_selftest(void)
 	dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
 	pr_cont("\n");
 
+	print_testname("mixed read-lock/lock-write ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK);
+	/*
+	 * Lockdep does indeed fail here, but there's nothing we can do about
+	 * that now.  Don't kill lockdep for it.
+	 */
+	unexpected_testcase_failures--;
+
+	pr_cont("             |");
+	dotest(rwsem_ABBA1, FAILURE, LOCKTYPE_RWSEM);
+
+	print_testname("mixed read-lock/lock-read ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA2, SUCCESS, LOCKTYPE_RWLOCK);
+	pr_cont("             |");
+	dotest(rwsem_ABBA2, FAILURE, LOCKTYPE_RWSEM);
+
+	print_testname("mixed write-lock/lock-write ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA3, FAILURE, LOCKTYPE_RWLOCK);
+	pr_cont("             |");
+	dotest(rwsem_ABBA3, FAILURE, LOCKTYPE_RWSEM);
+
 	printk("  --------------------------------------------------------------------------\n");
 
 	/*
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index c191b42..9717e2a 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2022,6 +2022,7 @@ void radix_tree_iter_delete(struct radix_tree_root *root,
 	if (__radix_tree_delete(root, iter->node, slot))
 		iter->index = iter->next_index;
 }
+EXPORT_SYMBOL(radix_tree_iter_delete);
 
 /**
  * radix_tree_delete_item - delete an item from a radix tree
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3057011..a93adf6 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -5,7 +5,7 @@ raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
-raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
+raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
 
@@ -26,7 +26,9 @@ NEON_FLAGS := -ffreestanding
 ifeq ($(ARCH),arm)
 NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
 endif
+CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
 ifeq ($(ARCH),arm64)
+CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 7857049..4769947 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -113,6 +113,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #ifdef CONFIG_S390
 	&raid6_recov_s390xc,
 #endif
+#if defined(CONFIG_KERNEL_MODE_NEON)
+	&raid6_recov_neon,
+#endif
 	&raid6_recov_intx1,
 	NULL
 };
diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc
index 4fa51b7..d5242f5 100644
--- a/lib/raid6/neon.uc
+++ b/lib/raid6/neon.uc
@@ -46,8 +46,12 @@ static inline unative_t SHLBYTE(unative_t v)
  */
 static inline unative_t MASK(unative_t v)
 {
-	const uint8x16_t temp = NBYTES(0);
-	return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp);
+	return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
+}
+
+static inline unative_t PMUL(unative_t v, unative_t u)
+{
+	return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
 }
 
 void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
@@ -110,7 +114,30 @@ void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
 			wq$$ = veorq_u8(w1$$, wd$$);
 		}
 		/* P/Q left side optimization */
-		for ( z = start-1 ; z >= 0 ; z-- ) {
+		for ( z = start-1 ; z >= 3 ; z -= 4 ) {
+			w2$$ = vshrq_n_u8(wq$$, 4);
+			w1$$ = vshlq_n_u8(wq$$, 4);
+
+			w2$$ = PMUL(w2$$, x1d);
+			wq$$ = veorq_u8(w1$$, w2$$);
+		}
+
+		switch (z) {
+		case 2:
+			w2$$ = vshrq_n_u8(wq$$, 5);
+			w1$$ = vshlq_n_u8(wq$$, 3);
+
+			w2$$ = PMUL(w2$$, x1d);
+			wq$$ = veorq_u8(w1$$, w2$$);
+			break;
+		case 1:
+			w2$$ = vshrq_n_u8(wq$$, 6);
+			w1$$ = vshlq_n_u8(wq$$, 2);
+
+			w2$$ = PMUL(w2$$, x1d);
+			wq$$ = veorq_u8(w1$$, w2$$);
+			break;
+		case 0:
 			w2$$ = MASK(wq$$);
 			w1$$ = SHLBYTE(wq$$);
 
diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c
new file mode 100644
index 0000000..eeb5c40
--- /dev/null
+++ b/lib/raid6/recov_neon.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/raid/pq.h>
+
+#ifdef __KERNEL__
+#include <asm/neon.h>
+#else
+#define kernel_neon_begin()
+#define kernel_neon_end()
+#define cpu_has_neon()		(1)
+#endif
+
+static int raid6_has_neon(void)
+{
+	return cpu_has_neon();
+}
+
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+			      uint8_t *dq, const uint8_t *pbmul,
+			      const uint8_t *qmul);
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+			      const uint8_t *qmul);
+
+static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
+		int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dp;
+	ptrs[failb]     = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+					 raid6_gfexp[failb]]];
+
+	kernel_neon_begin();
+	__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
+	kernel_neon_end();
+}
+
+static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
+		void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_neon_begin();
+	__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
+	kernel_neon_end();
+}
+
+const struct raid6_recov_calls raid6_recov_neon = {
+	.data2		= raid6_2data_recov_neon,
+	.datap		= raid6_datap_recov_neon,
+	.valid		= raid6_has_neon,
+	.name		= "neon",
+	.priority	= 10,
+};
diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c
new file mode 100644
index 0000000..8cd20c9
--- /dev/null
+++ b/lib/raid6/recov_neon_inner.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <arm_neon.h>
+
+static const uint8x16_t x0f = {
+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+};
+
+#ifdef CONFIG_ARM
+/*
+ * AArch32 does not provide this intrinsic natively because it does not
+ * implement the underlying instruction. AArch32 only provides a 64-bit
+ * wide vtbl.8 instruction, so use that instead.
+ */
+static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
+{
+	union {
+		uint8x16_t	val;
+		uint8x8x2_t	pair;
+	} __a = { a };
+
+	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
+			   vtbl2_u8(__a.pair, vget_high_u8(b)));
+}
+#endif
+
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+			      uint8_t *dq, const uint8_t *pbmul,
+			      const uint8_t *qmul)
+{
+	uint8x16_t pm0 = vld1q_u8(pbmul);
+	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
+	uint8x16_t qm0 = vld1q_u8(qmul);
+	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+
+	/*
+	 * while ( bytes-- ) {
+	 *	uint8_t px, qx, db;
+	 *
+	 *	px    = *p ^ *dp;
+	 *	qx    = qmul[*q ^ *dq];
+	 *	*dq++ = db = pbmul[px] ^ qx;
+	 *	*dp++ = db ^ px;
+	 *	p++; q++;
+	 * }
+	 */
+
+	while (bytes) {
+		uint8x16_t vx, vy, px, qx, db;
+
+		px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
+		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		qx = veorq_u8(vx, vy);
+
+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
+		vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
+		vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
+		vx = veorq_u8(vx, vy);
+		db = veorq_u8(vx, qx);
+
+		vst1q_u8(dq, db);
+		vst1q_u8(dp, veorq_u8(db, px));
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dp += 16;
+		dq += 16;
+	}
+}
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+			      const uint8_t *qmul)
+{
+	uint8x16_t qm0 = vld1q_u8(qmul);
+	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+
+	/*
+	 * while (bytes--) {
+	 *	*p++ ^= *dq = qmul[*q ^ *dq];
+	 *	q++; dq++;
+	 * }
+	 */
+
+	while (bytes) {
+		uint8x16_t vx, vy;
+
+		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		vx = veorq_u8(vx, vy);
+		vy = veorq_u8(vx, vld1q_u8(p));
+
+		vst1q_u8(dq, vx);
+		vst1q_u8(p, vy);
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dq += 16;
+	}
+}
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index a8d74a7..8c6c83e 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <linux/gfp.h>
 #include <linux/scatterlist.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -155,6 +156,15 @@ unsigned long swiotlb_size_or_default(void)
 	return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
+void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { }
+
+/* For swiotlb, clear memory encryption mask from dma addresses */
+static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev,
+				      phys_addr_t address)
+{
+	return __sme_clr(phys_to_dma(hwdev, address));
+}
+
 /* Note that this doesn't work with highmem page */
 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 				      volatile void *address)
@@ -183,6 +193,31 @@ void swiotlb_print_info(void)
 	       bytes >> 20, vstart, vend - 1);
 }
 
+/*
+ * Early SWIOTLB allocation may be too early to allow an architecture to
+ * perform the desired operations.  This function allows the architecture to
+ * call SWIOTLB when the operations are possible.  It needs to be called
+ * before the SWIOTLB memory is used.
+ */
+void __init swiotlb_update_mem_attributes(void)
+{
+	void *vaddr;
+	unsigned long bytes;
+
+	if (no_iotlb_memory || late_alloc)
+		return;
+
+	vaddr = phys_to_virt(io_tlb_start);
+	bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+	swiotlb_set_mem_attributes(vaddr, bytes);
+	memset(vaddr, 0, bytes);
+
+	vaddr = phys_to_virt(io_tlb_overflow_buffer);
+	bytes = PAGE_ALIGN(io_tlb_overflow);
+	swiotlb_set_mem_attributes(vaddr, bytes);
+	memset(vaddr, 0, bytes);
+}
+
 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
 	void *v_overflow_buffer;
@@ -320,6 +355,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	io_tlb_start = virt_to_phys(tlb);
 	io_tlb_end = io_tlb_start + bytes;
 
+	swiotlb_set_mem_attributes(tlb, bytes);
 	memset(tlb, 0, bytes);
 
 	/*
@@ -330,6 +366,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!v_overflow_buffer)
 		goto cleanup2;
 
+	swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow);
+	memset(v_overflow_buffer, 0, io_tlb_overflow);
 	io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
 
 	/*
@@ -469,6 +507,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 	if (no_iotlb_memory)
 		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
 
+	if (sme_active())
+		pr_warn_once("SME is active and system is using DMA bounce buffers\n");
+
 	mask = dma_get_seg_boundary(hwdev);
 
 	tbl_dma_addr &= mask;
@@ -581,7 +622,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 		return SWIOTLB_MAP_ERROR;
 	}
 
-	start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
+	start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start);
 	return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
 				      dir, attrs);
 }
@@ -702,7 +743,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			goto err_warn;
 
 		ret = phys_to_virt(paddr);
-		dev_addr = phys_to_dma(hwdev, paddr);
+		dev_addr = swiotlb_phys_to_dma(hwdev, paddr);
 
 		/* Confirm address can be DMA'd by device */
 		if (dev_addr + size - 1 > dma_mask) {
@@ -812,10 +853,10 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	map = map_single(dev, phys, size, dir, attrs);
 	if (map == SWIOTLB_MAP_ERROR) {
 		swiotlb_full(dev, size, dir, 1);
-		return phys_to_dma(dev, io_tlb_overflow_buffer);
+		return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
 	}
 
-	dev_addr = phys_to_dma(dev, map);
+	dev_addr = swiotlb_phys_to_dma(dev, map);
 
 	/* Ensure that the address returned is DMA'ble */
 	if (dma_capable(dev, dev_addr, size))
@@ -824,7 +865,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	attrs |= DMA_ATTR_SKIP_CPU_SYNC;
 	swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
 
-	return phys_to_dma(dev, io_tlb_overflow_buffer);
+	return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
 }
 EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
@@ -958,7 +999,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 				sg_dma_len(sgl) = 0;
 				return 0;
 			}
-			sg->dma_address = phys_to_dma(hwdev, map);
+			sg->dma_address = swiotlb_phys_to_dma(hwdev, map);
 		} else
 			sg->dma_address = dev_addr;
 		sg_dma_len(sg) = sg->length;
@@ -1026,7 +1067,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-	return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer));
+	return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer));
 }
 EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 
@@ -1039,6 +1080,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 int
 swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+	return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
 EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/lib/test_firmware.c b/lib/test_firmware.c
index 09371b0..64a4c76 100644
--- a/lib/test_firmware.c
+++ b/lib/test_firmware.c
@@ -19,10 +19,85 @@
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+
+#define TEST_FIRMWARE_NAME	"test-firmware.bin"
+#define TEST_FIRMWARE_NUM_REQS	4
 
 static DEFINE_MUTEX(test_fw_mutex);
 static const struct firmware *test_firmware;
 
+struct test_batched_req {
+	u8 idx;
+	int rc;
+	bool sent;
+	const struct firmware *fw;
+	const char *name;
+	struct completion completion;
+	struct task_struct *task;
+	struct device *dev;
+};
+
+/**
+ * test_config - represents configuration for the test for different triggers
+ *
+ * @name: the name of the firmware file to look for
+ * @sync_direct: when the sync trigger is used if this is true
+ *	request_firmware_direct() will be used instead.
+ * @send_uevent: whether or not to send a uevent for async requests
+ * @num_requests: number of requests to try per test case. This is trigger
+ *	specific.
+ * @reqs: stores all requests information
+ * @read_fw_idx: index of thread from which we want to read firmware results
+ *	from through the read_fw trigger.
+ * @test_result: a test may use this to collect the result from the call
+ *	of the request_firmware*() calls used in their tests. In order of
+ *	priority we always keep first any setup error. If no setup errors were
+ *	found then we move on to the first error encountered while running the
+ *	API. Note that for async calls this typically will be a successful
+ *	result (0) unless of course you've used bogus parameters, or the system
+ *	is out of memory.  In the async case the callback is expected to do a
+ *	bit more homework to figure out what happened, unfortunately the only
+ *	information passed today on error is the fact that no firmware was
+ *	found so we can only assume -ENOENT on async calls if the firmware is
+ *	NULL.
+ *
+ *	Errors you can expect:
+ *
+ *	API specific:
+ *
+ *	0:		success for sync, for async it means request was sent
+ *	-EINVAL:	invalid parameters or request
+ *	-ENOENT:	files not found
+ *
+ *	System environment:
+ *
+ *	-ENOMEM:	memory pressure on system
+ *	-ENODEV:	out of number of devices to test
+ *	-EINVAL:	an unexpected error has occurred
+ * @req_firmware: if @sync_direct is true this is set to
+ *	request_firmware_direct(), otherwise request_firmware()
+ */
+struct test_config {
+	char *name;
+	bool sync_direct;
+	bool send_uevent;
+	u8 num_requests;
+	u8 read_fw_idx;
+
+	/*
+	 * These below don't belong her but we'll move them once we create
+	 * a struct fw_test_device and stuff the misc_dev under there later.
+	 */
+	struct test_batched_req *reqs;
+	int test_result;
+	int (*req_firmware)(const struct firmware **fw, const char *name,
+			    struct device *device);
+};
+
+struct test_config *test_fw_config;
+
 static ssize_t test_fw_misc_read(struct file *f, char __user *buf,
 				 size_t size, loff_t *offset)
 {
@@ -42,6 +117,338 @@ static const struct file_operations test_fw_fops = {
 	.read           = test_fw_misc_read,
 };
 
+static void __test_release_all_firmware(void)
+{
+	struct test_batched_req *req;
+	u8 i;
+
+	if (!test_fw_config->reqs)
+		return;
+
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (req->fw)
+			release_firmware(req->fw);
+	}
+
+	vfree(test_fw_config->reqs);
+	test_fw_config->reqs = NULL;
+}
+
+static void test_release_all_firmware(void)
+{
+	mutex_lock(&test_fw_mutex);
+	__test_release_all_firmware();
+	mutex_unlock(&test_fw_mutex);
+}
+
+
+static void __test_firmware_config_free(void)
+{
+	__test_release_all_firmware();
+	kfree_const(test_fw_config->name);
+	test_fw_config->name = NULL;
+}
+
+/*
+ * XXX: move to kstrncpy() once merged.
+ *
+ * Users should use kfree_const() when freeing these.
+ */
+static int __kstrncpy(char **dst, const char *name, size_t count, gfp_t gfp)
+{
+	*dst = kstrndup(name, count, gfp);
+	if (!*dst)
+		return -ENOSPC;
+	return count;
+}
+
+static int __test_firmware_config_init(void)
+{
+	int ret;
+
+	ret = __kstrncpy(&test_fw_config->name, TEST_FIRMWARE_NAME,
+			 strlen(TEST_FIRMWARE_NAME), GFP_KERNEL);
+	if (ret < 0)
+		goto out;
+
+	test_fw_config->num_requests = TEST_FIRMWARE_NUM_REQS;
+	test_fw_config->send_uevent = true;
+	test_fw_config->sync_direct = false;
+	test_fw_config->req_firmware = request_firmware;
+	test_fw_config->test_result = 0;
+	test_fw_config->reqs = NULL;
+
+	return 0;
+
+out:
+	__test_firmware_config_free();
+	return ret;
+}
+
+static ssize_t reset_store(struct device *dev,
+			   struct device_attribute *attr,
+			   const char *buf, size_t count)
+{
+	int ret;
+
+	mutex_lock(&test_fw_mutex);
+
+	__test_firmware_config_free();
+
+	ret = __test_firmware_config_init();
+	if (ret < 0) {
+		ret = -ENOMEM;
+		pr_err("could not alloc settings for config trigger: %d\n",
+		       ret);
+		goto out;
+	}
+
+	pr_info("reset\n");
+	ret = count;
+
+out:
+	mutex_unlock(&test_fw_mutex);
+
+	return ret;
+}
+static DEVICE_ATTR_WO(reset);
+
+static ssize_t config_show(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	int len = 0;
+
+	mutex_lock(&test_fw_mutex);
+
+	len += snprintf(buf, PAGE_SIZE,
+			"Custom trigger configuration for: %s\n",
+			dev_name(dev));
+
+	if (test_fw_config->name)
+		len += snprintf(buf+len, PAGE_SIZE,
+				"name:\t%s\n",
+				test_fw_config->name);
+	else
+		len += snprintf(buf+len, PAGE_SIZE,
+				"name:\tEMTPY\n");
+
+	len += snprintf(buf+len, PAGE_SIZE,
+			"num_requests:\t%u\n", test_fw_config->num_requests);
+
+	len += snprintf(buf+len, PAGE_SIZE,
+			"send_uevent:\t\t%s\n",
+			test_fw_config->send_uevent ?
+			"FW_ACTION_HOTPLUG" :
+			"FW_ACTION_NOHOTPLUG");
+	len += snprintf(buf+len, PAGE_SIZE,
+			"sync_direct:\t\t%s\n",
+			test_fw_config->sync_direct ? "true" : "false");
+	len += snprintf(buf+len, PAGE_SIZE,
+			"read_fw_idx:\t%u\n", test_fw_config->read_fw_idx);
+
+	mutex_unlock(&test_fw_mutex);
+
+	return len;
+}
+static DEVICE_ATTR_RO(config);
+
+static ssize_t config_name_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	int ret;
+
+	mutex_lock(&test_fw_mutex);
+	kfree_const(test_fw_config->name);
+	ret = __kstrncpy(&test_fw_config->name, buf, count, GFP_KERNEL);
+	mutex_unlock(&test_fw_mutex);
+
+	return ret;
+}
+
+/*
+ * As per sysfs_kf_seq_show() the buf is max PAGE_SIZE.
+ */
+static ssize_t config_test_show_str(char *dst,
+				    char *src)
+{
+	int len;
+
+	mutex_lock(&test_fw_mutex);
+	len = snprintf(dst, PAGE_SIZE, "%s\n", src);
+	mutex_unlock(&test_fw_mutex);
+
+	return len;
+}
+
+static int test_dev_config_update_bool(const char *buf, size_t size,
+				       bool *cfg)
+{
+	int ret;
+
+	mutex_lock(&test_fw_mutex);
+	if (strtobool(buf, cfg) < 0)
+		ret = -EINVAL;
+	else
+		ret = size;
+	mutex_unlock(&test_fw_mutex);
+
+	return ret;
+}
+
+static ssize_t
+test_dev_config_show_bool(char *buf,
+			  bool config)
+{
+	bool val;
+
+	mutex_lock(&test_fw_mutex);
+	val = config;
+	mutex_unlock(&test_fw_mutex);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t test_dev_config_show_int(char *buf, int cfg)
+{
+	int val;
+
+	mutex_lock(&test_fw_mutex);
+	val = cfg;
+	mutex_unlock(&test_fw_mutex);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg)
+{
+	int ret;
+	long new;
+
+	ret = kstrtol(buf, 10, &new);
+	if (ret)
+		return ret;
+
+	if (new > U8_MAX)
+		return -EINVAL;
+
+	mutex_lock(&test_fw_mutex);
+	*(u8 *)cfg = new;
+	mutex_unlock(&test_fw_mutex);
+
+	/* Always return full write size even if we didn't consume all */
+	return size;
+}
+
+static ssize_t test_dev_config_show_u8(char *buf, u8 cfg)
+{
+	u8 val;
+
+	mutex_lock(&test_fw_mutex);
+	val = cfg;
+	mutex_unlock(&test_fw_mutex);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t config_name_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return config_test_show_str(buf, test_fw_config->name);
+}
+static DEVICE_ATTR(config_name, 0644, config_name_show, config_name_store);
+
+static ssize_t config_num_requests_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	int rc;
+
+	mutex_lock(&test_fw_mutex);
+	if (test_fw_config->reqs) {
+		pr_err("Must call release_all_firmware prior to changing config\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	mutex_unlock(&test_fw_mutex);
+
+	rc = test_dev_config_update_u8(buf, count,
+				       &test_fw_config->num_requests);
+
+out:
+	return rc;
+}
+
+static ssize_t config_num_requests_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	return test_dev_config_show_u8(buf, test_fw_config->num_requests);
+}
+static DEVICE_ATTR(config_num_requests, 0644, config_num_requests_show,
+		   config_num_requests_store);
+
+static ssize_t config_sync_direct_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	int rc = test_dev_config_update_bool(buf, count,
+					     &test_fw_config->sync_direct);
+
+	if (rc == count)
+		test_fw_config->req_firmware = test_fw_config->sync_direct ?
+				       request_firmware_direct :
+				       request_firmware;
+	return rc;
+}
+
+static ssize_t config_sync_direct_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	return test_dev_config_show_bool(buf, test_fw_config->sync_direct);
+}
+static DEVICE_ATTR(config_sync_direct, 0644, config_sync_direct_show,
+		   config_sync_direct_store);
+
+static ssize_t config_send_uevent_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	return test_dev_config_update_bool(buf, count,
+					   &test_fw_config->send_uevent);
+}
+
+static ssize_t config_send_uevent_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	return test_dev_config_show_bool(buf, test_fw_config->send_uevent);
+}
+static DEVICE_ATTR(config_send_uevent, 0644, config_send_uevent_show,
+		   config_send_uevent_store);
+
+static ssize_t config_read_fw_idx_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	return test_dev_config_update_u8(buf, count,
+					 &test_fw_config->read_fw_idx);
+}
+
+static ssize_t config_read_fw_idx_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	return test_dev_config_show_u8(buf, test_fw_config->read_fw_idx);
+}
+static DEVICE_ATTR(config_read_fw_idx, 0644, config_read_fw_idx_show,
+		   config_read_fw_idx_store);
+
+
 static ssize_t trigger_request_store(struct device *dev,
 				     struct device_attribute *attr,
 				     const char *buf, size_t count)
@@ -170,12 +577,301 @@ out:
 }
 static DEVICE_ATTR_WO(trigger_custom_fallback);
 
+static int test_fw_run_batch_request(void *data)
+{
+	struct test_batched_req *req = data;
+
+	if (!req) {
+		test_fw_config->test_result = -EINVAL;
+		return -EINVAL;
+	}
+
+	req->rc = test_fw_config->req_firmware(&req->fw, req->name, req->dev);
+	if (req->rc) {
+		pr_info("#%u: batched sync load failed: %d\n",
+			req->idx, req->rc);
+		if (!test_fw_config->test_result)
+			test_fw_config->test_result = req->rc;
+	} else if (req->fw) {
+		req->sent = true;
+		pr_info("#%u: batched sync loaded %zu\n",
+			req->idx, req->fw->size);
+	}
+	complete(&req->completion);
+
+	req->task = NULL;
+
+	return 0;
+}
+
+/*
+ * We use a kthread as otherwise the kernel serializes all our sync requests
+ * and we would not be able to mimic batched requests on a sync call. Batched
+ * requests on a sync call can for instance happen on a device driver when
+ * multiple cards are used and firmware loading happens outside of probe.
+ */
+static ssize_t trigger_batched_requests_store(struct device *dev,
+					      struct device_attribute *attr,
+					      const char *buf, size_t count)
+{
+	struct test_batched_req *req;
+	int rc;
+	u8 i;
+
+	mutex_lock(&test_fw_mutex);
+
+	test_fw_config->reqs = vzalloc(sizeof(struct test_batched_req) *
+				       test_fw_config->num_requests * 2);
+	if (!test_fw_config->reqs) {
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+
+	pr_info("batched sync firmware loading '%s' %u times\n",
+		test_fw_config->name, test_fw_config->num_requests);
+
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (!req) {
+			WARN_ON(1);
+			rc = -ENOMEM;
+			goto out_bail;
+		}
+		req->fw = NULL;
+		req->idx = i;
+		req->name = test_fw_config->name;
+		req->dev = dev;
+		init_completion(&req->completion);
+		req->task = kthread_run(test_fw_run_batch_request, req,
+					     "%s-%u", KBUILD_MODNAME, req->idx);
+		if (!req->task || IS_ERR(req->task)) {
+			pr_err("Setting up thread %u failed\n", req->idx);
+			req->task = NULL;
+			rc = -ENOMEM;
+			goto out_bail;
+		}
+	}
+
+	rc = count;
+
+	/*
+	 * We require an explicit release to enable more time and delay of
+	 * calling release_firmware() to improve our chances of forcing a
+	 * batched request. If we instead called release_firmware() right away
+	 * then we might miss on an opportunity of having a successful firmware
+	 * request pass on the opportunity to be come a batched request.
+	 */
+
+out_bail:
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (req->task || req->sent)
+			wait_for_completion(&req->completion);
+	}
+
+	/* Override any worker error if we had a general setup error */
+	if (rc < 0)
+		test_fw_config->test_result = rc;
+
+out_unlock:
+	mutex_unlock(&test_fw_mutex);
+
+	return rc;
+}
+static DEVICE_ATTR_WO(trigger_batched_requests);
+
+/*
+ * We wait for each callback to return with the lock held, no need to lock here
+ */
+static void trigger_batched_cb(const struct firmware *fw, void *context)
+{
+	struct test_batched_req *req = context;
+
+	if (!req) {
+		test_fw_config->test_result = -EINVAL;
+		return;
+	}
+
+	/* forces *some* batched requests to queue up */
+	if (!req->idx)
+		ssleep(2);
+
+	req->fw = fw;
+
+	/*
+	 * Unfortunately the firmware API gives us nothing other than a null FW
+	 * if the firmware was not found on async requests.  Best we can do is
+	 * just assume -ENOENT. A better API would pass the actual return
+	 * value to the callback.
+	 */
+	if (!fw && !test_fw_config->test_result)
+		test_fw_config->test_result = -ENOENT;
+
+	complete(&req->completion);
+}
+
+static
+ssize_t trigger_batched_requests_async_store(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf, size_t count)
+{
+	struct test_batched_req *req;
+	bool send_uevent;
+	int rc;
+	u8 i;
+
+	mutex_lock(&test_fw_mutex);
+
+	test_fw_config->reqs = vzalloc(sizeof(struct test_batched_req) *
+				       test_fw_config->num_requests * 2);
+	if (!test_fw_config->reqs) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	pr_info("batched loading '%s' custom fallback mechanism %u times\n",
+		test_fw_config->name, test_fw_config->num_requests);
+
+	send_uevent = test_fw_config->send_uevent ? FW_ACTION_HOTPLUG :
+		FW_ACTION_NOHOTPLUG;
+
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (!req) {
+			WARN_ON(1);
+			goto out_bail;
+		}
+		req->name = test_fw_config->name;
+		req->fw = NULL;
+		req->idx = i;
+		init_completion(&req->completion);
+		rc = request_firmware_nowait(THIS_MODULE, send_uevent,
+					     req->name,
+					     dev, GFP_KERNEL, req,
+					     trigger_batched_cb);
+		if (rc) {
+			pr_info("#%u: batched async load failed setup: %d\n",
+				i, rc);
+			req->rc = rc;
+			goto out_bail;
+		} else
+			req->sent = true;
+	}
+
+	rc = count;
+
+out_bail:
+
+	/*
+	 * We require an explicit release to enable more time and delay of
+	 * calling release_firmware() to improve our chances of forcing a
+	 * batched request. If we instead called release_firmware() right away
+	 * then we might miss on an opportunity of having a successful firmware
+	 * request pass on the opportunity to be come a batched request.
+	 */
+
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (req->sent)
+			wait_for_completion(&req->completion);
+	}
+
+	/* Override any worker error if we had a general setup error */
+	if (rc < 0)
+		test_fw_config->test_result = rc;
+
+out:
+	mutex_unlock(&test_fw_mutex);
+
+	return rc;
+}
+static DEVICE_ATTR_WO(trigger_batched_requests_async);
+
+static ssize_t test_result_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return test_dev_config_show_int(buf, test_fw_config->test_result);
+}
+static DEVICE_ATTR_RO(test_result);
+
+static ssize_t release_all_firmware_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	test_release_all_firmware();
+	return count;
+}
+static DEVICE_ATTR_WO(release_all_firmware);
+
+static ssize_t read_firmware_show(struct device *dev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	struct test_batched_req *req;
+	u8 idx;
+	ssize_t rc = 0;
+
+	mutex_lock(&test_fw_mutex);
+
+	idx = test_fw_config->read_fw_idx;
+	if (idx >= test_fw_config->num_requests) {
+		rc = -ERANGE;
+		goto out;
+	}
+
+	if (!test_fw_config->reqs) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	req = &test_fw_config->reqs[idx];
+	if (!req->fw) {
+		pr_err("#%u: failed to async load firmware\n", idx);
+		rc = -ENOENT;
+		goto out;
+	}
+
+	pr_info("#%u: loaded %zu\n", idx, req->fw->size);
+
+	if (req->fw->size > PAGE_SIZE) {
+		pr_err("Testing interface must use PAGE_SIZE firmware for now\n");
+		rc = -EINVAL;
+	}
+	memcpy(buf, req->fw->data, req->fw->size);
+
+	rc = req->fw->size;
+out:
+	mutex_unlock(&test_fw_mutex);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(read_firmware);
+
 #define TEST_FW_DEV_ATTR(name)          &dev_attr_##name.attr
 
 static struct attribute *test_dev_attrs[] = {
+	TEST_FW_DEV_ATTR(reset),
+
+	TEST_FW_DEV_ATTR(config),
+	TEST_FW_DEV_ATTR(config_name),
+	TEST_FW_DEV_ATTR(config_num_requests),
+	TEST_FW_DEV_ATTR(config_sync_direct),
+	TEST_FW_DEV_ATTR(config_send_uevent),
+	TEST_FW_DEV_ATTR(config_read_fw_idx),
+
+	/* These don't use the config at all - they could be ported! */
 	TEST_FW_DEV_ATTR(trigger_request),
 	TEST_FW_DEV_ATTR(trigger_async_request),
 	TEST_FW_DEV_ATTR(trigger_custom_fallback),
+
+	/* These use the config and can use the test_result */
+	TEST_FW_DEV_ATTR(trigger_batched_requests),
+	TEST_FW_DEV_ATTR(trigger_batched_requests_async),
+
+	TEST_FW_DEV_ATTR(release_all_firmware),
+	TEST_FW_DEV_ATTR(test_result),
+	TEST_FW_DEV_ATTR(read_firmware),
 	NULL,
 };
 
@@ -192,8 +888,17 @@ static int __init test_firmware_init(void)
 {
 	int rc;
 
+	test_fw_config = kzalloc(sizeof(struct test_config), GFP_KERNEL);
+	if (!test_fw_config)
+		return -ENOMEM;
+
+	rc = __test_firmware_config_init();
+	if (rc)
+		return rc;
+
 	rc = misc_register(&test_fw_misc_device);
 	if (rc) {
+		kfree(test_fw_config);
 		pr_err("could not register misc device: %d\n", rc);
 		return rc;
 	}
@@ -207,8 +912,13 @@ module_init(test_firmware_init);
 
 static void __exit test_firmware_exit(void)
 {
+	mutex_lock(&test_fw_mutex);
 	release_firmware(test_firmware);
 	misc_deregister(&test_fw_misc_device);
+	__test_firmware_config_free();
+	kfree(test_fw_config);
+	mutex_unlock(&test_fw_mutex);
+
 	pr_warn("removed interface\n");
 }