35 files changed, 2303 insertions, 158 deletions
diff --git a/lib/Kconfig b/lib/Kconfig
index 0d49ed0..b1445b2 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -570,9 +570,6 @@ config ARCH_HAS_PMEM_API
 config ARCH_HAS_UACCESS_FLUSHCACHE
 	bool
 
-config ARCH_HAS_MMIO_FLUSH
-	bool
-
 config STACKDEPOT
 	bool
 	select STACKTRACE
@@ -586,4 +583,7 @@ config PARMAN
 config PRIME_NUMBERS
 	tristate
 
+config STRING_SELFTEST
+	bool "Test string functions"
+
 endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 98fe715..b19c491 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -374,6 +374,9 @@ config STACK_VALIDATION
 	  pointers (if CONFIG_FRAME_POINTER is enabled).  This helps ensure
 	  that runtime stack traces are more reliable.
 
+	  This is also a prerequisite for generation of ORC unwind data, which
+	  is needed for CONFIG_ORC_UNWINDER.
+
 	  For more information, see
 	  tools/objtool/Documentation/stack-validation.txt.
 
@@ -798,6 +801,13 @@ config HARDLOCKUP_DETECTOR_PERF
 	select SOFTLOCKUP_DETECTOR
 
 #
+# Enables a timestamp based low pass filter to compensate for perf based
+# hard lockup detection which runs too fast due to turbo modes.
+#
+config HARDLOCKUP_CHECK_TIMESTAMP
+	bool
+
+#
 # arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard
 # lockup detector rather than the perf based detector.
 #
@@ -1081,6 +1091,8 @@ config PROVE_LOCKING
 	select DEBUG_MUTEXES
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
+	select LOCKDEP_CROSSRELEASE
+	select LOCKDEP_COMPLETIONS
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1121,7 +1133,7 @@ config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE
+	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE && !X86
 	select KALLSYMS
 	select KALLSYMS_ALL
 
@@ -1150,6 +1162,22 @@ config LOCK_STAT
 	 CONFIG_LOCK_STAT defines "contended" and "acquired" lock events.
 	 (CONFIG_LOCKDEP defines "acquire" and "release" events.)
 
+config LOCKDEP_CROSSRELEASE
+	bool
+	help
+	 This makes lockdep work for crosslock which is a lock allowed to
+	 be released in a different context from the acquisition context.
+	 Normally a lock must be released in the context acquiring the lock.
+	 However, relexing this constraint helps synchronization primitives
+	 such as page locks or completions can use the lock correctness
+	 detector, lockdep.
+
+config LOCKDEP_COMPLETIONS
+	bool
+	help
+	 A deadlock caused by wait_for_completion() and complete() can be
+	 detected by lockdep using crossrelease feature.
+
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
@@ -1540,7 +1568,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
 	depends on !X86_64
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE && !X86
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
@@ -1549,7 +1577,7 @@ config LATENCYTOP
 	depends on DEBUG_KERNEL
 	depends on STACKTRACE_SUPPORT
 	depends on PROC_FS
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86
 	select KALLSYMS
 	select KALLSYMS_ALL
 	select STACKTRACE
@@ -1902,6 +1930,17 @@ config TEST_KMOD
 
 	  If unsure, say N.
 
+config TEST_DEBUG_VIRTUAL
+	tristate "Test CONFIG_DEBUG_VIRTUAL feature"
+	depends on DEBUG_VIRTUAL
+	help
+	  Test the kernel's ability to detect incorrect calls to
+	  virt_to_phys() done against the non-linear part of the
+	  kernel's virtual address map.
+
+	  If unsure, say N.
+
+
 source "samples/Kconfig"
 
 source "lib/Kconfig.kgdb"
diff --git a/lib/Makefile b/lib/Makefile
index d5c8a4f..dafa796 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
 obj-$(CONFIG_TEST_UUID) += test_uuid.o
 obj-$(CONFIG_TEST_PARMAN) += test_parman.o
 obj-$(CONFIG_TEST_KMOD) += test_kmod.o
+obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index 59fd7c0..155c55d 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -1,6 +1,6 @@
 /* Generic associative array implementation.
  *
- * See Documentation/assoc_array.txt for information.
+ * See Documentation/core-api/assoc_array.rst for information.
  *
  * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 9a53280..c82c61b 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -513,7 +513,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
 		int nmaskbits)
 {
 	unsigned int a, b, old_a, old_b;
-	unsigned int group_size, used_size;
+	unsigned int group_size, used_size, off;
 	int c, old_c, totaldigits, ndigits;
 	const char __user __force *ubuf = (const char __user __force *)buf;
 	int at_start, in_range, in_partial_range;
@@ -599,6 +599,8 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
 			a = old_a;
 			b = old_b;
 			old_a = old_b = 0;
+		} else {
+			used_size = group_size = b - a + 1;
 		}
 		/* if no digit is after '-', it's wrong*/
 		if (at_start && in_range)
@@ -608,17 +610,9 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
 		if (b >= nmaskbits)
 			return -ERANGE;
 		while (a <= b) {
-			if (in_partial_range) {
-				static int pos_in_group = 1;
-
-				if (pos_in_group <= used_size)
-					set_bit(a, maskp);
-
-				if (a == b || ++pos_in_group > group_size)
-					pos_in_group = 1;
-			} else
-				set_bit(a, maskp);
-			a++;
+			off = min(b - a + 1, used_size);
+			bitmap_set(maskp, a, off);
+			a += group_size;
 		}
 	} while (buflen && c == ',');
 	return 0;
diff --git a/lib/cmdline.c b/lib/cmdline.c
index 4c0888c..171c19b 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -244,5 +244,4 @@ char *next_arg(char *args, char **param, char **val)
 
 	/* Chew up trailing spaces. */
 	return skip_spaces(next);
-	//return next;
 }
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 4731a08..8b1a1bd 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -6,6 +6,22 @@
 #include <linux/bootmem.h>
 
 /**
+ * cpumask_next - get the next cpu in a cpumask
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @srcp: the cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set.
+ */
+unsigned int cpumask_next(int n, const struct cpumask *srcp)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
+	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
+}
+EXPORT_SYMBOL(cpumask_next);
+
+/**
  * cpumask_next_and - get the next cpu in *src1p & *src2p
  * @n: the cpu prior to the place to search (ie. return will be > @n)
  * @src1p: the first cpumask pointer
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 17afb04..2f5349c 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -18,6 +18,7 @@
 #include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/hash.h>
+#include <linux/kmemleak.h>
 
 #define ODEBUG_HASH_BITS	14
 #define ODEBUG_HASH_SIZE	(1 << ODEBUG_HASH_BITS)
@@ -110,6 +111,7 @@ static void fill_pool(void)
 		if (!new)
 			return;
 
+		kmemleak_ignore(new);
 		raw_spin_lock_irqsave(&pool_lock, flags);
 		hlist_add_head(&new->node, &obj_pool);
 		debug_objects_allocated++;
@@ -1080,6 +1082,7 @@ static int __init debug_objects_replace_static_objects(void)
 		obj = kmem_cache_zalloc(obj_cache, GFP_KERNEL);
 		if (!obj)
 			goto free;
+		kmemleak_ignore(obj);
 		hlist_add_head(&obj->node, &objects);
 	}
 
diff --git a/lib/errseq.c b/lib/errseq.c
index 841fa24..7b900c2 100644
--- a/lib/errseq.c
+++ b/lib/errseq.c
@@ -41,23 +41,20 @@
 #define ERRSEQ_CTR_INC		(1 << (ERRSEQ_SHIFT + 1))
 
 /**
- * __errseq_set - set a errseq_t for later reporting
+ * errseq_set - set a errseq_t for later reporting
  * @eseq: errseq_t field that should be set
- * @err: error to set
+ * @err: error to set (must be between -1 and -MAX_ERRNO)
  *
  * This function sets the error in *eseq, and increments the sequence counter
  * if the last sequence was sampled at some point in the past.
  *
  * Any error set will always overwrite an existing error.
  *
- * Most callers will want to use the errseq_set inline wrapper to efficiently
- * handle the common case where err is 0.
- *
- * We do return an errseq_t here, primarily for debugging purposes. The return
- * value should not be used as a previously sampled value in later calls as it
- * will not have the SEEN flag set.
+ * We do return the latest value here, primarily for debugging purposes. The
+ * return value should not be used as a previously sampled value in later calls
+ * as it will not have the SEEN flag set.
  */
-errseq_t __errseq_set(errseq_t *eseq, int err)
+errseq_t errseq_set(errseq_t *eseq, int err)
 {
 	errseq_t cur, old;
 
@@ -107,7 +104,7 @@ errseq_t __errseq_set(errseq_t *eseq, int err)
 	}
 	return cur;
 }
-EXPORT_SYMBOL(__errseq_set);
+EXPORT_SYMBOL(errseq_set);
 
 /**
  * errseq_sample - grab current errseq_t value
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 992457b..81b70ed 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -9,6 +9,7 @@
 
 #include <linux/types.h>
 #include <linux/ctype.h>
+#include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <asm/unaligned.h>
@@ -42,7 +43,7 @@ EXPORT_SYMBOL(hex_to_bin);
  * @src: ascii hexadecimal string
  * @count: result length
  *
- * Return 0 on success, -1 in case of bad input.
+ * Return 0 on success, -EINVAL in case of bad input.
  */
 int hex2bin(u8 *dst, const char *src, size_t count)
 {
@@ -51,7 +52,7 @@ int hex2bin(u8 *dst, const char *src, size_t count)
 		int lo = hex_to_bin(*src++);
 
 		if ((hi < 0) || (lo < 0))
-			return -1;
+			return -EINVAL;
 
 		*dst++ = (hi << 4) | lo;
 	}
diff --git a/lib/idr.c b/lib/idr.c
index b13682b..f9adf48 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -7,45 +7,32 @@
 DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap);
 static DEFINE_SPINLOCK(simple_ida_lock);
 
-/**
- * idr_alloc - allocate an id
- * @idr: idr handle
- * @ptr: pointer to be associated with the new id
- * @start: the minimum id (inclusive)
- * @end: the maximum id (exclusive)
- * @gfp: memory allocation flags
- *
- * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
- * if there are no unused IDs in that range.
- *
- * Note that @end is treated as max when <= 0.  This is to always allow
- * using @start + N as @end as long as N is inside integer range.
- *
- * Simultaneous modifications to the @idr are not allowed and should be
- * prevented by the user, usually with a lock.  idr_alloc() may be called
- * concurrently with read-only accesses to the @idr, such as idr_find() and
- * idr_for_each_entry().
- */
-int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
+int idr_alloc_cmn(struct idr *idr, void *ptr, unsigned long *index,
+		  unsigned long start, unsigned long end, gfp_t gfp,
+		  bool ext)
 {
-	void __rcu **slot;
 	struct radix_tree_iter iter;
+	void __rcu **slot;
 
-	if (WARN_ON_ONCE(start < 0))
-		return -EINVAL;
 	if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
 		return -EINVAL;
 
 	radix_tree_iter_init(&iter, start);
-	slot = idr_get_free(&idr->idr_rt, &iter, gfp, end);
+	if (ext)
+		slot = idr_get_free_ext(&idr->idr_rt, &iter, gfp, end);
+	else
+		slot = idr_get_free(&idr->idr_rt, &iter, gfp, end);
 	if (IS_ERR(slot))
 		return PTR_ERR(slot);
 
 	radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
 	radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);
-	return iter.index;
+
+	if (index)
+		*index = iter.index;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(idr_alloc);
+EXPORT_SYMBOL_GPL(idr_alloc_cmn);
 
 /**
  * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion
@@ -134,6 +121,20 @@ void *idr_get_next(struct idr *idr, int *nextid)
 }
 EXPORT_SYMBOL(idr_get_next);
 
+void *idr_get_next_ext(struct idr *idr, unsigned long *nextid)
+{
+	struct radix_tree_iter iter;
+	void __rcu **slot;
+
+	slot = radix_tree_iter_find(&idr->idr_rt, &iter, *nextid);
+	if (!slot)
+		return NULL;
+
+	*nextid = iter.index;
+	return rcu_dereference_raw(*slot);
+}
+EXPORT_SYMBOL(idr_get_next_ext);
+
 /**
  * idr_replace - replace pointer for given id
  * @idr: idr handle
@@ -150,12 +151,19 @@ EXPORT_SYMBOL(idr_get_next);
  */
 void *idr_replace(struct idr *idr, void *ptr, int id)
 {
+	if (id < 0)
+		return ERR_PTR(-EINVAL);
+
+	return idr_replace_ext(idr, ptr, id);
+}
+EXPORT_SYMBOL(idr_replace);
+
+void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id)
+{
 	struct radix_tree_node *node;
 	void __rcu **slot = NULL;
 	void *entry;
 
-	if (WARN_ON_ONCE(id < 0))
-		return ERR_PTR(-EINVAL);
 	if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
 		return ERR_PTR(-EINVAL);
 
@@ -167,7 +175,7 @@ void *idr_replace(struct idr *idr, void *ptr, int id)
 
 	return entry;
 }
-EXPORT_SYMBOL(idr_replace);
+EXPORT_SYMBOL(idr_replace_ext);
 
 /**
  * DOC: IDA description
diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c
index df495fe..0e343fd 100644
--- a/lib/interval_tree_test.c
+++ b/lib/interval_tree_test.c
@@ -19,14 +19,14 @@ __param(bool, search_all, false, "Searches will iterate all nodes in the tree");
 
 __param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint");
 
-static struct rb_root root = RB_ROOT;
+static struct rb_root_cached root = RB_ROOT_CACHED;
 static struct interval_tree_node *nodes = NULL;
 static u32 *queries = NULL;
 
 static struct rnd_state rnd;
 
 static inline unsigned long
-search(struct rb_root *root, unsigned long start, unsigned long last)
+search(struct rb_root_cached *root, unsigned long start, unsigned long last)
 {
 	struct interval_tree_node *node;
 	unsigned long results = 0;
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 719c155..e590523 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -52,6 +52,8 @@ static const char *kobject_actions[] = {
 	[KOBJ_MOVE] =		"move",
 	[KOBJ_ONLINE] =		"online",
 	[KOBJ_OFFLINE] =	"offline",
+	[KOBJ_BIND] =		"bind",
+	[KOBJ_UNBIND] =		"unbind",
 };
 
 static int kobject_action_type(const char *buf, size_t count,
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 6f2b135..cd0b5c9 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -363,6 +363,103 @@ static void rsem_AA3(void)
 }
 
 /*
+ * read_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		write_lock(A)
+ */
+static void rlock_ABBA1(void)
+{
+	RL(X1);
+	L(Y1);
+	U(Y1);
+	RU(X1);
+
+	L(Y1);
+	WL(X1);
+	WU(X1);
+	U(Y1); // should fail
+}
+
+static void rwsem_ABBA1(void)
+{
+	RSL(X1);
+	ML(Y1);
+	MU(Y1);
+	RSU(X1);
+
+	ML(Y1);
+	WSL(X1);
+	WSU(X1);
+	MU(Y1); // should fail
+}
+
+/*
+ * read_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		read_lock(A)
+ */
+static void rlock_ABBA2(void)
+{
+	RL(X1);
+	L(Y1);
+	U(Y1);
+	RU(X1);
+
+	L(Y1);
+	RL(X1);
+	RU(X1);
+	U(Y1); // should NOT fail
+}
+
+static void rwsem_ABBA2(void)
+{
+	RSL(X1);
+	ML(Y1);
+	MU(Y1);
+	RSU(X1);
+
+	ML(Y1);
+	RSL(X1);
+	RSU(X1);
+	MU(Y1); // should fail
+}
+
+
+/*
+ * write_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		write_lock(A)
+ */
+static void rlock_ABBA3(void)
+{
+	WL(X1);
+	L(Y1);
+	U(Y1);
+	WU(X1);
+
+	L(Y1);
+	WL(X1);
+	WU(X1);
+	U(Y1); // should fail
+}
+
+static void rwsem_ABBA3(void)
+{
+	WSL(X1);
+	ML(Y1);
+	MU(Y1);
+	WSU(X1);
+
+	ML(Y1);
+	WSL(X1);
+	WSU(X1);
+	MU(Y1); // should fail
+}
+
+/*
  * ABBA deadlock:
  */
 
@@ -1056,8 +1153,6 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
 	if (debug_locks != expected) {
 		unexpected_testcase_failures++;
 		pr_cont("FAILED|");
-
-		dump_stack();
 	} else {
 		testcase_successes++;
 		pr_cont("  ok  |");
@@ -1933,6 +2028,30 @@ void locking_selftest(void)
 	dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
 	pr_cont("\n");
 
+	print_testname("mixed read-lock/lock-write ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK);
+	/*
+	 * Lockdep does indeed fail here, but there's nothing we can do about
+	 * that now.  Don't kill lockdep for it.
+	 */
+	unexpected_testcase_failures--;
+
+	pr_cont("             |");
+	dotest(rwsem_ABBA1, FAILURE, LOCKTYPE_RWSEM);
+
+	print_testname("mixed read-lock/lock-read ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA2, SUCCESS, LOCKTYPE_RWLOCK);
+	pr_cont("             |");
+	dotest(rwsem_ABBA2, FAILURE, LOCKTYPE_RWSEM);
+
+	print_testname("mixed write-lock/lock-write ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA3, FAILURE, LOCKTYPE_RWLOCK);
+	pr_cont("             |");
+	dotest(rwsem_ABBA3, FAILURE, LOCKTYPE_RWSEM);
+
 	printk("  --------------------------------------------------------------------------\n");
 
 	/*
diff --git a/lib/mpi/longlong.h b/lib/mpi/longlong.h
index 9333650..57fd45a 100644
--- a/lib/mpi/longlong.h
+++ b/lib/mpi/longlong.h
@@ -176,8 +176,8 @@ extern UDItype __udiv_qrnnd(UDItype *, UDItype, UDItype, UDItype);
 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
 	__asm__ ("adds %1, %4, %5\n" \
 		"adc  %0, %2, %3" \
-	: "=r" ((USItype)(sh)), \
-		"=&r" ((USItype)(sl)) \
+	: "=r" (sh), \
+		"=&r" (sl) \
 	: "%r" ((USItype)(ah)), \
 		"rI" ((USItype)(bh)), \
 		"%r" ((USItype)(al)), \
@@ -185,15 +185,15 @@ extern UDItype __udiv_qrnnd(UDItype *, UDItype, UDItype, UDItype);
 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
 	__asm__ ("subs %1, %4, %5\n" \
 		"sbc  %0, %2, %3" \
-	: "=r" ((USItype)(sh)), \
-		"=&r" ((USItype)(sl)) \
+	: "=r" (sh), \
+		"=&r" (sl) \
 	: "r" ((USItype)(ah)), \
 		"rI" ((USItype)(bh)), \
 		"r" ((USItype)(al)), \
 		"rI" ((USItype)(bl)))
 #if defined __ARM_ARCH_2__ || defined __ARM_ARCH_3__
 #define umul_ppmm(xh, xl, a, b) \
-	__asm__ ("%@ Inlined umul_ppmm\n" \
+	__asm__ ("@ Inlined umul_ppmm\n" \
 		"mov	%|r0, %2, lsr #16		@ AAAA\n" \
 		"mov	%|r2, %3, lsr #16		@ BBBB\n" \
 		"bic	%|r1, %2, %|r0, lsl #16		@ aaaa\n" \
@@ -206,19 +206,19 @@ extern UDItype __udiv_qrnnd(UDItype *, UDItype, UDItype, UDItype);
 		"addcs	%|r2, %|r2, #65536\n" \
 		"adds	%1, %|r1, %|r0, lsl #16\n" \
 		"adc	%0, %|r2, %|r0, lsr #16" \
-	: "=&r" ((USItype)(xh)), \
-		"=r" ((USItype)(xl)) \
+	: "=&r" (xh), \
+		"=r" (xl) \
 	: "r" ((USItype)(a)), \
 		"r" ((USItype)(b)) \
 	: "r0", "r1", "r2")
 #else
 #define umul_ppmm(xh, xl, a, b) \
-	__asm__ ("%@ Inlined umul_ppmm\n" \
-		"umull %r1, %r0, %r2, %r3" \
-	: "=&r" ((USItype)(xh)), \
-			"=&r" ((USItype)(xl)) \
+	__asm__ ("@ Inlined umul_ppmm\n" \
+		"umull %1, %0, %2, %3" \
+	: "=&r" (xh), \
+		"=&r" (xl) \
 	: "r" ((USItype)(a)), \
-			"r" ((USItype)(b)) \
+		"r" ((USItype)(b)) \
 	: "r0", "r1")
 #endif
 #define UMUL_TIME 20
diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c
index 5a0f75a..eead4b3 100644
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
@@ -364,11 +364,11 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes)
 	}
 
 	miter.consumed = lzeros;
-	sg_miter_stop(&miter);
 
 	nbytes -= lzeros;
 	nbits = nbytes * 8;
 	if (nbits > MAX_EXTERN_MPI_BITS) {
+		sg_miter_stop(&miter);
 		pr_info("MPI: mpi too large (%u bits)\n", nbits);
 		return NULL;
 	}
@@ -376,6 +376,8 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes)
 	if (nbytes > 0)
 		nbits -= count_leading_zeros(*buff) - (BITS_PER_LONG - 8);
 
+	sg_miter_stop(&miter);
+
 	nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB);
 	val = mpi_alloc(nlimbs);
 	if (!val)
diff --git a/lib/nlattr.c b/lib/nlattr.c
index fb52435..927c2f1 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -27,6 +27,30 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
 	[NLA_S64]	= sizeof(s64),
 };
 
+static int validate_nla_bitfield32(const struct nlattr *nla,
+				   u32 *valid_flags_allowed)
+{
+	const struct nla_bitfield32 *bf = nla_data(nla);
+	u32 *valid_flags_mask = valid_flags_allowed;
+
+	if (!valid_flags_allowed)
+		return -EINVAL;
+
+	/*disallow invalid bit selector */
+	if (bf->selector & ~*valid_flags_mask)
+		return -EINVAL;
+
+	/*disallow invalid bit values */
+	if (bf->value & ~*valid_flags_mask)
+		return -EINVAL;
+
+	/*disallow valid bit values that are not selected*/
+	if (bf->value & ~bf->selector)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int validate_nla(const struct nlattr *nla, int maxtype,
 			const struct nla_policy *policy)
 {
@@ -46,6 +70,12 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			return -ERANGE;
 		break;
 
+	case NLA_BITFIELD32:
+		if (attrlen != sizeof(struct nla_bitfield32))
+			return -ERANGE;
+
+		return validate_nla_bitfield32(nla, pt->validation_data);
+
 	case NLA_NUL_STRING:
 		if (pt->len)
 			minlen = min_t(int, attrlen, pt->len + 1);
@@ -272,6 +302,30 @@ size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize)
 EXPORT_SYMBOL(nla_strlcpy);
 
 /**
+ * nla_strdup - Copy string attribute payload into a newly allocated buffer
+ * @nla: attribute to copy the string from
+ * @flags: the type of memory to allocate (see kmalloc).
+ *
+ * Returns a pointer to the allocated buffer or NULL on error.
+ */
+char *nla_strdup(const struct nlattr *nla, gfp_t flags)
+{
+	size_t srclen = nla_len(nla);
+	char *src = nla_data(nla), *dst;
+
+	if (srclen > 0 && src[srclen - 1] == '\0')
+		srclen--;
+
+	dst = kmalloc(srclen + 1, flags);
+	if (dst != NULL) {
+		memcpy(dst, src, srclen);
+		dst[srclen] = '\0';
+	}
+	return dst;
+}
+EXPORT_SYMBOL(nla_strdup);
+
+/**
  * nla_memcpy - Copy a netlink attribute into another memory area
  * @dest: where to copy to memcpy
  * @src: netlink attribute to copy from
diff --git a/lib/oid_registry.c b/lib/oid_registry.c
index 318f382..41b9e50 100644
--- a/lib/oid_registry.c
+++ b/lib/oid_registry.c
@@ -142,9 +142,9 @@ int sprint_oid(const void *data, size_t datasize, char *buffer, size_t bufsize)
 		}
 		ret += count = snprintf(buffer, bufsize, ".%lu", num);
 		buffer += count;
-		bufsize -= count;
-		if (bufsize == 0)
+		if (bufsize <= count)
 			return -ENOBUFS;
+		bufsize -= count;
 	}
 
 	return ret;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 898e879..8b1feca 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -463,7 +463,7 @@ radix_tree_node_free(struct radix_tree_node *node)
  * To make use of this facility, the radix tree must be initialised without
  * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
  */
-static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
+static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
 {
 	struct radix_tree_preload *rtp;
 	struct radix_tree_node *node;
@@ -2022,6 +2022,7 @@ void radix_tree_iter_delete(struct radix_tree_root *root,
 	if (__radix_tree_delete(root, iter->node, slot))
 		iter->index = iter->next_index;
 }
+EXPORT_SYMBOL(radix_tree_iter_delete);
 
 /**
  * radix_tree_delete_item - delete an item from a radix tree
@@ -2103,7 +2104,8 @@ EXPORT_SYMBOL(radix_tree_tagged);
  */
 void idr_preload(gfp_t gfp_mask)
 {
-	__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE);
+	if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
+		preempt_disable();
 }
 EXPORT_SYMBOL(idr_preload);
 
@@ -2117,13 +2119,13 @@ EXPORT_SYMBOL(idr_preload);
  */
 int ida_pre_get(struct ida *ida, gfp_t gfp)
 {
-	__radix_tree_preload(gfp, IDA_PRELOAD_SIZE);
 	/*
 	 * The IDA API has no preload_end() equivalent.  Instead,
 	 * ida_get_new() can return -EAGAIN, prompting the caller
 	 * to return to the ida_pre_get() step.
 	 */
-	preempt_enable();
+	if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
+		preempt_enable();
 
 	if (!this_cpu_read(ida_bitmap)) {
 		struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
@@ -2137,13 +2139,13 @@ int ida_pre_get(struct ida *ida, gfp_t gfp)
 }
 EXPORT_SYMBOL(ida_pre_get);
 
-void __rcu **idr_get_free(struct radix_tree_root *root,
-			struct radix_tree_iter *iter, gfp_t gfp, int end)
+void __rcu **idr_get_free_cmn(struct radix_tree_root *root,
+			      struct radix_tree_iter *iter, gfp_t gfp,
+			      unsigned long max)
 {
 	struct radix_tree_node *node = NULL, *child;
 	void __rcu **slot = (void __rcu **)&root->rnode;
 	unsigned long maxindex, start = iter->next_index;
-	unsigned long max = end > 0 ? end - 1 : INT_MAX;
 	unsigned int shift, offset = 0;
 
  grow:
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3057011..a93adf6 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -5,7 +5,7 @@ raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
-raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
+raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
 
@@ -26,7 +26,9 @@ NEON_FLAGS := -ffreestanding
 ifeq ($(ARCH),arm)
 NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
 endif
+CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
 ifeq ($(ARCH),arm64)
+CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 7857049..4769947 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -113,6 +113,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #ifdef CONFIG_S390
 	&raid6_recov_s390xc,
 #endif
+#if defined(CONFIG_KERNEL_MODE_NEON)
+	&raid6_recov_neon,
+#endif
 	&raid6_recov_intx1,
 	NULL
 };
diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c
index f524a79..46df797 100644
--- a/lib/raid6/avx512.c
+++ b/lib/raid6/avx512.c
@@ -29,7 +29,7 @@
 
 static const struct raid6_avx512_constants {
 	u64 x1d[8];
-} raid6_avx512_constants __aligned(512) = {
+} raid6_avx512_constants __aligned(512/8) = {
 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc
index 4fa51b7..d5242f5 100644
--- a/lib/raid6/neon.uc
+++ b/lib/raid6/neon.uc
@@ -46,8 +46,12 @@ static inline unative_t SHLBYTE(unative_t v)
  */
 static inline unative_t MASK(unative_t v)
 {
-	const uint8x16_t temp = NBYTES(0);
-	return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp);
+	return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
+}
+
+static inline unative_t PMUL(unative_t v, unative_t u)
+{
+	return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
 }
 
 void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
@@ -110,7 +114,30 @@ void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
 			wq$$ = veorq_u8(w1$$, wd$$);
 		}
 		/* P/Q left side optimization */
-		for ( z = start-1 ; z >= 0 ; z-- ) {
+		for ( z = start-1 ; z >= 3 ; z -= 4 ) {
+			w2$$ = vshrq_n_u8(wq$$, 4);
+			w1$$ = vshlq_n_u8(wq$$, 4);
+
+			w2$$ = PMUL(w2$$, x1d);
+			wq$$ = veorq_u8(w1$$, w2$$);
+		}
+
+		switch (z) {
+		case 2:
+			w2$$ = vshrq_n_u8(wq$$, 5);
+			w1$$ = vshlq_n_u8(wq$$, 3);
+
+			w2$$ = PMUL(w2$$, x1d);
+			wq$$ = veorq_u8(w1$$, w2$$);
+			break;
+		case 1:
+			w2$$ = vshrq_n_u8(wq$$, 6);
+			w1$$ = vshlq_n_u8(wq$$, 2);
+
+			w2$$ = PMUL(w2$$, x1d);
+			wq$$ = veorq_u8(w1$$, w2$$);
+			break;
+		case 0:
 			w2$$ = MASK(wq$$);
 			w1$$ = SHLBYTE(wq$$);
 
diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c
new file mode 100644
index 0000000..eeb5c40
--- /dev/null
+++ b/lib/raid6/recov_neon.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/raid/pq.h>
+
+#ifdef __KERNEL__
+#include <asm/neon.h>
+#else
+#define kernel_neon_begin()
+#define kernel_neon_end()
+#define cpu_has_neon()		(1)
+#endif
+
+static int raid6_has_neon(void)
+{
+	return cpu_has_neon();
+}
+
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+			      uint8_t *dq, const uint8_t *pbmul,
+			      const uint8_t *qmul);
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+			      const uint8_t *qmul);
+
+static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
+		int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dp;
+	ptrs[failb]     = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+					 raid6_gfexp[failb]]];
+
+	kernel_neon_begin();
+	__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
+	kernel_neon_end();
+}
+
+static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
+		void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_neon_begin();
+	__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
+	kernel_neon_end();
+}
+
+const struct raid6_recov_calls raid6_recov_neon = {
+	.data2		= raid6_2data_recov_neon,
+	.datap		= raid6_datap_recov_neon,
+	.valid		= raid6_has_neon,
+	.name		= "neon",
+	.priority	= 10,
+};
diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c
new file mode 100644
index 0000000..8cd20c9
--- /dev/null
+++ b/lib/raid6/recov_neon_inner.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <arm_neon.h>
+
+static const uint8x16_t x0f = {
+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+};
+
+#ifdef CONFIG_ARM
+/*
+ * AArch32 does not provide this intrinsic natively because it does not
+ * implement the underlying instruction. AArch32 only provides a 64-bit
+ * wide vtbl.8 instruction, so use that instead.
+ */
+static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
+{
+	union {
+		uint8x16_t	val;
+		uint8x8x2_t	pair;
+	} __a = { a };
+
+	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
+			   vtbl2_u8(__a.pair, vget_high_u8(b)));
+}
+#endif
+
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+			      uint8_t *dq, const uint8_t *pbmul,
+			      const uint8_t *qmul)
+{
+	uint8x16_t pm0 = vld1q_u8(pbmul);
+	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
+	uint8x16_t qm0 = vld1q_u8(qmul);
+	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+
+	/*
+	 * while ( bytes-- ) {
+	 *	uint8_t px, qx, db;
+	 *
+	 *	px    = *p ^ *dp;
+	 *	qx    = qmul[*q ^ *dq];
+	 *	*dq++ = db = pbmul[px] ^ qx;
+	 *	*dp++ = db ^ px;
+	 *	p++; q++;
+	 * }
+	 */
+
+	while (bytes) {
+		uint8x16_t vx, vy, px, qx, db;
+
+		px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
+		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		qx = veorq_u8(vx, vy);
+
+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
+		vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
+		vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
+		vx = veorq_u8(vx, vy);
+		db = veorq_u8(vx, qx);
+
+		vst1q_u8(dq, db);
+		vst1q_u8(dp, veorq_u8(db, px));
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dp += 16;
+		dq += 16;
+	}
+}
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+			      const uint8_t *qmul)
+{
+	uint8x16_t qm0 = vld1q_u8(qmul);
+	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+
+	/*
+	 * while (bytes--) {
+	 *	*p++ ^= *dq = qmul[*q ^ *dq];
+	 *	q++; dq++;
+	 * }
+	 */
+
+	while (bytes) {
+		uint8x16_t vx, vy;
+
+		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		vx = veorq_u8(vx, vy);
+		vy = veorq_u8(vx, vld1q_u8(p));
+
+		vst1q_u8(dq, vx);
+		vst1q_u8(p, vy);
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dq += 16;
+	}
+}
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 4ba2828..ba4a9d1 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -95,22 +95,35 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
 
 static __always_inline void
 __rb_insert(struct rb_node *node, struct rb_root *root,
+	    bool newleft, struct rb_node **leftmost,
 	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
 	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
 
+	if (newleft)
+		*leftmost = node;
+
 	while (true) {
 		/*
-		 * Loop invariant: node is red
-		 *
-		 * If there is a black parent, we are done.
-		 * Otherwise, take some corrective action as we don't
-		 * want a red root or two consecutive red nodes.
+		 * Loop invariant: node is red.
 		 */
-		if (!parent) {
+		if (unlikely(!parent)) {
+			/*
+			 * The inserted node is root. Either this is the
+			 * first node, or we recursed at Case 1 below and
+			 * are no longer violating 4).
+			 */
 			rb_set_parent_color(node, NULL, RB_BLACK);
 			break;
-		} else if (rb_is_black(parent))
+		}
+
+		/*
+		 * If there is a black parent, we are done.
+		 * Otherwise, take some corrective action as,
+		 * per 4), we don't want a red root or two
+		 * consecutive red nodes.
+		 */
+		if(rb_is_black(parent))
 			break;
 
 		gparent = rb_red_parent(parent);
@@ -119,7 +132,7 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 		if (parent != tmp) {	/* parent == gparent->rb_left */
 			if (tmp && rb_is_red(tmp)) {
 				/*
-				 * Case 1 - color flips
+				 * Case 1 - node's uncle is red (color flips).
 				 *
 				 *       G            g
 				 *      / \          / \
@@ -142,7 +155,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			tmp = parent->rb_right;
 			if (node == tmp) {
 				/*
-				 * Case 2 - left rotate at parent
+				 * Case 2 - node's uncle is black and node is
+				 * the parent's right child (left rotate at parent).
 				 *
 				 *      G             G
 				 *     / \           / \
@@ -166,7 +180,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			}
 
 			/*
-			 * Case 3 - right rotate at gparent
+			 * Case 3 - node's uncle is black and node is
+			 * the parent's left child (right rotate at gparent).
 			 *
 			 *        G           P
 			 *       / \         / \
@@ -434,19 +449,38 @@ static const struct rb_augment_callbacks dummy_callbacks = {
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-	__rb_insert(node, root, dummy_rotate);
+	__rb_insert(node, root, false, NULL, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_insert_color);
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
+	rebalance = __rb_erase_augmented(node, root,
+					 NULL, &dummy_callbacks);
 	if (rebalance)
 		____rb_erase_color(rebalance, root, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_erase);
 
+void rb_insert_color_cached(struct rb_node *node,
+			    struct rb_root_cached *root, bool leftmost)
+{
+	__rb_insert(node, &root->rb_root, leftmost,
+		    &root->rb_leftmost, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_insert_color_cached);
+
+void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
+{
+	struct rb_node *rebalance;
+	rebalance = __rb_erase_augmented(node, &root->rb_root,
+					 &root->rb_leftmost, &dummy_callbacks);
+	if (rebalance)
+		____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_erase_cached);
+
 /*
  * Augmented rbtree manipulation functions.
  *
@@ -455,9 +489,10 @@ EXPORT_SYMBOL(rb_erase);
  */
 
 void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+			   bool newleft, struct rb_node **leftmost,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	__rb_insert(node, root, augment_rotate);
+	__rb_insert(node, root, newleft, leftmost, augment_rotate);
 }
 EXPORT_SYMBOL(__rb_insert_augmented);
 
@@ -502,7 +537,7 @@ struct rb_node *rb_next(const struct rb_node *node)
 	 * as we can.
 	 */
 	if (node->rb_right) {
-		node = node->rb_right; 
+		node = node->rb_right;
 		while (node->rb_left)
 			node=node->rb_left;
 		return (struct rb_node *)node;
@@ -534,7 +569,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
 	 * as we can.
 	 */
 	if (node->rb_left) {
-		node = node->rb_left; 
+		node = node->rb_left;
 		while (node->rb_right)
 			node=node->rb_right;
 		return (struct rb_node *)node;
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 8b3c9dc..191a238 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -1,11 +1,18 @@
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/rbtree_augmented.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 #include <asm/timex.h>
 
-#define NODES       100
-#define PERF_LOOPS  100000
-#define CHECK_LOOPS 100
+#define __param(type, name, init, msg)		\
+	static type name = init;		\
+	module_param(name, type, 0444);		\
+	MODULE_PARM_DESC(name, msg);
+
+__param(int, nnodes, 100, "Number of nodes in the rb-tree");
+__param(int, perf_loops, 100000, "Number of iterations modifying the rb-tree");
+__param(int, check_loops, 100, "Number of iterations modifying and verifying the rb-tree");
 
 struct test_node {
 	u32 key;
@@ -16,14 +23,14 @@ struct test_node {
 	u32 augmented;
 };
 
-static struct rb_root root = RB_ROOT;
-static struct test_node nodes[NODES];
+static struct rb_root_cached root = RB_ROOT_CACHED;
+static struct test_node *nodes = NULL;
 
 static struct rnd_state rnd;
 
-static void insert(struct test_node *node, struct rb_root *root)
+static void insert(struct test_node *node, struct rb_root_cached *root)
 {
-	struct rb_node **new = &root->rb_node, *parent = NULL;
+	struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
 	u32 key = node->key;
 
 	while (*new) {
@@ -35,14 +42,40 @@ static void insert(struct test_node *node, struct rb_root *root)
 	}
 
 	rb_link_node(&node->rb, parent, new);
-	rb_insert_color(&node->rb, root);
+	rb_insert_color(&node->rb, &root->rb_root);
+}
+
+static void insert_cached(struct test_node *node, struct rb_root_cached *root)
+{
+	struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
+	u32 key = node->key;
+	bool leftmost = true;
+
+	while (*new) {
+		parent = *new;
+		if (key < rb_entry(parent, struct test_node, rb)->key)
+			new = &parent->rb_left;
+		else {
+			new = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+
+	rb_link_node(&node->rb, parent, new);
+	rb_insert_color_cached(&node->rb, root, leftmost);
 }
 
-static inline void erase(struct test_node *node, struct rb_root *root)
+static inline void erase(struct test_node *node, struct rb_root_cached *root)
 {
-	rb_erase(&node->rb, root);
+	rb_erase(&node->rb, &root->rb_root);
 }
 
+static inline void erase_cached(struct test_node *node, struct rb_root_cached *root)
+{
+	rb_erase_cached(&node->rb, root);
+}
+
+
 static inline u32 augment_recompute(struct test_node *node)
 {
 	u32 max = node->val, child_augmented;
@@ -64,9 +97,10 @@ static inline u32 augment_recompute(struct test_node *node)
 RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb,
 		     u32, augmented, augment_recompute)
 
-static void insert_augmented(struct test_node *node, struct rb_root *root)
+static void insert_augmented(struct test_node *node,
+			     struct rb_root_cached *root)
 {
-	struct rb_node **new = &root->rb_node, *rb_parent = NULL;
+	struct rb_node **new = &root->rb_root.rb_node, *rb_parent = NULL;
 	u32 key = node->key;
 	u32 val = node->val;
 	struct test_node *parent;
@@ -84,18 +118,53 @@ static void insert_augmented(struct test_node *node, struct rb_root *root)
 
 	node->augmented = val;
 	rb_link_node(&node->rb, rb_parent, new);
-	rb_insert_augmented(&node->rb, root, &augment_callbacks);
+	rb_insert_augmented(&node->rb, &root->rb_root, &augment_callbacks);
+}
+
+static void insert_augmented_cached(struct test_node *node,
+				    struct rb_root_cached *root)
+{
+	struct rb_node **new = &root->rb_root.rb_node, *rb_parent = NULL;
+	u32 key = node->key;
+	u32 val = node->val;
+	struct test_node *parent;
+	bool leftmost = true;
+
+	while (*new) {
+		rb_parent = *new;
+		parent = rb_entry(rb_parent, struct test_node, rb);
+		if (parent->augmented < val)
+			parent->augmented = val;
+		if (key < parent->key)
+			new = &parent->rb.rb_left;
+		else {
+			new = &parent->rb.rb_right;
+			leftmost = false;
+		}
+	}
+
+	node->augmented = val;
+	rb_link_node(&node->rb, rb_parent, new);
+	rb_insert_augmented_cached(&node->rb, root,
+				   leftmost, &augment_callbacks);
+}
+
+
+static void erase_augmented(struct test_node *node, struct rb_root_cached *root)
+{
+	rb_erase_augmented(&node->rb, &root->rb_root, &augment_callbacks);
 }
 
-static void erase_augmented(struct test_node *node, struct rb_root *root)
+static void erase_augmented_cached(struct test_node *node,
+				   struct rb_root_cached *root)
 {
-	rb_erase_augmented(&node->rb, root, &augment_callbacks);
+	rb_erase_augmented_cached(&node->rb, root, &augment_callbacks);
 }
 
 static void init(void)
 {
 	int i;
-	for (i = 0; i < NODES; i++) {
+	for (i = 0; i < nnodes; i++) {
 		nodes[i].key = prandom_u32_state(&rnd);
 		nodes[i].val = prandom_u32_state(&rnd);
 	}
@@ -118,7 +187,7 @@ static void check_postorder_foreach(int nr_nodes)
 {
 	struct test_node *cur, *n;
 	int count = 0;
-	rbtree_postorder_for_each_entry_safe(cur, n, &root, rb)
+	rbtree_postorder_for_each_entry_safe(cur, n, &root.rb_root, rb)
 		count++;
 
 	WARN_ON_ONCE(count != nr_nodes);
@@ -128,7 +197,7 @@ static void check_postorder(int nr_nodes)
 {
 	struct rb_node *rb;
 	int count = 0;
-	for (rb = rb_first_postorder(&root); rb; rb = rb_next_postorder(rb))
+	for (rb = rb_first_postorder(&root.rb_root); rb; rb = rb_next_postorder(rb))
 		count++;
 
 	WARN_ON_ONCE(count != nr_nodes);
@@ -140,7 +209,7 @@ static void check(int nr_nodes)
 	int count = 0, blacks = 0;
 	u32 prev_key = 0;
 
-	for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
+	for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) {
 		struct test_node *node = rb_entry(rb, struct test_node, rb);
 		WARN_ON_ONCE(node->key < prev_key);
 		WARN_ON_ONCE(is_red(rb) &&
@@ -155,7 +224,7 @@ static void check(int nr_nodes)
 	}
 
 	WARN_ON_ONCE(count != nr_nodes);
-	WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1);
+	WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root.rb_root))) - 1);
 
 	check_postorder(nr_nodes);
 	check_postorder_foreach(nr_nodes);
@@ -166,7 +235,7 @@ static void check_augmented(int nr_nodes)
 	struct rb_node *rb;
 
 	check(nr_nodes);
-	for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
+	for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) {
 		struct test_node *node = rb_entry(rb, struct test_node, rb);
 		WARN_ON_ONCE(node->augmented != augment_recompute(node));
 	}
@@ -176,6 +245,11 @@ static int __init rbtree_test_init(void)
 {
 	int i, j;
 	cycles_t time1, time2, time;
+	struct rb_node *node;
+
+	nodes = kmalloc(nnodes * sizeof(*nodes), GFP_KERNEL);
+	if (!nodes)
+		return -ENOMEM;
 
 	printk(KERN_ALERT "rbtree testing");
 
@@ -184,27 +258,88 @@ static int __init rbtree_test_init(void)
 
 	time1 = get_cycles();
 
-	for (i = 0; i < PERF_LOOPS; i++) {
-		for (j = 0; j < NODES; j++)
+	for (i = 0; i < perf_loops; i++) {
+		for (j = 0; j < nnodes; j++)
 			insert(nodes + j, &root);
-		for (j = 0; j < NODES; j++)
+		for (j = 0; j < nnodes; j++)
 			erase(nodes + j, &root);
 	}
 
 	time2 = get_cycles();
 	time = time2 - time1;
 
-	time = div_u64(time, PERF_LOOPS);
-	printk(" -> %llu cycles\n", (unsigned long long)time);
+	time = div_u64(time, perf_loops);
+	printk(" -> test 1 (latency of nnodes insert+delete): %llu cycles\n",
+	       (unsigned long long)time);
+
+	time1 = get_cycles();
+
+	for (i = 0; i < perf_loops; i++) {
+		for (j = 0; j < nnodes; j++)
+			insert_cached(nodes + j, &root);
+		for (j = 0; j < nnodes; j++)
+			erase_cached(nodes + j, &root);
+	}
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, perf_loops);
+	printk(" -> test 2 (latency of nnodes cached insert+delete): %llu cycles\n",
+	       (unsigned long long)time);
+
+	for (i = 0; i < nnodes; i++)
+		insert(nodes + i, &root);
+
+	time1 = get_cycles();
+
+	for (i = 0; i < perf_loops; i++) {
+		for (node = rb_first(&root.rb_root); node; node = rb_next(node))
+			;
+	}
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, perf_loops);
+	printk(" -> test 3 (latency of inorder traversal): %llu cycles\n",
+	       (unsigned long long)time);
+
+	time1 = get_cycles();
+
+	for (i = 0; i < perf_loops; i++)
+		node = rb_first(&root.rb_root);
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, perf_loops);
+	printk(" -> test 4 (latency to fetch first node)\n");
+	printk("        non-cached: %llu cycles\n", (unsigned long long)time);
+
+	time1 = get_cycles();
+
+	for (i = 0; i < perf_loops; i++)
+		node = rb_first_cached(&root);
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, perf_loops);
+	printk("        cached: %llu cycles\n", (unsigned long long)time);
 
-	for (i = 0; i < CHECK_LOOPS; i++) {
+	for (i = 0; i < nnodes; i++)
+		erase(nodes + i, &root);
+
+	/* run checks */
+	for (i = 0; i < check_loops; i++) {
 		init();
-		for (j = 0; j < NODES; j++) {
+		for (j = 0; j < nnodes; j++) {
 			check(j);
 			insert(nodes + j, &root);
 		}
-		for (j = 0; j < NODES; j++) {
-			check(NODES - j);
+		for (j = 0; j < nnodes; j++) {
+			check(nnodes - j);
 			erase(nodes + j, &root);
 		}
 		check(0);
@@ -216,32 +351,49 @@ static int __init rbtree_test_init(void)
 
 	time1 = get_cycles();
 
-	for (i = 0; i < PERF_LOOPS; i++) {
-		for (j = 0; j < NODES; j++)
+	for (i = 0; i < perf_loops; i++) {
+		for (j = 0; j < nnodes; j++)
 			insert_augmented(nodes + j, &root);
-		for (j = 0; j < NODES; j++)
+		for (j = 0; j < nnodes; j++)
 			erase_augmented(nodes + j, &root);
 	}
 
 	time2 = get_cycles();
 	time = time2 - time1;
 
-	time = div_u64(time, PERF_LOOPS);
-	printk(" -> %llu cycles\n", (unsigned long long)time);
+	time = div_u64(time, perf_loops);
+	printk(" -> test 1 (latency of nnodes insert+delete): %llu cycles\n", (unsigned long long)time);
+
+	time1 = get_cycles();
+
+	for (i = 0; i < perf_loops; i++) {
+		for (j = 0; j < nnodes; j++)
+			insert_augmented_cached(nodes + j, &root);
+		for (j = 0; j < nnodes; j++)
+			erase_augmented_cached(nodes + j, &root);
+	}
+
+	time2 = get_cycles();
+	time = time2 - time1;
+
+	time = div_u64(time, perf_loops);
+	printk(" -> test 2 (latency of nnodes cached insert+delete): %llu cycles\n", (unsigned long long)time);
 
-	for (i = 0; i < CHECK_LOOPS; i++) {
+	for (i = 0; i < check_loops; i++) {
 		init();
-		for (j = 0; j < NODES; j++) {
+		for (j = 0; j < nnodes; j++) {
 			check_augmented(j);
 			insert_augmented(nodes + j, &root);
 		}
-		for (j = 0; j < NODES; j++) {
-			check_augmented(NODES - j);
+		for (j = 0; j < nnodes; j++) {
+			check_augmented(nnodes - j);
 			erase_augmented(nodes + j, &root);
 		}
 		check_augmented(0);
 	}
 
+	kfree(nodes);
+
 	return -EAGAIN; /* Fail will directly unload the module */
 }
 
diff --git a/lib/string.c b/lib/string.c
index ebbb99c..9921dc2 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -723,6 +723,72 @@ void memzero_explicit(void *s, size_t count)
 }
 EXPORT_SYMBOL(memzero_explicit);
 
+#ifndef __HAVE_ARCH_MEMSET16
+/**
+ * memset16() - Fill a memory area with a uint16_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint16_t instead
+ * of a byte.  Remember that @count is the number of uint16_ts to
+ * store, not the number of bytes.
+ */
+void *memset16(uint16_t *s, uint16_t v, size_t count)
+{
+	uint16_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+EXPORT_SYMBOL(memset16);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET32
+/**
+ * memset32() - Fill a memory area with a uint32_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint32_t instead
+ * of a byte.  Remember that @count is the number of uint32_ts to
+ * store, not the number of bytes.
+ */
+void *memset32(uint32_t *s, uint32_t v, size_t count)
+{
+	uint32_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+EXPORT_SYMBOL(memset32);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET64
+/**
+ * memset64() - Fill a memory area with a uint64_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint64_t instead
+ * of a byte.  Remember that @count is the number of uint64_ts to
+ * store, not the number of bytes.
+ */
+void *memset64(uint64_t *s, uint64_t v, size_t count)
+{
+	uint64_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+EXPORT_SYMBOL(memset64);
+#endif
+
 #ifndef __HAVE_ARCH_MEMCPY
 /**
  * memcpy - Copy one area of memory to another
@@ -985,3 +1051,144 @@ void fortify_panic(const char *name)
 	BUG();
 }
 EXPORT_SYMBOL(fortify_panic);
+
+#ifdef CONFIG_STRING_SELFTEST
+#include <linux/slab.h>
+#include <linux/module.h>
+
+static __init int memset16_selftest(void)
+{
+	unsigned i, j, k;
+	u16 v, *p;
+
+	p = kmalloc(256 * 2 * 2, GFP_KERNEL);
+	if (!p)
+		return -1;
+
+	for (i = 0; i < 256; i++) {
+		for (j = 0; j < 256; j++) {
+			memset(p, 0xa1, 256 * 2 * sizeof(v));
+			memset16(p + i, 0xb1b2, j);
+			for (k = 0; k < 512; k++) {
+				v = p[k];
+				if (k < i) {
+					if (v != 0xa1a1)
+						goto fail;
+				} else if (k < i + j) {
+					if (v != 0xb1b2)
+						goto fail;
+				} else {
+					if (v != 0xa1a1)
+						goto fail;
+				}
+			}
+		}
+	}
+
+fail:
+	kfree(p);
+	if (i < 256)
+		return (i << 24) | (j << 16) | k;
+	return 0;
+}
+
+static __init int memset32_selftest(void)
+{
+	unsigned i, j, k;
+	u32 v, *p;
+
+	p = kmalloc(256 * 2 * 4, GFP_KERNEL);
+	if (!p)
+		return -1;
+
+	for (i = 0; i < 256; i++) {
+		for (j = 0; j < 256; j++) {
+			memset(p, 0xa1, 256 * 2 * sizeof(v));
+			memset32(p + i, 0xb1b2b3b4, j);
+			for (k = 0; k < 512; k++) {
+				v = p[k];
+				if (k < i) {
+					if (v != 0xa1a1a1a1)
+						goto fail;
+				} else if (k < i + j) {
+					if (v != 0xb1b2b3b4)
+						goto fail;
+				} else {
+					if (v != 0xa1a1a1a1)
+						goto fail;
+				}
+			}
+		}
+	}
+
+fail:
+	kfree(p);
+	if (i < 256)
+		return (i << 24) | (j << 16) | k;
+	return 0;
+}
+
+static __init int memset64_selftest(void)
+{
+	unsigned i, j, k;
+	u64 v, *p;
+
+	p = kmalloc(256 * 2 * 8, GFP_KERNEL);
+	if (!p)
+		return -1;
+
+	for (i = 0; i < 256; i++) {
+		for (j = 0; j < 256; j++) {
+			memset(p, 0xa1, 256 * 2 * sizeof(v));
+			memset64(p + i, 0xb1b2b3b4b5b6b7b8ULL, j);
+			for (k = 0; k < 512; k++) {
+				v = p[k];
+				if (k < i) {
+					if (v != 0xa1a1a1a1a1a1a1a1ULL)
+						goto fail;
+				} else if (k < i + j) {
+					if (v != 0xb1b2b3b4b5b6b7b8ULL)
+						goto fail;
+				} else {
+					if (v != 0xa1a1a1a1a1a1a1a1ULL)
+						goto fail;
+				}
+			}
+		}
+	}
+
+fail:
+	kfree(p);
+	if (i < 256)
+		return (i << 24) | (j << 16) | k;
+	return 0;
+}
+
+static __init int string_selftest_init(void)
+{
+	int test, subtest;
+
+	test = 1;
+	subtest = memset16_selftest();
+	if (subtest)
+		goto fail;
+
+	test = 2;
+	subtest = memset32_selftest();
+	if (subtest)
+		goto fail;
+
+	test = 3;
+	subtest = memset64_selftest();
+	if (subtest)
+		goto fail;
+
+	pr_info("String selftests succeeded\n");
+	return 0;
+fail:
+	pr_crit("String selftest failure %d.%08x\n", test, subtest);
+	return 0;
+}
+
+module_init(string_selftest_init);
+#endif	/* CONFIG_STRING_SELFTEST */
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index ecaac2c..29c490e 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -576,7 +576,7 @@ char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
 	char *buffer, *quoted;
 	int i, res;
 
-	buffer = kmalloc(PAGE_SIZE, GFP_TEMPORARY);
+	buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!buffer)
 		return NULL;
 
@@ -612,7 +612,7 @@ char *kstrdup_quotable_file(struct file *file, gfp_t gfp)
 		return kstrdup("<unknown>", gfp);
 
 	/* We add 11 spaces for ' (deleted)' to be appended */
-	temp = kmalloc(PATH_MAX + 11, GFP_TEMPORARY);
+	temp = kmalloc(PATH_MAX + 11, GFP_KERNEL);
 	if (!temp)
 		return kstrdup("<no_memory>", gfp);
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index a8d74a7..8c6c83e 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <linux/gfp.h>
 #include <linux/scatterlist.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -155,6 +156,15 @@ unsigned long swiotlb_size_or_default(void)
 	return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
+void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { }
+
+/* For swiotlb, clear memory encryption mask from dma addresses */
+static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev,
+				      phys_addr_t address)
+{
+	return __sme_clr(phys_to_dma(hwdev, address));
+}
+
 /* Note that this doesn't work with highmem page */
 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 				      volatile void *address)
@@ -183,6 +193,31 @@ void swiotlb_print_info(void)
 	       bytes >> 20, vstart, vend - 1);
 }
 
+/*
+ * Early SWIOTLB allocation may be too early to allow an architecture to
+ * perform the desired operations.  This function allows the architecture to
+ * call SWIOTLB when the operations are possible.  It needs to be called
+ * before the SWIOTLB memory is used.
+ */
+void __init swiotlb_update_mem_attributes(void)
+{
+	void *vaddr;
+	unsigned long bytes;
+
+	if (no_iotlb_memory || late_alloc)
+		return;
+
+	vaddr = phys_to_virt(io_tlb_start);
+	bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+	swiotlb_set_mem_attributes(vaddr, bytes);
+	memset(vaddr, 0, bytes);
+
+	vaddr = phys_to_virt(io_tlb_overflow_buffer);
+	bytes = PAGE_ALIGN(io_tlb_overflow);
+	swiotlb_set_mem_attributes(vaddr, bytes);
+	memset(vaddr, 0, bytes);
+}
+
 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
 	void *v_overflow_buffer;
@@ -320,6 +355,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	io_tlb_start = virt_to_phys(tlb);
 	io_tlb_end = io_tlb_start + bytes;
 
+	swiotlb_set_mem_attributes(tlb, bytes);
 	memset(tlb, 0, bytes);
 
 	/*
@@ -330,6 +366,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!v_overflow_buffer)
 		goto cleanup2;
 
+	swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow);
+	memset(v_overflow_buffer, 0, io_tlb_overflow);
 	io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
 
 	/*
@@ -469,6 +507,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 	if (no_iotlb_memory)
 		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
 
+	if (sme_active())
+		pr_warn_once("SME is active and system is using DMA bounce buffers\n");
+
 	mask = dma_get_seg_boundary(hwdev);
 
 	tbl_dma_addr &= mask;
@@ -581,7 +622,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 		return SWIOTLB_MAP_ERROR;
 	}
 
-	start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
+	start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start);
 	return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
 				      dir, attrs);
 }
@@ -702,7 +743,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			goto err_warn;
 
 		ret = phys_to_virt(paddr);
-		dev_addr = phys_to_dma(hwdev, paddr);
+		dev_addr = swiotlb_phys_to_dma(hwdev, paddr);
 
 		/* Confirm address can be DMA'd by device */
 		if (dev_addr + size - 1 > dma_mask) {
@@ -812,10 +853,10 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	map = map_single(dev, phys, size, dir, attrs);
 	if (map == SWIOTLB_MAP_ERROR) {
 		swiotlb_full(dev, size, dir, 1);
-		return phys_to_dma(dev, io_tlb_overflow_buffer);
+		return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
 	}
 
-	dev_addr = phys_to_dma(dev, map);
+	dev_addr = swiotlb_phys_to_dma(dev, map);
 
 	/* Ensure that the address returned is DMA'ble */
 	if (dma_capable(dev, dev_addr, size))
@@ -824,7 +865,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	attrs |= DMA_ATTR_SKIP_CPU_SYNC;
 	swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
 
-	return phys_to_dma(dev, io_tlb_overflow_buffer);
+	return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
 }
 EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
@@ -958,7 +999,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 				sg_dma_len(sgl) = 0;
 				return 0;
 			}
-			sg->dma_address = phys_to_dma(hwdev, map);
+			sg->dma_address = swiotlb_phys_to_dma(hwdev, map);
 		} else
 			sg->dma_address = dev_addr;
 		sg_dma_len(sg) = sg->length;
@@ -1026,7 +1067,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-	return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer));
+	return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer));
 }
 EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 
@@ -1039,6 +1080,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 int
 swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+	return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
 EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 2526a29..aa1f266 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -165,6 +165,96 @@ static void __init test_zero_fill_copy(void)
 	expect_eq_pbl("128-1023", bmap2, 1024);
 }
 
+#define PARSE_TIME 0x1
+
+struct test_bitmap_parselist{
+	const int errno;
+	const char *in;
+	const unsigned long *expected;
+	const int nbits;
+	const int flags;
+};
+
+static const unsigned long exp[] __initconst = {
+	BITMAP_FROM_U64(1),
+	BITMAP_FROM_U64(2),
+	BITMAP_FROM_U64(0x0000ffff),
+	BITMAP_FROM_U64(0xffff0000),
+	BITMAP_FROM_U64(0x55555555),
+	BITMAP_FROM_U64(0xaaaaaaaa),
+	BITMAP_FROM_U64(0x11111111),
+	BITMAP_FROM_U64(0x22222222),
+	BITMAP_FROM_U64(0xffffffff),
+	BITMAP_FROM_U64(0xfffffffe),
+	BITMAP_FROM_U64(0x3333333311111111ULL),
+	BITMAP_FROM_U64(0xffffffff77777777ULL)
+};
+
+static const unsigned long exp2[] __initconst = {
+	BITMAP_FROM_U64(0x3333333311111111ULL),
+	BITMAP_FROM_U64(0xffffffff77777777ULL)
+};
+
+static const struct test_bitmap_parselist parselist_tests[] __initconst = {
+#define step (sizeof(u64) / sizeof(unsigned long))
+
+	{0, "0",			&exp[0], 8, 0},
+	{0, "1",			&exp[1 * step], 8, 0},
+	{0, "0-15",			&exp[2 * step], 32, 0},
+	{0, "16-31",			&exp[3 * step], 32, 0},
+	{0, "0-31:1/2",			&exp[4 * step], 32, 0},
+	{0, "1-31:1/2",			&exp[5 * step], 32, 0},
+	{0, "0-31:1/4",			&exp[6 * step], 32, 0},
+	{0, "1-31:1/4",			&exp[7 * step], 32, 0},
+	{0, "0-31:4/4",			&exp[8 * step], 32, 0},
+	{0, "1-31:4/4",			&exp[9 * step], 32, 0},
+	{0, "0-31:1/4,32-63:2/4",	&exp[10 * step], 64, 0},
+	{0, "0-31:3/4,32-63:4/4",	&exp[11 * step], 64, 0},
+
+	{0, "0-31:1/4,32-63:2/4,64-95:3/4,96-127:4/4",	exp2, 128, 0},
+
+	{0, "0-2047:128/256", NULL, 2048, PARSE_TIME},
+
+	{-EINVAL, "-1",	NULL, 8, 0},
+	{-EINVAL, "-0",	NULL, 8, 0},
+	{-EINVAL, "10-1", NULL, 8, 0},
+	{-EINVAL, "0-31:10/1", NULL, 8, 0},
+};
+
+static void __init test_bitmap_parselist(void)
+{
+	int i;
+	int err;
+	cycles_t cycles;
+	DECLARE_BITMAP(bmap, 2048);
+
+	for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) {
+#define ptest parselist_tests[i]
+
+		cycles = get_cycles();
+		err = bitmap_parselist(ptest.in, bmap, ptest.nbits);
+		cycles = get_cycles() - cycles;
+
+		if (err != ptest.errno) {
+			pr_err("test %d: input is %s, errno is %d, expected %d\n",
+					i, ptest.in, err, ptest.errno);
+			continue;
+		}
+
+		if (!err && ptest.expected
+			 && !__bitmap_equal(bmap, ptest.expected, ptest.nbits)) {
+			pr_err("test %d: input is %s, result is 0x%lx, expected 0x%lx\n",
+					i, ptest.in, bmap[0], *ptest.expected);
+			continue;
+		}
+
+		if (ptest.flags & PARSE_TIME)
+			pr_err("test %d: input is '%s' OK, Time: %llu\n",
+					i, ptest.in,
+					(unsigned long long)cycles);
+	}
+}
+
 static void __init test_bitmap_u32_array_conversions(void)
 {
 	DECLARE_BITMAP(bmap1, 1024);
@@ -365,6 +455,7 @@ static int __init test_bitmap_init(void)
 {
 	test_zero_fill_copy();
 	test_bitmap_u32_array_conversions();
+	test_bitmap_parselist();
 	test_mem_optimisations();
 
 	if (failed_tests == 0)
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index d9d5a41..aa8812a 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -952,6 +952,32 @@ static struct bpf_test tests[] = {
 		{ { 2, 0 }, { 3, 1 }, { 4, MAX_K } },
 	},
 	{
+		"JGE (jt 0), test 1",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_LEN, 0),
+			BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2),
+			BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1),
+			BPF_STMT(BPF_RET | BPF_K, 1),
+			BPF_STMT(BPF_RET | BPF_K, MAX_K)
+		},
+		CLASSIC,
+		{ 4, 4, 4, 3, 3 },
+		{ { 2, 0 }, { 3, 1 }, { 4, 1 } },
+	},
+	{
+		"JGE (jt 0), test 2",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_LEN, 0),
+			BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2),
+			BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1),
+			BPF_STMT(BPF_RET | BPF_K, 1),
+			BPF_STMT(BPF_RET | BPF_K, MAX_K)
+		},
+		CLASSIC,
+		{ 4, 4, 5, 3, 3 },
+		{ { 4, 1 }, { 5, 1 }, { 6, MAX_K } },
+	},
+	{
 		"JGE",
 		.u.insns = {
 			BPF_STMT(BPF_LDX | BPF_LEN, 0),
@@ -4492,6 +4518,35 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLT | BPF_K */
+	{
+		"JMP_JSLT_K: Signed jump: if (-2 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xfffffffffffffffeLL),
+			BPF_JMP_IMM(BPF_JSLT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLT_K: Signed jump: if (-1 < -1) return 0",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSLT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSGT | BPF_K */
 	{
 		"JMP_JSGT_K: Signed jump: if (-1 > -2) return 1",
@@ -4521,6 +4576,73 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLE | BPF_K */
+	{
+		"JMP_JSLE_K: Signed jump: if (-2 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xfffffffffffffffeLL),
+			BPF_JMP_IMM(BPF_JSLE, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_K: Signed jump: if (-1 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSLE, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_K: Signed jump: value walk 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 6),
+			BPF_ALU64_IMM(BPF_SUB, R1, 1),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 4),
+			BPF_ALU64_IMM(BPF_SUB, R1, 1),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 2),
+			BPF_ALU64_IMM(BPF_SUB, R1, 1),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 1),
+			BPF_EXIT_INSN(),		/* bad exit */
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),	/* good exit */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_K: Signed jump: value walk 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 4),
+			BPF_ALU64_IMM(BPF_SUB, R1, 2),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 2),
+			BPF_ALU64_IMM(BPF_SUB, R1, 2),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 1),
+			BPF_EXIT_INSN(),		/* bad exit */
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),	/* good exit */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSGE | BPF_K */
 	{
 		"JMP_JSGE_K: Signed jump: if (-1 >= -2) return 1",
@@ -4617,6 +4739,35 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLT | BPF_K */
+	{
+		"JMP_JLT_K: if (2 < 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 2),
+			BPF_JMP_IMM(BPF_JLT, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JGT_K: Unsigned jump: if (1 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 1),
+			BPF_JMP_IMM(BPF_JLT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGE | BPF_K */
 	{
 		"JMP_JGE_K: if (3 >= 2) return 1",
@@ -4632,6 +4783,21 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLE | BPF_K */
+	{
+		"JMP_JLE_K: if (2 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 2),
+			BPF_JMP_IMM(BPF_JLE, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGT | BPF_K jump backwards */
 	{
 		"JMP_JGT_K: if (3 > 2) return 1 (jump backwards)",
@@ -4662,6 +4828,36 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLT | BPF_K jump backwards */
+	{
+		"JMP_JGT_K: if (2 < 3) return 1 (jump backwards)",
+		.u.insns_int = {
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2), /* goto start */
+			BPF_ALU32_IMM(BPF_MOV, R0, 1), /* out: */
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0), /* start: */
+			BPF_LD_IMM64(R1, 2), /* note: this takes 2 insns */
+			BPF_JMP_IMM(BPF_JLT, R1, 3, -6), /* goto out */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JLE_K: if (3 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JLE, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JNE | BPF_K */
 	{
 		"JMP_JNE_K: if (3 != 2) return 1",
@@ -4752,6 +4948,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLT | BPF_X */
+	{
+		"JMP_JSLT_X: Signed jump: if (-2 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -2),
+			BPF_JMP_REG(BPF_JSLT, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLT_X: Signed jump: if (-1 < -1) return 0",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -1),
+			BPF_JMP_REG(BPF_JSLT, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSGE | BPF_X */
 	{
 		"JMP_JSGE_X: Signed jump: if (-1 >= -2) return 1",
@@ -4783,6 +5010,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLE | BPF_X */
+	{
+		"JMP_JSLE_X: Signed jump: if (-2 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -2),
+			BPF_JMP_REG(BPF_JSLE, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_X: Signed jump: if (-1 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -1),
+			BPF_JMP_REG(BPF_JSLE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGT | BPF_X */
 	{
 		"JMP_JGT_X: if (3 > 2) return 1",
@@ -4814,6 +5072,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLT | BPF_X */
+	{
+		"JMP_JLT_X: if (2 < 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLT, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JLT_X: Unsigned jump: if (1 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, 1),
+			BPF_JMP_REG(BPF_JLT, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGE | BPF_X */
 	{
 		"JMP_JGE_X: if (3 >= 2) return 1",
@@ -4845,6 +5134,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLE | BPF_X */
+	{
+		"JMP_JLE_X: if (2 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JLE_X: if (3 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 3),
+			BPF_JMP_REG(BPF_JLE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	{
 		/* Mainly testing JIT + imm64 here. */
 		"JMP_JGE_X: ldimm64 test 1",
@@ -4890,6 +5210,50 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	{
+		"JMP_JLE_X: ldimm64 test 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 2),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xeeeeeeeeU } },
+	},
+	{
+		"JMP_JLE_X: ldimm64 test 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 0),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffffU } },
+	},
+	{
+		"JMP_JLE_X: ldimm64 test 3",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 4),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JNE | BPF_X */
 	{
 		"JMP_JNE_X: if (3 != 2) return 1",
diff --git a/lib/test_debug_virtual.c b/lib/test_debug_virtual.c
new file mode 100644
index 0000000..b9cdeec
--- /dev/null
+++ b/lib/test_debug_virtual.c
@@ -0,0 +1,49 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/sizes.h>
+
+#include <asm/page.h>
+#ifdef CONFIG_MIPS
+#include <asm/bootinfo.h>
+#endif
+
+struct foo {
+	unsigned int bar;
+};
+
+struct foo *foo;
+
+static int __init test_debug_virtual_init(void)
+{
+	phys_addr_t pa;
+	void *va;
+
+	va = (void *)VMALLOC_START;
+	pa = virt_to_phys(va);
+
+	pr_info("PA: %pa for VA: 0x%lx\n", &pa, (unsigned long)va);
+
+	foo = kzalloc(sizeof(*foo), GFP_KERNEL);
+	if (!foo)
+		return -ENOMEM;
+
+	pa = virt_to_phys(foo);
+	va = foo;
+	pr_info("PA: %pa for VA: 0x%lx\n", &pa, (unsigned long)va);
+
+	return 0;
+}
+module_init(test_debug_virtual_init);
+
+static void __exit test_debug_virtual_exit(void)
+{
+	kfree(foo);
+}
+module_exit(test_debug_virtual_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Test module for CONFIG_DEBUG_VIRTUAL");
diff --git a/lib/test_firmware.c b/lib/test_firmware.c
index 09371b0..64a4c76 100644
--- a/lib/test_firmware.c
+++ b/lib/test_firmware.c
@@ -19,10 +19,85 @@
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+
+#define TEST_FIRMWARE_NAME	"test-firmware.bin"
+#define TEST_FIRMWARE_NUM_REQS	4
 
 static DEFINE_MUTEX(test_fw_mutex);
 static const struct firmware *test_firmware;
 
+struct test_batched_req {
+	u8 idx;
+	int rc;
+	bool sent;
+	const struct firmware *fw;
+	const char *name;
+	struct completion completion;
+	struct task_struct *task;
+	struct device *dev;
+};
+
+/**
+ * test_config - represents configuration for the test for different triggers
+ *
+ * @name: the name of the firmware file to look for
+ * @sync_direct: when the sync trigger is used if this is true
+ *	request_firmware_direct() will be used instead.
+ * @send_uevent: whether or not to send a uevent for async requests
+ * @num_requests: number of requests to try per test case. This is trigger
+ *	specific.
+ * @reqs: stores all requests information
+ * @read_fw_idx: index of thread from which we want to read firmware results
+ *	from through the read_fw trigger.
+ * @test_result: a test may use this to collect the result from the call
+ *	of the request_firmware*() calls used in their tests. In order of
+ *	priority we always keep first any setup error. If no setup errors were
+ *	found then we move on to the first error encountered while running the
+ *	API. Note that for async calls this typically will be a successful
+ *	result (0) unless of course you've used bogus parameters, or the system
+ *	is out of memory.  In the async case the callback is expected to do a
+ *	bit more homework to figure out what happened, unfortunately the only
+ *	information passed today on error is the fact that no firmware was
+ *	found so we can only assume -ENOENT on async calls if the firmware is
+ *	NULL.
+ *
+ *	Errors you can expect:
+ *
+ *	API specific:
+ *
+ *	0:		success for sync, for async it means request was sent
+ *	-EINVAL:	invalid parameters or request
+ *	-ENOENT:	files not found
+ *
+ *	System environment:
+ *
+ *	-ENOMEM:	memory pressure on system
+ *	-ENODEV:	out of number of devices to test
+ *	-EINVAL:	an unexpected error has occurred
+ * @req_firmware: if @sync_direct is true this is set to
+ *	request_firmware_direct(), otherwise request_firmware()
+ */
+struct test_config {
+	char *name;
+	bool sync_direct;
+	bool send_uevent;
+	u8 num_requests;
+	u8 read_fw_idx;
+
+	/*
+	 * These below don't belong her but we'll move them once we create
+	 * a struct fw_test_device and stuff the misc_dev under there later.
+	 */
+	struct test_batched_req *reqs;
+	int test_result;
+	int (*req_firmware)(const struct firmware **fw, const char *name,
+			    struct device *device);
+};
+
+struct test_config *test_fw_config;
+
 static ssize_t test_fw_misc_read(struct file *f, char __user *buf,
 				 size_t size, loff_t *offset)
 {
@@ -42,6 +117,338 @@ static const struct file_operations test_fw_fops = {
 	.read           = test_fw_misc_read,
 };
 
+static void __test_release_all_firmware(void)
+{
+	struct test_batched_req *req;
+	u8 i;
+
+	if (!test_fw_config->reqs)
+		return;
+
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (req->fw)
+			release_firmware(req->fw);
+	}
+
+	vfree(test_fw_config->reqs);
+	test_fw_config->reqs = NULL;
+}
+
+static void test_release_all_firmware(void)
+{
+	mutex_lock(&test_fw_mutex);
+	__test_release_all_firmware();
+	mutex_unlock(&test_fw_mutex);
+}
+
+
+static void __test_firmware_config_free(void)
+{
+	__test_release_all_firmware();
+	kfree_const(test_fw_config->name);
+	test_fw_config->name = NULL;
+}
+
+/*
+ * XXX: move to kstrncpy() once merged.
+ *
+ * Users should use kfree_const() when freeing these.
+ */
+static int __kstrncpy(char **dst, const char *name, size_t count, gfp_t gfp)
+{
+	*dst = kstrndup(name, count, gfp);
+	if (!*dst)
+		return -ENOSPC;
+	return count;
+}
+
+static int __test_firmware_config_init(void)
+{
+	int ret;
+
+	ret = __kstrncpy(&test_fw_config->name, TEST_FIRMWARE_NAME,
+			 strlen(TEST_FIRMWARE_NAME), GFP_KERNEL);
+	if (ret < 0)
+		goto out;
+
+	test_fw_config->num_requests = TEST_FIRMWARE_NUM_REQS;
+	test_fw_config->send_uevent = true;
+	test_fw_config->sync_direct = false;
+	test_fw_config->req_firmware = request_firmware;
+	test_fw_config->test_result = 0;
+	test_fw_config->reqs = NULL;
+
+	return 0;
+
+out:
+	__test_firmware_config_free();
+	return ret;
+}
+
+static ssize_t reset_store(struct device *dev,
+			   struct device_attribute *attr,
+			   const char *buf, size_t count)
+{
+	int ret;
+
+	mutex_lock(&test_fw_mutex);
+
+	__test_firmware_config_free();
+
+	ret = __test_firmware_config_init();
+	if (ret < 0) {
+		ret = -ENOMEM;
+		pr_err("could not alloc settings for config trigger: %d\n",
+		       ret);
+		goto out;
+	}
+
+	pr_info("reset\n");
+	ret = count;
+
+out:
+	mutex_unlock(&test_fw_mutex);
+
+	return ret;
+}
+static DEVICE_ATTR_WO(reset);
+
+static ssize_t config_show(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	int len = 0;
+
+	mutex_lock(&test_fw_mutex);
+
+	len += snprintf(buf, PAGE_SIZE,
+			"Custom trigger configuration for: %s\n",
+			dev_name(dev));
+
+	if (test_fw_config->name)
+		len += snprintf(buf+len, PAGE_SIZE,
+				"name:\t%s\n",
+				test_fw_config->name);
+	else
+		len += snprintf(buf+len, PAGE_SIZE,
+				"name:\tEMTPY\n");
+
+	len += snprintf(buf+len, PAGE_SIZE,
+			"num_requests:\t%u\n", test_fw_config->num_requests);
+
+	len += snprintf(buf+len, PAGE_SIZE,
+			"send_uevent:\t\t%s\n",
+			test_fw_config->send_uevent ?
+			"FW_ACTION_HOTPLUG" :
+			"FW_ACTION_NOHOTPLUG");
+	len += snprintf(buf+len, PAGE_SIZE,
+			"sync_direct:\t\t%s\n",
+			test_fw_config->sync_direct ? "true" : "false");
+	len += snprintf(buf+len, PAGE_SIZE,
+			"read_fw_idx:\t%u\n", test_fw_config->read_fw_idx);
+
+	mutex_unlock(&test_fw_mutex);
+
+	return len;
+}
+static DEVICE_ATTR_RO(config);
+
+static ssize_t config_name_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	int ret;
+
+	mutex_lock(&test_fw_mutex);
+	kfree_const(test_fw_config->name);
+	ret = __kstrncpy(&test_fw_config->name, buf, count, GFP_KERNEL);
+	mutex_unlock(&test_fw_mutex);
+
+	return ret;
+}
+
+/*
+ * As per sysfs_kf_seq_show() the buf is max PAGE_SIZE.
+ */
+static ssize_t config_test_show_str(char *dst,
+				    char *src)
+{
+	int len;
+
+	mutex_lock(&test_fw_mutex);
+	len = snprintf(dst, PAGE_SIZE, "%s\n", src);
+	mutex_unlock(&test_fw_mutex);
+
+	return len;
+}
+
+static int test_dev_config_update_bool(const char *buf, size_t size,
+				       bool *cfg)
+{
+	int ret;
+
+	mutex_lock(&test_fw_mutex);
+	if (strtobool(buf, cfg) < 0)
+		ret = -EINVAL;
+	else
+		ret = size;
+	mutex_unlock(&test_fw_mutex);
+
+	return ret;
+}
+
+static ssize_t
+test_dev_config_show_bool(char *buf,
+			  bool config)
+{
+	bool val;
+
+	mutex_lock(&test_fw_mutex);
+	val = config;
+	mutex_unlock(&test_fw_mutex);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t test_dev_config_show_int(char *buf, int cfg)
+{
+	int val;
+
+	mutex_lock(&test_fw_mutex);
+	val = cfg;
+	mutex_unlock(&test_fw_mutex);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg)
+{
+	int ret;
+	long new;
+
+	ret = kstrtol(buf, 10, &new);
+	if (ret)
+		return ret;
+
+	if (new > U8_MAX)
+		return -EINVAL;
+
+	mutex_lock(&test_fw_mutex);
+	*(u8 *)cfg = new;
+	mutex_unlock(&test_fw_mutex);
+
+	/* Always return full write size even if we didn't consume all */
+	return size;
+}
+
+static ssize_t test_dev_config_show_u8(char *buf, u8 cfg)
+{
+	u8 val;
+
+	mutex_lock(&test_fw_mutex);
+	val = cfg;
+	mutex_unlock(&test_fw_mutex);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t config_name_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return config_test_show_str(buf, test_fw_config->name);
+}
+static DEVICE_ATTR(config_name, 0644, config_name_show, config_name_store);
+
+static ssize_t config_num_requests_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	int rc;
+
+	mutex_lock(&test_fw_mutex);
+	if (test_fw_config->reqs) {
+		pr_err("Must call release_all_firmware prior to changing config\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	mutex_unlock(&test_fw_mutex);
+
+	rc = test_dev_config_update_u8(buf, count,
+				       &test_fw_config->num_requests);
+
+out:
+	return rc;
+}
+
+static ssize_t config_num_requests_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	return test_dev_config_show_u8(buf, test_fw_config->num_requests);
+}
+static DEVICE_ATTR(config_num_requests, 0644, config_num_requests_show,
+		   config_num_requests_store);
+
+static ssize_t config_sync_direct_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	int rc = test_dev_config_update_bool(buf, count,
+					     &test_fw_config->sync_direct);
+
+	if (rc == count)
+		test_fw_config->req_firmware = test_fw_config->sync_direct ?
+				       request_firmware_direct :
+				       request_firmware;
+	return rc;
+}
+
+static ssize_t config_sync_direct_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	return test_dev_config_show_bool(buf, test_fw_config->sync_direct);
+}
+static DEVICE_ATTR(config_sync_direct, 0644, config_sync_direct_show,
+		   config_sync_direct_store);
+
+static ssize_t config_send_uevent_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	return test_dev_config_update_bool(buf, count,
+					   &test_fw_config->send_uevent);
+}
+
+static ssize_t config_send_uevent_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	return test_dev_config_show_bool(buf, test_fw_config->send_uevent);
+}
+static DEVICE_ATTR(config_send_uevent, 0644, config_send_uevent_show,
+		   config_send_uevent_store);
+
+static ssize_t config_read_fw_idx_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	return test_dev_config_update_u8(buf, count,
+					 &test_fw_config->read_fw_idx);
+}
+
+static ssize_t config_read_fw_idx_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	return test_dev_config_show_u8(buf, test_fw_config->read_fw_idx);
+}
+static DEVICE_ATTR(config_read_fw_idx, 0644, config_read_fw_idx_show,
+		   config_read_fw_idx_store);
+
+
 static ssize_t trigger_request_store(struct device *dev,
 				     struct device_attribute *attr,
 				     const char *buf, size_t count)
@@ -170,12 +577,301 @@ out:
 }
 static DEVICE_ATTR_WO(trigger_custom_fallback);
 
+static int test_fw_run_batch_request(void *data)
+{
+	struct test_batched_req *req = data;
+
+	if (!req) {
+		test_fw_config->test_result = -EINVAL;
+		return -EINVAL;
+	}
+
+	req->rc = test_fw_config->req_firmware(&req->fw, req->name, req->dev);
+	if (req->rc) {
+		pr_info("#%u: batched sync load failed: %d\n",
+			req->idx, req->rc);
+		if (!test_fw_config->test_result)
+			test_fw_config->test_result = req->rc;
+	} else if (req->fw) {
+		req->sent = true;
+		pr_info("#%u: batched sync loaded %zu\n",
+			req->idx, req->fw->size);
+	}
+	complete(&req->completion);
+
+	req->task = NULL;
+
+	return 0;
+}
+
+/*
+ * We use a kthread as otherwise the kernel serializes all our sync requests
+ * and we would not be able to mimic batched requests on a sync call. Batched
+ * requests on a sync call can for instance happen on a device driver when
+ * multiple cards are used and firmware loading happens outside of probe.
+ */
+static ssize_t trigger_batched_requests_store(struct device *dev,
+					      struct device_attribute *attr,
+					      const char *buf, size_t count)
+{
+	struct test_batched_req *req;
+	int rc;
+	u8 i;
+
+	mutex_lock(&test_fw_mutex);
+
+	test_fw_config->reqs = vzalloc(sizeof(struct test_batched_req) *
+				       test_fw_config->num_requests * 2);
+	if (!test_fw_config->reqs) {
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+
+	pr_info("batched sync firmware loading '%s' %u times\n",
+		test_fw_config->name, test_fw_config->num_requests);
+
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (!req) {
+			WARN_ON(1);
+			rc = -ENOMEM;
+			goto out_bail;
+		}
+		req->fw = NULL;
+		req->idx = i;
+		req->name = test_fw_config->name;
+		req->dev = dev;
+		init_completion(&req->completion);
+		req->task = kthread_run(test_fw_run_batch_request, req,
+					     "%s-%u", KBUILD_MODNAME, req->idx);
+		if (!req->task || IS_ERR(req->task)) {
+			pr_err("Setting up thread %u failed\n", req->idx);
+			req->task = NULL;
+			rc = -ENOMEM;
+			goto out_bail;
+		}
+	}
+
+	rc = count;
+
+	/*
+	 * We require an explicit release to enable more time and delay of
+	 * calling release_firmware() to improve our chances of forcing a
+	 * batched request. If we instead called release_firmware() right away
+	 * then we might miss on an opportunity of having a successful firmware
+	 * request pass on the opportunity to be come a batched request.
+	 */
+
+out_bail:
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (req->task || req->sent)
+			wait_for_completion(&req->completion);
+	}
+
+	/* Override any worker error if we had a general setup error */
+	if (rc < 0)
+		test_fw_config->test_result = rc;
+
+out_unlock:
+	mutex_unlock(&test_fw_mutex);
+
+	return rc;
+}
+static DEVICE_ATTR_WO(trigger_batched_requests);
+
+/*
+ * We wait for each callback to return with the lock held, no need to lock here
+ */
+static void trigger_batched_cb(const struct firmware *fw, void *context)
+{
+	struct test_batched_req *req = context;
+
+	if (!req) {
+		test_fw_config->test_result = -EINVAL;
+		return;
+	}
+
+	/* forces *some* batched requests to queue up */
+	if (!req->idx)
+		ssleep(2);
+
+	req->fw = fw;
+
+	/*
+	 * Unfortunately the firmware API gives us nothing other than a null FW
+	 * if the firmware was not found on async requests.  Best we can do is
+	 * just assume -ENOENT. A better API would pass the actual return
+	 * value to the callback.
+	 */
+	if (!fw && !test_fw_config->test_result)
+		test_fw_config->test_result = -ENOENT;
+
+	complete(&req->completion);
+}
+
+static
+ssize_t trigger_batched_requests_async_store(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf, size_t count)
+{
+	struct test_batched_req *req;
+	bool send_uevent;
+	int rc;
+	u8 i;
+
+	mutex_lock(&test_fw_mutex);
+
+	test_fw_config->reqs = vzalloc(sizeof(struct test_batched_req) *
+				       test_fw_config->num_requests * 2);
+	if (!test_fw_config->reqs) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	pr_info("batched loading '%s' custom fallback mechanism %u times\n",
+		test_fw_config->name, test_fw_config->num_requests);
+
+	send_uevent = test_fw_config->send_uevent ? FW_ACTION_HOTPLUG :
+		FW_ACTION_NOHOTPLUG;
+
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (!req) {
+			WARN_ON(1);
+			goto out_bail;
+		}
+		req->name = test_fw_config->name;
+		req->fw = NULL;
+		req->idx = i;
+		init_completion(&req->completion);
+		rc = request_firmware_nowait(THIS_MODULE, send_uevent,
+					     req->name,
+					     dev, GFP_KERNEL, req,
+					     trigger_batched_cb);
+		if (rc) {
+			pr_info("#%u: batched async load failed setup: %d\n",
+				i, rc);
+			req->rc = rc;
+			goto out_bail;
+		} else
+			req->sent = true;
+	}
+
+	rc = count;
+
+out_bail:
+
+	/*
+	 * We require an explicit release to enable more time and delay of
+	 * calling release_firmware() to improve our chances of forcing a
+	 * batched request. If we instead called release_firmware() right away
+	 * then we might miss on an opportunity of having a successful firmware
+	 * request pass on the opportunity to be come a batched request.
+	 */
+
+	for (i = 0; i < test_fw_config->num_requests; i++) {
+		req = &test_fw_config->reqs[i];
+		if (req->sent)
+			wait_for_completion(&req->completion);
+	}
+
+	/* Override any worker error if we had a general setup error */
+	if (rc < 0)
+		test_fw_config->test_result = rc;
+
+out:
+	mutex_unlock(&test_fw_mutex);
+
+	return rc;
+}
+static DEVICE_ATTR_WO(trigger_batched_requests_async);
+
+static ssize_t test_result_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return test_dev_config_show_int(buf, test_fw_config->test_result);
+}
+static DEVICE_ATTR_RO(test_result);
+
+static ssize_t release_all_firmware_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	test_release_all_firmware();
+	return count;
+}
+static DEVICE_ATTR_WO(release_all_firmware);
+
+static ssize_t read_firmware_show(struct device *dev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	struct test_batched_req *req;
+	u8 idx;
+	ssize_t rc = 0;
+
+	mutex_lock(&test_fw_mutex);
+
+	idx = test_fw_config->read_fw_idx;
+	if (idx >= test_fw_config->num_requests) {
+		rc = -ERANGE;
+		goto out;
+	}
+
+	if (!test_fw_config->reqs) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	req = &test_fw_config->reqs[idx];
+	if (!req->fw) {
+		pr_err("#%u: failed to async load firmware\n", idx);
+		rc = -ENOENT;
+		goto out;
+	}
+
+	pr_info("#%u: loaded %zu\n", idx, req->fw->size);
+
+	if (req->fw->size > PAGE_SIZE) {
+		pr_err("Testing interface must use PAGE_SIZE firmware for now\n");
+		rc = -EINVAL;
+	}
+	memcpy(buf, req->fw->data, req->fw->size);
+
+	rc = req->fw->size;
+out:
+	mutex_unlock(&test_fw_mutex);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(read_firmware);
+
 #define TEST_FW_DEV_ATTR(name)          &dev_attr_##name.attr
 
 static struct attribute *test_dev_attrs[] = {
+	TEST_FW_DEV_ATTR(reset),
+
+	TEST_FW_DEV_ATTR(config),
+	TEST_FW_DEV_ATTR(config_name),
+	TEST_FW_DEV_ATTR(config_num_requests),
+	TEST_FW_DEV_ATTR(config_sync_direct),
+	TEST_FW_DEV_ATTR(config_send_uevent),
+	TEST_FW_DEV_ATTR(config_read_fw_idx),
+
+	/* These don't use the config at all - they could be ported! */
 	TEST_FW_DEV_ATTR(trigger_request),
 	TEST_FW_DEV_ATTR(trigger_async_request),
 	TEST_FW_DEV_ATTR(trigger_custom_fallback),
+
+	/* These use the config and can use the test_result */
+	TEST_FW_DEV_ATTR(trigger_batched_requests),
+	TEST_FW_DEV_ATTR(trigger_batched_requests_async),
+
+	TEST_FW_DEV_ATTR(release_all_firmware),
+	TEST_FW_DEV_ATTR(test_result),
+	TEST_FW_DEV_ATTR(read_firmware),
 	NULL,
 };
 
@@ -192,8 +888,17 @@ static int __init test_firmware_init(void)
 {
 	int rc;
 
+	test_fw_config = kzalloc(sizeof(struct test_config), GFP_KERNEL);
+	if (!test_fw_config)
+		return -ENOMEM;
+
+	rc = __test_firmware_config_init();
+	if (rc)
+		return rc;
+
 	rc = misc_register(&test_fw_misc_device);
 	if (rc) {
+		kfree(test_fw_config);
 		pr_err("could not register misc device: %d\n", rc);
 		return rc;
 	}
@@ -207,8 +912,13 @@ module_init(test_firmware_init);
 
 static void __exit test_firmware_exit(void)
 {
+	mutex_lock(&test_fw_mutex);
 	release_firmware(test_firmware);
 	misc_deregister(&test_fw_misc_device);
+	__test_firmware_config_free();
+	kfree(test_fw_config);
+	mutex_unlock(&test_fw_mutex);
+
 	pr_warn("removed interface\n");
 }
 
diff --git a/lib/test_kmod.c b/lib/test_kmod.c
index ff91489..fba78d25 100644
--- a/lib/test_kmod.c
+++ b/lib/test_kmod.c
@@ -924,7 +924,7 @@ static int test_dev_config_update_uint_range(struct kmod_test_device *test_dev,
 	if (ret)
 		return ret;
 
-	if (new < min || new >  max || new > UINT_MAX)
+	if (new < min || new > max)
 		return -EINVAL;
 
 	mutex_lock(&test_dev->config_mutex);
@@ -946,7 +946,7 @@ static int test_dev_config_update_int(struct kmod_test_device *test_dev,
 	if (ret)
 		return ret;
 
-	if (new > INT_MAX || new < INT_MIN)
+	if (new < INT_MIN || new > INT_MAX)
 		return -EINVAL;
 
 	mutex_lock(&test_dev->config_mutex);