23 files changed, 955 insertions, 315 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 55d2cff..1b1c86c 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -4235,6 +4235,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 	pagecopy((void *)src, (void *)dst);
 }
 
+int unmapped_buf_allowed = 1;
+
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
index 0083f29..9d16509 100644
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@@ -3312,6 +3312,8 @@ pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst)
 	mtx_unlock(&cmtx);
 }
 
+int unmapped_buf_allowed = 1;
+
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
diff --git a/sys/arm/arm/pmap.c b/sys/arm/arm/pmap.c
index c18783b..0875f83 100644
--- a/sys/arm/arm/pmap.c
+++ b/sys/arm/arm/pmap.c
@@ -4428,6 +4428,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
 #endif
 }
 
+int unmapped_buf_allowed = 1;
+
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
diff --git a/sys/geom/geom.h b/sys/geom/geom.h
index 351b05d..660bf6e 100644
--- a/sys/geom/geom.h
+++ b/sys/geom/geom.h
@@ -205,6 +205,7 @@ struct g_provider {
 	u_int			flags;
 #define G_PF_WITHER		0x2
 #define G_PF_ORPHAN		0x4
+#define	G_PF_ACCEPT_UNMAPPED	0x8
 
 	/* Two fields for the implementing class to use */
 	void			*private;
diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c
index c6a5da8..6ffc06e 100644
--- a/sys/geom/geom_io.c
+++ b/sys/geom/geom_io.c
@@ -1,6 +1,7 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
+ * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
@@ -8,6 +9,9 @@
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -44,6 +48,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/stack.h>
+#include <sys/sysctl.h>
 
 #include <sys/errno.h>
 #include <geom/geom.h>
@@ -51,6 +56,13 @@ __FBSDID("$FreeBSD$");
 #include <sys/devicestat.h>
 
 #include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
 
 static struct g_bioq g_bio_run_down;
 static struct g_bioq g_bio_run_up;
@@ -180,12 +192,17 @@ g_clone_bio(struct bio *bp)
 		/*
 		 *  BIO_ORDERED flag may be used by disk drivers to enforce
 		 *  ordering restrictions, so this flag needs to be cloned.
+		 *  BIO_UNMAPPED should be inherited, to properly indicate
+		 *  which way the buffer is passed.
 		 *  Other bio flags are not suitable for cloning.
 		 */
-		bp2->bio_flags = bp->bio_flags & BIO_ORDERED;
+		bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED);
 		bp2->bio_length = bp->bio_length;
 		bp2->bio_offset = bp->bio_offset;
 		bp2->bio_data = bp->bio_data;
+		bp2->bio_ma = bp->bio_ma;
+		bp2->bio_ma_n = bp->bio_ma_n;
+		bp2->bio_ma_offset = bp->bio_ma_offset;
 		bp2->bio_attribute = bp->bio_attribute;
 		/* Inherit classification info from the parent */
 		bp2->bio_classifier1 = bp->bio_classifier1;
@@ -210,11 +227,15 @@ g_duplicate_bio(struct bio *bp)
 	struct bio *bp2;
 
 	bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
+	bp2->bio_flags = bp->bio_flags & BIO_UNMAPPED;
 	bp2->bio_parent = bp;
 	bp2->bio_cmd = bp->bio_cmd;
 	bp2->bio_length = bp->bio_length;
 	bp2->bio_offset = bp->bio_offset;
 	bp2->bio_data = bp->bio_data;
+	bp2->bio_ma = bp->bio_ma;
+	bp2->bio_ma_n = bp->bio_ma_n;
+	bp2->bio_ma_offset = bp->bio_ma_offset;
 	bp2->bio_attribute = bp->bio_attribute;
 	bp->bio_children++;
 #ifdef KTR
@@ -575,6 +596,83 @@ g_io_deliver(struct bio *bp, int error)
 	return;
 }
 
+SYSCTL_DECL(_kern_geom);
+
+static long transient_maps;
+SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
+    &transient_maps, 0,
+    "Total count of the transient mapping requests");
+u_int transient_map_retries = 10;
+SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
+    &transient_map_retries, 0,
+    "Max count of retries used before giving up on creating transient map");
+int transient_map_hard_failures;
+SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
+    &transient_map_hard_failures, 0,
+    "Failures to establish the transient mapping due to retry attempts "
+    "exhausted");
+int transient_map_soft_failures;
+SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
+    &transient_map_soft_failures, 0,
+    "Count of retried failures to establish the transient mapping");
+int inflight_transient_maps;
+SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
+    &inflight_transient_maps, 0,
+    "Current count of the active transient maps");
+
+static int
+g_io_transient_map_bio(struct bio *bp)
+{
+	vm_offset_t addr;
+	long size;
+	u_int retried;
+	int rv;
+
+	size = round_page(bp->bio_ma_offset + bp->bio_length);
+	KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
+	addr = 0;
+	retried = 0;
+	atomic_add_long(&transient_maps, 1);
+retry:
+	vm_map_lock(bio_transient_map);
+	if (vm_map_findspace(bio_transient_map, vm_map_min(bio_transient_map),
+	    size, &addr)) {
+		vm_map_unlock(bio_transient_map);
+		if (transient_map_retries != 0 &&
+		    retried >= transient_map_retries) {
+			g_io_deliver(bp, EDEADLK/* XXXKIB */);
+			CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
+			    bp, bp->bio_to->name);
+			atomic_add_int(&transient_map_hard_failures, 1);
+			return (1);
+		} else {
+			/*
+			 * Naive attempt to quisce the I/O to get more
+			 * in-flight requests completed and defragment
+			 * the bio_transient_map.
+			 */
+			CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
+			    bp, bp->bio_to->name, retried);
+			pause("g_d_tra", hz / 10);
+			retried++;
+			atomic_add_int(&transient_map_soft_failures, 1);
+			goto retry;
+		}
+	}
+	rv = vm_map_insert(bio_transient_map, NULL, 0, addr, addr + size,
+	    VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+	KASSERT(rv == KERN_SUCCESS,
+	    ("vm_map_insert(bio_transient_map) rv %d %jx %lx",
+	    rv, (uintmax_t)addr, size));
+	vm_map_unlock(bio_transient_map);
+	atomic_add_int(&inflight_transient_maps, 1);
+	pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
+	bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
+	bp->bio_flags |= BIO_TRANSIENT_MAPPING;
+	bp->bio_flags &= ~BIO_UNMAPPED;
+	return (0);
+}
+
 void
 g_io_schedule_down(struct thread *tp __unused)
 {
@@ -636,6 +734,12 @@ g_io_schedule_down(struct thread *tp __unused)
 		default:
 			break;
 		}
+		if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
+		    (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
+		    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
+			if (g_io_transient_map_bio(bp))
+				continue;
+		}
 		THREAD_NO_SLEEPING();
 		CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
 		    "len %ld", bp, bp->bio_to->name, bp->bio_offset,
diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c
index bbed550..92f1ad2 100644
--- a/sys/geom/geom_vfs.c
+++ b/sys/geom/geom_vfs.c
@@ -188,14 +188,14 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp)
 	bip = g_alloc_bio();
 	bip->bio_cmd = bp->b_iocmd;
 	bip->bio_offset = bp->b_iooffset;
-	bip->bio_data = bp->b_data;
-	bip->bio_done = g_vfs_done;
-	bip->bio_caller2 = bp;
 	bip->bio_length = bp->b_bcount;
-	if (bp->b_flags & B_BARRIER) {
+	bdata2bio(bp, bip);
+	if ((bp->b_flags & B_BARRIER) != 0) {
 		bip->bio_flags |= BIO_ORDERED;
 		bp->b_flags &= ~B_BARRIER;
 	}
+	bip->bio_done = g_vfs_done;
+	bip->bio_caller2 = bp;
 	g_io_request(bip, cp);
 }
 
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 6499986..f4681bf 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -4205,6 +4205,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
 	mtx_unlock(&sysmaps->lock);
 }
 
+int unmapped_buf_allowed = 1;
+
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c
index 0f7a80f..a9bc124 100644
--- a/sys/i386/xen/pmap.c
+++ b/sys/i386/xen/pmap.c
@@ -3448,6 +3448,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
 	mtx_unlock(&sysmaps->lock);
 }
 
+int unmapped_buf_allowed = 1;
+
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
diff --git a/sys/ia64/ia64/pmap.c b/sys/ia64/ia64/pmap.c
index 3256600..883f39c 100644
--- a/sys/ia64/ia64/pmap.c
+++ b/sys/ia64/ia64/pmap.c
@@ -2014,6 +2014,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 	bcopy(src, dst, PAGE_SIZE);
 }
 
+int unmapped_buf_allowed;
+
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
diff --git a/sys/kern/subr_bus_dma.c b/sys/kern/subr_bus_dma.c
index 773d01a..4d344b4 100644
--- a/sys/kern/subr_bus_dma.c
+++ b/sys/kern/subr_bus_dma.c
@@ -126,11 +126,28 @@ static int
 _bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
     int *nsegs, int flags)
 {
-	int error;
+	vm_paddr_t paddr;
+	bus_size_t len, tlen;
+	int error, i, ma_offs;
 
-	error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data,
-	    bio->bio_bcount, kernel_pmap, flags, NULL, nsegs);
+	if ((bio->bio_flags & BIO_UNMAPPED) == 0) {
+		error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data,
+		    bio->bio_bcount, kernel_pmap, flags, NULL, nsegs);
+		return (error);
+	}
 
+	error = 0;
+	tlen = bio->bio_bcount;
+	ma_offs = bio->bio_ma_offset;
+	for (i = 0; tlen > 0; i++, tlen -= len) {
+		len = min(PAGE_SIZE - ma_offs, tlen);
+		paddr = VM_PAGE_TO_PHYS(bio->bio_ma[i]) + ma_offs;
+		error = _bus_dmamap_load_phys(dmat, map, paddr, len,
+		    flags, NULL, nsegs);
+		if (error != 0)
+			break;
+		ma_offs = 0;
+	}
 	return (error);
 }
 
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
index 825a3a0..a2e822c 100644
--- a/sys/kern/subr_param.c
+++ b/sys/kern/subr_param.c
@@ -92,6 +92,7 @@ int	maxfiles;			/* sys. wide open files limit */
 int	maxfilesperproc;		/* per-proc open files limit */
 int	msgbufsize;			/* size of kernel message buffer */
 int	nbuf;
+int	bio_transient_maxcnt;
 int	ngroups_max;			/* max # groups per process */
 int	nswbuf;
 pid_t	pid_max = PID_MAX;
@@ -118,6 +119,9 @@ SYSCTL_LONG(_kern, OID_AUTO, maxswzone, CTLFLAG_RDTUN, &maxswzone, 0,
     "Maximum memory for swap metadata");
 SYSCTL_LONG(_kern, OID_AUTO, maxbcache, CTLFLAG_RDTUN, &maxbcache, 0,
     "Maximum value of vfs.maxbufspace");
+SYSCTL_INT(_kern, OID_AUTO, bio_transient_maxcnt, CTLFLAG_RDTUN,
+    &bio_transient_maxcnt, 0,
+    "Maximum number of transient BIOs mappings");
 SYSCTL_ULONG(_kern, OID_AUTO, maxtsiz, CTLFLAG_RW | CTLFLAG_TUN, &maxtsiz, 0,
     "Maximum text size");
 SYSCTL_ULONG(_kern, OID_AUTO, dfldsiz, CTLFLAG_RW | CTLFLAG_TUN, &dfldsiz, 0,
@@ -266,6 +270,8 @@ init_param1(void)
 		pid_max = PID_MAX;
 	else if (pid_max < 300)
 		pid_max = 300;
+
+	TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed);
 }
 
 /*
@@ -322,6 +328,7 @@ init_param2(long physpages)
 	 */
 	nbuf = NBUF;
 	TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
+	TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt);
 
 	/*
 	 * The default for maxpipekva is min(1/64 of the kernel address space,
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index d20c829..cded596 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,8 +1,12 @@
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
+ * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -92,6 +96,7 @@ struct	buf_ops buf_ops_bio = {
  * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
  */
 struct buf *buf;		/* buffer header pool */
+caddr_t unmapped_buf;
 
 static struct proc *bufdaemonproc;
 
@@ -132,6 +137,10 @@ SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
     "Virtual memory used for buffers");
 #endif
+static long unmapped_bufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
+    &unmapped_bufspace, 0,
+    "Amount of unmapped buffers, inclusive in the bufspace");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including buf_daemon)");
@@ -201,6 +210,10 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
     "Number of times getnewbuf has had to restart a buffer aquisition");
+static int mappingrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+    "Number of times getblk has had to restart a buffer mapping for "
+    "unmapped buffer");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
@@ -210,6 +223,9 @@ SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
     "Number of barrier writes");
+SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
+    &unmapped_buf_allowed, 0,
+    "Permit the use of the unmapped i/o");
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
@@ -281,6 +297,9 @@ static struct mtx nblock;
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
+#ifdef INVARIANTS
+static int bq_len[BUFFER_QUEUES];
+#endif
 
 /* Lock for the bufqueues */
 static struct mtx bqlock;
@@ -511,7 +530,7 @@ caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int tuned_nbuf;
-	long maxbuf;
+	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
@@ -555,6 +574,52 @@ kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 	}
 
 	/*
+	 * Ideal allocation size for the transient bio submap if 10%
+	 * of the maximal space buffer map.  This roughly corresponds
+	 * to the amount of the buffer mapped for typical UFS load.
+	 *
+	 * Clip the buffer map to reserve space for the transient
+	 * BIOs, if its extent is bigger than 90% of the maximum
+	 * buffer map extent on the platform.
+	 *
+	 * The fall-back to the maxbuf in case of maxbcache unset,
+	 * allows to not trim the buffer KVA for the architectures
+	 * with ample KVA space.
+	 */
+	if (bio_transient_maxcnt == 0) {
+		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
+		buf_sz = (long)nbuf * BKVASIZE;
+		if (buf_sz < maxbuf_sz / 10 * 9) {
+			/*
+			 * There is more KVA than memory.  Do not
+			 * adjust buffer map size, and assign the rest
+			 * of maxbuf to transient map.
+			 */
+			biotmap_sz = maxbuf_sz - buf_sz;
+		} else {
+			/*
+			 * Buffer map spans all KVA we could afford on
+			 * this platform.  Give 10% of the buffer map
+			 * to the transient bio map.
+			 */
+ 			biotmap_sz = buf_sz / 10;
+			buf_sz -= biotmap_sz;
+		}
+		if (biotmap_sz / INT_MAX > MAXPHYS)
+			bio_transient_maxcnt = INT_MAX;
+		else
+			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
+		/*
+		 * Artifically limit to 1024 simultaneous in-flight I/Os
+		 * using the transient mapping.
+		 */
+		if (bio_transient_maxcnt > 1024)
+			bio_transient_maxcnt = 1024;
+		if (tuned_nbuf)
+			nbuf = buf_sz / BKVASIZE;
+	}
+
+	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
@@ -607,6 +672,9 @@ bufinit(void)
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+#ifdef INVARIANTS
+		bq_len[QUEUE_EMPTY]++;
+#endif
 	}
 
 	/*
@@ -675,6 +743,55 @@ bufinit(void)
 
 	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+	unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS);
+}
+
+#ifdef INVARIANTS
+static inline void
+vfs_buf_check_mapped(struct buf *bp)
+{
+
+	KASSERT((bp->b_flags & B_UNMAPPED) == 0,
+	    ("mapped buf %p %x", bp, bp->b_flags));
+	KASSERT(bp->b_kvabase != unmapped_buf,
+	    ("mapped buf: b_kvabase was not updated %p", bp));
+	KASSERT(bp->b_data != unmapped_buf,
+	    ("mapped buf: b_data was not updated %p", bp));
+}
+
+static inline void
+vfs_buf_check_unmapped(struct buf *bp)
+{
+
+	KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
+	    ("unmapped buf %p %x", bp, bp->b_flags));
+	KASSERT(bp->b_kvabase == unmapped_buf,
+	    ("unmapped buf: corrupted b_kvabase %p", bp));
+	KASSERT(bp->b_data == unmapped_buf,
+	    ("unmapped buf: corrupted b_data %p", bp));
+}
+
+#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
+#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
+#else
+#define	BUF_CHECK_MAPPED(bp) do {} while (0)
+#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
+#endif
+
+static void
+bpmap_qenter(struct buf *bp)
+{
+
+	BUF_CHECK_MAPPED(bp);
+
+	/*
+	 * bp->b_data is relative to bp->b_offset, but
+	 * bp->b_offset may be offset into the first page.
+	 */
+	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
+	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
 /*
@@ -686,14 +803,26 @@ static void
 bfreekva(struct buf *bp)
 {
 
-	if (bp->b_kvasize) {
-		atomic_add_int(&buffreekvacnt, 1);
-		atomic_subtract_long(&bufspace, bp->b_kvasize);
-		vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
-		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
-		bp->b_kvasize = 0;
-		bufspacewakeup();
+	if (bp->b_kvasize == 0)
+		return;
+
+	atomic_add_int(&buffreekvacnt, 1);
+	atomic_subtract_long(&bufspace, bp->b_kvasize);
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
+		vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvabase,
+		    (vm_offset_t)bp->b_kvabase + bp->b_kvasize);
+	} else {
+		BUF_CHECK_UNMAPPED(bp);
+		if ((bp->b_flags & B_KVAALLOC) != 0) {
+			vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvaalloc,
+			    (vm_offset_t)bp->b_kvaalloc + bp->b_kvasize);
+		}
+		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+		bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
 	}
+	bp->b_kvasize = 0;
+	bufspacewakeup();
 }
 
 /*
@@ -760,6 +889,11 @@ bremfreel(struct buf *bp)
 	mtx_assert(&bqlock, MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
+	    bp->b_qindex));
+	bq_len[bp->b_qindex]--;
+#endif
 	bp->b_qindex = QUEUE_NONE;
 	/*
 	 * If this was a delayed bremfree() we only need to remove the buffer
@@ -1414,7 +1548,8 @@ brelse(struct buf *bp)
 					}
 				}
 
-				if ((bp->b_flags & B_INVAL) == 0) {
+				if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
+					BUF_CHECK_MAPPED(bp);
 					pmap_qenter(
 					    trunc_page((vm_offset_t)bp->b_data),
 					    bp->b_pages, bp->b_npages);
@@ -1509,11 +1644,17 @@ brelse(struct buf *bp)
 			bp->b_qindex = QUEUE_DIRTY;
 		else
 			bp->b_qindex = QUEUE_CLEAN;
-		if (bp->b_flags & B_AGE)
-			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
-		else
-			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+		if (bp->b_flags & B_AGE) {
+			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp,
+			    b_freelist);
+		} else {
+			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp,
+			    b_freelist);
+		}
 	}
+#ifdef INVARIANTS
+	bq_len[bp->b_qindex]++;
+#endif
 	mtx_unlock(&bqlock);
 
 	/*
@@ -1604,6 +1745,9 @@ bqrelse(struct buf *bp)
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_qindex = QUEUE_DIRTY;
 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+		bq_len[bp->b_qindex]++;
+#endif
 	} else {
 		/*
 		 * The locking of the BO_LOCK for checking of the
@@ -1616,6 +1760,9 @@ bqrelse(struct buf *bp)
 			bp->b_qindex = QUEUE_CLEAN;
 			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
 			    b_freelist);
+#ifdef INVARIANTS
+			bq_len[QUEUE_CLEAN]++;
+#endif
 		} else {
 			/*
 			 * We are too low on memory, we have to try to free
@@ -1657,7 +1804,11 @@ vfs_vmio_release(struct buf *bp)
 	int i;
 	vm_page_t m;
 
-	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
+		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+	} else
+		BUF_CHECK_UNMAPPED(bp);
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
@@ -1761,8 +1912,10 @@ vfs_bio_awrite(struct buf *bp)
 	int nwritten;
 	int size;
 	int maxcl;
+	int gbflags;
 
 	bo = &vp->v_bufobj;
+	gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
@@ -1794,7 +1947,7 @@ vfs_bio_awrite(struct buf *bp)
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
-			    0);
+			    gbflags);
 			return (nwritten);
 		}
 	}
@@ -1811,46 +1964,207 @@ vfs_bio_awrite(struct buf *bp)
 	return (nwritten);
 }
 
+static void
+setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
+{
+
+	KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+	    bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
+	if ((gbflags & GB_UNMAPPED) == 0) {
+		bp->b_kvabase = (caddr_t)addr;
+	} else if ((gbflags & GB_KVAALLOC) != 0) {
+		KASSERT((gbflags & GB_UNMAPPED) != 0,
+		    ("GB_KVAALLOC without GB_UNMAPPED"));
+		bp->b_kvaalloc = (caddr_t)addr;
+		bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+	}
+	bp->b_kvasize = maxsize;
+}
+
 /*
- *	getnewbuf:
- *
- *	Find and initialize a new buffer header, freeing up existing buffers 
- *	in the bufqueues as necessary.  The new buffer is returned locked.
- *
- *	Important:  B_INVAL is not set.  If the caller wishes to throw the
- *	buffer away, the caller must set B_INVAL prior to calling brelse().
- *
- *	We block if:
- *		We have insufficient buffer headers
- *		We have insufficient buffer space
- *		buffer_map is too fragmented ( space reservation fails )
- *		If we have to flush dirty buffers ( but we try to avoid this )
- *
- *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- *	Instead we ask the buf daemon to do it for us.  We attempt to
- *	avoid piecemeal wakeups of the pageout daemon.
+ * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
+ * needed.
  */
+static int
+allocbufkva(struct buf *bp, int maxsize, int gbflags)
+{
+	vm_offset_t addr;
+	int rv;
 
-static struct buf *
-getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
-    int gbflags)
+	bfreekva(bp);
+	addr = 0;
+
+	vm_map_lock(buffer_map);
+	if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize,
+	    &addr)) {
+		vm_map_unlock(buffer_map);
+		/*
+		 * Buffer map is too fragmented.  Request the caller
+		 * to defragment the map.
+		 */
+		atomic_add_int(&bufdefragcnt, 1);
+		return (1);
+	}
+	rv = vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize,
+	    VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+	KASSERT(rv == KERN_SUCCESS, ("vm_map_insert(buffer_map) rv %d", rv));
+	vm_map_unlock(buffer_map);
+	setbufkva(bp, addr, maxsize, gbflags);
+	atomic_add_long(&bufspace, bp->b_kvasize);
+	return (0);
+}
+
+/*
+ * Ask the bufdaemon for help, or act as bufdaemon itself, when a
+ * locked vnode is supplied.
+ */
+static void
+getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
+    int defrag)
 {
 	struct thread *td;
-	struct buf *bp;
-	struct buf *nbp;
-	int defrag = 0;
-	int nqindex;
-	static int flushingbufs;
+	char *waitmsg;
+	int fl, flags, norunbuf;
+
+	mtx_assert(&bqlock, MA_OWNED);
+
+	if (defrag) {
+		flags = VFS_BIO_NEED_BUFSPACE;
+		waitmsg = "nbufkv";
+	} else if (bufspace >= hibufspace) {
+		waitmsg = "nbufbs";
+		flags = VFS_BIO_NEED_BUFSPACE;
+	} else {
+		waitmsg = "newbuf";
+		flags = VFS_BIO_NEED_ANY;
+	}
+	mtx_lock(&nblock);
+	needsbuffer |= flags;
+	mtx_unlock(&nblock);
+	mtx_unlock(&bqlock);
+
+	bd_speedup();	/* heeeelp */
+	if ((gbflags & GB_NOWAIT_BD) != 0)
+		return;
 
 	td = curthread;
+	mtx_lock(&nblock);
+	while (needsbuffer & flags) {
+		if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
+			mtx_unlock(&nblock);
+			/*
+			 * getblk() is called with a vnode locked, and
+			 * some majority of the dirty buffers may as
+			 * well belong to the vnode.  Flushing the
+			 * buffers there would make a progress that
+			 * cannot be achieved by the buf_daemon, that
+			 * cannot lock the vnode.
+			 */
+			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+			    (td->td_pflags & TDP_NORUNNINGBUF);
+			/* play bufdaemon */
+			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+			fl = buf_do_flush(vp);
+			td->td_pflags &= norunbuf;
+			mtx_lock(&nblock);
+			if (fl != 0)
+				continue;
+			if ((needsbuffer & flags) == 0)
+				break;
+		}
+		if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag,
+		    waitmsg, slptimeo))
+			break;
+	}
+	mtx_unlock(&nblock);
+}
+
+static void
+getnewbuf_reuse_bp(struct buf *bp, int qindex)
+{
+
+	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
+	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
+	     bp->b_kvasize, bp->b_bufsize, qindex);
+	mtx_assert(&bqlock, MA_NOTOWNED);
+
 	/*
-	 * We can't afford to block since we might be holding a vnode lock,
-	 * which may prevent system daemons from running.  We deal with
-	 * low-memory situations by proactively returning memory and running
-	 * async I/O rather then sync I/O.
+	 * Note: we no longer distinguish between VMIO and non-VMIO
+	 * buffers.
 	 */
-	atomic_add_int(&getnewbufcalls, 1);
-	atomic_subtract_int(&getnewbufrestarts, 1);
+	KASSERT((bp->b_flags & B_DELWRI) == 0,
+	    ("delwri buffer %p found in queue %d", bp, qindex));
+
+	if (qindex == QUEUE_CLEAN) {
+		if (bp->b_flags & B_VMIO) {
+			bp->b_flags &= ~B_ASYNC;
+			vfs_vmio_release(bp);
+		}
+		if (bp->b_vp != NULL)
+			brelvp(bp);
+	}
+
+	/*
+	 * Get the rest of the buffer freed up.  b_kva* is still valid
+	 * after this operation.
+	 */
+
+	if (bp->b_rcred != NOCRED) {
+		crfree(bp->b_rcred);
+		bp->b_rcred = NOCRED;
+	}
+	if (bp->b_wcred != NOCRED) {
+		crfree(bp->b_wcred);
+		bp->b_wcred = NOCRED;
+	}
+	if (!LIST_EMPTY(&bp->b_dep))
+		buf_deallocate(bp);
+	if (bp->b_vflags & BV_BKGRDINPROG)
+		panic("losing buffer 3");
+	KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
+	    bp, bp->b_vp, qindex));
+	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+
+	if (bp->b_bufsize)
+		allocbuf(bp, 0);
+
+	bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
+	bp->b_ioflags = 0;
+	bp->b_xflags = 0;
+	KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
+	    ("buf %p still counted as free?", bp));
+	bp->b_vflags = 0;
+	bp->b_vp = NULL;
+	bp->b_blkno = bp->b_lblkno = 0;
+	bp->b_offset = NOOFFSET;
+	bp->b_iodone = 0;
+	bp->b_error = 0;
+	bp->b_resid = 0;
+	bp->b_bcount = 0;
+	bp->b_npages = 0;
+	bp->b_dirtyoff = bp->b_dirtyend = 0;
+	bp->b_bufobj = NULL;
+	bp->b_pin_count = 0;
+	bp->b_fsprivate1 = NULL;
+	bp->b_fsprivate2 = NULL;
+	bp->b_fsprivate3 = NULL;
+
+	LIST_INIT(&bp->b_dep);
+}
+
+static int flushingbufs;
+
+static struct buf *
+getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
+{
+	struct buf *bp, *nbp;
+	int nqindex, qindex, pass;
+
+	KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
+
+	pass = 1;
 restart:
 	atomic_add_int(&getnewbufrestarts, 1);
 
@@ -1860,66 +2174,90 @@ restart:
 	 * that if we are specially marked process, we are allowed to
 	 * dip into our reserves.
 	 *
-	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
+	 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+	 * for the allocation of the mapped buffer.  For unmapped, the
+	 * easiest is to start with EMPTY outright.
 	 *
 	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
+	nbp = NULL;
 	mtx_lock(&bqlock);
-	nqindex = QUEUE_EMPTYKVA;
-	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
-
+	if (!defrag && unmapped) {
+		nqindex = QUEUE_EMPTY;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+	}
 	if (nbp == NULL) {
-		/*
-		 * If no EMPTYKVA buffers and we are either
-		 * defragging or reusing, locate a CLEAN buffer
-		 * to free or reuse.  If bufspace useage is low
-		 * skip this step so we can allocate a new buffer.
-		 */
-		if (defrag || bufspace >= lobufspace) {
-			nqindex = QUEUE_CLEAN;
-			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
-		}
+		nqindex = QUEUE_EMPTYKVA;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+	}
 
-		/*
-		 * If we could not find or were not allowed to reuse a
-		 * CLEAN buffer, check to see if it is ok to use an EMPTY
-		 * buffer.  We can only use an EMPTY buffer if allocating
-		 * its KVA would not otherwise run us out of buffer space.
-		 */
-		if (nbp == NULL && defrag == 0 &&
-		    bufspace + maxsize < hibufspace) {
-			nqindex = QUEUE_EMPTY;
-			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
-		}
+	/*
+	 * If no EMPTYKVA buffers and we are either defragging or
+	 * reusing, locate a CLEAN buffer to free or reuse.  If
+	 * bufspace useage is low skip this step so we can allocate a
+	 * new buffer.
+	 */
+	if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
+		nqindex = QUEUE_CLEAN;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+	}
+
+	/*
+	 * If we could not find or were not allowed to reuse a CLEAN
+	 * buffer, check to see if it is ok to use an EMPTY buffer.
+	 * We can only use an EMPTY buffer if allocating its KVA would
+	 * not otherwise run us out of buffer space.  No KVA is needed
+	 * for the unmapped allocation.
+	 */
+	if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
+	    metadata)) {
+		nqindex = QUEUE_EMPTY;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+	}
+
+	/*
+	 * All available buffers might be clean, retry ignoring the
+	 * lobufspace as the last resort.
+	 */
+	if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
+		nqindex = QUEUE_CLEAN;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 	}
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
-
 	while ((bp = nbp) != NULL) {
-		int qindex = nqindex;
+		qindex = nqindex;
 
 		/*
-		 * Calculate next bp ( we can only use it if we do not block
-		 * or do other fancy things ).
+		 * Calculate next bp (we can only use it if we do not
+		 * block or do other fancy things).
 		 */
 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
-			switch(qindex) {
+			switch (qindex) {
 			case QUEUE_EMPTY:
 				nqindex = QUEUE_EMPTYKVA;
-				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
+				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+				if (nbp != NULL)
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_EMPTYKVA:
 				nqindex = QUEUE_CLEAN;
-				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
+				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+				if (nbp != NULL)
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_CLEAN:
+				if (metadata && pass == 1) {
+					pass = 2;
+					nqindex = QUEUE_EMPTY;
+					nbp = TAILQ_FIRST(
+					    &bufqueues[QUEUE_EMPTY]);
+				}
 				/*
 				 * nbp is NULL. 
 				 */
@@ -1952,22 +2290,9 @@ restart:
 			}
 			BO_UNLOCK(bp->b_bufobj);
 		}
-		CTR6(KTR_BUF,
-		    "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
-		    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
-		    bp->b_kvasize, bp->b_bufsize, qindex);
-
-		/*
-		 * Sanity Checks
-		 */
-		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
-
-		/*
-		 * Note: we no longer distinguish between VMIO and non-VMIO
-		 * buffers.
-		 */
 
-		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
+		KASSERT(bp->b_qindex == qindex,
+		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 
 		if (bp->b_bufobj != NULL)
 			BO_LOCK(bp->b_bufobj);
@@ -1975,68 +2300,13 @@ restart:
 		if (bp->b_bufobj != NULL)
 			BO_UNLOCK(bp->b_bufobj);
 		mtx_unlock(&bqlock);
-
-		if (qindex == QUEUE_CLEAN) {
-			if (bp->b_flags & B_VMIO) {
-				bp->b_flags &= ~B_ASYNC;
-				vfs_vmio_release(bp);
-			}
-			if (bp->b_vp)
-				brelvp(bp);
-		}
-
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
-		 *
-		 * Get the rest of the buffer freed up.  b_kva* is still
-		 * valid after this operation.
 		 */
 
-		if (bp->b_rcred != NOCRED) {
-			crfree(bp->b_rcred);
-			bp->b_rcred = NOCRED;
-		}
-		if (bp->b_wcred != NOCRED) {
-			crfree(bp->b_wcred);
-			bp->b_wcred = NOCRED;
-		}
-		if (!LIST_EMPTY(&bp->b_dep))
-			buf_deallocate(bp);
-		if (bp->b_vflags & BV_BKGRDINPROG)
-			panic("losing buffer 3");
-		KASSERT(bp->b_vp == NULL,
-		    ("bp: %p still has vnode %p.  qindex: %d",
-		    bp, bp->b_vp, qindex));
-		KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
-		   ("bp: %p still on a buffer list. xflags %X",
-		    bp, bp->b_xflags));
-
-		if (bp->b_bufsize)
-			allocbuf(bp, 0);
-
-		bp->b_flags = 0;
-		bp->b_ioflags = 0;
-		bp->b_xflags = 0;
-		KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
-		    ("buf %p still counted as free?", bp));
-		bp->b_vflags = 0;
-		bp->b_vp = NULL;
-		bp->b_blkno = bp->b_lblkno = 0;
-		bp->b_offset = NOOFFSET;
-		bp->b_iodone = 0;
-		bp->b_error = 0;
-		bp->b_resid = 0;
-		bp->b_bcount = 0;
-		bp->b_npages = 0;
-		bp->b_dirtyoff = bp->b_dirtyend = 0;
-		bp->b_bufobj = NULL;
-		bp->b_pin_count = 0;
-		bp->b_fsprivate1 = NULL;
-		bp->b_fsprivate2 = NULL;
-		bp->b_fsprivate3 = NULL;
-
-		LIST_INIT(&bp->b_dep);
+		getnewbuf_reuse_bp(bp, qindex);
+		mtx_assert(&bqlock, MA_NOTOWNED);
 
 		/*
 		 * If we are defragging then free the buffer.
@@ -2060,6 +2330,9 @@ restart:
 			goto restart;
 		}
 
+		if (metadata)
+			break;
+
 		/*
 		 * If we are overcomitted then recover the buffer and its
 		 * KVM space.  This occurs in rare situations when multiple
@@ -2077,6 +2350,59 @@ restart:
 			flushingbufs = 0;
 		break;
 	}
+	return (bp);
+}
+
+/*
+ *	getnewbuf:
+ *
+ *	Find and initialize a new buffer header, freeing up existing buffers
+ *	in the bufqueues as necessary.  The new buffer is returned locked.
+ *
+ *	Important:  B_INVAL is not set.  If the caller wishes to throw the
+ *	buffer away, the caller must set B_INVAL prior to calling brelse().
+ *
+ *	We block if:
+ *		We have insufficient buffer headers
+ *		We have insufficient buffer space
+ *		buffer_map is too fragmented ( space reservation fails )
+ *		If we have to flush dirty buffers ( but we try to avoid this )
+ *
+ *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
+ *	Instead we ask the buf daemon to do it for us.  We attempt to
+ *	avoid piecemeal wakeups of the pageout daemon.
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+    int gbflags)
+{
+	struct buf *bp;
+	int defrag, metadata;
+
+	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+	if (!unmapped_buf_allowed)
+		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+	defrag = 0;
+	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
+	    vp->v_type == VCHR)
+		metadata = 1;
+	else
+		metadata = 0;
+	/*
+	 * We can't afford to block since we might be holding a vnode lock,
+	 * which may prevent system daemons from running.  We deal with
+	 * low-memory situations by proactively returning memory and running
+	 * async I/O rather then sync I/O.
+	 */
+	atomic_add_int(&getnewbufcalls, 1);
+	atomic_subtract_int(&getnewbufrestarts, 1);
+restart:
+	bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
+	    GB_KVAALLOC)) == GB_UNMAPPED, metadata);
+	if (bp != NULL)
+		defrag = 0;
 
 	/*
 	 * If we exhausted our list, sleep as appropriate.  We may have to
@@ -2084,65 +2410,23 @@ restart:
 	 *
 	 * Generally we are sleeping due to insufficient buffer space.
 	 */
-
 	if (bp == NULL) {
-		int flags, norunbuf;
-		char *waitmsg;
-		int fl;
-
-		if (defrag) {
-			flags = VFS_BIO_NEED_BUFSPACE;
-			waitmsg = "nbufkv";
-		} else if (bufspace >= hibufspace) {
-			waitmsg = "nbufbs";
-			flags = VFS_BIO_NEED_BUFSPACE;
-		} else {
-			waitmsg = "newbuf";
-			flags = VFS_BIO_NEED_ANY;
-		}
-		mtx_lock(&nblock);
-		needsbuffer |= flags;
-		mtx_unlock(&nblock);
-		mtx_unlock(&bqlock);
-
-		bd_speedup();	/* heeeelp */
-		if (gbflags & GB_NOWAIT_BD)
-			return (NULL);
-
-		mtx_lock(&nblock);
-		while (needsbuffer & flags) {
-			if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
-				mtx_unlock(&nblock);
-				/*
-				 * getblk() is called with a vnode
-				 * locked, and some majority of the
-				 * dirty buffers may as well belong to
-				 * the vnode. Flushing the buffers
-				 * there would make a progress that
-				 * cannot be achieved by the
-				 * buf_daemon, that cannot lock the
-				 * vnode.
-				 */
-				norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
-				    (td->td_pflags & TDP_NORUNNINGBUF);
-				/* play bufdaemon */
-				td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
-				fl = buf_do_flush(vp);
-				td->td_pflags &= norunbuf;
-				mtx_lock(&nblock);
-				if (fl != 0)
-					continue;
-				if ((needsbuffer & flags) == 0)
-					break;
-			}
-			if (msleep(&needsbuffer, &nblock,
-			    (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
-				mtx_unlock(&nblock);
-				return (NULL);
-			}
-		}
-		mtx_unlock(&nblock);
+		mtx_assert(&bqlock, MA_OWNED);
+		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
+		mtx_assert(&bqlock, MA_NOTOWNED);
+	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
+		mtx_assert(&bqlock, MA_NOTOWNED);
+
+		bfreekva(bp);
+		bp->b_flags |= B_UNMAPPED;
+		bp->b_kvabase = bp->b_data = unmapped_buf;
+		bp->b_kvasize = maxsize;
+		atomic_add_long(&bufspace, bp->b_kvasize);
+		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+		atomic_add_int(&bufreusecnt, 1);
 	} else {
+		mtx_assert(&bqlock, MA_NOTOWNED);
+
 		/*
 		 * We finally have a valid bp.  We aren't quite out of the
 		 * woods, we still have to reserve kva space.  In order
@@ -2151,39 +2435,47 @@ restart:
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
-		if (maxsize != bp->b_kvasize) {
-			vm_offset_t addr = 0;
-			int rv;
-
-			bfreekva(bp);
-
-			vm_map_lock(buffer_map);
-			if (vm_map_findspace(buffer_map,
-			    vm_map_min(buffer_map), maxsize, &addr)) {
-				/*
-				 * Buffer map is too fragmented.
-				 * We must defragment the map.
-				 */
-				atomic_add_int(&bufdefragcnt, 1);
-				vm_map_unlock(buffer_map);
+		if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
+		    B_KVAALLOC)) == B_UNMAPPED) {
+			if (allocbufkva(bp, maxsize, gbflags)) {
 				defrag = 1;
 				bp->b_flags |= B_INVAL;
 				brelse(bp);
 				goto restart;
 			}
-			rv = vm_map_insert(buffer_map, NULL, 0, addr,
-			    addr + maxsize, VM_PROT_ALL, VM_PROT_ALL,
-			    MAP_NOFAULT);
-			KASSERT(rv == KERN_SUCCESS,
-			    ("vm_map_insert(buffer_map) rv %d", rv));
-			vm_map_unlock(buffer_map);
-			bp->b_kvabase = (caddr_t)addr;
-			bp->b_kvasize = maxsize;
-			atomic_add_long(&bufspace, bp->b_kvasize);
+			atomic_add_int(&bufreusecnt, 1);
+		} else if ((bp->b_flags & B_KVAALLOC) != 0 &&
+		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
+			/*
+			 * If the reused buffer has KVA allocated,
+			 * reassign b_kvaalloc to b_kvabase.
+			 */
+			bp->b_kvabase = bp->b_kvaalloc;
+			bp->b_flags &= ~B_KVAALLOC;
+			atomic_subtract_long(&unmapped_bufspace,
+			    bp->b_kvasize);
+			atomic_add_int(&bufreusecnt, 1);
+		} else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
+		    GB_KVAALLOC)) {
+			/*
+			 * The case of reused buffer already have KVA
+			 * mapped, but the request is for unmapped
+			 * buffer with KVA allocated.
+			 */
+			bp->b_kvaalloc = bp->b_kvabase;
+			bp->b_data = bp->b_kvabase = unmapped_buf;
+			bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+			atomic_add_long(&unmapped_bufspace,
+			    bp->b_kvasize);
 			atomic_add_int(&bufreusecnt, 1);
 		}
-		bp->b_saveaddr = bp->b_kvabase;
-		bp->b_data = bp->b_saveaddr;
+		if ((gbflags & GB_UNMAPPED) == 0) {
+			bp->b_saveaddr = bp->b_kvabase;
+			bp->b_data = bp->b_saveaddr;
+			bp->b_flags &= ~B_UNMAPPED;
+			BUF_CHECK_MAPPED(bp);
+		}
 	}
 	return (bp);
 }
@@ -2594,6 +2886,90 @@ vfs_setdirty_locked_object(struct buf *bp)
 }
 
 /*
+ * Allocate the KVA mapping for an existing buffer. It handles the
+ * cases of both B_UNMAPPED buffer, and buffer with the preallocated
+ * KVA which is not mapped (B_KVAALLOC).
+ */
+static void
+bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
+{
+	struct buf *scratch_bp;
+	int bsize, maxsize, need_mapping, need_kva;
+	off_t offset;
+
+	need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
+	    (gbflags & GB_UNMAPPED) == 0;
+	need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
+	    (gbflags & GB_KVAALLOC) != 0;
+	if (!need_mapping && !need_kva)
+		return;
+
+	BUF_CHECK_UNMAPPED(bp);
+
+	if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
+		/*
+		 * Buffer is not mapped, but the KVA was already
+		 * reserved at the time of the instantiation.  Use the
+		 * allocated space.
+		 */
+		bp->b_flags &= ~B_KVAALLOC;
+		KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
+		bp->b_kvabase = bp->b_kvaalloc;
+		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+		goto has_addr;
+	}
+
+	/*
+	 * Calculate the amount of the address space we would reserve
+	 * if the buffer was mapped.
+	 */
+	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
+	offset = blkno * bsize;
+	maxsize = size + (offset & PAGE_MASK);
+	maxsize = imax(maxsize, bsize);
+
+mapping_loop:
+	if (allocbufkva(bp, maxsize, gbflags)) {
+		/*
+		 * Request defragmentation. getnewbuf() returns us the
+		 * allocated space by the scratch buffer KVA.
+		 */
+		scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
+		    (GB_UNMAPPED | GB_KVAALLOC));
+		if (scratch_bp == NULL) {
+			if ((gbflags & GB_NOWAIT_BD) != 0) {
+				/*
+				 * XXXKIB: defragmentation cannot
+				 * succeed, not sure what else to do.
+				 */
+				panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
+			}
+			atomic_add_int(&mappingrestarts, 1);
+			goto mapping_loop;
+		}
+		KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
+		    ("scratch bp !B_KVAALLOC %p", scratch_bp));
+		setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
+		    scratch_bp->b_kvasize, gbflags);
+
+		/* Get rid of the scratch buffer. */
+		scratch_bp->b_kvasize = 0;
+		scratch_bp->b_flags |= B_INVAL;
+		scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
+		brelse(scratch_bp);
+	}
+	if (!need_mapping)
+		return;
+
+has_addr:
+	bp->b_saveaddr = bp->b_kvabase;
+	bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
+	bp->b_flags &= ~B_UNMAPPED;
+	BUF_CHECK_MAPPED(bp);
+	bpmap_qenter(bp);
+}
+
+/*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
@@ -2635,12 +3011,17 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
 {
 	struct buf *bp;
 	struct bufobj *bo;
-	int error;
+	int bsize, error, maxsize, vmio;
+	off_t offset;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
+	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	ASSERT_VOP_LOCKED(vp, "getblk");
 	if (size > MAXBSIZE)
 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+	if (!unmapped_buf_allowed)
+		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
 loop:
@@ -2743,12 +3124,18 @@ loop:
 		}
 
 		/*
+		 * Handle the case of unmapped buffer which should
+		 * become mapped, or the buffer for which KVA
+		 * reservation is requested.
+		 */
+		bp_unmapped_get_kva(bp, blkno, size, flags);
+
+		/*
 		 * If the size is inconsistant in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
-
 		if (bp->b_bcount != size)
 			allocbuf(bp, size);
 
@@ -2789,9 +3176,6 @@ loop:
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
-		int bsize, maxsize, vmio;
-		off_t offset;
-
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
@@ -2807,7 +3191,13 @@ loop:
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
-		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+		if (vmio) {
+			maxsize = size + (offset & PAGE_MASK);
+		} else {
+			maxsize = size;
+			/* Do not allow non-VMIO notmapped buffers. */
+			flags &= ~GB_UNMAPPED;
+		}
 		maxsize = imax(maxsize, bsize);
 
 		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
@@ -2863,6 +3253,7 @@ loop:
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
+			BUF_CHECK_MAPPED(bp);
 		}
 
 		allocbuf(bp, size);
@@ -3038,10 +3429,14 @@ allocbuf(struct buf *bp, int size)
 			if (desiredpages < bp->b_npages) {
 				vm_page_t m;
 
-				pmap_qremove((vm_offset_t)trunc_page(
-				    (vm_offset_t)bp->b_data) +
-				    (desiredpages << PAGE_SHIFT),
-				    (bp->b_npages - desiredpages));
+				if ((bp->b_flags & B_UNMAPPED) == 0) {
+					BUF_CHECK_MAPPED(bp);
+					pmap_qremove((vm_offset_t)trunc_page(
+					    (vm_offset_t)bp->b_data) +
+					    (desiredpages << PAGE_SHIFT),
+					    (bp->b_npages - desiredpages));
+				} else
+					BUF_CHECK_UNMAPPED(bp);
 				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
@@ -3147,21 +3542,12 @@ allocbuf(struct buf *bp, int size)
 			VM_OBJECT_WUNLOCK(obj);
 
 			/*
-			 * Step 3, fixup the KVM pmap.  Remember that
-			 * bp->b_data is relative to bp->b_offset, but 
-			 * bp->b_offset may be offset into the first page.
+			 * Step 3, fixup the KVM pmap.
 			 */
-
-			bp->b_data = (caddr_t)
-			    trunc_page((vm_offset_t)bp->b_data);
-			pmap_qenter(
-			    (vm_offset_t)bp->b_data,
-			    bp->b_pages, 
-			    bp->b_npages
-			);
-			
-			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
-			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
+			if ((bp->b_flags & B_UNMAPPED) == 0)
+				bpmap_qenter(bp);
+			else
+				BUF_CHECK_UNMAPPED(bp);
 		}
 	}
 	if (newbsize < bp->b_bufsize)
@@ -3171,21 +3557,38 @@ allocbuf(struct buf *bp, int size)
 	return 1;
 }
 
+extern int inflight_transient_maps;
+
 void
 biodone(struct bio *bp)
 {
 	struct mtx *mtxp;
 	void (*done)(struct bio *);
+	vm_offset_t start, end;
+	int transient;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->bio_flags |= BIO_DONE;
+	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
+		start = trunc_page((vm_offset_t)bp->bio_data);
+		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
+		transient = 1;
+	} else {
+		transient = 0;
+		start = end = 0;
+	}
 	done = bp->bio_done;
 	if (done == NULL)
 		wakeup(bp);
 	mtx_unlock(mtxp);
 	if (done != NULL)
 		done(bp);
+	if (transient) {
+		pmap_qremove(start, OFF_TO_IDX(end - start));
+		vm_map_remove(bio_transient_map, start, end);
+		atomic_add_int(&inflight_transient_maps, -1);
+	}
 }
 
 /*
@@ -3288,7 +3691,7 @@ dev_strategy(struct cdev *dev, struct buf *bp)
 	bip->bio_offset = bp->b_iooffset;
 	bip->bio_length = bp->b_bcount;
 	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
-	bip->bio_data = bp->b_data;
+	bdata2bio(bp, bip);
 	bip->bio_done = bufdonebio;
 	bip->bio_caller2 = bp;
 	bip->bio_dev = dev;
@@ -3442,9 +3845,11 @@ bufdone_finish(struct buf *bp)
 		}
 		vm_object_pip_wakeupn(obj, 0);
 		VM_OBJECT_WUNLOCK(obj);
-		if (bogus)
+		if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+			BUF_CHECK_MAPPED(bp);
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 			    bp->b_pages, bp->b_npages);
+		}
 	}
 
 	/*
@@ -3487,8 +3892,12 @@ vfs_unbusy_pages(struct buf *bp)
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
-			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
-			    bp->b_pages, bp->b_npages);
+			if ((bp->b_flags & B_UNMAPPED) == 0) {
+				BUF_CHECK_MAPPED(bp);
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+				    bp->b_pages, bp->b_npages);
+			} else
+				BUF_CHECK_UNMAPPED(bp);
 		}
 		vm_object_pip_subtract(obj, 1);
 		vm_page_io_finish(m);
@@ -3653,9 +4062,11 @@ vfs_busy_pages(struct buf *bp, int clear_modify)
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	VM_OBJECT_WUNLOCK(obj);
-	if (bogus)
+	if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
+	}
 }
 
 /*
@@ -3777,6 +4188,8 @@ vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 	vm_page_t p;
 	int index;
 
+	BUF_CHECK_MAPPED(bp);
+
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
@@ -3808,6 +4221,8 @@ vm_hold_free_pages(struct buf *bp, int newbsize)
 	vm_page_t p;
 	int index, newnpages;
 
+	BUF_CHECK_MAPPED(bp);
+
 	from = round_page((vm_offset_t)bp->b_data + newbsize);
 	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 	if (bp->b_npages > newnpages)
@@ -4009,6 +4424,30 @@ bunpin_wait(struct buf *bp)
 	mtx_unlock(mtxp);
 }
 
+/*
+ * Set bio_data or bio_ma for struct bio from the struct buf.
+ */
+void
+bdata2bio(struct buf *bp, struct bio *bip)
+{
+
+	if ((bp->b_flags & B_UNMAPPED) != 0) {
+		KASSERT(unmapped_buf_allowed, ("unmapped"));
+		bip->bio_ma = bp->b_pages;
+		bip->bio_ma_n = bp->b_npages;
+		bip->bio_data = unmapped_buf;
+		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+		bip->bio_flags |= BIO_UNMAPPED;
+		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
+		    PAGE_SIZE == bp->b_npages,
+		    ("Buffer %p too short: %d %d %d", bp, bip->bio_ma_offset,
+		    bip->bio_length, bip->bio_ma_n));
+	} else {
+		bip->bio_data = bp->b_data;
+		bip->bio_ma = NULL;
+	}
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 28aa4ff..91a0443 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -61,11 +61,11 @@ SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
 
 static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
 
-static struct cluster_save *
-	cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
-static struct buf *
-	cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
-			 daddr_t blkno, long size, int run, struct buf *fbp);
+static struct cluster_save *cluster_collectbufs(struct vnode *vp,
+	    struct buf *last_bp, int gbflags);
+static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
+	    daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
+	    struct buf *fbp);
 static void cluster_callback(struct buf *);
 
 static int write_behind = 1;
@@ -97,6 +97,8 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
 
 	error = 0;
 	bo = &vp->v_bufobj;
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
 
 	/*
 	 * Try to limit the amount of read-ahead by a few
@@ -112,7 +114,7 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
 	/*
 	 * get the requested block
 	 */
-	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0);
+	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
 	origblkno = lblkno;
 
 	/*
@@ -203,7 +205,7 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
 			if (ncontig < nblks)
 				nblks = ncontig;
 			bp = cluster_rbuild(vp, filesize, lblkno,
-				blkno, size, nblks, bp);
+			    blkno, size, nblks, gbflags, bp);
 			lblkno += (bp->b_bufsize / size);
 		} else {
 			bp->b_flags |= B_RAM;
@@ -247,14 +249,14 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
 		if (ncontig) {
 			ncontig = min(ncontig + 1, racluster);
 			rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
-				size, ncontig, NULL);
+			    size, ncontig, gbflags, NULL);
 			lblkno += (rbp->b_bufsize / size);
 			if (rbp->b_flags & B_DELWRI) {
 				bqrelse(rbp);
 				continue;
 			}
 		} else {
-			rbp = getblk(vp, lblkno, size, 0, 0, 0);
+			rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
 			lblkno += 1;
 			if (rbp->b_flags & B_DELWRI) {
 				bqrelse(rbp);
@@ -293,14 +295,8 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
  * and then parcel them up into logical blocks in the buffer hash table.
  */
 static struct buf *
-cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
-	struct vnode *vp;
-	u_quad_t filesize;
-	daddr_t lbn;
-	daddr_t blkno;
-	long size;
-	int run;
-	struct buf *fbp;
+cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+    daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
 {
 	struct bufobj *bo;
 	struct buf *bp, *tbp;
@@ -324,7 +320,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 		tbp = fbp;
 		tbp->b_iocmd = BIO_READ; 
 	} else {
-		tbp = getblk(vp, lbn, size, 0, 0, 0);
+		tbp = getblk(vp, lbn, size, 0, 0, gbflags);
 		if (tbp->b_flags & B_CACHE)
 			return tbp;
 		tbp->b_flags |= B_ASYNC | B_RAM;
@@ -345,9 +341,14 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 	 * address may not be either.  Inherit the b_data offset
 	 * from the original buffer.
 	 */
-	bp->b_data = (char *)((vm_offset_t)bp->b_data |
-	    ((vm_offset_t)tbp->b_data & PAGE_MASK));
 	bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+	if ((gbflags & GB_UNMAPPED) != 0) {
+		bp->b_flags |= B_UNMAPPED;
+		bp->b_data = unmapped_buf;
+	} else {
+		bp->b_data = (char *)((vm_offset_t)bp->b_data |
+		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+	}
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = cluster_callback;
 	bp->b_blkno = blkno;
@@ -371,7 +372,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 				break;
 			}
 
-			tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT);
+			tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
+			    (gbflags & GB_UNMAPPED));
 
 			/* Don't wait around for locked bufs. */
 			if (tbp == NULL)
@@ -493,8 +495,10 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 		    bp->b_bufsize, bp->b_kvasize);
 	bp->b_kvasize = bp->b_bufsize;
 
-	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
-		(vm_page_t *)bp->b_pages, bp->b_npages);
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+		    (vm_page_t *)bp->b_pages, bp->b_npages);
+	}
 	return (bp);
 }
 
@@ -517,7 +521,10 @@ cluster_callback(bp)
 	if (bp->b_ioflags & BIO_ERROR)
 		error = bp->b_error;
 
-	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+		    bp->b_npages);
+	}
 	/*
 	 * Move memory from the large cluster buffer into the component
 	 * buffers and mark IO as done on these.
@@ -559,7 +566,8 @@ cluster_callback(bp)
  */
 
 static __inline int
-cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+    int gbflags)
 {
 	int r = 0;
 
@@ -570,7 +578,7 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
 		start_lbn -= len;
 		/* FALLTHROUGH */
 	case 1:
-		r = cluster_wbuild(vp, size, start_lbn, len, 0);
+		r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
 		/* FALLTHROUGH */
 	default:
 		/* FALLTHROUGH */
@@ -598,6 +606,9 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
 	int lblocksize;
 	int async;
 
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
 	if (vp->v_type == VREG) {
 		async = DOINGASYNC(vp);
 		lblocksize = vp->v_mount->mnt_stat.f_iosize;
@@ -637,13 +648,13 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
 				if (!async && seqcount > 0) {
 					cluster_wbuild_wb(vp, lblocksize,
-						vp->v_cstart, cursize);
+					    vp->v_cstart, cursize, gbflags);
 				}
 			} else {
 				struct buf **bpp, **endbp;
 				struct cluster_save *buflist;
 
-				buflist = cluster_collectbufs(vp, bp);
+				buflist = cluster_collectbufs(vp, bp, gbflags);
 				endbp = &buflist->bs_children
 				    [buflist->bs_nchildren - 1];
 				if (VOP_REALLOCBLKS(vp, buflist)) {
@@ -662,7 +673,7 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
 					if (seqcount > 1) {
 						cluster_wbuild_wb(vp, 
 						    lblocksize, vp->v_cstart, 
-						    cursize);
+						    cursize, gbflags);
 					}
 				} else {
 					/*
@@ -710,8 +721,10 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
 		 * update daemon handle it.
 		 */
 		bdwrite(bp);
-		if (seqcount > 1)
-			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+		if (seqcount > 1) {
+			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
+			    vp->v_clen + 1, gbflags);
+		}
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
 	} else if (vm_page_count_severe()) {
@@ -746,6 +759,9 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
 	int totalwritten = 0;
 	int dbsize = btodb(size);
 
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
 	bo = &vp->v_bufobj;
 	while (len > 0) {
 		/*
@@ -824,10 +840,16 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
 		 * address may not be either.  Inherit the b_data offset
 		 * from the original buffer.
 		 */
-		bp->b_data = (char *)((vm_offset_t)bp->b_data |
-		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
-		bp->b_flags |= B_CLUSTER |
-				(tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
+		if ((gbflags & GB_UNMAPPED) == 0 ||
+		    (tbp->b_flags & B_VMIO) == 0) {
+			bp->b_data = (char *)((vm_offset_t)bp->b_data |
+			    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+		} else {
+			bp->b_flags |= B_UNMAPPED;
+			bp->b_data = unmapped_buf;
+		}
+		bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
+		    B_NEEDCOMMIT));
 		bp->b_iodone = cluster_callback;
 		pbgetvp(vp, bp);
 		/*
@@ -954,8 +976,10 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
 				tbp, b_cluster.cluster_entry);
 		}
 	finishcluster:
-		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
-			(vm_page_t *) bp->b_pages, bp->b_npages);
+		if ((bp->b_flags & B_UNMAPPED) == 0) {
+			pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+			    (vm_page_t *)bp->b_pages, bp->b_npages);
+		}
 		if (bp->b_bufsize > bp->b_kvasize)
 			panic(
 			    "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
@@ -976,9 +1000,7 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
  * Plus add one additional buffer.
  */
 static struct cluster_save *
-cluster_collectbufs(vp, last_bp)
-	struct vnode *vp;
-	struct buf *last_bp;
+cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
 {
 	struct cluster_save *buflist;
 	struct buf *bp;
@@ -991,7 +1013,8 @@ cluster_collectbufs(vp, last_bp)
 	buflist->bs_nchildren = 0;
 	buflist->bs_children = (struct buf **) (buflist + 1);
 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
-		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
+		(void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
+		    gbflags, &bp);
 		buflist->bs_children[i] = bp;
 		if (bp->b_blkno == bp->b_lblkno)
 			VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c
index 4fe6ebe..5c94a39 100644
--- a/sys/mips/mips/pmap.c
+++ b/sys/mips/mips/pmap.c
@@ -2576,6 +2576,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
 	}
 }
 
+int unmapped_buf_allowed;
+
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c
index 90466e8a..66194df 100644
--- a/sys/powerpc/aim/mmu_oea64.c
+++ b/sys/powerpc/aim/mmu_oea64.c
@@ -648,6 +648,14 @@ moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart,
 			moea64_kenter(mmup, pa, pa);
 	}
 	ENABLE_TRANS(msr);
+
+	/*
+	 * Allow user to override unmapped_buf_allowed for testing.
+	 * XXXKIB Only direct map implementation was tested.
+	 */
+	if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed",
+	    &unmapped_buf_allowed))
+		unmapped_buf_allowed = hw_direct_map;
 }
 
 void
diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c
index 42f1a39..7fd98f4 100644
--- a/sys/powerpc/powerpc/pmap_dispatch.c
+++ b/sys/powerpc/powerpc/pmap_dispatch.c
@@ -574,3 +574,5 @@ pmap_mmu_install(char *name, int prio)
 
 	return (FALSE);
 }
+
+int unmapped_buf_allowed;
diff --git a/sys/sparc64/sparc64/pmap.c b/sys/sparc64/sparc64/pmap.c
index 97a085a..2a24eb5 100644
--- a/sys/sparc64/sparc64/pmap.c
+++ b/sys/sparc64/sparc64/pmap.c
@@ -1918,6 +1918,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 	}
 }
 
+int unmapped_buf_allowed;
+
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index c016ee6..7678f5a 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -55,10 +55,13 @@
 #define BIO_DONE	0x02
 #define BIO_ONQUEUE	0x04
 #define BIO_ORDERED	0x08
+#define	BIO_UNMAPPED	0x10
+#define	BIO_TRANSIENT_MAPPING	0x20
 
 #ifdef _KERNEL
 struct disk;
 struct bio;
+struct vm_map;
 
 /* Empty classifier tag, to prevent further classification. */
 #define	BIO_NOTCLASSIFIED		(void *)(~0UL)
@@ -78,6 +81,9 @@ struct bio {
 	off_t	bio_offset;		/* Offset into file. */
 	long	bio_bcount;		/* Valid bytes in buffer. */
 	caddr_t	bio_data;		/* Memory, superblocks, indirect etc. */
+	struct vm_page **bio_ma;	/* Or unmapped. */
+	int	bio_ma_offset;		/* Offset in the first page of bio_ma. */
+	int	bio_ma_n;		/* Number of pages in bio_ma. */
 	int	bio_error;		/* Errno for BIO_ERROR. */
 	long	bio_resid;		/* Remaining I/O in bytes. */
 	void	(*bio_done)(struct bio *);
@@ -121,6 +127,9 @@ struct bio_queue_head {
 	struct	bio *insert_point;
 };
 
+extern struct vm_map *bio_transient_map;
+extern int bio_transient_maxcnt;
+
 void biodone(struct bio *bp);
 void biofinish(struct bio *bp, struct devstat *stat, int error);
 int biowait(struct bio *bp, const char *wchan);
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 87f2b76..cc8029e 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -117,6 +117,7 @@ struct buf {
 	long	b_bufsize;		/* Allocated buffer size. */
 	long	b_runningbufspace;	/* when I/O is running, pipelining */
 	caddr_t	b_kvabase;		/* base kva for buffer */
+	caddr_t	b_kvaalloc;		/* allocated kva for B_KVAALLOC */
 	int	b_kvasize;		/* size of kva for buffer */
 	daddr_t b_lblkno;		/* Logical block number. */
 	struct	vnode *b_vp;		/* Device vnode. */
@@ -202,8 +203,8 @@ struct buf {
 #define	B_PERSISTENT	0x00000100	/* Perm. ref'ed while EXT2FS mounted. */
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
-#define	B_00000800	0x00000800	/* Available flag. */
-#define	B_00001000	0x00001000	/* Available flag. */
+#define	B_UNMAPPED	0x00000800	/* KVA is not mapped. */
+#define	B_KVAALLOC	0x00001000	/* But allocated. */
 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
 #define	B_BARRIER	0x00004000	/* Write this and all preceeding first. */
 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
@@ -453,7 +454,9 @@ buf_countdeps(struct buf *bp, int i)
  */
 #define	GB_LOCK_NOWAIT	0x0001		/* Fail if we block on a buf lock. */
 #define	GB_NOCREAT	0x0002		/* Don't create a buf if not found. */
-#define	GB_NOWAIT_BD	0x0004		/* Do not wait for bufdaemon */
+#define	GB_NOWAIT_BD	0x0004		/* Do not wait for bufdaemon. */
+#define	GB_UNMAPPED	0x0008		/* Do not mmap buffer pages. */
+#define	GB_KVAALLOC	0x0010		/* But allocate KVA. */
 
 #ifdef _KERNEL
 extern int	nbuf;			/* The number of buffer headers */
@@ -470,11 +473,13 @@ extern struct	buf *swbuf;		/* Swap I/O buffer headers. */
 extern int	nswbuf;			/* Number of swap I/O buffer headers. */
 extern int	cluster_pbuf_freecnt;	/* Number of pbufs for clusters */
 extern int	vnode_pbuf_freecnt;	/* Number of pbufs for vnode pager */
+extern caddr_t	unmapped_buf;
 
 void	runningbufwakeup(struct buf *);
 void	waitrunningbufspace(void);
 caddr_t	kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est);
 void	bufinit(void);
+void	bdata2bio(struct buf *bp, struct bio *bip);
 void	bwillwrite(void);
 int	buf_dirty_count_severe(void);
 void	bremfree(struct buf *);
diff --git a/sys/sys/systm.h b/sys/sys/systm.h
index 12337de..61b4679 100644
--- a/sys/sys/systm.h
+++ b/sys/sys/systm.h
@@ -138,6 +138,7 @@ extern char **kenvp;
 
 extern const void *zero_region;	/* address space maps to a zeroed page	*/
 
+extern int unmapped_buf_allowed;
 extern int iosize_max_clamp;
 #define	IOSIZE_MAX	(iosize_max_clamp ? INT_MAX : SSIZE_MAX)
 
diff --git a/sys/vm/vm.h b/sys/vm/vm.h
index 132c10e..106c510 100644
--- a/sys/vm/vm.h
+++ b/sys/vm/vm.h
@@ -136,6 +136,8 @@ struct kva_md_info {
 	vm_offset_t	clean_eva;
 	vm_offset_t	pager_sva;
 	vm_offset_t	pager_eva;
+	vm_offset_t	bio_transient_sva;
+	vm_offset_t	bio_transient_eva;
 };
 
 extern struct kva_md_info	kmi;
diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c
index 08c9b03..b350e96 100644
--- a/sys/vm/vm_init.c
+++ b/sys/vm/vm_init.c
@@ -184,10 +184,15 @@ again:
 		panic("startup: table size inconsistency");
 
 	clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva,
-	    (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS, TRUE);
+	    (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS +
+	    (long)bio_transient_maxcnt * MAXPHYS, TRUE);
 	buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva,
 	    &kmi->buffer_eva, (long)nbuf * BKVASIZE, FALSE);
 	buffer_map->system_map = 1;
+	bio_transient_map = kmem_suballoc(clean_map, &kmi->bio_transient_sva,
+	    &kmi->bio_transient_eva, (long)bio_transient_maxcnt * MAXPHYS,
+	    FALSE);
+	bio_transient_map->system_map = 1;
 	pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva,
 	    (long)nswbuf * MAXPHYS, FALSE);
 	pager_map->system_map = 1;
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 64a2ebb..7c7ccc1 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -90,6 +90,7 @@ vm_map_t kmem_map;
 vm_map_t exec_map;
 vm_map_t pipe_map;
 vm_map_t buffer_map;
+vm_map_t bio_transient_map;
 
 const void *zero_region;
 CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);