summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorkib <kib@FreeBSD.org>2013-03-19 14:13:12 +0000
committerkib <kib@FreeBSD.org>2013-03-19 14:13:12 +0000
commit7c26a038f99b336029be9c909af59ab894214591 (patch)
treedccc6b57fecce89556fcde1d5370a34e16178ff5 /sys
parent878ef603e27acf1ab599ceece679d5f8bfda9f42 (diff)
downloadFreeBSD-src-7c26a038f99b336029be9c909af59ab894214591.zip
FreeBSD-src-7c26a038f99b336029be9c909af59ab894214591.tar.gz
Implement the concept of the unmapped VMIO buffers, i.e. buffers which
do not map the b_pages pages into buffer_map KVA. The use of the unmapped buffers eliminate the need to perform TLB shootdown for mapping on the buffer creation and reuse, greatly reducing the amount of IPIs for shootdown on big-SMP machines and eliminating up to 25-30% of the system time on i/o intensive workloads. The unmapped buffer should be explicitely requested by the GB_UNMAPPED flag by the consumer. For unmapped buffer, no KVA reservation is performed at all. The consumer might request unmapped buffer which does have a KVA reserve, to manually map it without recursing into buffer cache and blocking, with the GB_KVAALLOC flag. When the mapped buffer is requested and unmapped buffer already exists, the cache performs an upgrade, possibly reusing the KVA reservation. Unmapped buffer is translated into unmapped bio in g_vfs_strategy(). Unmapped bio carry a pointer to the vm_page_t array, offset and length instead of the data pointer. The provider which processes the bio should explicitely specify a readiness to accept unmapped bio, otherwise g_down geom thread performs the transient upgrade of the bio request by mapping the pages into the new bio_transient_map KVA submap. The bio_transient_map submap claims up to 10% of the buffer map, and the total buffer_map + bio_transient_map KVA usage stays the same. Still, it could be manually tuned by kern.bio_transient_maxcnt tunable, in the units of the transient mappings. Eventually, the bio_transient_map could be removed after all geom classes and drivers can accept unmapped i/o requests. Unmapped support can be turned off by the vfs.unmapped_buf_allowed tunable, disabling which makes the buffer (or cluster) creation requests to ignore GB_UNMAPPED and GB_KVAALLOC flags. Unmapped buffers are only enabled by default on the architectures where pmap_copy_page() was implemented and tested. In the rework, filesystem metadata is not the subject to maxbufspace limit anymore. Since the metadata buffers are always mapped, the buffers still have to fit into the buffer map, which provides a reasonable (but practically unreachable) upper bound on it. The non-metadata buffer allocations, both mapped and unmapped, is accounted against maxbufspace, as before. Effectively, this means that the maxbufspace is forced on mapped and unmapped buffers separately. The pre-patch bufspace limiting code did not worked, because buffer_map fragmentation does not allow the limit to be reached. By Jeff Roberson request, the getnewbuf() function was split into smaller single-purpose functions. Sponsored by: The FreeBSD Foundation Discussed with: jeff (previous version) Tested by: pho, scottl (previous version), jhb, bf MFC after: 2 weeks
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/amd64/pmap.c2
-rw-r--r--sys/arm/arm/pmap-v6.c2
-rw-r--r--sys/arm/arm/pmap.c2
-rw-r--r--sys/geom/geom.h1
-rw-r--r--sys/geom/geom_io.c106
-rw-r--r--sys/geom/geom_vfs.c8
-rw-r--r--sys/i386/i386/pmap.c2
-rw-r--r--sys/i386/xen/pmap.c2
-rw-r--r--sys/ia64/ia64/pmap.c2
-rw-r--r--sys/kern/subr_bus_dma.c23
-rw-r--r--sys/kern/subr_param.c7
-rw-r--r--sys/kern/vfs_bio.c963
-rw-r--r--sys/kern/vfs_cluster.c105
-rw-r--r--sys/mips/mips/pmap.c2
-rw-r--r--sys/powerpc/aim/mmu_oea64.c8
-rw-r--r--sys/powerpc/powerpc/pmap_dispatch.c2
-rw-r--r--sys/sparc64/sparc64/pmap.c2
-rw-r--r--sys/sys/bio.h9
-rw-r--r--sys/sys/buf.h11
-rw-r--r--sys/sys/systm.h1
-rw-r--r--sys/vm/vm.h2
-rw-r--r--sys/vm/vm_init.c7
-rw-r--r--sys/vm/vm_kern.c1
23 files changed, 955 insertions, 315 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 55d2cff..1b1c86c 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -4235,6 +4235,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
pagecopy((void *)src, (void *)dst);
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c
index 0083f29..9d16509 100644
--- a/sys/arm/arm/pmap-v6.c
+++ b/sys/arm/arm/pmap-v6.c
@@ -3312,6 +3312,8 @@ pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst)
mtx_unlock(&cmtx);
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/arm/arm/pmap.c b/sys/arm/arm/pmap.c
index c18783b..0875f83 100644
--- a/sys/arm/arm/pmap.c
+++ b/sys/arm/arm/pmap.c
@@ -4428,6 +4428,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
#endif
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/geom/geom.h b/sys/geom/geom.h
index 351b05d..660bf6e 100644
--- a/sys/geom/geom.h
+++ b/sys/geom/geom.h
@@ -205,6 +205,7 @@ struct g_provider {
u_int flags;
#define G_PF_WITHER 0x2
#define G_PF_ORPHAN 0x4
+#define G_PF_ACCEPT_UNMAPPED 0x8
/* Two fields for the implementing class to use */
void *private;
diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c
index c6a5da8..6ffc06e 100644
--- a/sys/geom/geom_io.c
+++ b/sys/geom/geom_io.c
@@ -1,6 +1,7 @@
/*-
* Copyright (c) 2002 Poul-Henning Kamp
* Copyright (c) 2002 Networks Associates Technology, Inc.
+ * Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Poul-Henning Kamp
@@ -8,6 +9,9 @@
* under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
* DARPA CHATS research program.
*
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -44,6 +48,7 @@ __FBSDID("$FreeBSD$");
#include <sys/ktr.h>
#include <sys/proc.h>
#include <sys/stack.h>
+#include <sys/sysctl.h>
#include <sys/errno.h>
#include <geom/geom.h>
@@ -51,6 +56,13 @@ __FBSDID("$FreeBSD$");
#include <sys/devicestat.h>
#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
static struct g_bioq g_bio_run_down;
static struct g_bioq g_bio_run_up;
@@ -180,12 +192,17 @@ g_clone_bio(struct bio *bp)
/*
* BIO_ORDERED flag may be used by disk drivers to enforce
* ordering restrictions, so this flag needs to be cloned.
+ * BIO_UNMAPPED should be inherited, to properly indicate
+ * which way the buffer is passed.
* Other bio flags are not suitable for cloning.
*/
- bp2->bio_flags = bp->bio_flags & BIO_ORDERED;
+ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED);
bp2->bio_length = bp->bio_length;
bp2->bio_offset = bp->bio_offset;
bp2->bio_data = bp->bio_data;
+ bp2->bio_ma = bp->bio_ma;
+ bp2->bio_ma_n = bp->bio_ma_n;
+ bp2->bio_ma_offset = bp->bio_ma_offset;
bp2->bio_attribute = bp->bio_attribute;
/* Inherit classification info from the parent */
bp2->bio_classifier1 = bp->bio_classifier1;
@@ -210,11 +227,15 @@ g_duplicate_bio(struct bio *bp)
struct bio *bp2;
bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
+ bp2->bio_flags = bp->bio_flags & BIO_UNMAPPED;
bp2->bio_parent = bp;
bp2->bio_cmd = bp->bio_cmd;
bp2->bio_length = bp->bio_length;
bp2->bio_offset = bp->bio_offset;
bp2->bio_data = bp->bio_data;
+ bp2->bio_ma = bp->bio_ma;
+ bp2->bio_ma_n = bp->bio_ma_n;
+ bp2->bio_ma_offset = bp->bio_ma_offset;
bp2->bio_attribute = bp->bio_attribute;
bp->bio_children++;
#ifdef KTR
@@ -575,6 +596,83 @@ g_io_deliver(struct bio *bp, int error)
return;
}
+SYSCTL_DECL(_kern_geom);
+
+static long transient_maps;
+SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
+ &transient_maps, 0,
+ "Total count of the transient mapping requests");
+u_int transient_map_retries = 10;
+SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
+ &transient_map_retries, 0,
+ "Max count of retries used before giving up on creating transient map");
+int transient_map_hard_failures;
+SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
+ &transient_map_hard_failures, 0,
+ "Failures to establish the transient mapping due to retry attempts "
+ "exhausted");
+int transient_map_soft_failures;
+SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
+ &transient_map_soft_failures, 0,
+ "Count of retried failures to establish the transient mapping");
+int inflight_transient_maps;
+SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
+ &inflight_transient_maps, 0,
+ "Current count of the active transient maps");
+
+static int
+g_io_transient_map_bio(struct bio *bp)
+{
+ vm_offset_t addr;
+ long size;
+ u_int retried;
+ int rv;
+
+ size = round_page(bp->bio_ma_offset + bp->bio_length);
+ KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
+ addr = 0;
+ retried = 0;
+ atomic_add_long(&transient_maps, 1);
+retry:
+ vm_map_lock(bio_transient_map);
+ if (vm_map_findspace(bio_transient_map, vm_map_min(bio_transient_map),
+ size, &addr)) {
+ vm_map_unlock(bio_transient_map);
+ if (transient_map_retries != 0 &&
+ retried >= transient_map_retries) {
+ g_io_deliver(bp, EDEADLK/* XXXKIB */);
+ CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
+ bp, bp->bio_to->name);
+ atomic_add_int(&transient_map_hard_failures, 1);
+ return (1);
+ } else {
+ /*
+ * Naive attempt to quisce the I/O to get more
+ * in-flight requests completed and defragment
+ * the bio_transient_map.
+ */
+ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
+ bp, bp->bio_to->name, retried);
+ pause("g_d_tra", hz / 10);
+ retried++;
+ atomic_add_int(&transient_map_soft_failures, 1);
+ goto retry;
+ }
+ }
+ rv = vm_map_insert(bio_transient_map, NULL, 0, addr, addr + size,
+ VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+ KASSERT(rv == KERN_SUCCESS,
+ ("vm_map_insert(bio_transient_map) rv %d %jx %lx",
+ rv, (uintmax_t)addr, size));
+ vm_map_unlock(bio_transient_map);
+ atomic_add_int(&inflight_transient_maps, 1);
+ pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
+ bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
+ bp->bio_flags |= BIO_TRANSIENT_MAPPING;
+ bp->bio_flags &= ~BIO_UNMAPPED;
+ return (0);
+}
+
void
g_io_schedule_down(struct thread *tp __unused)
{
@@ -636,6 +734,12 @@ g_io_schedule_down(struct thread *tp __unused)
default:
break;
}
+ if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
+ (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
+ (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
+ if (g_io_transient_map_bio(bp))
+ continue;
+ }
THREAD_NO_SLEEPING();
CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
"len %ld", bp, bp->bio_to->name, bp->bio_offset,
diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c
index bbed550..92f1ad2 100644
--- a/sys/geom/geom_vfs.c
+++ b/sys/geom/geom_vfs.c
@@ -188,14 +188,14 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp)
bip = g_alloc_bio();
bip->bio_cmd = bp->b_iocmd;
bip->bio_offset = bp->b_iooffset;
- bip->bio_data = bp->b_data;
- bip->bio_done = g_vfs_done;
- bip->bio_caller2 = bp;
bip->bio_length = bp->b_bcount;
- if (bp->b_flags & B_BARRIER) {
+ bdata2bio(bp, bip);
+ if ((bp->b_flags & B_BARRIER) != 0) {
bip->bio_flags |= BIO_ORDERED;
bp->b_flags &= ~B_BARRIER;
}
+ bip->bio_done = g_vfs_done;
+ bip->bio_caller2 = bp;
g_io_request(bip, cp);
}
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index 6499986..f4681bf 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -4205,6 +4205,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
mtx_unlock(&sysmaps->lock);
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c
index 0f7a80f..a9bc124 100644
--- a/sys/i386/xen/pmap.c
+++ b/sys/i386/xen/pmap.c
@@ -3448,6 +3448,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
mtx_unlock(&sysmaps->lock);
}
+int unmapped_buf_allowed = 1;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/ia64/ia64/pmap.c b/sys/ia64/ia64/pmap.c
index 3256600..883f39c 100644
--- a/sys/ia64/ia64/pmap.c
+++ b/sys/ia64/ia64/pmap.c
@@ -2014,6 +2014,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
bcopy(src, dst, PAGE_SIZE);
}
+int unmapped_buf_allowed;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/kern/subr_bus_dma.c b/sys/kern/subr_bus_dma.c
index 773d01a..4d344b4 100644
--- a/sys/kern/subr_bus_dma.c
+++ b/sys/kern/subr_bus_dma.c
@@ -126,11 +126,28 @@ static int
_bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
int *nsegs, int flags)
{
- int error;
+ vm_paddr_t paddr;
+ bus_size_t len, tlen;
+ int error, i, ma_offs;
- error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data,
- bio->bio_bcount, kernel_pmap, flags, NULL, nsegs);
+ if ((bio->bio_flags & BIO_UNMAPPED) == 0) {
+ error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data,
+ bio->bio_bcount, kernel_pmap, flags, NULL, nsegs);
+ return (error);
+ }
+ error = 0;
+ tlen = bio->bio_bcount;
+ ma_offs = bio->bio_ma_offset;
+ for (i = 0; tlen > 0; i++, tlen -= len) {
+ len = min(PAGE_SIZE - ma_offs, tlen);
+ paddr = VM_PAGE_TO_PHYS(bio->bio_ma[i]) + ma_offs;
+ error = _bus_dmamap_load_phys(dmat, map, paddr, len,
+ flags, NULL, nsegs);
+ if (error != 0)
+ break;
+ ma_offs = 0;
+ }
return (error);
}
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
index 825a3a0..a2e822c 100644
--- a/sys/kern/subr_param.c
+++ b/sys/kern/subr_param.c
@@ -92,6 +92,7 @@ int maxfiles; /* sys. wide open files limit */
int maxfilesperproc; /* per-proc open files limit */
int msgbufsize; /* size of kernel message buffer */
int nbuf;
+int bio_transient_maxcnt;
int ngroups_max; /* max # groups per process */
int nswbuf;
pid_t pid_max = PID_MAX;
@@ -118,6 +119,9 @@ SYSCTL_LONG(_kern, OID_AUTO, maxswzone, CTLFLAG_RDTUN, &maxswzone, 0,
"Maximum memory for swap metadata");
SYSCTL_LONG(_kern, OID_AUTO, maxbcache, CTLFLAG_RDTUN, &maxbcache, 0,
"Maximum value of vfs.maxbufspace");
+SYSCTL_INT(_kern, OID_AUTO, bio_transient_maxcnt, CTLFLAG_RDTUN,
+ &bio_transient_maxcnt, 0,
+ "Maximum number of transient BIOs mappings");
SYSCTL_ULONG(_kern, OID_AUTO, maxtsiz, CTLFLAG_RW | CTLFLAG_TUN, &maxtsiz, 0,
"Maximum text size");
SYSCTL_ULONG(_kern, OID_AUTO, dfldsiz, CTLFLAG_RW | CTLFLAG_TUN, &dfldsiz, 0,
@@ -266,6 +270,8 @@ init_param1(void)
pid_max = PID_MAX;
else if (pid_max < 300)
pid_max = 300;
+
+ TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed);
}
/*
@@ -322,6 +328,7 @@ init_param2(long physpages)
*/
nbuf = NBUF;
TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
+ TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt);
/*
* The default for maxpipekva is min(1/64 of the kernel address space,
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index d20c829..cded596 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,8 +1,12 @@
/*-
* Copyright (c) 2004 Poul-Henning Kamp
* Copyright (c) 1994,1997 John S. Dyson
+ * Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
*
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -92,6 +96,7 @@ struct buf_ops buf_ops_bio = {
* carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c.
*/
struct buf *buf; /* buffer header pool */
+caddr_t unmapped_buf;
static struct proc *bufdaemonproc;
@@ -132,6 +137,10 @@ SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
"Virtual memory used for buffers");
#endif
+static long unmapped_bufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
+ &unmapped_bufspace, 0,
+ "Amount of unmapped buffers, inclusive in the bufspace");
static long maxbufspace;
SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
"Maximum allowed value of bufspace (including buf_daemon)");
@@ -201,6 +210,10 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
static int getnewbufrestarts;
SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
"Number of times getnewbuf has had to restart a buffer aquisition");
+static int mappingrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+ "Number of times getblk has had to restart a buffer mapping for "
+ "unmapped buffer");
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
"Amount of work to do in flushbufqueues when helping bufdaemon");
@@ -210,6 +223,9 @@ SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
static long barrierwrites;
SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
"Number of barrier writes");
+SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
+ &unmapped_buf_allowed, 0,
+ "Permit the use of the unmapped i/o");
/*
* Wakeup point for bufdaemon, as well as indicator of whether it is already
@@ -281,6 +297,9 @@ static struct mtx nblock;
/* Queues for free buffers with various properties */
static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
+#ifdef INVARIANTS
+static int bq_len[BUFFER_QUEUES];
+#endif
/* Lock for the bufqueues */
static struct mtx bqlock;
@@ -511,7 +530,7 @@ caddr_t
kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
{
int tuned_nbuf;
- long maxbuf;
+ long maxbuf, maxbuf_sz, buf_sz, biotmap_sz;
/*
* physmem_est is in pages. Convert it to kilobytes (assumes
@@ -555,6 +574,52 @@ kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
}
/*
+ * Ideal allocation size for the transient bio submap if 10%
+ * of the maximal space buffer map. This roughly corresponds
+ * to the amount of the buffer mapped for typical UFS load.
+ *
+ * Clip the buffer map to reserve space for the transient
+ * BIOs, if its extent is bigger than 90% of the maximum
+ * buffer map extent on the platform.
+ *
+ * The fall-back to the maxbuf in case of maxbcache unset,
+ * allows to not trim the buffer KVA for the architectures
+ * with ample KVA space.
+ */
+ if (bio_transient_maxcnt == 0) {
+ maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
+ buf_sz = (long)nbuf * BKVASIZE;
+ if (buf_sz < maxbuf_sz / 10 * 9) {
+ /*
+ * There is more KVA than memory. Do not
+ * adjust buffer map size, and assign the rest
+ * of maxbuf to transient map.
+ */
+ biotmap_sz = maxbuf_sz - buf_sz;
+ } else {
+ /*
+ * Buffer map spans all KVA we could afford on
+ * this platform. Give 10% of the buffer map
+ * to the transient bio map.
+ */
+ biotmap_sz = buf_sz / 10;
+ buf_sz -= biotmap_sz;
+ }
+ if (biotmap_sz / INT_MAX > MAXPHYS)
+ bio_transient_maxcnt = INT_MAX;
+ else
+ bio_transient_maxcnt = biotmap_sz / MAXPHYS;
+ /*
+ * Artifically limit to 1024 simultaneous in-flight I/Os
+ * using the transient mapping.
+ */
+ if (bio_transient_maxcnt > 1024)
+ bio_transient_maxcnt = 1024;
+ if (tuned_nbuf)
+ nbuf = buf_sz / BKVASIZE;
+ }
+
+ /*
* swbufs are used as temporary holders for I/O, such as paging I/O.
* We have no less then 16 and no more then 256.
*/
@@ -607,6 +672,9 @@ bufinit(void)
LIST_INIT(&bp->b_dep);
BUF_LOCKINIT(bp);
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+#ifdef INVARIANTS
+ bq_len[QUEUE_EMPTY]++;
+#endif
}
/*
@@ -675,6 +743,55 @@ bufinit(void)
bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+ unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS);
+}
+
+#ifdef INVARIANTS
+static inline void
+vfs_buf_check_mapped(struct buf *bp)
+{
+
+ KASSERT((bp->b_flags & B_UNMAPPED) == 0,
+ ("mapped buf %p %x", bp, bp->b_flags));
+ KASSERT(bp->b_kvabase != unmapped_buf,
+ ("mapped buf: b_kvabase was not updated %p", bp));
+ KASSERT(bp->b_data != unmapped_buf,
+ ("mapped buf: b_data was not updated %p", bp));
+}
+
+static inline void
+vfs_buf_check_unmapped(struct buf *bp)
+{
+
+ KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
+ ("unmapped buf %p %x", bp, bp->b_flags));
+ KASSERT(bp->b_kvabase == unmapped_buf,
+ ("unmapped buf: corrupted b_kvabase %p", bp));
+ KASSERT(bp->b_data == unmapped_buf,
+ ("unmapped buf: corrupted b_data %p", bp));
+}
+
+#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
+#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
+#else
+#define BUF_CHECK_MAPPED(bp) do {} while (0)
+#define BUF_CHECK_UNMAPPED(bp) do {} while (0)
+#endif
+
+static void
+bpmap_qenter(struct buf *bp)
+{
+
+ BUF_CHECK_MAPPED(bp);
+
+ /*
+ * bp->b_data is relative to bp->b_offset, but
+ * bp->b_offset may be offset into the first page.
+ */
+ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
+ pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+ bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+ (vm_offset_t)(bp->b_offset & PAGE_MASK));
}
/*
@@ -686,14 +803,26 @@ static void
bfreekva(struct buf *bp)
{
- if (bp->b_kvasize) {
- atomic_add_int(&buffreekvacnt, 1);
- atomic_subtract_long(&bufspace, bp->b_kvasize);
- vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
- (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
- bp->b_kvasize = 0;
- bufspacewakeup();
+ if (bp->b_kvasize == 0)
+ return;
+
+ atomic_add_int(&buffreekvacnt, 1);
+ atomic_subtract_long(&bufspace, bp->b_kvasize);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvabase,
+ (vm_offset_t)bp->b_kvabase + bp->b_kvasize);
+ } else {
+ BUF_CHECK_UNMAPPED(bp);
+ if ((bp->b_flags & B_KVAALLOC) != 0) {
+ vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvaalloc,
+ (vm_offset_t)bp->b_kvaalloc + bp->b_kvasize);
+ }
+ atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+ bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
}
+ bp->b_kvasize = 0;
+ bufspacewakeup();
}
/*
@@ -760,6 +889,11 @@ bremfreel(struct buf *bp)
mtx_assert(&bqlock, MA_OWNED);
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+ KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
+ bp->b_qindex));
+ bq_len[bp->b_qindex]--;
+#endif
bp->b_qindex = QUEUE_NONE;
/*
* If this was a delayed bremfree() we only need to remove the buffer
@@ -1414,7 +1548,8 @@ brelse(struct buf *bp)
}
}
- if ((bp->b_flags & B_INVAL) == 0) {
+ if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
+ BUF_CHECK_MAPPED(bp);
pmap_qenter(
trunc_page((vm_offset_t)bp->b_data),
bp->b_pages, bp->b_npages);
@@ -1509,11 +1644,17 @@ brelse(struct buf *bp)
bp->b_qindex = QUEUE_DIRTY;
else
bp->b_qindex = QUEUE_CLEAN;
- if (bp->b_flags & B_AGE)
- TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
- else
- TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+ if (bp->b_flags & B_AGE) {
+ TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp,
+ b_freelist);
+ } else {
+ TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp,
+ b_freelist);
+ }
}
+#ifdef INVARIANTS
+ bq_len[bp->b_qindex]++;
+#endif
mtx_unlock(&bqlock);
/*
@@ -1604,6 +1745,9 @@ bqrelse(struct buf *bp)
if (bp->b_flags & B_DELWRI) {
bp->b_qindex = QUEUE_DIRTY;
TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+ bq_len[bp->b_qindex]++;
+#endif
} else {
/*
* The locking of the BO_LOCK for checking of the
@@ -1616,6 +1760,9 @@ bqrelse(struct buf *bp)
bp->b_qindex = QUEUE_CLEAN;
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
b_freelist);
+#ifdef INVARIANTS
+ bq_len[QUEUE_CLEAN]++;
+#endif
} else {
/*
* We are too low on memory, we have to try to free
@@ -1657,7 +1804,11 @@ vfs_vmio_release(struct buf *bp)
int i;
vm_page_t m;
- pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
@@ -1761,8 +1912,10 @@ vfs_bio_awrite(struct buf *bp)
int nwritten;
int size;
int maxcl;
+ int gbflags;
bo = &vp->v_bufobj;
+ gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
/*
* right now we support clustered writing only to regular files. If
* we find a clusterable block we could be in the middle of a cluster
@@ -1794,7 +1947,7 @@ vfs_bio_awrite(struct buf *bp)
if (ncl != 1) {
BUF_UNLOCK(bp);
nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
- 0);
+ gbflags);
return (nwritten);
}
}
@@ -1811,46 +1964,207 @@ vfs_bio_awrite(struct buf *bp)
return (nwritten);
}
+static void
+setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
+{
+
+ KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+ bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
+ if ((gbflags & GB_UNMAPPED) == 0) {
+ bp->b_kvabase = (caddr_t)addr;
+ } else if ((gbflags & GB_KVAALLOC) != 0) {
+ KASSERT((gbflags & GB_UNMAPPED) != 0,
+ ("GB_KVAALLOC without GB_UNMAPPED"));
+ bp->b_kvaalloc = (caddr_t)addr;
+ bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+ atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+ }
+ bp->b_kvasize = maxsize;
+}
+
/*
- * getnewbuf:
- *
- * Find and initialize a new buffer header, freeing up existing buffers
- * in the bufqueues as necessary. The new buffer is returned locked.
- *
- * Important: B_INVAL is not set. If the caller wishes to throw the
- * buffer away, the caller must set B_INVAL prior to calling brelse().
- *
- * We block if:
- * We have insufficient buffer headers
- * We have insufficient buffer space
- * buffer_map is too fragmented ( space reservation fails )
- * If we have to flush dirty buffers ( but we try to avoid this )
- *
- * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- * Instead we ask the buf daemon to do it for us. We attempt to
- * avoid piecemeal wakeups of the pageout daemon.
+ * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
+ * needed.
*/
+static int
+allocbufkva(struct buf *bp, int maxsize, int gbflags)
+{
+ vm_offset_t addr;
+ int rv;
-static struct buf *
-getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
- int gbflags)
+ bfreekva(bp);
+ addr = 0;
+
+ vm_map_lock(buffer_map);
+ if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize,
+ &addr)) {
+ vm_map_unlock(buffer_map);
+ /*
+ * Buffer map is too fragmented. Request the caller
+ * to defragment the map.
+ */
+ atomic_add_int(&bufdefragcnt, 1);
+ return (1);
+ }
+ rv = vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize,
+ VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+ KASSERT(rv == KERN_SUCCESS, ("vm_map_insert(buffer_map) rv %d", rv));
+ vm_map_unlock(buffer_map);
+ setbufkva(bp, addr, maxsize, gbflags);
+ atomic_add_long(&bufspace, bp->b_kvasize);
+ return (0);
+}
+
+/*
+ * Ask the bufdaemon for help, or act as bufdaemon itself, when a
+ * locked vnode is supplied.
+ */
+static void
+getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
+ int defrag)
{
struct thread *td;
- struct buf *bp;
- struct buf *nbp;
- int defrag = 0;
- int nqindex;
- static int flushingbufs;
+ char *waitmsg;
+ int fl, flags, norunbuf;
+
+ mtx_assert(&bqlock, MA_OWNED);
+
+ if (defrag) {
+ flags = VFS_BIO_NEED_BUFSPACE;
+ waitmsg = "nbufkv";
+ } else if (bufspace >= hibufspace) {
+ waitmsg = "nbufbs";
+ flags = VFS_BIO_NEED_BUFSPACE;
+ } else {
+ waitmsg = "newbuf";
+ flags = VFS_BIO_NEED_ANY;
+ }
+ mtx_lock(&nblock);
+ needsbuffer |= flags;
+ mtx_unlock(&nblock);
+ mtx_unlock(&bqlock);
+
+ bd_speedup(); /* heeeelp */
+ if ((gbflags & GB_NOWAIT_BD) != 0)
+ return;
td = curthread;
+ mtx_lock(&nblock);
+ while (needsbuffer & flags) {
+ if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
+ mtx_unlock(&nblock);
+ /*
+ * getblk() is called with a vnode locked, and
+ * some majority of the dirty buffers may as
+ * well belong to the vnode. Flushing the
+ * buffers there would make a progress that
+ * cannot be achieved by the buf_daemon, that
+ * cannot lock the vnode.
+ */
+ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+ (td->td_pflags & TDP_NORUNNINGBUF);
+ /* play bufdaemon */
+ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+ fl = buf_do_flush(vp);
+ td->td_pflags &= norunbuf;
+ mtx_lock(&nblock);
+ if (fl != 0)
+ continue;
+ if ((needsbuffer & flags) == 0)
+ break;
+ }
+ if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag,
+ waitmsg, slptimeo))
+ break;
+ }
+ mtx_unlock(&nblock);
+}
+
+static void
+getnewbuf_reuse_bp(struct buf *bp, int qindex)
+{
+
+ CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
+ "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
+ bp->b_kvasize, bp->b_bufsize, qindex);
+ mtx_assert(&bqlock, MA_NOTOWNED);
+
/*
- * We can't afford to block since we might be holding a vnode lock,
- * which may prevent system daemons from running. We deal with
- * low-memory situations by proactively returning memory and running
- * async I/O rather then sync I/O.
+ * Note: we no longer distinguish between VMIO and non-VMIO
+ * buffers.
*/
- atomic_add_int(&getnewbufcalls, 1);
- atomic_subtract_int(&getnewbufrestarts, 1);
+ KASSERT((bp->b_flags & B_DELWRI) == 0,
+ ("delwri buffer %p found in queue %d", bp, qindex));
+
+ if (qindex == QUEUE_CLEAN) {
+ if (bp->b_flags & B_VMIO) {
+ bp->b_flags &= ~B_ASYNC;
+ vfs_vmio_release(bp);
+ }
+ if (bp->b_vp != NULL)
+ brelvp(bp);
+ }
+
+ /*
+ * Get the rest of the buffer freed up. b_kva* is still valid
+ * after this operation.
+ */
+
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_deallocate(bp);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 3");
+ KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d",
+ bp, bp->b_vp, qindex));
+ KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+ ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
+
+ bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
+ bp->b_ioflags = 0;
+ bp->b_xflags = 0;
+ KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
+ ("buf %p still counted as free?", bp));
+ bp->b_vflags = 0;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_bufobj = NULL;
+ bp->b_pin_count = 0;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
+
+ LIST_INIT(&bp->b_dep);
+}
+
+static int flushingbufs;
+
+static struct buf *
+getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
+{
+ struct buf *bp, *nbp;
+ int nqindex, qindex, pass;
+
+ KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
+
+ pass = 1;
restart:
atomic_add_int(&getnewbufrestarts, 1);
@@ -1860,66 +2174,90 @@ restart:
* that if we are specially marked process, we are allowed to
* dip into our reserves.
*
- * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+ * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+ * for the allocation of the mapped buffer. For unmapped, the
+ * easiest is to start with EMPTY outright.
*
* We start with EMPTYKVA. If the list is empty we backup to EMPTY.
* However, there are a number of cases (defragging, reusing, ...)
* where we cannot backup.
*/
+ nbp = NULL;
mtx_lock(&bqlock);
- nqindex = QUEUE_EMPTYKVA;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
-
+ if (!defrag && unmapped) {
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ }
if (nbp == NULL) {
- /*
- * If no EMPTYKVA buffers and we are either
- * defragging or reusing, locate a CLEAN buffer
- * to free or reuse. If bufspace useage is low
- * skip this step so we can allocate a new buffer.
- */
- if (defrag || bufspace >= lobufspace) {
- nqindex = QUEUE_CLEAN;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
- }
+ nqindex = QUEUE_EMPTYKVA;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+ }
- /*
- * If we could not find or were not allowed to reuse a
- * CLEAN buffer, check to see if it is ok to use an EMPTY
- * buffer. We can only use an EMPTY buffer if allocating
- * its KVA would not otherwise run us out of buffer space.
- */
- if (nbp == NULL && defrag == 0 &&
- bufspace + maxsize < hibufspace) {
- nqindex = QUEUE_EMPTY;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
- }
+ /*
+ * If no EMPTYKVA buffers and we are either defragging or
+ * reusing, locate a CLEAN buffer to free or reuse. If
+ * bufspace useage is low skip this step so we can allocate a
+ * new buffer.
+ */
+ if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ }
+
+ /*
+ * If we could not find or were not allowed to reuse a CLEAN
+ * buffer, check to see if it is ok to use an EMPTY buffer.
+ * We can only use an EMPTY buffer if allocating its KVA would
+ * not otherwise run us out of buffer space. No KVA is needed
+ * for the unmapped allocation.
+ */
+ if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
+ metadata)) {
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ }
+
+ /*
+ * All available buffers might be clean, retry ignoring the
+ * lobufspace as the last resort.
+ */
+ if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
}
/*
* Run scan, possibly freeing data and/or kva mappings on the fly
* depending.
*/
-
while ((bp = nbp) != NULL) {
- int qindex = nqindex;
+ qindex = nqindex;
/*
- * Calculate next bp ( we can only use it if we do not block
- * or do other fancy things ).
+ * Calculate next bp (we can only use it if we do not
+ * block or do other fancy things).
*/
if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
- switch(qindex) {
+ switch (qindex) {
case QUEUE_EMPTY:
nqindex = QUEUE_EMPTYKVA;
- if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+ if (nbp != NULL)
break;
/* FALLTHROUGH */
case QUEUE_EMPTYKVA:
nqindex = QUEUE_CLEAN;
- if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ if (nbp != NULL)
break;
/* FALLTHROUGH */
case QUEUE_CLEAN:
+ if (metadata && pass == 1) {
+ pass = 2;
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(
+ &bufqueues[QUEUE_EMPTY]);
+ }
/*
* nbp is NULL.
*/
@@ -1952,22 +2290,9 @@ restart:
}
BO_UNLOCK(bp->b_bufobj);
}
- CTR6(KTR_BUF,
- "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
- "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
- bp->b_kvasize, bp->b_bufsize, qindex);
-
- /*
- * Sanity Checks
- */
- KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
-
- /*
- * Note: we no longer distinguish between VMIO and non-VMIO
- * buffers.
- */
- KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
+ KASSERT(bp->b_qindex == qindex,
+ ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
if (bp->b_bufobj != NULL)
BO_LOCK(bp->b_bufobj);
@@ -1975,68 +2300,13 @@ restart:
if (bp->b_bufobj != NULL)
BO_UNLOCK(bp->b_bufobj);
mtx_unlock(&bqlock);
-
- if (qindex == QUEUE_CLEAN) {
- if (bp->b_flags & B_VMIO) {
- bp->b_flags &= ~B_ASYNC;
- vfs_vmio_release(bp);
- }
- if (bp->b_vp)
- brelvp(bp);
- }
-
/*
* NOTE: nbp is now entirely invalid. We can only restart
* the scan from this point on.
- *
- * Get the rest of the buffer freed up. b_kva* is still
- * valid after this operation.
*/
- if (bp->b_rcred != NOCRED) {
- crfree(bp->b_rcred);
- bp->b_rcred = NOCRED;
- }
- if (bp->b_wcred != NOCRED) {
- crfree(bp->b_wcred);
- bp->b_wcred = NOCRED;
- }
- if (!LIST_EMPTY(&bp->b_dep))
- buf_deallocate(bp);
- if (bp->b_vflags & BV_BKGRDINPROG)
- panic("losing buffer 3");
- KASSERT(bp->b_vp == NULL,
- ("bp: %p still has vnode %p. qindex: %d",
- bp, bp->b_vp, qindex));
- KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
- ("bp: %p still on a buffer list. xflags %X",
- bp, bp->b_xflags));
-
- if (bp->b_bufsize)
- allocbuf(bp, 0);
-
- bp->b_flags = 0;
- bp->b_ioflags = 0;
- bp->b_xflags = 0;
- KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
- ("buf %p still counted as free?", bp));
- bp->b_vflags = 0;
- bp->b_vp = NULL;
- bp->b_blkno = bp->b_lblkno = 0;
- bp->b_offset = NOOFFSET;
- bp->b_iodone = 0;
- bp->b_error = 0;
- bp->b_resid = 0;
- bp->b_bcount = 0;
- bp->b_npages = 0;
- bp->b_dirtyoff = bp->b_dirtyend = 0;
- bp->b_bufobj = NULL;
- bp->b_pin_count = 0;
- bp->b_fsprivate1 = NULL;
- bp->b_fsprivate2 = NULL;
- bp->b_fsprivate3 = NULL;
-
- LIST_INIT(&bp->b_dep);
+ getnewbuf_reuse_bp(bp, qindex);
+ mtx_assert(&bqlock, MA_NOTOWNED);
/*
* If we are defragging then free the buffer.
@@ -2060,6 +2330,9 @@ restart:
goto restart;
}
+ if (metadata)
+ break;
+
/*
* If we are overcomitted then recover the buffer and its
* KVM space. This occurs in rare situations when multiple
@@ -2077,6 +2350,59 @@ restart:
flushingbufs = 0;
break;
}
+ return (bp);
+}
+
+/*
+ * getnewbuf:
+ *
+ * Find and initialize a new buffer header, freeing up existing buffers
+ * in the bufqueues as necessary. The new buffer is returned locked.
+ *
+ * Important: B_INVAL is not set. If the caller wishes to throw the
+ * buffer away, the caller must set B_INVAL prior to calling brelse().
+ *
+ * We block if:
+ * We have insufficient buffer headers
+ * We have insufficient buffer space
+ * buffer_map is too fragmented ( space reservation fails )
+ * If we have to flush dirty buffers ( but we try to avoid this )
+ *
+ * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
+ * Instead we ask the buf daemon to do it for us. We attempt to
+ * avoid piecemeal wakeups of the pageout daemon.
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+ int gbflags)
+{
+ struct buf *bp;
+ int defrag, metadata;
+
+ KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+ if (!unmapped_buf_allowed)
+ gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+ defrag = 0;
+ if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
+ vp->v_type == VCHR)
+ metadata = 1;
+ else
+ metadata = 0;
+ /*
+ * We can't afford to block since we might be holding a vnode lock,
+ * which may prevent system daemons from running. We deal with
+ * low-memory situations by proactively returning memory and running
+ * async I/O rather then sync I/O.
+ */
+ atomic_add_int(&getnewbufcalls, 1);
+ atomic_subtract_int(&getnewbufrestarts, 1);
+restart:
+ bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
+ GB_KVAALLOC)) == GB_UNMAPPED, metadata);
+ if (bp != NULL)
+ defrag = 0;
/*
* If we exhausted our list, sleep as appropriate. We may have to
@@ -2084,65 +2410,23 @@ restart:
*
* Generally we are sleeping due to insufficient buffer space.
*/
-
if (bp == NULL) {
- int flags, norunbuf;
- char *waitmsg;
- int fl;
-
- if (defrag) {
- flags = VFS_BIO_NEED_BUFSPACE;
- waitmsg = "nbufkv";
- } else if (bufspace >= hibufspace) {
- waitmsg = "nbufbs";
- flags = VFS_BIO_NEED_BUFSPACE;
- } else {
- waitmsg = "newbuf";
- flags = VFS_BIO_NEED_ANY;
- }
- mtx_lock(&nblock);
- needsbuffer |= flags;
- mtx_unlock(&nblock);
- mtx_unlock(&bqlock);
-
- bd_speedup(); /* heeeelp */
- if (gbflags & GB_NOWAIT_BD)
- return (NULL);
-
- mtx_lock(&nblock);
- while (needsbuffer & flags) {
- if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
- mtx_unlock(&nblock);
- /*
- * getblk() is called with a vnode
- * locked, and some majority of the
- * dirty buffers may as well belong to
- * the vnode. Flushing the buffers
- * there would make a progress that
- * cannot be achieved by the
- * buf_daemon, that cannot lock the
- * vnode.
- */
- norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
- (td->td_pflags & TDP_NORUNNINGBUF);
- /* play bufdaemon */
- td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
- fl = buf_do_flush(vp);
- td->td_pflags &= norunbuf;
- mtx_lock(&nblock);
- if (fl != 0)
- continue;
- if ((needsbuffer & flags) == 0)
- break;
- }
- if (msleep(&needsbuffer, &nblock,
- (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
- mtx_unlock(&nblock);
- return (NULL);
- }
- }
- mtx_unlock(&nblock);
+ mtx_assert(&bqlock, MA_OWNED);
+ getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
+ mtx_assert(&bqlock, MA_NOTOWNED);
+ } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
+ mtx_assert(&bqlock, MA_NOTOWNED);
+
+ bfreekva(bp);
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_kvabase = bp->b_data = unmapped_buf;
+ bp->b_kvasize = maxsize;
+ atomic_add_long(&bufspace, bp->b_kvasize);
+ atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
} else {
+ mtx_assert(&bqlock, MA_NOTOWNED);
+
/*
* We finally have a valid bp. We aren't quite out of the
* woods, we still have to reserve kva space. In order
@@ -2151,39 +2435,47 @@ restart:
*/
maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
- if (maxsize != bp->b_kvasize) {
- vm_offset_t addr = 0;
- int rv;
-
- bfreekva(bp);
-
- vm_map_lock(buffer_map);
- if (vm_map_findspace(buffer_map,
- vm_map_min(buffer_map), maxsize, &addr)) {
- /*
- * Buffer map is too fragmented.
- * We must defragment the map.
- */
- atomic_add_int(&bufdefragcnt, 1);
- vm_map_unlock(buffer_map);
+ if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
+ B_KVAALLOC)) == B_UNMAPPED) {
+ if (allocbufkva(bp, maxsize, gbflags)) {
defrag = 1;
bp->b_flags |= B_INVAL;
brelse(bp);
goto restart;
}
- rv = vm_map_insert(buffer_map, NULL, 0, addr,
- addr + maxsize, VM_PROT_ALL, VM_PROT_ALL,
- MAP_NOFAULT);
- KASSERT(rv == KERN_SUCCESS,
- ("vm_map_insert(buffer_map) rv %d", rv));
- vm_map_unlock(buffer_map);
- bp->b_kvabase = (caddr_t)addr;
- bp->b_kvasize = maxsize;
- atomic_add_long(&bufspace, bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
+ } else if ((bp->b_flags & B_KVAALLOC) != 0 &&
+ (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
+ /*
+ * If the reused buffer has KVA allocated,
+ * reassign b_kvaalloc to b_kvabase.
+ */
+ bp->b_kvabase = bp->b_kvaalloc;
+ bp->b_flags &= ~B_KVAALLOC;
+ atomic_subtract_long(&unmapped_bufspace,
+ bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
+ } else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+ (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
+ GB_KVAALLOC)) {
+ /*
+ * The case of reused buffer already have KVA
+ * mapped, but the request is for unmapped
+ * buffer with KVA allocated.
+ */
+ bp->b_kvaalloc = bp->b_kvabase;
+ bp->b_data = bp->b_kvabase = unmapped_buf;
+ bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+ atomic_add_long(&unmapped_bufspace,
+ bp->b_kvasize);
atomic_add_int(&bufreusecnt, 1);
}
- bp->b_saveaddr = bp->b_kvabase;
- bp->b_data = bp->b_saveaddr;
+ if ((gbflags & GB_UNMAPPED) == 0) {
+ bp->b_saveaddr = bp->b_kvabase;
+ bp->b_data = bp->b_saveaddr;
+ bp->b_flags &= ~B_UNMAPPED;
+ BUF_CHECK_MAPPED(bp);
+ }
}
return (bp);
}
@@ -2594,6 +2886,90 @@ vfs_setdirty_locked_object(struct buf *bp)
}
/*
+ * Allocate the KVA mapping for an existing buffer. It handles the
+ * cases of both B_UNMAPPED buffer, and buffer with the preallocated
+ * KVA which is not mapped (B_KVAALLOC).
+ */
+static void
+bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
+{
+ struct buf *scratch_bp;
+ int bsize, maxsize, need_mapping, need_kva;
+ off_t offset;
+
+ need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
+ (gbflags & GB_UNMAPPED) == 0;
+ need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
+ (gbflags & GB_KVAALLOC) != 0;
+ if (!need_mapping && !need_kva)
+ return;
+
+ BUF_CHECK_UNMAPPED(bp);
+
+ if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
+ /*
+ * Buffer is not mapped, but the KVA was already
+ * reserved at the time of the instantiation. Use the
+ * allocated space.
+ */
+ bp->b_flags &= ~B_KVAALLOC;
+ KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
+ bp->b_kvabase = bp->b_kvaalloc;
+ atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+ goto has_addr;
+ }
+
+ /*
+ * Calculate the amount of the address space we would reserve
+ * if the buffer was mapped.
+ */
+ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
+ offset = blkno * bsize;
+ maxsize = size + (offset & PAGE_MASK);
+ maxsize = imax(maxsize, bsize);
+
+mapping_loop:
+ if (allocbufkva(bp, maxsize, gbflags)) {
+ /*
+ * Request defragmentation. getnewbuf() returns us the
+ * allocated space by the scratch buffer KVA.
+ */
+ scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
+ (GB_UNMAPPED | GB_KVAALLOC));
+ if (scratch_bp == NULL) {
+ if ((gbflags & GB_NOWAIT_BD) != 0) {
+ /*
+ * XXXKIB: defragmentation cannot
+ * succeed, not sure what else to do.
+ */
+ panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
+ }
+ atomic_add_int(&mappingrestarts, 1);
+ goto mapping_loop;
+ }
+ KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
+ ("scratch bp !B_KVAALLOC %p", scratch_bp));
+ setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
+ scratch_bp->b_kvasize, gbflags);
+
+ /* Get rid of the scratch buffer. */
+ scratch_bp->b_kvasize = 0;
+ scratch_bp->b_flags |= B_INVAL;
+ scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
+ brelse(scratch_bp);
+ }
+ if (!need_mapping)
+ return;
+
+has_addr:
+ bp->b_saveaddr = bp->b_kvabase;
+ bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
+ bp->b_flags &= ~B_UNMAPPED;
+ BUF_CHECK_MAPPED(bp);
+ bpmap_qenter(bp);
+}
+
+/*
* getblk:
*
* Get a block given a specified block and offset into a file/device.
@@ -2635,12 +3011,17 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
{
struct buf *bp;
struct bufobj *bo;
- int error;
+ int bsize, error, maxsize, vmio;
+ off_t offset;
CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
+ KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
ASSERT_VOP_LOCKED(vp, "getblk");
if (size > MAXBSIZE)
panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+ if (!unmapped_buf_allowed)
+ flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
bo = &vp->v_bufobj;
loop:
@@ -2743,12 +3124,18 @@ loop:
}
/*
+ * Handle the case of unmapped buffer which should
+ * become mapped, or the buffer for which KVA
+ * reservation is requested.
+ */
+ bp_unmapped_get_kva(bp, blkno, size, flags);
+
+ /*
* If the size is inconsistant in the VMIO case, we can resize
* the buffer. This might lead to B_CACHE getting set or
* cleared. If the size has not changed, B_CACHE remains
* unchanged from its previous state.
*/
-
if (bp->b_bcount != size)
allocbuf(bp, size);
@@ -2789,9 +3176,6 @@ loop:
}
bp->b_flags &= ~B_DONE;
} else {
- int bsize, maxsize, vmio;
- off_t offset;
-
/*
* Buffer is not in-core, create new buffer. The buffer
* returned by getnewbuf() is locked. Note that the returned
@@ -2807,7 +3191,13 @@ loop:
bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
offset = blkno * bsize;
vmio = vp->v_object != NULL;
- maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+ if (vmio) {
+ maxsize = size + (offset & PAGE_MASK);
+ } else {
+ maxsize = size;
+ /* Do not allow non-VMIO notmapped buffers. */
+ flags &= ~GB_UNMAPPED;
+ }
maxsize = imax(maxsize, bsize);
bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
@@ -2863,6 +3253,7 @@ loop:
KASSERT(bp->b_bufobj->bo_object == NULL,
("ARGH! has b_bufobj->bo_object %p %p\n",
bp, bp->b_bufobj->bo_object));
+ BUF_CHECK_MAPPED(bp);
}
allocbuf(bp, size);
@@ -3038,10 +3429,14 @@ allocbuf(struct buf *bp, int size)
if (desiredpages < bp->b_npages) {
vm_page_t m;
- pmap_qremove((vm_offset_t)trunc_page(
- (vm_offset_t)bp->b_data) +
- (desiredpages << PAGE_SHIFT),
- (bp->b_npages - desiredpages));
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove((vm_offset_t)trunc_page(
+ (vm_offset_t)bp->b_data) +
+ (desiredpages << PAGE_SHIFT),
+ (bp->b_npages - desiredpages));
+ } else
+ BUF_CHECK_UNMAPPED(bp);
VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
for (i = desiredpages; i < bp->b_npages; i++) {
/*
@@ -3147,21 +3542,12 @@ allocbuf(struct buf *bp, int size)
VM_OBJECT_WUNLOCK(obj);
/*
- * Step 3, fixup the KVM pmap. Remember that
- * bp->b_data is relative to bp->b_offset, but
- * bp->b_offset may be offset into the first page.
+ * Step 3, fixup the KVM pmap.
*/
-
- bp->b_data = (caddr_t)
- trunc_page((vm_offset_t)bp->b_data);
- pmap_qenter(
- (vm_offset_t)bp->b_data,
- bp->b_pages,
- bp->b_npages
- );
-
- bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
- (vm_offset_t)(bp->b_offset & PAGE_MASK));
+ if ((bp->b_flags & B_UNMAPPED) == 0)
+ bpmap_qenter(bp);
+ else
+ BUF_CHECK_UNMAPPED(bp);
}
}
if (newbsize < bp->b_bufsize)
@@ -3171,21 +3557,38 @@ allocbuf(struct buf *bp, int size)
return 1;
}
+extern int inflight_transient_maps;
+
void
biodone(struct bio *bp)
{
struct mtx *mtxp;
void (*done)(struct bio *);
+ vm_offset_t start, end;
+ int transient;
mtxp = mtx_pool_find(mtxpool_sleep, bp);
mtx_lock(mtxp);
bp->bio_flags |= BIO_DONE;
+ if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
+ start = trunc_page((vm_offset_t)bp->bio_data);
+ end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
+ transient = 1;
+ } else {
+ transient = 0;
+ start = end = 0;
+ }
done = bp->bio_done;
if (done == NULL)
wakeup(bp);
mtx_unlock(mtxp);
if (done != NULL)
done(bp);
+ if (transient) {
+ pmap_qremove(start, OFF_TO_IDX(end - start));
+ vm_map_remove(bio_transient_map, start, end);
+ atomic_add_int(&inflight_transient_maps, -1);
+ }
}
/*
@@ -3288,7 +3691,7 @@ dev_strategy(struct cdev *dev, struct buf *bp)
bip->bio_offset = bp->b_iooffset;
bip->bio_length = bp->b_bcount;
bip->bio_bcount = bp->b_bcount; /* XXX: remove */
- bip->bio_data = bp->b_data;
+ bdata2bio(bp, bip);
bip->bio_done = bufdonebio;
bip->bio_caller2 = bp;
bip->bio_dev = dev;
@@ -3442,9 +3845,11 @@ bufdone_finish(struct buf *bp)
}
vm_object_pip_wakeupn(obj, 0);
VM_OBJECT_WUNLOCK(obj);
- if (bogus)
+ if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
bp->b_pages, bp->b_npages);
+ }
}
/*
@@ -3487,8 +3892,12 @@ vfs_unbusy_pages(struct buf *bp)
if (!m)
panic("vfs_unbusy_pages: page missing\n");
bp->b_pages[i] = m;
- pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
- bp->b_pages, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
}
vm_object_pip_subtract(obj, 1);
vm_page_io_finish(m);
@@ -3653,9 +4062,11 @@ vfs_busy_pages(struct buf *bp, int clear_modify)
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
}
VM_OBJECT_WUNLOCK(obj);
- if (bogus)
+ if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
bp->b_pages, bp->b_npages);
+ }
}
/*
@@ -3777,6 +4188,8 @@ vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
vm_page_t p;
int index;
+ BUF_CHECK_MAPPED(bp);
+
to = round_page(to);
from = round_page(from);
index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
@@ -3808,6 +4221,8 @@ vm_hold_free_pages(struct buf *bp, int newbsize)
vm_page_t p;
int index, newnpages;
+ BUF_CHECK_MAPPED(bp);
+
from = round_page((vm_offset_t)bp->b_data + newbsize);
newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
if (bp->b_npages > newnpages)
@@ -4009,6 +4424,30 @@ bunpin_wait(struct buf *bp)
mtx_unlock(mtxp);
}
+/*
+ * Set bio_data or bio_ma for struct bio from the struct buf.
+ */
+void
+bdata2bio(struct buf *bp, struct bio *bip)
+{
+
+ if ((bp->b_flags & B_UNMAPPED) != 0) {
+ KASSERT(unmapped_buf_allowed, ("unmapped"));
+ bip->bio_ma = bp->b_pages;
+ bip->bio_ma_n = bp->b_npages;
+ bip->bio_data = unmapped_buf;
+ bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+ bip->bio_flags |= BIO_UNMAPPED;
+ KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
+ PAGE_SIZE == bp->b_npages,
+ ("Buffer %p too short: %d %d %d", bp, bip->bio_ma_offset,
+ bip->bio_length, bip->bio_ma_n));
+ } else {
+ bip->bio_data = bp->b_data;
+ bip->bio_ma = NULL;
+ }
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 28aa4ff..91a0443 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -61,11 +61,11 @@ SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
-static struct cluster_save *
- cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
-static struct buf *
- cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
- daddr_t blkno, long size, int run, struct buf *fbp);
+static struct cluster_save *cluster_collectbufs(struct vnode *vp,
+ struct buf *last_bp, int gbflags);
+static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
+ daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
+ struct buf *fbp);
static void cluster_callback(struct buf *);
static int write_behind = 1;
@@ -97,6 +97,8 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
error = 0;
bo = &vp->v_bufobj;
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
/*
* Try to limit the amount of read-ahead by a few
@@ -112,7 +114,7 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
/*
* get the requested block
*/
- *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0);
+ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
origblkno = lblkno;
/*
@@ -203,7 +205,7 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
if (ncontig < nblks)
nblks = ncontig;
bp = cluster_rbuild(vp, filesize, lblkno,
- blkno, size, nblks, bp);
+ blkno, size, nblks, gbflags, bp);
lblkno += (bp->b_bufsize / size);
} else {
bp->b_flags |= B_RAM;
@@ -247,14 +249,14 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
if (ncontig) {
ncontig = min(ncontig + 1, racluster);
rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
- size, ncontig, NULL);
+ size, ncontig, gbflags, NULL);
lblkno += (rbp->b_bufsize / size);
if (rbp->b_flags & B_DELWRI) {
bqrelse(rbp);
continue;
}
} else {
- rbp = getblk(vp, lblkno, size, 0, 0, 0);
+ rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
lblkno += 1;
if (rbp->b_flags & B_DELWRI) {
bqrelse(rbp);
@@ -293,14 +295,8 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
* and then parcel them up into logical blocks in the buffer hash table.
*/
static struct buf *
-cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
- struct vnode *vp;
- u_quad_t filesize;
- daddr_t lbn;
- daddr_t blkno;
- long size;
- int run;
- struct buf *fbp;
+cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+ daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
{
struct bufobj *bo;
struct buf *bp, *tbp;
@@ -324,7 +320,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
tbp = fbp;
tbp->b_iocmd = BIO_READ;
} else {
- tbp = getblk(vp, lbn, size, 0, 0, 0);
+ tbp = getblk(vp, lbn, size, 0, 0, gbflags);
if (tbp->b_flags & B_CACHE)
return tbp;
tbp->b_flags |= B_ASYNC | B_RAM;
@@ -345,9 +341,14 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
* address may not be either. Inherit the b_data offset
* from the original buffer.
*/
- bp->b_data = (char *)((vm_offset_t)bp->b_data |
- ((vm_offset_t)tbp->b_data & PAGE_MASK));
bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+ if ((gbflags & GB_UNMAPPED) != 0) {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_data = unmapped_buf;
+ } else {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ }
bp->b_iocmd = BIO_READ;
bp->b_iodone = cluster_callback;
bp->b_blkno = blkno;
@@ -371,7 +372,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
break;
}
- tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT);
+ tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
+ (gbflags & GB_UNMAPPED));
/* Don't wait around for locked bufs. */
if (tbp == NULL)
@@ -493,8 +495,10 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
bp->b_bufsize, bp->b_kvasize);
bp->b_kvasize = bp->b_bufsize;
- pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
- (vm_page_t *)bp->b_pages, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
return (bp);
}
@@ -517,7 +521,10 @@ cluster_callback(bp)
if (bp->b_ioflags & BIO_ERROR)
error = bp->b_error;
- pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+ bp->b_npages);
+ }
/*
* Move memory from the large cluster buffer into the component
* buffers and mark IO as done on these.
@@ -559,7 +566,8 @@ cluster_callback(bp)
*/
static __inline int
-cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+ int gbflags)
{
int r = 0;
@@ -570,7 +578,7 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
start_lbn -= len;
/* FALLTHROUGH */
case 1:
- r = cluster_wbuild(vp, size, start_lbn, len, 0);
+ r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
/* FALLTHROUGH */
default:
/* FALLTHROUGH */
@@ -598,6 +606,9 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
int lblocksize;
int async;
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
if (vp->v_type == VREG) {
async = DOINGASYNC(vp);
lblocksize = vp->v_mount->mnt_stat.f_iosize;
@@ -637,13 +648,13 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
if (!async && seqcount > 0) {
cluster_wbuild_wb(vp, lblocksize,
- vp->v_cstart, cursize);
+ vp->v_cstart, cursize, gbflags);
}
} else {
struct buf **bpp, **endbp;
struct cluster_save *buflist;
- buflist = cluster_collectbufs(vp, bp);
+ buflist = cluster_collectbufs(vp, bp, gbflags);
endbp = &buflist->bs_children
[buflist->bs_nchildren - 1];
if (VOP_REALLOCBLKS(vp, buflist)) {
@@ -662,7 +673,7 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
if (seqcount > 1) {
cluster_wbuild_wb(vp,
lblocksize, vp->v_cstart,
- cursize);
+ cursize, gbflags);
}
} else {
/*
@@ -710,8 +721,10 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
* update daemon handle it.
*/
bdwrite(bp);
- if (seqcount > 1)
- cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+ if (seqcount > 1) {
+ cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
+ vp->v_clen + 1, gbflags);
+ }
vp->v_clen = 0;
vp->v_cstart = lbn + 1;
} else if (vm_page_count_severe()) {
@@ -746,6 +759,9 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
int totalwritten = 0;
int dbsize = btodb(size);
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
bo = &vp->v_bufobj;
while (len > 0) {
/*
@@ -824,10 +840,16 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
* address may not be either. Inherit the b_data offset
* from the original buffer.
*/
- bp->b_data = (char *)((vm_offset_t)bp->b_data |
- ((vm_offset_t)tbp->b_data & PAGE_MASK));
- bp->b_flags |= B_CLUSTER |
- (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
+ if ((gbflags & GB_UNMAPPED) == 0 ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ } else {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_data = unmapped_buf;
+ }
+ bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
+ B_NEEDCOMMIT));
bp->b_iodone = cluster_callback;
pbgetvp(vp, bp);
/*
@@ -954,8 +976,10 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
tbp, b_cluster.cluster_entry);
}
finishcluster:
- pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
- (vm_page_t *) bp->b_pages, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
if (bp->b_bufsize > bp->b_kvasize)
panic(
"cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
@@ -976,9 +1000,7 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
* Plus add one additional buffer.
*/
static struct cluster_save *
-cluster_collectbufs(vp, last_bp)
- struct vnode *vp;
- struct buf *last_bp;
+cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
{
struct cluster_save *buflist;
struct buf *bp;
@@ -991,7 +1013,8 @@ cluster_collectbufs(vp, last_bp)
buflist->bs_nchildren = 0;
buflist->bs_children = (struct buf **) (buflist + 1);
for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
- (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
+ (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
+ gbflags, &bp);
buflist->bs_children[i] = bp;
if (bp->b_blkno == bp->b_lblkno)
VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c
index 4fe6ebe..5c94a39 100644
--- a/sys/mips/mips/pmap.c
+++ b/sys/mips/mips/pmap.c
@@ -2576,6 +2576,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst)
}
}
+int unmapped_buf_allowed;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c
index 90466e8a..66194df 100644
--- a/sys/powerpc/aim/mmu_oea64.c
+++ b/sys/powerpc/aim/mmu_oea64.c
@@ -648,6 +648,14 @@ moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart,
moea64_kenter(mmup, pa, pa);
}
ENABLE_TRANS(msr);
+
+ /*
+ * Allow user to override unmapped_buf_allowed for testing.
+ * XXXKIB Only direct map implementation was tested.
+ */
+ if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed",
+ &unmapped_buf_allowed))
+ unmapped_buf_allowed = hw_direct_map;
}
void
diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c
index 42f1a39..7fd98f4 100644
--- a/sys/powerpc/powerpc/pmap_dispatch.c
+++ b/sys/powerpc/powerpc/pmap_dispatch.c
@@ -574,3 +574,5 @@ pmap_mmu_install(char *name, int prio)
return (FALSE);
}
+
+int unmapped_buf_allowed;
diff --git a/sys/sparc64/sparc64/pmap.c b/sys/sparc64/sparc64/pmap.c
index 97a085a..2a24eb5 100644
--- a/sys/sparc64/sparc64/pmap.c
+++ b/sys/sparc64/sparc64/pmap.c
@@ -1918,6 +1918,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
}
}
+int unmapped_buf_allowed;
+
void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index c016ee6..7678f5a 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -55,10 +55,13 @@
#define BIO_DONE 0x02
#define BIO_ONQUEUE 0x04
#define BIO_ORDERED 0x08
+#define BIO_UNMAPPED 0x10
+#define BIO_TRANSIENT_MAPPING 0x20
#ifdef _KERNEL
struct disk;
struct bio;
+struct vm_map;
/* Empty classifier tag, to prevent further classification. */
#define BIO_NOTCLASSIFIED (void *)(~0UL)
@@ -78,6 +81,9 @@ struct bio {
off_t bio_offset; /* Offset into file. */
long bio_bcount; /* Valid bytes in buffer. */
caddr_t bio_data; /* Memory, superblocks, indirect etc. */
+ struct vm_page **bio_ma; /* Or unmapped. */
+ int bio_ma_offset; /* Offset in the first page of bio_ma. */
+ int bio_ma_n; /* Number of pages in bio_ma. */
int bio_error; /* Errno for BIO_ERROR. */
long bio_resid; /* Remaining I/O in bytes. */
void (*bio_done)(struct bio *);
@@ -121,6 +127,9 @@ struct bio_queue_head {
struct bio *insert_point;
};
+extern struct vm_map *bio_transient_map;
+extern int bio_transient_maxcnt;
+
void biodone(struct bio *bp);
void biofinish(struct bio *bp, struct devstat *stat, int error);
int biowait(struct bio *bp, const char *wchan);
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 87f2b76..cc8029e 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -117,6 +117,7 @@ struct buf {
long b_bufsize; /* Allocated buffer size. */
long b_runningbufspace; /* when I/O is running, pipelining */
caddr_t b_kvabase; /* base kva for buffer */
+ caddr_t b_kvaalloc; /* allocated kva for B_KVAALLOC */
int b_kvasize; /* size of kva for buffer */
daddr_t b_lblkno; /* Logical block number. */
struct vnode *b_vp; /* Device vnode. */
@@ -202,8 +203,8 @@ struct buf {
#define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */
#define B_DONE 0x00000200 /* I/O completed. */
#define B_EINTR 0x00000400 /* I/O was interrupted */
-#define B_00000800 0x00000800 /* Available flag. */
-#define B_00001000 0x00001000 /* Available flag. */
+#define B_UNMAPPED 0x00000800 /* KVA is not mapped. */
+#define B_KVAALLOC 0x00001000 /* But allocated. */
#define B_INVAL 0x00002000 /* Does not contain valid info. */
#define B_BARRIER 0x00004000 /* Write this and all preceeding first. */
#define B_NOCACHE 0x00008000 /* Do not cache block after use. */
@@ -453,7 +454,9 @@ buf_countdeps(struct buf *bp, int i)
*/
#define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */
#define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */
-#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */
+#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */
+#define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */
+#define GB_KVAALLOC 0x0010 /* But allocate KVA. */
#ifdef _KERNEL
extern int nbuf; /* The number of buffer headers */
@@ -470,11 +473,13 @@ extern struct buf *swbuf; /* Swap I/O buffer headers. */
extern int nswbuf; /* Number of swap I/O buffer headers. */
extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */
extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */
+extern caddr_t unmapped_buf;
void runningbufwakeup(struct buf *);
void waitrunningbufspace(void);
caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est);
void bufinit(void);
+void bdata2bio(struct buf *bp, struct bio *bip);
void bwillwrite(void);
int buf_dirty_count_severe(void);
void bremfree(struct buf *);
diff --git a/sys/sys/systm.h b/sys/sys/systm.h
index 12337de..61b4679 100644
--- a/sys/sys/systm.h
+++ b/sys/sys/systm.h
@@ -138,6 +138,7 @@ extern char **kenvp;
extern const void *zero_region; /* address space maps to a zeroed page */
+extern int unmapped_buf_allowed;
extern int iosize_max_clamp;
#define IOSIZE_MAX (iosize_max_clamp ? INT_MAX : SSIZE_MAX)
diff --git a/sys/vm/vm.h b/sys/vm/vm.h
index 132c10e..106c510 100644
--- a/sys/vm/vm.h
+++ b/sys/vm/vm.h
@@ -136,6 +136,8 @@ struct kva_md_info {
vm_offset_t clean_eva;
vm_offset_t pager_sva;
vm_offset_t pager_eva;
+ vm_offset_t bio_transient_sva;
+ vm_offset_t bio_transient_eva;
};
extern struct kva_md_info kmi;
diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c
index 08c9b03..b350e96 100644
--- a/sys/vm/vm_init.c
+++ b/sys/vm/vm_init.c
@@ -184,10 +184,15 @@ again:
panic("startup: table size inconsistency");
clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva,
- (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS, TRUE);
+ (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS +
+ (long)bio_transient_maxcnt * MAXPHYS, TRUE);
buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva,
&kmi->buffer_eva, (long)nbuf * BKVASIZE, FALSE);
buffer_map->system_map = 1;
+ bio_transient_map = kmem_suballoc(clean_map, &kmi->bio_transient_sva,
+ &kmi->bio_transient_eva, (long)bio_transient_maxcnt * MAXPHYS,
+ FALSE);
+ bio_transient_map->system_map = 1;
pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva,
(long)nswbuf * MAXPHYS, FALSE);
pager_map->system_map = 1;
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 64a2ebb..7c7ccc1 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -90,6 +90,7 @@ vm_map_t kmem_map;
vm_map_t exec_map;
vm_map_t pipe_map;
vm_map_t buffer_map;
+vm_map_t bio_transient_map;
const void *zero_region;
CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);
OpenPOWER on IntegriCloud