diff options
-rw-r--r-- | sys/amd64/amd64/pmap.c | 2 | ||||
-rw-r--r-- | sys/arm/arm/pmap-v6.c | 2 | ||||
-rw-r--r-- | sys/arm/arm/pmap.c | 2 | ||||
-rw-r--r-- | sys/geom/geom.h | 1 | ||||
-rw-r--r-- | sys/geom/geom_io.c | 106 | ||||
-rw-r--r-- | sys/geom/geom_vfs.c | 8 | ||||
-rw-r--r-- | sys/i386/i386/pmap.c | 2 | ||||
-rw-r--r-- | sys/i386/xen/pmap.c | 2 | ||||
-rw-r--r-- | sys/ia64/ia64/pmap.c | 2 | ||||
-rw-r--r-- | sys/kern/subr_bus_dma.c | 23 | ||||
-rw-r--r-- | sys/kern/subr_param.c | 7 | ||||
-rw-r--r-- | sys/kern/vfs_bio.c | 963 | ||||
-rw-r--r-- | sys/kern/vfs_cluster.c | 105 | ||||
-rw-r--r-- | sys/mips/mips/pmap.c | 2 | ||||
-rw-r--r-- | sys/powerpc/aim/mmu_oea64.c | 8 | ||||
-rw-r--r-- | sys/powerpc/powerpc/pmap_dispatch.c | 2 | ||||
-rw-r--r-- | sys/sparc64/sparc64/pmap.c | 2 | ||||
-rw-r--r-- | sys/sys/bio.h | 9 | ||||
-rw-r--r-- | sys/sys/buf.h | 11 | ||||
-rw-r--r-- | sys/sys/systm.h | 1 | ||||
-rw-r--r-- | sys/vm/vm.h | 2 | ||||
-rw-r--r-- | sys/vm/vm_init.c | 7 | ||||
-rw-r--r-- | sys/vm/vm_kern.c | 1 |
23 files changed, 955 insertions, 315 deletions
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 55d2cff..1b1c86c 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -4235,6 +4235,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) pagecopy((void *)src, (void *)dst); } +int unmapped_buf_allowed = 1; + void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c index 0083f29..9d16509 100644 --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -3312,6 +3312,8 @@ pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst) mtx_unlock(&cmtx); } +int unmapped_buf_allowed = 1; + void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) diff --git a/sys/arm/arm/pmap.c b/sys/arm/arm/pmap.c index c18783b..0875f83 100644 --- a/sys/arm/arm/pmap.c +++ b/sys/arm/arm/pmap.c @@ -4428,6 +4428,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) #endif } +int unmapped_buf_allowed = 1; + void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) diff --git a/sys/geom/geom.h b/sys/geom/geom.h index 351b05d..660bf6e 100644 --- a/sys/geom/geom.h +++ b/sys/geom/geom.h @@ -205,6 +205,7 @@ struct g_provider { u_int flags; #define G_PF_WITHER 0x2 #define G_PF_ORPHAN 0x4 +#define G_PF_ACCEPT_UNMAPPED 0x8 /* Two fields for the implementing class to use */ void *private; diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c index c6a5da8..6ffc06e 100644 --- a/sys/geom/geom_io.c +++ b/sys/geom/geom_io.c @@ -1,6 +1,7 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. + * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp @@ -8,6 +9,9 @@ * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -44,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include <sys/ktr.h> #include <sys/proc.h> #include <sys/stack.h> +#include <sys/sysctl.h> #include <sys/errno.h> #include <geom/geom.h> @@ -51,6 +56,13 @@ __FBSDID("$FreeBSD$"); #include <sys/devicestat.h> #include <vm/uma.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; @@ -180,12 +192,17 @@ g_clone_bio(struct bio *bp) /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. + * BIO_UNMAPPED should be inherited, to properly indicate + * which way the buffer is passed. * Other bio flags are not suitable for cloning. */ - bp2->bio_flags = bp->bio_flags & BIO_ORDERED; + bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; + bp2->bio_ma_n = bp->bio_ma_n; + bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; @@ -210,11 +227,15 @@ g_duplicate_bio(struct bio *bp) struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); + bp2->bio_flags = bp->bio_flags & BIO_UNMAPPED; bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; + bp2->bio_ma_n = bp->bio_ma_n; + bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR @@ -575,6 +596,83 @@ g_io_deliver(struct bio *bp, int error) return; } +SYSCTL_DECL(_kern_geom); + +static long transient_maps; +SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, + &transient_maps, 0, + "Total count of the transient mapping requests"); +u_int transient_map_retries = 10; +SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, + &transient_map_retries, 0, + "Max count of retries used before giving up on creating transient map"); +int transient_map_hard_failures; +SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, + &transient_map_hard_failures, 0, + "Failures to establish the transient mapping due to retry attempts " + "exhausted"); +int transient_map_soft_failures; +SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, + &transient_map_soft_failures, 0, + "Count of retried failures to establish the transient mapping"); +int inflight_transient_maps; +SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, + &inflight_transient_maps, 0, + "Current count of the active transient maps"); + +static int +g_io_transient_map_bio(struct bio *bp) +{ + vm_offset_t addr; + long size; + u_int retried; + int rv; + + size = round_page(bp->bio_ma_offset + bp->bio_length); + KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); + addr = 0; + retried = 0; + atomic_add_long(&transient_maps, 1); +retry: + vm_map_lock(bio_transient_map); + if (vm_map_findspace(bio_transient_map, vm_map_min(bio_transient_map), + size, &addr)) { + vm_map_unlock(bio_transient_map); + if (transient_map_retries != 0 && + retried >= transient_map_retries) { + g_io_deliver(bp, EDEADLK/* XXXKIB */); + CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", + bp, bp->bio_to->name); + atomic_add_int(&transient_map_hard_failures, 1); + return (1); + } else { + /* + * Naive attempt to quisce the I/O to get more + * in-flight requests completed and defragment + * the bio_transient_map. + */ + CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", + bp, bp->bio_to->name, retried); + pause("g_d_tra", hz / 10); + retried++; + atomic_add_int(&transient_map_soft_failures, 1); + goto retry; + } + } + rv = vm_map_insert(bio_transient_map, NULL, 0, addr, addr + size, + VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); + KASSERT(rv == KERN_SUCCESS, + ("vm_map_insert(bio_transient_map) rv %d %jx %lx", + rv, (uintmax_t)addr, size)); + vm_map_unlock(bio_transient_map); + atomic_add_int(&inflight_transient_maps, 1); + pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); + bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; + bp->bio_flags |= BIO_TRANSIENT_MAPPING; + bp->bio_flags &= ~BIO_UNMAPPED; + return (0); +} + void g_io_schedule_down(struct thread *tp __unused) { @@ -636,6 +734,12 @@ g_io_schedule_down(struct thread *tp __unused) default: break; } + if ((bp->bio_flags & BIO_UNMAPPED) != 0 && + (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && + (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { + if (g_io_transient_map_bio(bp)) + continue; + } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c index bbed550..92f1ad2 100644 --- a/sys/geom/geom_vfs.c +++ b/sys/geom/geom_vfs.c @@ -188,14 +188,14 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp) bip = g_alloc_bio(); bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; - bip->bio_data = bp->b_data; - bip->bio_done = g_vfs_done; - bip->bio_caller2 = bp; bip->bio_length = bp->b_bcount; - if (bp->b_flags & B_BARRIER) { + bdata2bio(bp, bip); + if ((bp->b_flags & B_BARRIER) != 0) { bip->bio_flags |= BIO_ORDERED; bp->b_flags &= ~B_BARRIER; } + bip->bio_done = g_vfs_done; + bip->bio_caller2 = bp; g_io_request(bip, cp); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 6499986..f4681bf 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -4205,6 +4205,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) mtx_unlock(&sysmaps->lock); } +int unmapped_buf_allowed = 1; + void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c index 0f7a80f..a9bc124 100644 --- a/sys/i386/xen/pmap.c +++ b/sys/i386/xen/pmap.c @@ -3448,6 +3448,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) mtx_unlock(&sysmaps->lock); } +int unmapped_buf_allowed = 1; + void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) diff --git a/sys/ia64/ia64/pmap.c b/sys/ia64/ia64/pmap.c index 3256600..883f39c 100644 --- a/sys/ia64/ia64/pmap.c +++ b/sys/ia64/ia64/pmap.c @@ -2014,6 +2014,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) bcopy(src, dst, PAGE_SIZE); } +int unmapped_buf_allowed; + void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) diff --git a/sys/kern/subr_bus_dma.c b/sys/kern/subr_bus_dma.c index 773d01a..4d344b4 100644 --- a/sys/kern/subr_bus_dma.c +++ b/sys/kern/subr_bus_dma.c @@ -126,11 +126,28 @@ static int _bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio, int *nsegs, int flags) { - int error; + vm_paddr_t paddr; + bus_size_t len, tlen; + int error, i, ma_offs; - error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data, - bio->bio_bcount, kernel_pmap, flags, NULL, nsegs); + if ((bio->bio_flags & BIO_UNMAPPED) == 0) { + error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data, + bio->bio_bcount, kernel_pmap, flags, NULL, nsegs); + return (error); + } + error = 0; + tlen = bio->bio_bcount; + ma_offs = bio->bio_ma_offset; + for (i = 0; tlen > 0; i++, tlen -= len) { + len = min(PAGE_SIZE - ma_offs, tlen); + paddr = VM_PAGE_TO_PHYS(bio->bio_ma[i]) + ma_offs; + error = _bus_dmamap_load_phys(dmat, map, paddr, len, + flags, NULL, nsegs); + if (error != 0) + break; + ma_offs = 0; + } return (error); } diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index 825a3a0..a2e822c 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -92,6 +92,7 @@ int maxfiles; /* sys. wide open files limit */ int maxfilesperproc; /* per-proc open files limit */ int msgbufsize; /* size of kernel message buffer */ int nbuf; +int bio_transient_maxcnt; int ngroups_max; /* max # groups per process */ int nswbuf; pid_t pid_max = PID_MAX; @@ -118,6 +119,9 @@ SYSCTL_LONG(_kern, OID_AUTO, maxswzone, CTLFLAG_RDTUN, &maxswzone, 0, "Maximum memory for swap metadata"); SYSCTL_LONG(_kern, OID_AUTO, maxbcache, CTLFLAG_RDTUN, &maxbcache, 0, "Maximum value of vfs.maxbufspace"); +SYSCTL_INT(_kern, OID_AUTO, bio_transient_maxcnt, CTLFLAG_RDTUN, + &bio_transient_maxcnt, 0, + "Maximum number of transient BIOs mappings"); SYSCTL_ULONG(_kern, OID_AUTO, maxtsiz, CTLFLAG_RW | CTLFLAG_TUN, &maxtsiz, 0, "Maximum text size"); SYSCTL_ULONG(_kern, OID_AUTO, dfldsiz, CTLFLAG_RW | CTLFLAG_TUN, &dfldsiz, 0, @@ -266,6 +270,8 @@ init_param1(void) pid_max = PID_MAX; else if (pid_max < 300) pid_max = 300; + + TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed); } /* @@ -322,6 +328,7 @@ init_param2(long physpages) */ nbuf = NBUF; TUNABLE_INT_FETCH("kern.nbuf", &nbuf); + TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt); /* * The default for maxpipekva is min(1/64 of the kernel address space, diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index d20c829..cded596 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,8 +1,12 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson + * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -92,6 +96,7 @@ struct buf_ops buf_ops_bio = { * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. */ struct buf *buf; /* buffer header pool */ +caddr_t unmapped_buf; static struct proc *bufdaemonproc; @@ -132,6 +137,10 @@ SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Virtual memory used for buffers"); #endif +static long unmapped_bufspace; +SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD, + &unmapped_bufspace, 0, + "Amount of unmapped buffers, inclusive in the bufspace"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, "Maximum allowed value of bufspace (including buf_daemon)"); @@ -201,6 +210,10 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); +static int mappingrestarts; +SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, + "Number of times getblk has had to restart a buffer mapping for " + "unmapped buffer"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); @@ -210,6 +223,9 @@ SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, ¬bufdflashes, 0, static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); +SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, + &unmapped_buf_allowed, 0, + "Permit the use of the unmapped i/o"); /* * Wakeup point for bufdaemon, as well as indicator of whether it is already @@ -281,6 +297,9 @@ static struct mtx nblock; /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; +#ifdef INVARIANTS +static int bq_len[BUFFER_QUEUES]; +#endif /* Lock for the bufqueues */ static struct mtx bqlock; @@ -511,7 +530,7 @@ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; - long maxbuf; + long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes @@ -555,6 +574,52 @@ kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) } /* + * Ideal allocation size for the transient bio submap if 10% + * of the maximal space buffer map. This roughly corresponds + * to the amount of the buffer mapped for typical UFS load. + * + * Clip the buffer map to reserve space for the transient + * BIOs, if its extent is bigger than 90% of the maximum + * buffer map extent on the platform. + * + * The fall-back to the maxbuf in case of maxbcache unset, + * allows to not trim the buffer KVA for the architectures + * with ample KVA space. + */ + if (bio_transient_maxcnt == 0) { + maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; + buf_sz = (long)nbuf * BKVASIZE; + if (buf_sz < maxbuf_sz / 10 * 9) { + /* + * There is more KVA than memory. Do not + * adjust buffer map size, and assign the rest + * of maxbuf to transient map. + */ + biotmap_sz = maxbuf_sz - buf_sz; + } else { + /* + * Buffer map spans all KVA we could afford on + * this platform. Give 10% of the buffer map + * to the transient bio map. + */ + biotmap_sz = buf_sz / 10; + buf_sz -= biotmap_sz; + } + if (biotmap_sz / INT_MAX > MAXPHYS) + bio_transient_maxcnt = INT_MAX; + else + bio_transient_maxcnt = biotmap_sz / MAXPHYS; + /* + * Artifically limit to 1024 simultaneous in-flight I/Os + * using the transient mapping. + */ + if (bio_transient_maxcnt > 1024) + bio_transient_maxcnt = 1024; + if (tuned_nbuf) + nbuf = buf_sz / BKVASIZE; + } + + /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ @@ -607,6 +672,9 @@ bufinit(void) LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); +#ifdef INVARIANTS + bq_len[QUEUE_EMPTY]++; +#endif } /* @@ -675,6 +743,55 @@ bufinit(void) bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); + unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS); +} + +#ifdef INVARIANTS +static inline void +vfs_buf_check_mapped(struct buf *bp) +{ + + KASSERT((bp->b_flags & B_UNMAPPED) == 0, + ("mapped buf %p %x", bp, bp->b_flags)); + KASSERT(bp->b_kvabase != unmapped_buf, + ("mapped buf: b_kvabase was not updated %p", bp)); + KASSERT(bp->b_data != unmapped_buf, + ("mapped buf: b_data was not updated %p", bp)); +} + +static inline void +vfs_buf_check_unmapped(struct buf *bp) +{ + + KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED, + ("unmapped buf %p %x", bp, bp->b_flags)); + KASSERT(bp->b_kvabase == unmapped_buf, + ("unmapped buf: corrupted b_kvabase %p", bp)); + KASSERT(bp->b_data == unmapped_buf, + ("unmapped buf: corrupted b_data %p", bp)); +} + +#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) +#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) +#else +#define BUF_CHECK_MAPPED(bp) do {} while (0) +#define BUF_CHECK_UNMAPPED(bp) do {} while (0) +#endif + +static void +bpmap_qenter(struct buf *bp) +{ + + BUF_CHECK_MAPPED(bp); + + /* + * bp->b_data is relative to bp->b_offset, but + * bp->b_offset may be offset into the first page. + */ + bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); + pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); + bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | + (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* @@ -686,14 +803,26 @@ static void bfreekva(struct buf *bp) { - if (bp->b_kvasize) { - atomic_add_int(&buffreekvacnt, 1); - atomic_subtract_long(&bufspace, bp->b_kvasize); - vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase, - (vm_offset_t) bp->b_kvabase + bp->b_kvasize); - bp->b_kvasize = 0; - bufspacewakeup(); + if (bp->b_kvasize == 0) + return; + + atomic_add_int(&buffreekvacnt, 1); + atomic_subtract_long(&bufspace, bp->b_kvasize); + if ((bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvabase, + (vm_offset_t)bp->b_kvabase + bp->b_kvasize); + } else { + BUF_CHECK_UNMAPPED(bp); + if ((bp->b_flags & B_KVAALLOC) != 0) { + vm_map_remove(buffer_map, (vm_offset_t)bp->b_kvaalloc, + (vm_offset_t)bp->b_kvaalloc + bp->b_kvasize); + } + atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); + bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC); } + bp->b_kvasize = 0; + bufspacewakeup(); } /* @@ -760,6 +889,11 @@ bremfreel(struct buf *bp) mtx_assert(&bqlock, MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); +#ifdef INVARIANTS + KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", + bp->b_qindex)); + bq_len[bp->b_qindex]--; +#endif bp->b_qindex = QUEUE_NONE; /* * If this was a delayed bremfree() we only need to remove the buffer @@ -1414,7 +1548,8 @@ brelse(struct buf *bp) } } - if ((bp->b_flags & B_INVAL) == 0) { + if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) { + BUF_CHECK_MAPPED(bp); pmap_qenter( trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); @@ -1509,11 +1644,17 @@ brelse(struct buf *bp) bp->b_qindex = QUEUE_DIRTY; else bp->b_qindex = QUEUE_CLEAN; - if (bp->b_flags & B_AGE) - TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); - else - TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); + if (bp->b_flags & B_AGE) { + TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, + b_freelist); + } else { + TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, + b_freelist); + } } +#ifdef INVARIANTS + bq_len[bp->b_qindex]++; +#endif mtx_unlock(&bqlock); /* @@ -1604,6 +1745,9 @@ bqrelse(struct buf *bp) if (bp->b_flags & B_DELWRI) { bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); +#ifdef INVARIANTS + bq_len[bp->b_qindex]++; +#endif } else { /* * The locking of the BO_LOCK for checking of the @@ -1616,6 +1760,9 @@ bqrelse(struct buf *bp) bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); +#ifdef INVARIANTS + bq_len[QUEUE_CLEAN]++; +#endif } else { /* * We are too low on memory, we have to try to free @@ -1657,7 +1804,11 @@ vfs_vmio_release(struct buf *bp) int i; vm_page_t m; - pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); + } else + BUF_CHECK_UNMAPPED(bp); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; @@ -1761,8 +1912,10 @@ vfs_bio_awrite(struct buf *bp) int nwritten; int size; int maxcl; + int gbflags; bo = &vp->v_bufobj; + gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster @@ -1794,7 +1947,7 @@ vfs_bio_awrite(struct buf *bp) if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, - 0); + gbflags); return (nwritten); } } @@ -1811,46 +1964,207 @@ vfs_bio_awrite(struct buf *bp) return (nwritten); } +static void +setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags) +{ + + KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 && + bp->b_kvasize == 0, ("call bfreekva(%p)", bp)); + if ((gbflags & GB_UNMAPPED) == 0) { + bp->b_kvabase = (caddr_t)addr; + } else if ((gbflags & GB_KVAALLOC) != 0) { + KASSERT((gbflags & GB_UNMAPPED) != 0, + ("GB_KVAALLOC without GB_UNMAPPED")); + bp->b_kvaalloc = (caddr_t)addr; + bp->b_flags |= B_UNMAPPED | B_KVAALLOC; + atomic_add_long(&unmapped_bufspace, bp->b_kvasize); + } + bp->b_kvasize = maxsize; +} + /* - * getnewbuf: - * - * Find and initialize a new buffer header, freeing up existing buffers - * in the bufqueues as necessary. The new buffer is returned locked. - * - * Important: B_INVAL is not set. If the caller wishes to throw the - * buffer away, the caller must set B_INVAL prior to calling brelse(). - * - * We block if: - * We have insufficient buffer headers - * We have insufficient buffer space - * buffer_map is too fragmented ( space reservation fails ) - * If we have to flush dirty buffers ( but we try to avoid this ) - * - * To avoid VFS layer recursion we do not flush dirty buffers ourselves. - * Instead we ask the buf daemon to do it for us. We attempt to - * avoid piecemeal wakeups of the pageout daemon. + * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if + * needed. */ +static int +allocbufkva(struct buf *bp, int maxsize, int gbflags) +{ + vm_offset_t addr; + int rv; -static struct buf * -getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, - int gbflags) + bfreekva(bp); + addr = 0; + + vm_map_lock(buffer_map); + if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, + &addr)) { + vm_map_unlock(buffer_map); + /* + * Buffer map is too fragmented. Request the caller + * to defragment the map. + */ + atomic_add_int(&bufdefragcnt, 1); + return (1); + } + rv = vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, + VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); + KASSERT(rv == KERN_SUCCESS, ("vm_map_insert(buffer_map) rv %d", rv)); + vm_map_unlock(buffer_map); + setbufkva(bp, addr, maxsize, gbflags); + atomic_add_long(&bufspace, bp->b_kvasize); + return (0); +} + +/* + * Ask the bufdaemon for help, or act as bufdaemon itself, when a + * locked vnode is supplied. + */ +static void +getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, + int defrag) { struct thread *td; - struct buf *bp; - struct buf *nbp; - int defrag = 0; - int nqindex; - static int flushingbufs; + char *waitmsg; + int fl, flags, norunbuf; + + mtx_assert(&bqlock, MA_OWNED); + + if (defrag) { + flags = VFS_BIO_NEED_BUFSPACE; + waitmsg = "nbufkv"; + } else if (bufspace >= hibufspace) { + waitmsg = "nbufbs"; + flags = VFS_BIO_NEED_BUFSPACE; + } else { + waitmsg = "newbuf"; + flags = VFS_BIO_NEED_ANY; + } + mtx_lock(&nblock); + needsbuffer |= flags; + mtx_unlock(&nblock); + mtx_unlock(&bqlock); + + bd_speedup(); /* heeeelp */ + if ((gbflags & GB_NOWAIT_BD) != 0) + return; td = curthread; + mtx_lock(&nblock); + while (needsbuffer & flags) { + if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) { + mtx_unlock(&nblock); + /* + * getblk() is called with a vnode locked, and + * some majority of the dirty buffers may as + * well belong to the vnode. Flushing the + * buffers there would make a progress that + * cannot be achieved by the buf_daemon, that + * cannot lock the vnode. + */ + norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | + (td->td_pflags & TDP_NORUNNINGBUF); + /* play bufdaemon */ + td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; + fl = buf_do_flush(vp); + td->td_pflags &= norunbuf; + mtx_lock(&nblock); + if (fl != 0) + continue; + if ((needsbuffer & flags) == 0) + break; + } + if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag, + waitmsg, slptimeo)) + break; + } + mtx_unlock(&nblock); +} + +static void +getnewbuf_reuse_bp(struct buf *bp, int qindex) +{ + + CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " + "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, + bp->b_kvasize, bp->b_bufsize, qindex); + mtx_assert(&bqlock, MA_NOTOWNED); + /* - * We can't afford to block since we might be holding a vnode lock, - * which may prevent system daemons from running. We deal with - * low-memory situations by proactively returning memory and running - * async I/O rather then sync I/O. + * Note: we no longer distinguish between VMIO and non-VMIO + * buffers. */ - atomic_add_int(&getnewbufcalls, 1); - atomic_subtract_int(&getnewbufrestarts, 1); + KASSERT((bp->b_flags & B_DELWRI) == 0, + ("delwri buffer %p found in queue %d", bp, qindex)); + + if (qindex == QUEUE_CLEAN) { + if (bp->b_flags & B_VMIO) { + bp->b_flags &= ~B_ASYNC; + vfs_vmio_release(bp); + } + if (bp->b_vp != NULL) + brelvp(bp); + } + + /* + * Get the rest of the buffer freed up. b_kva* is still valid + * after this operation. + */ + + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + if (!LIST_EMPTY(&bp->b_dep)) + buf_deallocate(bp); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("losing buffer 3"); + KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d", + bp, bp->b_vp, qindex)); + KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, + ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); + + if (bp->b_bufsize) + allocbuf(bp, 0); + + bp->b_flags &= B_UNMAPPED | B_KVAALLOC; + bp->b_ioflags = 0; + bp->b_xflags = 0; + KASSERT((bp->b_vflags & BV_INFREECNT) == 0, + ("buf %p still counted as free?", bp)); + bp->b_vflags = 0; + bp->b_vp = NULL; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_offset = NOOFFSET; + bp->b_iodone = 0; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_npages = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_bufobj = NULL; + bp->b_pin_count = 0; + bp->b_fsprivate1 = NULL; + bp->b_fsprivate2 = NULL; + bp->b_fsprivate3 = NULL; + + LIST_INIT(&bp->b_dep); +} + +static int flushingbufs; + +static struct buf * +getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata) +{ + struct buf *bp, *nbp; + int nqindex, qindex, pass; + + KASSERT(!unmapped || !defrag, ("both unmapped and defrag")); + + pass = 1; restart: atomic_add_int(&getnewbufrestarts, 1); @@ -1860,66 +2174,90 @@ restart: * that if we are specially marked process, we are allowed to * dip into our reserves. * - * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN + * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN + * for the allocation of the mapped buffer. For unmapped, the + * easiest is to start with EMPTY outright. * * We start with EMPTYKVA. If the list is empty we backup to EMPTY. * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ + nbp = NULL; mtx_lock(&bqlock); - nqindex = QUEUE_EMPTYKVA; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); - + if (!defrag && unmapped) { + nqindex = QUEUE_EMPTY; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + } if (nbp == NULL) { - /* - * If no EMPTYKVA buffers and we are either - * defragging or reusing, locate a CLEAN buffer - * to free or reuse. If bufspace useage is low - * skip this step so we can allocate a new buffer. - */ - if (defrag || bufspace >= lobufspace) { - nqindex = QUEUE_CLEAN; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); - } + nqindex = QUEUE_EMPTYKVA; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); + } - /* - * If we could not find or were not allowed to reuse a - * CLEAN buffer, check to see if it is ok to use an EMPTY - * buffer. We can only use an EMPTY buffer if allocating - * its KVA would not otherwise run us out of buffer space. - */ - if (nbp == NULL && defrag == 0 && - bufspace + maxsize < hibufspace) { - nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); - } + /* + * If no EMPTYKVA buffers and we are either defragging or + * reusing, locate a CLEAN buffer to free or reuse. If + * bufspace useage is low skip this step so we can allocate a + * new buffer. + */ + if (nbp == NULL && (defrag || bufspace >= lobufspace)) { + nqindex = QUEUE_CLEAN; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); + } + + /* + * If we could not find or were not allowed to reuse a CLEAN + * buffer, check to see if it is ok to use an EMPTY buffer. + * We can only use an EMPTY buffer if allocating its KVA would + * not otherwise run us out of buffer space. No KVA is needed + * for the unmapped allocation. + */ + if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace || + metadata)) { + nqindex = QUEUE_EMPTY; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + } + + /* + * All available buffers might be clean, retry ignoring the + * lobufspace as the last resort. + */ + if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) { + nqindex = QUEUE_CLEAN; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ - while ((bp = nbp) != NULL) { - int qindex = nqindex; + qindex = nqindex; /* - * Calculate next bp ( we can only use it if we do not block - * or do other fancy things ). + * Calculate next bp (we can only use it if we do not + * block or do other fancy things). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { - switch(qindex) { + switch (qindex) { case QUEUE_EMPTY: nqindex = QUEUE_EMPTYKVA; - if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); + if (nbp != NULL) break; /* FALLTHROUGH */ case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; - if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) + nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); + if (nbp != NULL) break; /* FALLTHROUGH */ case QUEUE_CLEAN: + if (metadata && pass == 1) { + pass = 2; + nqindex = QUEUE_EMPTY; + nbp = TAILQ_FIRST( + &bufqueues[QUEUE_EMPTY]); + } /* * nbp is NULL. */ @@ -1952,22 +2290,9 @@ restart: } BO_UNLOCK(bp->b_bufobj); } - CTR6(KTR_BUF, - "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " - "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, - bp->b_kvasize, bp->b_bufsize, qindex); - - /* - * Sanity Checks - */ - KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); - - /* - * Note: we no longer distinguish between VMIO and non-VMIO - * buffers. - */ - KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); + KASSERT(bp->b_qindex == qindex, + ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); if (bp->b_bufobj != NULL) BO_LOCK(bp->b_bufobj); @@ -1975,68 +2300,13 @@ restart: if (bp->b_bufobj != NULL) BO_UNLOCK(bp->b_bufobj); mtx_unlock(&bqlock); - - if (qindex == QUEUE_CLEAN) { - if (bp->b_flags & B_VMIO) { - bp->b_flags &= ~B_ASYNC; - vfs_vmio_release(bp); - } - if (bp->b_vp) - brelvp(bp); - } - /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. - * - * Get the rest of the buffer freed up. b_kva* is still - * valid after this operation. */ - if (bp->b_rcred != NOCRED) { - crfree(bp->b_rcred); - bp->b_rcred = NOCRED; - } - if (bp->b_wcred != NOCRED) { - crfree(bp->b_wcred); - bp->b_wcred = NOCRED; - } - if (!LIST_EMPTY(&bp->b_dep)) - buf_deallocate(bp); - if (bp->b_vflags & BV_BKGRDINPROG) - panic("losing buffer 3"); - KASSERT(bp->b_vp == NULL, - ("bp: %p still has vnode %p. qindex: %d", - bp, bp->b_vp, qindex)); - KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, - ("bp: %p still on a buffer list. xflags %X", - bp, bp->b_xflags)); - - if (bp->b_bufsize) - allocbuf(bp, 0); - - bp->b_flags = 0; - bp->b_ioflags = 0; - bp->b_xflags = 0; - KASSERT((bp->b_vflags & BV_INFREECNT) == 0, - ("buf %p still counted as free?", bp)); - bp->b_vflags = 0; - bp->b_vp = NULL; - bp->b_blkno = bp->b_lblkno = 0; - bp->b_offset = NOOFFSET; - bp->b_iodone = 0; - bp->b_error = 0; - bp->b_resid = 0; - bp->b_bcount = 0; - bp->b_npages = 0; - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_bufobj = NULL; - bp->b_pin_count = 0; - bp->b_fsprivate1 = NULL; - bp->b_fsprivate2 = NULL; - bp->b_fsprivate3 = NULL; - - LIST_INIT(&bp->b_dep); + getnewbuf_reuse_bp(bp, qindex); + mtx_assert(&bqlock, MA_NOTOWNED); /* * If we are defragging then free the buffer. @@ -2060,6 +2330,9 @@ restart: goto restart; } + if (metadata) + break; + /* * If we are overcomitted then recover the buffer and its * KVM space. This occurs in rare situations when multiple @@ -2077,6 +2350,59 @@ restart: flushingbufs = 0; break; } + return (bp); +} + +/* + * getnewbuf: + * + * Find and initialize a new buffer header, freeing up existing buffers + * in the bufqueues as necessary. The new buffer is returned locked. + * + * Important: B_INVAL is not set. If the caller wishes to throw the + * buffer away, the caller must set B_INVAL prior to calling brelse(). + * + * We block if: + * We have insufficient buffer headers + * We have insufficient buffer space + * buffer_map is too fragmented ( space reservation fails ) + * If we have to flush dirty buffers ( but we try to avoid this ) + * + * To avoid VFS layer recursion we do not flush dirty buffers ourselves. + * Instead we ask the buf daemon to do it for us. We attempt to + * avoid piecemeal wakeups of the pageout daemon. + */ +static struct buf * +getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, + int gbflags) +{ + struct buf *bp; + int defrag, metadata; + + KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, + ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); + if (!unmapped_buf_allowed) + gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); + + defrag = 0; + if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || + vp->v_type == VCHR) + metadata = 1; + else + metadata = 0; + /* + * We can't afford to block since we might be holding a vnode lock, + * which may prevent system daemons from running. We deal with + * low-memory situations by proactively returning memory and running + * async I/O rather then sync I/O. + */ + atomic_add_int(&getnewbufcalls, 1); + atomic_subtract_int(&getnewbufrestarts, 1); +restart: + bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | + GB_KVAALLOC)) == GB_UNMAPPED, metadata); + if (bp != NULL) + defrag = 0; /* * If we exhausted our list, sleep as appropriate. We may have to @@ -2084,65 +2410,23 @@ restart: * * Generally we are sleeping due to insufficient buffer space. */ - if (bp == NULL) { - int flags, norunbuf; - char *waitmsg; - int fl; - - if (defrag) { - flags = VFS_BIO_NEED_BUFSPACE; - waitmsg = "nbufkv"; - } else if (bufspace >= hibufspace) { - waitmsg = "nbufbs"; - flags = VFS_BIO_NEED_BUFSPACE; - } else { - waitmsg = "newbuf"; - flags = VFS_BIO_NEED_ANY; - } - mtx_lock(&nblock); - needsbuffer |= flags; - mtx_unlock(&nblock); - mtx_unlock(&bqlock); - - bd_speedup(); /* heeeelp */ - if (gbflags & GB_NOWAIT_BD) - return (NULL); - - mtx_lock(&nblock); - while (needsbuffer & flags) { - if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) { - mtx_unlock(&nblock); - /* - * getblk() is called with a vnode - * locked, and some majority of the - * dirty buffers may as well belong to - * the vnode. Flushing the buffers - * there would make a progress that - * cannot be achieved by the - * buf_daemon, that cannot lock the - * vnode. - */ - norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | - (td->td_pflags & TDP_NORUNNINGBUF); - /* play bufdaemon */ - td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; - fl = buf_do_flush(vp); - td->td_pflags &= norunbuf; - mtx_lock(&nblock); - if (fl != 0) - continue; - if ((needsbuffer & flags) == 0) - break; - } - if (msleep(&needsbuffer, &nblock, - (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { - mtx_unlock(&nblock); - return (NULL); - } - } - mtx_unlock(&nblock); + mtx_assert(&bqlock, MA_OWNED); + getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); + mtx_assert(&bqlock, MA_NOTOWNED); + } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { + mtx_assert(&bqlock, MA_NOTOWNED); + + bfreekva(bp); + bp->b_flags |= B_UNMAPPED; + bp->b_kvabase = bp->b_data = unmapped_buf; + bp->b_kvasize = maxsize; + atomic_add_long(&bufspace, bp->b_kvasize); + atomic_add_long(&unmapped_bufspace, bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); } else { + mtx_assert(&bqlock, MA_NOTOWNED); + /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order @@ -2151,39 +2435,47 @@ restart: */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; - if (maxsize != bp->b_kvasize) { - vm_offset_t addr = 0; - int rv; - - bfreekva(bp); - - vm_map_lock(buffer_map); - if (vm_map_findspace(buffer_map, - vm_map_min(buffer_map), maxsize, &addr)) { - /* - * Buffer map is too fragmented. - * We must defragment the map. - */ - atomic_add_int(&bufdefragcnt, 1); - vm_map_unlock(buffer_map); + if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED | + B_KVAALLOC)) == B_UNMAPPED) { + if (allocbufkva(bp, maxsize, gbflags)) { defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } - rv = vm_map_insert(buffer_map, NULL, 0, addr, - addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, - MAP_NOFAULT); - KASSERT(rv == KERN_SUCCESS, - ("vm_map_insert(buffer_map) rv %d", rv)); - vm_map_unlock(buffer_map); - bp->b_kvabase = (caddr_t)addr; - bp->b_kvasize = maxsize; - atomic_add_long(&bufspace, bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); + } else if ((bp->b_flags & B_KVAALLOC) != 0 && + (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) { + /* + * If the reused buffer has KVA allocated, + * reassign b_kvaalloc to b_kvabase. + */ + bp->b_kvabase = bp->b_kvaalloc; + bp->b_flags &= ~B_KVAALLOC; + atomic_subtract_long(&unmapped_bufspace, + bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); + } else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 && + (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED | + GB_KVAALLOC)) { + /* + * The case of reused buffer already have KVA + * mapped, but the request is for unmapped + * buffer with KVA allocated. + */ + bp->b_kvaalloc = bp->b_kvabase; + bp->b_data = bp->b_kvabase = unmapped_buf; + bp->b_flags |= B_UNMAPPED | B_KVAALLOC; + atomic_add_long(&unmapped_bufspace, + bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } - bp->b_saveaddr = bp->b_kvabase; - bp->b_data = bp->b_saveaddr; + if ((gbflags & GB_UNMAPPED) == 0) { + bp->b_saveaddr = bp->b_kvabase; + bp->b_data = bp->b_saveaddr; + bp->b_flags &= ~B_UNMAPPED; + BUF_CHECK_MAPPED(bp); + } } return (bp); } @@ -2594,6 +2886,90 @@ vfs_setdirty_locked_object(struct buf *bp) } /* + * Allocate the KVA mapping for an existing buffer. It handles the + * cases of both B_UNMAPPED buffer, and buffer with the preallocated + * KVA which is not mapped (B_KVAALLOC). + */ +static void +bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) +{ + struct buf *scratch_bp; + int bsize, maxsize, need_mapping, need_kva; + off_t offset; + + need_mapping = (bp->b_flags & B_UNMAPPED) != 0 && + (gbflags & GB_UNMAPPED) == 0; + need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED && + (gbflags & GB_KVAALLOC) != 0; + if (!need_mapping && !need_kva) + return; + + BUF_CHECK_UNMAPPED(bp); + + if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) { + /* + * Buffer is not mapped, but the KVA was already + * reserved at the time of the instantiation. Use the + * allocated space. + */ + bp->b_flags &= ~B_KVAALLOC; + KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0")); + bp->b_kvabase = bp->b_kvaalloc; + atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); + goto has_addr; + } + + /* + * Calculate the amount of the address space we would reserve + * if the buffer was mapped. + */ + bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; + offset = blkno * bsize; + maxsize = size + (offset & PAGE_MASK); + maxsize = imax(maxsize, bsize); + +mapping_loop: + if (allocbufkva(bp, maxsize, gbflags)) { + /* + * Request defragmentation. getnewbuf() returns us the + * allocated space by the scratch buffer KVA. + */ + scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags | + (GB_UNMAPPED | GB_KVAALLOC)); + if (scratch_bp == NULL) { + if ((gbflags & GB_NOWAIT_BD) != 0) { + /* + * XXXKIB: defragmentation cannot + * succeed, not sure what else to do. + */ + panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp); + } + atomic_add_int(&mappingrestarts, 1); + goto mapping_loop; + } + KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0, + ("scratch bp !B_KVAALLOC %p", scratch_bp)); + setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc, + scratch_bp->b_kvasize, gbflags); + + /* Get rid of the scratch buffer. */ + scratch_bp->b_kvasize = 0; + scratch_bp->b_flags |= B_INVAL; + scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC); + brelse(scratch_bp); + } + if (!need_mapping) + return; + +has_addr: + bp->b_saveaddr = bp->b_kvabase; + bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */ + bp->b_flags &= ~B_UNMAPPED; + BUF_CHECK_MAPPED(bp); + bpmap_qenter(bp); +} + +/* * getblk: * * Get a block given a specified block and offset into a file/device. @@ -2635,12 +3011,17 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, { struct buf *bp; struct bufobj *bo; - int error; + int bsize, error, maxsize, vmio; + off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); + KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, + ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); + if (!unmapped_buf_allowed) + flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; loop: @@ -2743,12 +3124,18 @@ loop: } /* + * Handle the case of unmapped buffer which should + * become mapped, or the buffer for which KVA + * reservation is requested. + */ + bp_unmapped_get_kva(bp, blkno, size, flags); + + /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ - if (bp->b_bcount != size) allocbuf(bp, size); @@ -2789,9 +3176,6 @@ loop: } bp->b_flags &= ~B_DONE; } else { - int bsize, maxsize, vmio; - off_t offset; - /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned @@ -2807,7 +3191,13 @@ loop: bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; offset = blkno * bsize; vmio = vp->v_object != NULL; - maxsize = vmio ? size + (offset & PAGE_MASK) : size; + if (vmio) { + maxsize = size + (offset & PAGE_MASK); + } else { + maxsize = size; + /* Do not allow non-VMIO notmapped buffers. */ + flags &= ~GB_UNMAPPED; + } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); @@ -2863,6 +3253,7 @@ loop: KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); + BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); @@ -3038,10 +3429,14 @@ allocbuf(struct buf *bp, int size) if (desiredpages < bp->b_npages) { vm_page_t m; - pmap_qremove((vm_offset_t)trunc_page( - (vm_offset_t)bp->b_data) + - (desiredpages << PAGE_SHIFT), - (bp->b_npages - desiredpages)); + if ((bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + pmap_qremove((vm_offset_t)trunc_page( + (vm_offset_t)bp->b_data) + + (desiredpages << PAGE_SHIFT), + (bp->b_npages - desiredpages)); + } else + BUF_CHECK_UNMAPPED(bp); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = desiredpages; i < bp->b_npages; i++) { /* @@ -3147,21 +3542,12 @@ allocbuf(struct buf *bp, int size) VM_OBJECT_WUNLOCK(obj); /* - * Step 3, fixup the KVM pmap. Remember that - * bp->b_data is relative to bp->b_offset, but - * bp->b_offset may be offset into the first page. + * Step 3, fixup the KVM pmap. */ - - bp->b_data = (caddr_t) - trunc_page((vm_offset_t)bp->b_data); - pmap_qenter( - (vm_offset_t)bp->b_data, - bp->b_pages, - bp->b_npages - ); - - bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | - (vm_offset_t)(bp->b_offset & PAGE_MASK)); + if ((bp->b_flags & B_UNMAPPED) == 0) + bpmap_qenter(bp); + else + BUF_CHECK_UNMAPPED(bp); } } if (newbsize < bp->b_bufsize) @@ -3171,21 +3557,38 @@ allocbuf(struct buf *bp, int size) return 1; } +extern int inflight_transient_maps; + void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); + vm_offset_t start, end; + int transient; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; + if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { + start = trunc_page((vm_offset_t)bp->bio_data); + end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); + transient = 1; + } else { + transient = 0; + start = end = 0; + } done = bp->bio_done; if (done == NULL) wakeup(bp); mtx_unlock(mtxp); if (done != NULL) done(bp); + if (transient) { + pmap_qremove(start, OFF_TO_IDX(end - start)); + vm_map_remove(bio_transient_map, start, end); + atomic_add_int(&inflight_transient_maps, -1); + } } /* @@ -3288,7 +3691,7 @@ dev_strategy(struct cdev *dev, struct buf *bp) bip->bio_offset = bp->b_iooffset; bip->bio_length = bp->b_bcount; bip->bio_bcount = bp->b_bcount; /* XXX: remove */ - bip->bio_data = bp->b_data; + bdata2bio(bp, bip); bip->bio_done = bufdonebio; bip->bio_caller2 = bp; bip->bio_dev = dev; @@ -3442,9 +3845,11 @@ bufdone_finish(struct buf *bp) } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_WUNLOCK(obj); - if (bogus) + if (bogus && (bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } } /* @@ -3487,8 +3892,12 @@ vfs_unbusy_pages(struct buf *bp) if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; - pmap_qenter(trunc_page((vm_offset_t)bp->b_data), - bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), + bp->b_pages, bp->b_npages); + } else + BUF_CHECK_UNMAPPED(bp); } vm_object_pip_subtract(obj, 1); vm_page_io_finish(m); @@ -3653,9 +4062,11 @@ vfs_busy_pages(struct buf *bp, int clear_modify) foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); - if (bogus) + if (bogus && (bp->b_flags & B_UNMAPPED) == 0) { + BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } } /* @@ -3777,6 +4188,8 @@ vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) vm_page_t p; int index; + BUF_CHECK_MAPPED(bp); + to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; @@ -3808,6 +4221,8 @@ vm_hold_free_pages(struct buf *bp, int newbsize) vm_page_t p; int index, newnpages; + BUF_CHECK_MAPPED(bp); + from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) @@ -4009,6 +4424,30 @@ bunpin_wait(struct buf *bp) mtx_unlock(mtxp); } +/* + * Set bio_data or bio_ma for struct bio from the struct buf. + */ +void +bdata2bio(struct buf *bp, struct bio *bip) +{ + + if ((bp->b_flags & B_UNMAPPED) != 0) { + KASSERT(unmapped_buf_allowed, ("unmapped")); + bip->bio_ma = bp->b_pages; + bip->bio_ma_n = bp->b_npages; + bip->bio_data = unmapped_buf; + bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; + bip->bio_flags |= BIO_UNMAPPED; + KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / + PAGE_SIZE == bp->b_npages, + ("Buffer %p too short: %d %d %d", bp, bip->bio_ma_offset, + bip->bio_length, bip->bio_ma_n)); + } else { + bip->bio_data = bp->b_data; + bip->bio_ma = NULL; + } +} + #include "opt_ddb.h" #ifdef DDB #include <ddb/ddb.h> diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 28aa4ff..91a0443 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -61,11 +61,11 @@ SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); -static struct cluster_save * - cluster_collectbufs(struct vnode *vp, struct buf *last_bp); -static struct buf * - cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, - daddr_t blkno, long size, int run, struct buf *fbp); +static struct cluster_save *cluster_collectbufs(struct vnode *vp, + struct buf *last_bp, int gbflags); +static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, + daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, + struct buf *fbp); static void cluster_callback(struct buf *); static int write_behind = 1; @@ -97,6 +97,8 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, error = 0; bo = &vp->v_bufobj; + if (!unmapped_buf_allowed) + gbflags &= ~GB_UNMAPPED; /* * Try to limit the amount of read-ahead by a few @@ -112,7 +114,7 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, /* * get the requested block */ - *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0); + *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags); origblkno = lblkno; /* @@ -203,7 +205,7 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, - blkno, size, nblks, bp); + blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; @@ -247,14 +249,14 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, - size, ncontig, NULL); + size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { - rbp = getblk(vp, lblkno, size, 0, 0, 0); + rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); @@ -293,14 +295,8 @@ cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * -cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) - struct vnode *vp; - u_quad_t filesize; - daddr_t lbn; - daddr_t blkno; - long size; - int run; - struct buf *fbp; +cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, + daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) { struct bufobj *bo; struct buf *bp, *tbp; @@ -324,7 +320,7 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) tbp = fbp; tbp->b_iocmd = BIO_READ; } else { - tbp = getblk(vp, lbn, size, 0, 0, 0); + tbp = getblk(vp, lbn, size, 0, 0, gbflags); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; @@ -345,9 +341,14 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) * address may not be either. Inherit the b_data offset * from the original buffer. */ - bp->b_data = (char *)((vm_offset_t)bp->b_data | - ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; + if ((gbflags & GB_UNMAPPED) != 0) { + bp->b_flags |= B_UNMAPPED; + bp->b_data = unmapped_buf; + } else { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; @@ -371,7 +372,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) break; } - tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT); + tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | + (gbflags & GB_UNMAPPED)); /* Don't wait around for locked bufs. */ if (tbp == NULL) @@ -493,8 +495,10 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) bp->b_bufsize, bp->b_kvasize); bp->b_kvasize = bp->b_bufsize; - pmap_qenter(trunc_page((vm_offset_t) bp->b_data), - (vm_page_t *)bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } return (bp); } @@ -517,7 +521,10 @@ cluster_callback(bp) if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; - pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), + bp->b_npages); + } /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. @@ -559,7 +566,8 @@ cluster_callback(bp) */ static __inline int -cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) +cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, + int gbflags) { int r = 0; @@ -570,7 +578,7 @@ cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len) start_lbn -= len; /* FALLTHROUGH */ case 1: - r = cluster_wbuild(vp, size, start_lbn, len, 0); + r = cluster_wbuild(vp, size, start_lbn, len, gbflags); /* FALLTHROUGH */ default: /* FALLTHROUGH */ @@ -598,6 +606,9 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, int lblocksize; int async; + if (!unmapped_buf_allowed) + gbflags &= ~GB_UNMAPPED; + if (vp->v_type == VREG) { async = DOINGASYNC(vp); lblocksize = vp->v_mount->mnt_stat.f_iosize; @@ -637,13 +648,13 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, - vp->v_cstart, cursize); + vp->v_cstart, cursize, gbflags); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; - buflist = cluster_collectbufs(vp, bp); + buflist = cluster_collectbufs(vp, bp, gbflags); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { @@ -662,7 +673,7 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, - cursize); + cursize, gbflags); } } else { /* @@ -710,8 +721,10 @@ cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, * update daemon handle it. */ bdwrite(bp); - if (seqcount > 1) - cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); + if (seqcount > 1) { + cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, + vp->v_clen + 1, gbflags); + } vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { @@ -746,6 +759,9 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, int totalwritten = 0; int dbsize = btodb(size); + if (!unmapped_buf_allowed) + gbflags &= ~GB_UNMAPPED; + bo = &vp->v_bufobj; while (len > 0) { /* @@ -824,10 +840,16 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, * address may not be either. Inherit the b_data offset * from the original buffer. */ - bp->b_data = (char *)((vm_offset_t)bp->b_data | - ((vm_offset_t)tbp->b_data & PAGE_MASK)); - bp->b_flags |= B_CLUSTER | - (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); + if ((gbflags & GB_UNMAPPED) == 0 || + (tbp->b_flags & B_VMIO) == 0) { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } else { + bp->b_flags |= B_UNMAPPED; + bp->b_data = unmapped_buf; + } + bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | + B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* @@ -954,8 +976,10 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, tbp, b_cluster.cluster_entry); } finishcluster: - pmap_qenter(trunc_page((vm_offset_t) bp->b_data), - (vm_page_t *) bp->b_pages, bp->b_npages); + if ((bp->b_flags & B_UNMAPPED) == 0) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", @@ -976,9 +1000,7 @@ cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, * Plus add one additional buffer. */ static struct cluster_save * -cluster_collectbufs(vp, last_bp) - struct vnode *vp; - struct buf *last_bp; +cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags) { struct cluster_save *buflist; struct buf *bp; @@ -991,7 +1013,8 @@ cluster_collectbufs(vp, last_bp) buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { - (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); + (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, + gbflags, &bp); buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c index 4fe6ebe..5c94a39 100644 --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -2576,6 +2576,8 @@ pmap_copy_page(vm_page_t src, vm_page_t dst) } } +int unmapped_buf_allowed; + void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) diff --git a/sys/powerpc/aim/mmu_oea64.c b/sys/powerpc/aim/mmu_oea64.c index 90466e8a..66194df 100644 --- a/sys/powerpc/aim/mmu_oea64.c +++ b/sys/powerpc/aim/mmu_oea64.c @@ -648,6 +648,14 @@ moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart, moea64_kenter(mmup, pa, pa); } ENABLE_TRANS(msr); + + /* + * Allow user to override unmapped_buf_allowed for testing. + * XXXKIB Only direct map implementation was tested. + */ + if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", + &unmapped_buf_allowed)) + unmapped_buf_allowed = hw_direct_map; } void diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c index 42f1a39..7fd98f4 100644 --- a/sys/powerpc/powerpc/pmap_dispatch.c +++ b/sys/powerpc/powerpc/pmap_dispatch.c @@ -574,3 +574,5 @@ pmap_mmu_install(char *name, int prio) return (FALSE); } + +int unmapped_buf_allowed; diff --git a/sys/sparc64/sparc64/pmap.c b/sys/sparc64/sparc64/pmap.c index 97a085a..2a24eb5 100644 --- a/sys/sparc64/sparc64/pmap.c +++ b/sys/sparc64/sparc64/pmap.c @@ -1918,6 +1918,8 @@ pmap_copy_page(vm_page_t msrc, vm_page_t mdst) } } +int unmapped_buf_allowed; + void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) diff --git a/sys/sys/bio.h b/sys/sys/bio.h index c016ee6..7678f5a 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -55,10 +55,13 @@ #define BIO_DONE 0x02 #define BIO_ONQUEUE 0x04 #define BIO_ORDERED 0x08 +#define BIO_UNMAPPED 0x10 +#define BIO_TRANSIENT_MAPPING 0x20 #ifdef _KERNEL struct disk; struct bio; +struct vm_map; /* Empty classifier tag, to prevent further classification. */ #define BIO_NOTCLASSIFIED (void *)(~0UL) @@ -78,6 +81,9 @@ struct bio { off_t bio_offset; /* Offset into file. */ long bio_bcount; /* Valid bytes in buffer. */ caddr_t bio_data; /* Memory, superblocks, indirect etc. */ + struct vm_page **bio_ma; /* Or unmapped. */ + int bio_ma_offset; /* Offset in the first page of bio_ma. */ + int bio_ma_n; /* Number of pages in bio_ma. */ int bio_error; /* Errno for BIO_ERROR. */ long bio_resid; /* Remaining I/O in bytes. */ void (*bio_done)(struct bio *); @@ -121,6 +127,9 @@ struct bio_queue_head { struct bio *insert_point; }; +extern struct vm_map *bio_transient_map; +extern int bio_transient_maxcnt; + void biodone(struct bio *bp); void biofinish(struct bio *bp, struct devstat *stat, int error); int biowait(struct bio *bp, const char *wchan); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 87f2b76..cc8029e 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -117,6 +117,7 @@ struct buf { long b_bufsize; /* Allocated buffer size. */ long b_runningbufspace; /* when I/O is running, pipelining */ caddr_t b_kvabase; /* base kva for buffer */ + caddr_t b_kvaalloc; /* allocated kva for B_KVAALLOC */ int b_kvasize; /* size of kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ @@ -202,8 +203,8 @@ struct buf { #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ -#define B_00000800 0x00000800 /* Available flag. */ -#define B_00001000 0x00001000 /* Available flag. */ +#define B_UNMAPPED 0x00000800 /* KVA is not mapped. */ +#define B_KVAALLOC 0x00001000 /* But allocated. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceeding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ @@ -453,7 +454,9 @@ buf_countdeps(struct buf *bp, int i) */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ -#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */ +#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ +#define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ +#define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ @@ -470,11 +473,13 @@ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ +extern caddr_t unmapped_buf; void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); +void bdata2bio(struct buf *bp, struct bio *bip); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); diff --git a/sys/sys/systm.h b/sys/sys/systm.h index 12337de..61b4679 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -138,6 +138,7 @@ extern char **kenvp; extern const void *zero_region; /* address space maps to a zeroed page */ +extern int unmapped_buf_allowed; extern int iosize_max_clamp; #define IOSIZE_MAX (iosize_max_clamp ? INT_MAX : SSIZE_MAX) diff --git a/sys/vm/vm.h b/sys/vm/vm.h index 132c10e..106c510 100644 --- a/sys/vm/vm.h +++ b/sys/vm/vm.h @@ -136,6 +136,8 @@ struct kva_md_info { vm_offset_t clean_eva; vm_offset_t pager_sva; vm_offset_t pager_eva; + vm_offset_t bio_transient_sva; + vm_offset_t bio_transient_eva; }; extern struct kva_md_info kmi; diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c index 08c9b03..b350e96 100644 --- a/sys/vm/vm_init.c +++ b/sys/vm/vm_init.c @@ -184,10 +184,15 @@ again: panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva, - (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS, TRUE); + (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS + + (long)bio_transient_maxcnt * MAXPHYS, TRUE); buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva, &kmi->buffer_eva, (long)nbuf * BKVASIZE, FALSE); buffer_map->system_map = 1; + bio_transient_map = kmem_suballoc(clean_map, &kmi->bio_transient_sva, + &kmi->bio_transient_eva, (long)bio_transient_maxcnt * MAXPHYS, + FALSE); + bio_transient_map->system_map = 1; pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva, (long)nswbuf * MAXPHYS, FALSE); pager_map->system_map = 1; diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index 64a2ebb..7c7ccc1 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -90,6 +90,7 @@ vm_map_t kmem_map; vm_map_t exec_map; vm_map_t pipe_map; vm_map_t buffer_map; +vm_map_t bio_transient_map; const void *zero_region; CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0); |