summaryrefslogtreecommitdiffstats
path: root/sys/geom
diff options
context:
space:
mode:
authorkib <kib@FreeBSD.org>2013-03-19 14:13:12 +0000
committerkib <kib@FreeBSD.org>2013-03-19 14:13:12 +0000
commit7c26a038f99b336029be9c909af59ab894214591 (patch)
treedccc6b57fecce89556fcde1d5370a34e16178ff5 /sys/geom
parent878ef603e27acf1ab599ceece679d5f8bfda9f42 (diff)
downloadFreeBSD-src-7c26a038f99b336029be9c909af59ab894214591.zip
FreeBSD-src-7c26a038f99b336029be9c909af59ab894214591.tar.gz
Implement the concept of the unmapped VMIO buffers, i.e. buffers which
do not map the b_pages pages into buffer_map KVA. The use of the unmapped buffers eliminate the need to perform TLB shootdown for mapping on the buffer creation and reuse, greatly reducing the amount of IPIs for shootdown on big-SMP machines and eliminating up to 25-30% of the system time on i/o intensive workloads. The unmapped buffer should be explicitely requested by the GB_UNMAPPED flag by the consumer. For unmapped buffer, no KVA reservation is performed at all. The consumer might request unmapped buffer which does have a KVA reserve, to manually map it without recursing into buffer cache and blocking, with the GB_KVAALLOC flag. When the mapped buffer is requested and unmapped buffer already exists, the cache performs an upgrade, possibly reusing the KVA reservation. Unmapped buffer is translated into unmapped bio in g_vfs_strategy(). Unmapped bio carry a pointer to the vm_page_t array, offset and length instead of the data pointer. The provider which processes the bio should explicitely specify a readiness to accept unmapped bio, otherwise g_down geom thread performs the transient upgrade of the bio request by mapping the pages into the new bio_transient_map KVA submap. The bio_transient_map submap claims up to 10% of the buffer map, and the total buffer_map + bio_transient_map KVA usage stays the same. Still, it could be manually tuned by kern.bio_transient_maxcnt tunable, in the units of the transient mappings. Eventually, the bio_transient_map could be removed after all geom classes and drivers can accept unmapped i/o requests. Unmapped support can be turned off by the vfs.unmapped_buf_allowed tunable, disabling which makes the buffer (or cluster) creation requests to ignore GB_UNMAPPED and GB_KVAALLOC flags. Unmapped buffers are only enabled by default on the architectures where pmap_copy_page() was implemented and tested. In the rework, filesystem metadata is not the subject to maxbufspace limit anymore. Since the metadata buffers are always mapped, the buffers still have to fit into the buffer map, which provides a reasonable (but practically unreachable) upper bound on it. The non-metadata buffer allocations, both mapped and unmapped, is accounted against maxbufspace, as before. Effectively, this means that the maxbufspace is forced on mapped and unmapped buffers separately. The pre-patch bufspace limiting code did not worked, because buffer_map fragmentation does not allow the limit to be reached. By Jeff Roberson request, the getnewbuf() function was split into smaller single-purpose functions. Sponsored by: The FreeBSD Foundation Discussed with: jeff (previous version) Tested by: pho, scottl (previous version), jhb, bf MFC after: 2 weeks
Diffstat (limited to 'sys/geom')
-rw-r--r--sys/geom/geom.h1
-rw-r--r--sys/geom/geom_io.c106
-rw-r--r--sys/geom/geom_vfs.c8
3 files changed, 110 insertions, 5 deletions
diff --git a/sys/geom/geom.h b/sys/geom/geom.h
index 351b05d..660bf6e 100644
--- a/sys/geom/geom.h
+++ b/sys/geom/geom.h
@@ -205,6 +205,7 @@ struct g_provider {
u_int flags;
#define G_PF_WITHER 0x2
#define G_PF_ORPHAN 0x4
+#define G_PF_ACCEPT_UNMAPPED 0x8
/* Two fields for the implementing class to use */
void *private;
diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c
index c6a5da8..6ffc06e 100644
--- a/sys/geom/geom_io.c
+++ b/sys/geom/geom_io.c
@@ -1,6 +1,7 @@
/*-
* Copyright (c) 2002 Poul-Henning Kamp
* Copyright (c) 2002 Networks Associates Technology, Inc.
+ * Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Poul-Henning Kamp
@@ -8,6 +9,9 @@
* under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
* DARPA CHATS research program.
*
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -44,6 +48,7 @@ __FBSDID("$FreeBSD$");
#include <sys/ktr.h>
#include <sys/proc.h>
#include <sys/stack.h>
+#include <sys/sysctl.h>
#include <sys/errno.h>
#include <geom/geom.h>
@@ -51,6 +56,13 @@ __FBSDID("$FreeBSD$");
#include <sys/devicestat.h>
#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
static struct g_bioq g_bio_run_down;
static struct g_bioq g_bio_run_up;
@@ -180,12 +192,17 @@ g_clone_bio(struct bio *bp)
/*
* BIO_ORDERED flag may be used by disk drivers to enforce
* ordering restrictions, so this flag needs to be cloned.
+ * BIO_UNMAPPED should be inherited, to properly indicate
+ * which way the buffer is passed.
* Other bio flags are not suitable for cloning.
*/
- bp2->bio_flags = bp->bio_flags & BIO_ORDERED;
+ bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED);
bp2->bio_length = bp->bio_length;
bp2->bio_offset = bp->bio_offset;
bp2->bio_data = bp->bio_data;
+ bp2->bio_ma = bp->bio_ma;
+ bp2->bio_ma_n = bp->bio_ma_n;
+ bp2->bio_ma_offset = bp->bio_ma_offset;
bp2->bio_attribute = bp->bio_attribute;
/* Inherit classification info from the parent */
bp2->bio_classifier1 = bp->bio_classifier1;
@@ -210,11 +227,15 @@ g_duplicate_bio(struct bio *bp)
struct bio *bp2;
bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO);
+ bp2->bio_flags = bp->bio_flags & BIO_UNMAPPED;
bp2->bio_parent = bp;
bp2->bio_cmd = bp->bio_cmd;
bp2->bio_length = bp->bio_length;
bp2->bio_offset = bp->bio_offset;
bp2->bio_data = bp->bio_data;
+ bp2->bio_ma = bp->bio_ma;
+ bp2->bio_ma_n = bp->bio_ma_n;
+ bp2->bio_ma_offset = bp->bio_ma_offset;
bp2->bio_attribute = bp->bio_attribute;
bp->bio_children++;
#ifdef KTR
@@ -575,6 +596,83 @@ g_io_deliver(struct bio *bp, int error)
return;
}
+SYSCTL_DECL(_kern_geom);
+
+static long transient_maps;
+SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD,
+ &transient_maps, 0,
+ "Total count of the transient mapping requests");
+u_int transient_map_retries = 10;
+SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW,
+ &transient_map_retries, 0,
+ "Max count of retries used before giving up on creating transient map");
+int transient_map_hard_failures;
+SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD,
+ &transient_map_hard_failures, 0,
+ "Failures to establish the transient mapping due to retry attempts "
+ "exhausted");
+int transient_map_soft_failures;
+SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD,
+ &transient_map_soft_failures, 0,
+ "Count of retried failures to establish the transient mapping");
+int inflight_transient_maps;
+SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD,
+ &inflight_transient_maps, 0,
+ "Current count of the active transient maps");
+
+static int
+g_io_transient_map_bio(struct bio *bp)
+{
+ vm_offset_t addr;
+ long size;
+ u_int retried;
+ int rv;
+
+ size = round_page(bp->bio_ma_offset + bp->bio_length);
+ KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp));
+ addr = 0;
+ retried = 0;
+ atomic_add_long(&transient_maps, 1);
+retry:
+ vm_map_lock(bio_transient_map);
+ if (vm_map_findspace(bio_transient_map, vm_map_min(bio_transient_map),
+ size, &addr)) {
+ vm_map_unlock(bio_transient_map);
+ if (transient_map_retries != 0 &&
+ retried >= transient_map_retries) {
+ g_io_deliver(bp, EDEADLK/* XXXKIB */);
+ CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s",
+ bp, bp->bio_to->name);
+ atomic_add_int(&transient_map_hard_failures, 1);
+ return (1);
+ } else {
+ /*
+ * Naive attempt to quisce the I/O to get more
+ * in-flight requests completed and defragment
+ * the bio_transient_map.
+ */
+ CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d",
+ bp, bp->bio_to->name, retried);
+ pause("g_d_tra", hz / 10);
+ retried++;
+ atomic_add_int(&transient_map_soft_failures, 1);
+ goto retry;
+ }
+ }
+ rv = vm_map_insert(bio_transient_map, NULL, 0, addr, addr + size,
+ VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
+ KASSERT(rv == KERN_SUCCESS,
+ ("vm_map_insert(bio_transient_map) rv %d %jx %lx",
+ rv, (uintmax_t)addr, size));
+ vm_map_unlock(bio_transient_map);
+ atomic_add_int(&inflight_transient_maps, 1);
+ pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size));
+ bp->bio_data = (caddr_t)addr + bp->bio_ma_offset;
+ bp->bio_flags |= BIO_TRANSIENT_MAPPING;
+ bp->bio_flags &= ~BIO_UNMAPPED;
+ return (0);
+}
+
void
g_io_schedule_down(struct thread *tp __unused)
{
@@ -636,6 +734,12 @@ g_io_schedule_down(struct thread *tp __unused)
default:
break;
}
+ if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
+ (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 &&
+ (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
+ if (g_io_transient_map_bio(bp))
+ continue;
+ }
THREAD_NO_SLEEPING();
CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld "
"len %ld", bp, bp->bio_to->name, bp->bio_offset,
diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c
index bbed550..92f1ad2 100644
--- a/sys/geom/geom_vfs.c
+++ b/sys/geom/geom_vfs.c
@@ -188,14 +188,14 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp)
bip = g_alloc_bio();
bip->bio_cmd = bp->b_iocmd;
bip->bio_offset = bp->b_iooffset;
- bip->bio_data = bp->b_data;
- bip->bio_done = g_vfs_done;
- bip->bio_caller2 = bp;
bip->bio_length = bp->b_bcount;
- if (bp->b_flags & B_BARRIER) {
+ bdata2bio(bp, bip);
+ if ((bp->b_flags & B_BARRIER) != 0) {
bip->bio_flags |= BIO_ORDERED;
bp->b_flags &= ~B_BARRIER;
}
+ bip->bio_done = g_vfs_done;
+ bip->bio_caller2 = bp;
g_io_request(bip, cp);
}
OpenPOWER on IntegriCloud