diff options
author | kib <kib@FreeBSD.org> | 2013-03-19 14:13:12 +0000 |
---|---|---|
committer | kib <kib@FreeBSD.org> | 2013-03-19 14:13:12 +0000 |
commit | 7c26a038f99b336029be9c909af59ab894214591 (patch) | |
tree | dccc6b57fecce89556fcde1d5370a34e16178ff5 /sys/geom | |
parent | 878ef603e27acf1ab599ceece679d5f8bfda9f42 (diff) | |
download | FreeBSD-src-7c26a038f99b336029be9c909af59ab894214591.zip FreeBSD-src-7c26a038f99b336029be9c909af59ab894214591.tar.gz |
Implement the concept of the unmapped VMIO buffers, i.e. buffers which
do not map the b_pages pages into buffer_map KVA. The use of the
unmapped buffers eliminate the need to perform TLB shootdown for
mapping on the buffer creation and reuse, greatly reducing the amount
of IPIs for shootdown on big-SMP machines and eliminating up to 25-30%
of the system time on i/o intensive workloads.
The unmapped buffer should be explicitely requested by the GB_UNMAPPED
flag by the consumer. For unmapped buffer, no KVA reservation is
performed at all. The consumer might request unmapped buffer which
does have a KVA reserve, to manually map it without recursing into
buffer cache and blocking, with the GB_KVAALLOC flag.
When the mapped buffer is requested and unmapped buffer already
exists, the cache performs an upgrade, possibly reusing the KVA
reservation.
Unmapped buffer is translated into unmapped bio in g_vfs_strategy().
Unmapped bio carry a pointer to the vm_page_t array, offset and length
instead of the data pointer. The provider which processes the bio
should explicitely specify a readiness to accept unmapped bio,
otherwise g_down geom thread performs the transient upgrade of the bio
request by mapping the pages into the new bio_transient_map KVA
submap.
The bio_transient_map submap claims up to 10% of the buffer map, and
the total buffer_map + bio_transient_map KVA usage stays the
same. Still, it could be manually tuned by kern.bio_transient_maxcnt
tunable, in the units of the transient mappings. Eventually, the
bio_transient_map could be removed after all geom classes and drivers
can accept unmapped i/o requests.
Unmapped support can be turned off by the vfs.unmapped_buf_allowed
tunable, disabling which makes the buffer (or cluster) creation
requests to ignore GB_UNMAPPED and GB_KVAALLOC flags. Unmapped
buffers are only enabled by default on the architectures where
pmap_copy_page() was implemented and tested.
In the rework, filesystem metadata is not the subject to maxbufspace
limit anymore. Since the metadata buffers are always mapped, the
buffers still have to fit into the buffer map, which provides a
reasonable (but practically unreachable) upper bound on it. The
non-metadata buffer allocations, both mapped and unmapped, is
accounted against maxbufspace, as before. Effectively, this means that
the maxbufspace is forced on mapped and unmapped buffers separately.
The pre-patch bufspace limiting code did not worked, because
buffer_map fragmentation does not allow the limit to be reached.
By Jeff Roberson request, the getnewbuf() function was split into
smaller single-purpose functions.
Sponsored by: The FreeBSD Foundation
Discussed with: jeff (previous version)
Tested by: pho, scottl (previous version), jhb, bf
MFC after: 2 weeks
Diffstat (limited to 'sys/geom')
-rw-r--r-- | sys/geom/geom.h | 1 | ||||
-rw-r--r-- | sys/geom/geom_io.c | 106 | ||||
-rw-r--r-- | sys/geom/geom_vfs.c | 8 |
3 files changed, 110 insertions, 5 deletions
diff --git a/sys/geom/geom.h b/sys/geom/geom.h index 351b05d..660bf6e 100644 --- a/sys/geom/geom.h +++ b/sys/geom/geom.h @@ -205,6 +205,7 @@ struct g_provider { u_int flags; #define G_PF_WITHER 0x2 #define G_PF_ORPHAN 0x4 +#define G_PF_ACCEPT_UNMAPPED 0x8 /* Two fields for the implementing class to use */ void *private; diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c index c6a5da8..6ffc06e 100644 --- a/sys/geom/geom_io.c +++ b/sys/geom/geom_io.c @@ -1,6 +1,7 @@ /*- * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. + * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp @@ -8,6 +9,9 @@ * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -44,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include <sys/ktr.h> #include <sys/proc.h> #include <sys/stack.h> +#include <sys/sysctl.h> #include <sys/errno.h> #include <geom/geom.h> @@ -51,6 +56,13 @@ __FBSDID("$FreeBSD$"); #include <sys/devicestat.h> #include <vm/uma.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> static struct g_bioq g_bio_run_down; static struct g_bioq g_bio_run_up; @@ -180,12 +192,17 @@ g_clone_bio(struct bio *bp) /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. + * BIO_UNMAPPED should be inherited, to properly indicate + * which way the buffer is passed. * Other bio flags are not suitable for cloning. */ - bp2->bio_flags = bp->bio_flags & BIO_ORDERED; + bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; + bp2->bio_ma_n = bp->bio_ma_n; + bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; /* Inherit classification info from the parent */ bp2->bio_classifier1 = bp->bio_classifier1; @@ -210,11 +227,15 @@ g_duplicate_bio(struct bio *bp) struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); + bp2->bio_flags = bp->bio_flags & BIO_UNMAPPED; bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; + bp2->bio_ma = bp->bio_ma; + bp2->bio_ma_n = bp->bio_ma_n; + bp2->bio_ma_offset = bp->bio_ma_offset; bp2->bio_attribute = bp->bio_attribute; bp->bio_children++; #ifdef KTR @@ -575,6 +596,83 @@ g_io_deliver(struct bio *bp, int error) return; } +SYSCTL_DECL(_kern_geom); + +static long transient_maps; +SYSCTL_LONG(_kern_geom, OID_AUTO, transient_maps, CTLFLAG_RD, + &transient_maps, 0, + "Total count of the transient mapping requests"); +u_int transient_map_retries = 10; +SYSCTL_UINT(_kern_geom, OID_AUTO, transient_map_retries, CTLFLAG_RW, + &transient_map_retries, 0, + "Max count of retries used before giving up on creating transient map"); +int transient_map_hard_failures; +SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_hard_failures, CTLFLAG_RD, + &transient_map_hard_failures, 0, + "Failures to establish the transient mapping due to retry attempts " + "exhausted"); +int transient_map_soft_failures; +SYSCTL_INT(_kern_geom, OID_AUTO, transient_map_soft_failures, CTLFLAG_RD, + &transient_map_soft_failures, 0, + "Count of retried failures to establish the transient mapping"); +int inflight_transient_maps; +SYSCTL_INT(_kern_geom, OID_AUTO, inflight_transient_maps, CTLFLAG_RD, + &inflight_transient_maps, 0, + "Current count of the active transient maps"); + +static int +g_io_transient_map_bio(struct bio *bp) +{ + vm_offset_t addr; + long size; + u_int retried; + int rv; + + size = round_page(bp->bio_ma_offset + bp->bio_length); + KASSERT(size / PAGE_SIZE == bp->bio_ma_n, ("Bio too short %p", bp)); + addr = 0; + retried = 0; + atomic_add_long(&transient_maps, 1); +retry: + vm_map_lock(bio_transient_map); + if (vm_map_findspace(bio_transient_map, vm_map_min(bio_transient_map), + size, &addr)) { + vm_map_unlock(bio_transient_map); + if (transient_map_retries != 0 && + retried >= transient_map_retries) { + g_io_deliver(bp, EDEADLK/* XXXKIB */); + CTR2(KTR_GEOM, "g_down cannot map bp %p provider %s", + bp, bp->bio_to->name); + atomic_add_int(&transient_map_hard_failures, 1); + return (1); + } else { + /* + * Naive attempt to quisce the I/O to get more + * in-flight requests completed and defragment + * the bio_transient_map. + */ + CTR3(KTR_GEOM, "g_down retrymap bp %p provider %s r %d", + bp, bp->bio_to->name, retried); + pause("g_d_tra", hz / 10); + retried++; + atomic_add_int(&transient_map_soft_failures, 1); + goto retry; + } + } + rv = vm_map_insert(bio_transient_map, NULL, 0, addr, addr + size, + VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); + KASSERT(rv == KERN_SUCCESS, + ("vm_map_insert(bio_transient_map) rv %d %jx %lx", + rv, (uintmax_t)addr, size)); + vm_map_unlock(bio_transient_map); + atomic_add_int(&inflight_transient_maps, 1); + pmap_qenter((vm_offset_t)addr, bp->bio_ma, OFF_TO_IDX(size)); + bp->bio_data = (caddr_t)addr + bp->bio_ma_offset; + bp->bio_flags |= BIO_TRANSIENT_MAPPING; + bp->bio_flags &= ~BIO_UNMAPPED; + return (0); +} + void g_io_schedule_down(struct thread *tp __unused) { @@ -636,6 +734,12 @@ g_io_schedule_down(struct thread *tp __unused) default: break; } + if ((bp->bio_flags & BIO_UNMAPPED) != 0 && + (bp->bio_to->flags & G_PF_ACCEPT_UNMAPPED) == 0 && + (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { + if (g_io_transient_map_bio(bp)) + continue; + } THREAD_NO_SLEEPING(); CTR4(KTR_GEOM, "g_down starting bp %p provider %s off %ld " "len %ld", bp, bp->bio_to->name, bp->bio_offset, diff --git a/sys/geom/geom_vfs.c b/sys/geom/geom_vfs.c index bbed550..92f1ad2 100644 --- a/sys/geom/geom_vfs.c +++ b/sys/geom/geom_vfs.c @@ -188,14 +188,14 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp) bip = g_alloc_bio(); bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; - bip->bio_data = bp->b_data; - bip->bio_done = g_vfs_done; - bip->bio_caller2 = bp; bip->bio_length = bp->b_bcount; - if (bp->b_flags & B_BARRIER) { + bdata2bio(bp, bip); + if ((bp->b_flags & B_BARRIER) != 0) { bip->bio_flags |= BIO_ORDERED; bp->b_flags &= ~B_BARRIER; } + bip->bio_done = g_vfs_done; + bip->bio_caller2 = bp; g_io_request(bip, cp); } |