diff options
Diffstat (limited to 'sys/kern/vfs_bio.c')
-rw-r--r-- | sys/kern/vfs_bio.c | 240 |
1 files changed, 155 insertions, 85 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index d29c308..5f68bde 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -64,9 +64,11 @@ __FBSDID("$FreeBSD$"); #include <sys/resourcevar.h> #include <sys/rwlock.h> #include <sys/sysctl.h> +#include <sys/sysproto.h> #include <sys/vmem.h> #include <sys/vmmeter.h> #include <sys/vnode.h> +#include <sys/watchdog.h> #include <geom/geom.h> #include <vm/vm.h> #include <vm/vm_param.h> @@ -76,6 +78,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_object.h> #include <vm/vm_extern.h> #include <vm/vm_map.h> +#include <vm/swap_pager.h> #include "opt_compat.h" #include "opt_swap.h" @@ -91,11 +94,8 @@ struct buf_ops buf_ops_bio = { .bop_bdflush = bufbdflush, }; -/* - * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has - * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. - */ -struct buf *buf; /* buffer header pool */ +static struct buf *buf; /* buffer header pool */ +extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ @@ -309,13 +309,12 @@ static int bdirtywait; /* * Definitions for the buffer free lists. */ -#define BUFFER_QUEUES 5 /* number of free buffer queues */ +#define BUFFER_QUEUES 4 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ -#define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */ -#define QUEUE_EMPTY 4 /* empty buffer headers */ +#define QUEUE_EMPTY 3 /* empty buffer headers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Queues for free buffers with various properties */ @@ -956,7 +955,7 @@ vfs_buf_check_mapped(struct buf *bp) ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); - KASSERT(bp->b_data < unmapped_buf || bp->b_data > unmapped_buf + + KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } @@ -975,6 +974,135 @@ vfs_buf_check_unmapped(struct buf *bp) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif +static int +isbufbusy(struct buf *bp) +{ + if (((bp->b_flags & (B_INVAL | B_PERSISTENT)) == 0 && + BUF_ISLOCKED(bp)) || + ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) + return (1); + return (0); +} + +/* + * Shutdown the system cleanly to prepare for reboot, halt, or power off. + */ +void +bufshutdown(int show_busybufs) +{ + static int first_buf_printf = 1; + struct buf *bp; + int iter, nbusy, pbusy; +#ifndef PREEMPTION + int subiter; +#endif + + /* + * Sync filesystems for shutdown + */ + wdog_kern_pat(WD_LASTVAL); + sys_sync(curthread, NULL); + + /* + * With soft updates, some buffers that are + * written will be remarked as dirty until other + * buffers are written. + */ + for (iter = pbusy = 0; iter < 20; iter++) { + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) + if (isbufbusy(bp)) + nbusy++; + if (nbusy == 0) { + if (first_buf_printf) + printf("All buffers synced."); + break; + } + if (first_buf_printf) { + printf("Syncing disks, buffers remaining... "); + first_buf_printf = 0; + } + printf("%d ", nbusy); + if (nbusy < pbusy) + iter = 0; + pbusy = nbusy; + + wdog_kern_pat(WD_LASTVAL); + sys_sync(curthread, NULL); + +#ifdef PREEMPTION + /* + * Drop Giant and spin for a while to allow + * interrupt threads to run. + */ + DROP_GIANT(); + DELAY(50000 * iter); + PICKUP_GIANT(); +#else + /* + * Drop Giant and context switch several times to + * allow interrupt threads to run. + */ + DROP_GIANT(); + for (subiter = 0; subiter < 50 * iter; subiter++) { + thread_lock(curthread); + mi_switch(SW_VOL, NULL); + thread_unlock(curthread); + DELAY(1000); + } + PICKUP_GIANT(); +#endif + } + printf("\n"); + /* + * Count only busy local buffers to prevent forcing + * a fsck if we're just a client of a wedged NFS server + */ + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if (isbufbusy(bp)) { +#if 0 +/* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ + if (bp->b_dev == NULL) { + TAILQ_REMOVE(&mountlist, + bp->b_vp->v_mount, mnt_list); + continue; + } +#endif + nbusy++; + if (show_busybufs > 0) { + printf( + "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", + nbusy, bp, bp->b_vp, bp->b_flags, + (intmax_t)bp->b_blkno, + (intmax_t)bp->b_lblkno); + BUF_LOCKPRINTINFO(bp); + if (show_busybufs > 1) + vn_printf(bp->b_vp, + "vnode content: "); + } + } + } + if (nbusy) { + /* + * Failed to sync all blocks. Indicate this and don't + * unmount filesystems (thus forcing an fsck on reboot). + */ + printf("Giving up on %d buffers\n", nbusy); + DELAY(5000000); /* 5 seconds */ + } else { + if (!first_buf_printf) + printf("Final sync complete\n"); + /* + * Unmount filesystems + */ + if (panicstr == 0) + vfs_unmountall(); + } + swapoff_all(); + DELAY(100000); /* wait for console output to finish */ +} + static void bpmap_qenter(struct buf *bp) { @@ -1862,10 +1990,8 @@ brelse(struct buf *bp) bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); - if (bp->b_kvasize) - qindex = QUEUE_EMPTYKVA; - else - qindex = QUEUE_EMPTY; + bufkvafree(bp); + qindex = QUEUE_EMPTY; bp->b_flags |= B_AGE; /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || @@ -2251,8 +2377,6 @@ getnewbuf_reuse_bp(struct buf *bp, int qindex) LIST_INIT(&bp->b_dep); } -static int flushingbufs; - static struct buf * getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata) { @@ -2261,64 +2385,25 @@ getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata) KASSERT(!unmapped || !defrag, ("both unmapped and defrag")); - pass = 1; + pass = 0; restart: - atomic_add_int(&getnewbufrestarts, 1); + if (pass != 0) + atomic_add_int(&getnewbufrestarts, 1); - /* - * Setup for scan. If we do not have enough free buffers, - * we setup a degenerate case that immediately fails. Note - * that if we are specially marked process, we are allowed to - * dip into our reserves. - * - * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN - * for the allocation of the mapped buffer. For unmapped, the - * easiest is to start with EMPTY outright. - * - * We start with EMPTYKVA. If the list is empty we backup to EMPTY. - * However, there are a number of cases (defragging, reusing, ...) - * where we cannot backup. - */ nbp = NULL; mtx_lock(&bqclean); - if (!defrag && unmapped) { - nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); - } - if (nbp == NULL) { - nqindex = QUEUE_EMPTYKVA; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); - } - - /* - * If no EMPTYKVA buffers and we are either defragging or - * reusing, locate a CLEAN buffer to free or reuse. If - * bufspace useage is low skip this step so we can allocate a - * new buffer. - */ - if (nbp == NULL && (defrag || bufspace >= lobufspace)) { - nqindex = QUEUE_CLEAN; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); - } - /* - * If we could not find or were not allowed to reuse a CLEAN - * buffer, check to see if it is ok to use an EMPTY buffer. - * We can only use an EMPTY buffer if allocating its KVA would - * not otherwise run us out of buffer space. No KVA is needed - * for the unmapped allocation. + * If we're not defragging or low on bufspace attempt to make a new + * buf from a header. */ - if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace || - metadata)) { + if (defrag == 0 && bufspace + maxsize < hibufspace) { nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + nbp = TAILQ_FIRST(&bufqueues[nqindex]); } - /* - * All available buffers might be clean, retry ignoring the - * lobufspace as the last resort. + * All available buffers might be clean or we need to start recycling. */ - if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) { + if (nbp == NULL) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } @@ -2332,28 +2417,21 @@ restart: /* * Calculate next bp (we can only use it if we do not - * block or do other fancy things). + * release the bqlock) */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch (qindex) { case QUEUE_EMPTY: - nqindex = QUEUE_EMPTYKVA; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); - if (nbp != NULL) - break; - /* FALLTHROUGH */ - case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); + nbp = TAILQ_FIRST(&bufqueues[nqindex]); if (nbp != NULL) break; /* FALLTHROUGH */ case QUEUE_CLEAN: - if (metadata && pass == 1) { - pass = 2; + if (metadata && pass == 0) { + pass = 1; nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST( - &bufqueues[QUEUE_EMPTY]); + nbp = TAILQ_FIRST(&bufqueues[nqindex]); } /* * nbp is NULL. @@ -2399,11 +2477,11 @@ restart: bremfreel(bp); mtx_unlock(&bqclean); + /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ - getnewbuf_reuse_bp(bp, qindex); mtx_assert(&bqclean, MA_NOTOWNED); @@ -2412,7 +2490,6 @@ restart: */ if (defrag) { bp->b_flags |= B_INVAL; - bufkvafree(bp); brelse(bp); defrag = 0; goto restart; @@ -2424,7 +2501,6 @@ restart: */ if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) { bp->b_flags |= B_INVAL; - bufkvafree(bp); brelse(bp); goto restart; } @@ -2437,16 +2513,11 @@ restart: * KVM space. This occurs in rare situations when multiple * processes are blocked in getnewbuf() or allocbuf(). */ - if (bufspace >= hibufspace) - flushingbufs = 1; - if (flushingbufs && bp->b_kvasize != 0) { + if (bufspace >= hibufspace && bp->b_kvasize != 0) { bp->b_flags |= B_INVAL; - bufkvafree(bp); brelse(bp); goto restart; } - if (bufspace < lobufspace) - flushingbufs = 0; break; } return (bp); @@ -2492,7 +2563,6 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, * async I/O rather then sync I/O. */ atomic_add_int(&getnewbufcalls, 1); - atomic_subtract_int(&getnewbufrestarts, 1); restart: bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED, metadata); |