diff options
Diffstat (limited to 'sys')
-rw-r--r-- | sys/kern/vfs_bio.c | 215 |
1 files changed, 154 insertions, 61 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 0f8830a..baf1851 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -80,6 +80,8 @@ static void vfs_clean_pages(struct buf * bp); static void vfs_setdirty(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static void vfs_backgroundwritedone(struct buf *bp); +static int vfs_bio_clcheck(struct vnode *vp, int size, + daddr_t lblkno, daddr_t blkno); static int flushbufqueues(void); static void buf_daemon(void); @@ -158,6 +160,11 @@ SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, static int bd_request; /* + * This lock synchronizes access to bd_request. + */ +static struct mtx bdlock; + +/* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer @@ -173,6 +180,12 @@ vm_page_t bogus_page; */ static int runningbufreq; +/* + * This lock protects the runningbufreq and synchronizes runningbufwakeup and + * waitrunningbufspace(). + */ +static struct mtx rbreqlock; + /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done @@ -182,6 +195,11 @@ static int runningbufreq; */ static int needsbuffer; +/* + * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. + */ +static struct mtx nblock; + #ifdef USE_BUFHASH /* * Mask for index into the buffer hash table, which needs to be power of 2 in @@ -218,6 +236,10 @@ static struct bufhashhdr invalhash; /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; + +/* Lock for the bufqueues */ +static struct mtx bqlock; + /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. @@ -255,10 +277,12 @@ static __inline void numdirtywakeup(int level) { if (numdirtybuffers <= level) { + mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; wakeup(&needsbuffer); } + mtx_unlock(&nblock); } } @@ -279,10 +303,12 @@ bufspacewakeup(void) * though we haven't freed the kva space yet, the waiting * process will be able to now. */ + mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; wakeup(&needsbuffer); } + mtx_unlock(&nblock); } /* @@ -293,12 +319,14 @@ static __inline void runningbufwakeup(struct buf *bp) { if (bp->b_runningbufspace) { - runningbufspace -= bp->b_runningbufspace; + atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); bp->b_runningbufspace = 0; + mtx_lock(&rbreqlock); if (runningbufreq && runningbufspace <= lorunningspace) { runningbufreq = 0; wakeup(&runningbufreq); } + mtx_unlock(&rbreqlock); } } @@ -314,13 +342,15 @@ runningbufwakeup(struct buf *bp) static __inline void bufcountwakeup(void) { - ++numfreebuffers; + atomic_subtract_int(&numfreebuffers, 1); + mtx_lock(&nblock); if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) needsbuffer &= ~VFS_BIO_NEED_FREE; wakeup(&needsbuffer); } + mtx_unlock(&nblock); } /* @@ -341,14 +371,12 @@ bufcountwakeup(void) static __inline void waitrunningbufspace(void) { - /* - * XXX race against wakeup interrupt, currently - * protected by Giant. FIXME! - */ + mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { ++runningbufreq; - tsleep(&runningbufreq, PVM, "wdrain", 0); + msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } + mtx_unlock(&rbreqlock); } @@ -379,10 +407,12 @@ static __inline__ void bd_wakeup(int dirtybuflevel) { + mtx_lock(&bdlock); if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { bd_request = 1; wakeup(&bd_request); } + mtx_unlock(&bdlock); } /* @@ -489,6 +519,10 @@ bufinit(void) LIST_INIT(&invalhash); #endif mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF); + mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); + mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); + mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); + mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); #ifdef USE_BUFHASH for (i = 0; i <= bufhashmask; i++) @@ -600,8 +634,8 @@ bfreekva(struct buf * bp) GIANT_REQUIRED; if (bp->b_kvasize) { - ++buffreekvacnt; - bufspace -= bp->b_kvasize; + atomic_add_int(&buffreekvacnt, 1); + atomic_subtract_int(&bufspace, bp->b_kvasize); vm_map_delete(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize @@ -624,6 +658,7 @@ bremfree(struct buf * bp) GIANT_REQUIRED; + mtx_lock(&bqlock); if (bp->b_qindex != QUEUE_NONE) { KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); @@ -644,12 +679,13 @@ bremfree(struct buf * bp) case QUEUE_CLEAN: case QUEUE_EMPTY: case QUEUE_EMPTYKVA: - --numfreebuffers; + atomic_subtract_int(&numfreebuffers, 1); break; default: break; } } + mtx_unlock(&bqlock); splx(s); } @@ -848,7 +884,7 @@ bwrite(struct buf * bp) * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; - runningbufspace += bp->b_runningbufspace; + atomic_add_int(&runningbufspace, bp->b_runningbufspace); if (curthread != PCPU_GET(idlethread)) curthread->td_proc->p_stats->p_ru.ru_oublock++; @@ -892,10 +928,12 @@ vfs_backgroundwritedone(bp) /* * Find the original buffer that we are writing. */ +#ifdef INVARIANTS VI_LOCK(bp->b_vp); if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); VI_UNLOCK(bp->b_vp); +#endif /* * Process dependencies then return any unfinished ones. */ @@ -903,6 +941,8 @@ vfs_backgroundwritedone(bp) buf_complete(bp); if (LIST_FIRST(&bp->b_dep) != NULL) buf_movedeps(bp, origbp); + + /* XXX Find out if origbp can disappear or get inconsistent */ /* * Clear the BX_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. @@ -1038,7 +1078,7 @@ bdirty(bp) if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); - ++numdirtybuffers; + atomic_add_int(&numdirtybuffers, 1); bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); } } @@ -1065,7 +1105,7 @@ bundirty(bp) if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp, bp->b_vp); - --numdirtybuffers; + atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } /* @@ -1108,12 +1148,15 @@ bwillwrite(void) mtx_lock(&Giant); s = splbio(); + mtx_lock(&nblock); while (numdirtybuffers >= hidirtybuffers) { bd_wakeup(1); needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; - tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); + msleep(&needsbuffer, &nblock, + (PRIBIO + 4), "flswai", 0); } splx(s); + mtx_lock(&nblock); mtx_unlock(&Giant); } } @@ -1172,7 +1215,7 @@ brelse(struct buf * bp) if (LIST_FIRST(&bp->b_dep) != NULL) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) { - --numdirtybuffers; + atomic_subtract_int(&numdirtybuffers, 1); numdirtywakeup(lodirtybuffers); } bp->b_flags &= ~(B_DELWRI | B_CACHE); @@ -1318,6 +1361,7 @@ brelse(struct buf * bp) } /* enqueue */ + mtx_lock(&bqlock); /* buffers with no memory */ if (bp->b_bufsize == 0) { @@ -1367,6 +1411,7 @@ brelse(struct buf * bp) else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } + mtx_unlock(&bqlock); /* * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already @@ -1397,12 +1442,12 @@ brelse(struct buf * bp) if (bp->b_bufsize || bp->b_kvasize) bufspacewakeup(); - /* unlock */ - BUF_UNLOCK(bp); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT | B_NOWDRAIN); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); + /* unlock */ + BUF_UNLOCK(bp); splx(s); } @@ -1434,6 +1479,7 @@ bqrelse(struct buf * bp) splx(s); return; } + mtx_lock(&bqlock); if (bp->b_flags & B_LOCKED) { bp->b_ioflags &= ~BIO_ERROR; bp->b_qindex = QUEUE_LOCKED; @@ -1448,6 +1494,7 @@ bqrelse(struct buf * bp) * buffer (most importantly: the wired pages making up its * backing store) *now*. */ + mtx_unlock(&bqlock); splx(s); brelse(bp); return; @@ -1455,6 +1502,7 @@ bqrelse(struct buf * bp) bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } + mtx_unlock(&bqlock); if ((bp->b_flags & B_LOCKED) == 0 && ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { @@ -1467,11 +1515,11 @@ bqrelse(struct buf * bp) if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); - /* unlock */ - BUF_UNLOCK(bp); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); + /* unlock */ + BUF_UNLOCK(bp); splx(s); } @@ -1560,6 +1608,45 @@ gbincore(struct vnode * vp, daddr_t blkno) #endif /* + * Check to see if a block at a particular lbn is available for a clustered + * write. + */ +static int +vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) +{ + struct buf *bpa; + int match; + + match = 0; + + /* If the buf isn't in core skip it */ + if ((bpa = gbincore(vp, lblkno)) == NULL) + return (0); + + /* If the buf is busy we don't want to wait for it */ + if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT) != 0) + return (0); + + /* Only cluster with valid clusterable delayed write buffers */ + if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != + (B_DELWRI | B_CLUSTEROK)) + goto done; + + if (bpa->b_bufsize != size) + goto done; + + /* + * Check to see if it is in the expected place on disk and that the + * block has been mapped. + */ + if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) + match = 1; +done: + BUF_UNLOCK(bpa); + return (match); +} + +/* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. @@ -1576,7 +1663,6 @@ vfs_bio_awrite(struct buf * bp) struct vnode *vp = bp->b_vp; int s; int ncl; - struct buf *bpa; int nwritten; int size; int maxcl; @@ -1595,34 +1681,16 @@ vfs_bio_awrite(struct buf * bp) maxcl = MAXPHYS / size; VI_LOCK(vp); - for (i = 1; i < maxcl; i++) { - if ((bpa = gbincore(vp, lblkno + i)) && - BUF_REFCNT(bpa) == 0 && - ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == - (B_DELWRI | B_CLUSTEROK)) && - (bpa->b_bufsize == size)) { - if ((bpa->b_blkno == bpa->b_lblkno) || - (bpa->b_blkno != - bp->b_blkno + ((i * size) >> DEV_BSHIFT))) - break; - } else { + for (i = 1; i < maxcl; i++) + if (vfs_bio_clcheck(vp, size, lblkno + i, + bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; - } - } - for (j = 1; i + j <= maxcl && j <= lblkno; j++) { - if ((bpa = gbincore(vp, lblkno - j)) && - BUF_REFCNT(bpa) == 0 && - ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == - (B_DELWRI | B_CLUSTEROK)) && - (bpa->b_bufsize == size)) { - if ((bpa->b_blkno == bpa->b_lblkno) || - (bpa->b_blkno != - bp->b_blkno - ((j * size) >> DEV_BSHIFT))) - break; - } else { + + for (j = 1; i + j <= maxcl && j <= lblkno; j++) + if (vfs_bio_clcheck(vp, size, lblkno - j, + bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; - } - } + VI_UNLOCK(vp); --j; ncl = i + j; @@ -1690,10 +1758,10 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize) * async I/O rather then sync I/O. */ - ++getnewbufcalls; - --getnewbufrestarts; + atomic_add_int(&getnewbufcalls, 1); + atomic_subtract_int(&getnewbufrestarts, 1); restart: - ++getnewbufrestarts; + atomic_add_int(&getnewbufrestarts, 1); /* * Setup for scan. If we do not have enough free buffers, @@ -1707,6 +1775,7 @@ restart: * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ + mtx_lock(&bqlock); nqindex = QUEUE_EMPTYKVA; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); @@ -1797,6 +1866,7 @@ restart: if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) panic("getnewbuf: locked buf"); + mtx_unlock(&bqlock); bremfree(bp); if (qindex == QUEUE_CLEAN) { @@ -1908,12 +1978,16 @@ restart: bd_speedup(); /* heeeelp */ + mtx_lock(&nblock); needsbuffer |= flags; while (needsbuffer & flags) { - if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, - waitmsg, slptimeo)) + if (msleep(&needsbuffer, &nblock, + (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { + mtx_unlock(&nblock); return (NULL); + } } + mtx_unlock(&nblock); } else { /* * We finally have a valid bp. We aren't quite out of the @@ -1934,7 +2008,7 @@ restart: * Uh oh. Buffer map is to fragmented. We * must defragment the map. */ - ++bufdefragcnt; + atomic_add_int(&bufdefragcnt, 1); defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); @@ -1947,8 +2021,8 @@ restart: bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; - bufspace += bp->b_kvasize; - ++bufreusecnt; + atomic_add_int(&bufspace, bp->b_kvasize); + atomic_add_int(&bufreusecnt, 1); } } bp->b_data = bp->b_kvabase; @@ -1990,11 +2064,13 @@ buf_daemon() * This process is allowed to take the buffer cache to the limit */ s = splbio(); + mtx_lock(&bdlock); for (;;) { - kthread_suspend_check(bufdaemonproc); - bd_request = 0; + mtx_unlock(&bdlock); + + kthread_suspend_check(bufdaemonproc); /* * Do the flush. Limit the amount of in-transit I/O we @@ -2019,6 +2095,7 @@ buf_daemon() * find any flushable buffers, we sleep half a second. * Otherwise we loop immediately. */ + mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the @@ -2026,14 +2103,14 @@ buf_daemon() * The sleep is just so the suspend code works. */ bd_request = 0; - tsleep(&bd_request, PVM, "psleep", hz); + msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ - tsleep(&bd_request, PVM, "qsleep", hz / 10); + msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } @@ -2055,6 +2132,7 @@ flushbufqueues(void) struct vnode *vp; struct buf *bp; + mtx_lock(&bqlock); TAILQ_FOREACH(bp, &bufqueues[QUEUE_DIRTY], b_freelist) { KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); @@ -2063,6 +2141,7 @@ flushbufqueues(void) if (bp->b_flags & B_INVAL) { if (BUF_LOCK(bp, LK_EXCLUSIVE) != 0) panic("flushbufqueues: locked buf"); + mtx_unlock(&bqlock); bremfree(bp); brelse(bp); return (1); @@ -2077,6 +2156,7 @@ flushbufqueues(void) */ if ((vp = bp->b_vp) == NULL || vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) { + mtx_unlock(&bqlock); vfs_bio_awrite(bp); if (vp != NULL) VOP_UNLOCK(vp, 0, td); @@ -2096,6 +2176,7 @@ flushbufqueues(void) if (bp->b_flags & B_INVAL) { if (BUF_LOCK(bp, LK_EXCLUSIVE) != 0) panic("flushbufqueues: locked buf"); + mtx_unlock(&bqlock); bremfree(bp); brelse(bp); return (1); @@ -2108,6 +2189,7 @@ flushbufqueues(void) */ if ((vp = bp->b_vp) == NULL || vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) { + mtx_unlock(&bqlock); vfs_bio_awrite(bp); if (vp != NULL) VOP_UNLOCK(vp, 0, td); @@ -2115,6 +2197,7 @@ flushbufqueues(void) return (0); } } + mtx_unlock(&bqlock); return (0); } @@ -2339,7 +2422,9 @@ loop: if (numfreebuffers == 0) { if (curthread == PCPU_GET(idlethread)) return NULL; + mtx_lock(&nblock); needsbuffer |= VFS_BIO_NEED_ANY; + mtx_unlock(&nblock); } VI_LOCK(vp); @@ -2617,7 +2702,8 @@ allocbuf(struct buf *bp, int size) } else { free(bp->b_data, M_BIOBUF); if (bp->b_bufsize) { - bufmallocspace -= bp->b_bufsize; + atomic_add_int(&bufmallocspace, + bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } @@ -2637,6 +2723,12 @@ allocbuf(struct buf *bp, int size) * and revert to page-allocated memory when the buffer * grows. */ + /* + * There is a potential smp race here that could lead + * to bufmallocspace slightly passing the max. It + * is probably extremely rare and not worth worrying + * over. + */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { @@ -2645,7 +2737,7 @@ allocbuf(struct buf *bp, int size) bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; - bufmallocspace += mbsize; + atomic_add_int(&bufmallocspace, mbsize); return 1; } origbuf = NULL; @@ -2659,7 +2751,8 @@ allocbuf(struct buf *bp, int size) origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; if (bp->b_bufsize) { - bufmallocspace -= bp->b_bufsize; + atomic_subtract_int(&bufmallocspace, + bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } |