diff options
Diffstat (limited to 'sys')
-rw-r--r-- | sys/kern/vfs_bio.c | 208 | ||||
-rw-r--r-- | sys/kern/vfs_cluster.c | 6 | ||||
-rw-r--r-- | sys/kern/vfs_export.c | 14 | ||||
-rw-r--r-- | sys/kern/vfs_subr.c | 14 | ||||
-rw-r--r-- | sys/kern/vfs_vnops.c | 4 | ||||
-rw-r--r-- | sys/sys/buf.h | 1 | ||||
-rw-r--r-- | sys/sys/vnode.h | 1 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_inode.c | 3 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_softdep.c | 36 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_readwrite.c | 4 | ||||
-rw-r--r-- | sys/vm/swap_pager.c | 4 | ||||
-rw-r--r-- | sys/vm/vm_page.c | 35 | ||||
-rw-r--r-- | sys/vm/vm_page.h | 2 | ||||
-rw-r--r-- | sys/vm/vm_pageout.c | 168 |
14 files changed, 315 insertions, 185 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 17def1b..9a9aae7 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -597,8 +597,14 @@ bwrite(struct buf * bp) * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. + * + * This optimization eats a lot of memory. If we have a page + * or buffer shortfall we can't do it. */ - if ((bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC)) { + if ((bp->b_xflags & BX_BKGRDWRITE) && + (bp->b_flags & B_ASYNC) && + !vm_page_count_severe() && + !buf_dirty_count_severe()) { if (bp->b_iodone != NULL) { printf("bp->b_iodone = %p\n", bp->b_iodone); panic("bwrite: need chained iodone"); @@ -682,7 +688,10 @@ vfs_backgroundwritedone(bp) /* * Clear the BX_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. + * If BX_BKGRDINPROG is not set in the original buffer it must + * have been released and re-instantiated - which is not legal. */ + KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_xflags &= ~BX_BKGRDINPROG; if (origbp->b_xflags & BX_BKGRDWAIT) { origbp->b_xflags &= ~BX_BKGRDWAIT; @@ -903,6 +912,15 @@ bwillwrite(void) } /* + * Return true if we have too many dirty buffers. + */ +int +buf_dirty_count_severe(void) +{ + return(numdirtybuffers >= hidirtybuffers); +} + +/* * brelse: * * Release a busy buffer and, if requested, free its resources. The @@ -964,10 +982,14 @@ brelse(struct buf * bp) * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. + * + * If B_DELWRI is not set we may have to set B_RELBUF if we are low + * on pages to return pages to the VM page queues. */ - if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; + else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG)) + bp->b_flags |= B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer @@ -989,8 +1011,7 @@ brelse(struct buf * bp) if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_tag == VT_NFS && !vn_isdisk(bp->b_vp, NULL) && - (bp->b_flags & B_DELWRI) && - (bp->b_xflags & BX_BKGRDINPROG)) + (bp->b_flags & B_DELWRI)) ) { int i, j, resid; @@ -1017,32 +1038,40 @@ brelse(struct buf * bp) * * See man buf(9) for more information */ - resid = bp->b_bufsize; foff = bp->b_offset; for (i = 0; i < bp->b_npages; i++) { + int had_bogus = 0; + m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); - if (m == bogus_page) { + /* + * If we hit a bogus page, fixup *all* the bogus pages + * now. + */ + if (m == bogus_page) { VOP_GETVOBJECT(vp, &obj); poff = OFF_TO_IDX(bp->b_offset); + had_bogus = 1; for (j = i; j < bp->b_npages; j++) { - m = bp->b_pages[j]; - if (m == bogus_page) { - m = vm_page_lookup(obj, poff + j); - if (!m) { + vm_page_t mtmp; + mtmp = bp->b_pages[j]; + if (mtmp == bogus_page) { + mtmp = vm_page_lookup(obj, poff + j); + if (!mtmp) { panic("brelse: page missing\n"); } - bp->b_pages[j] = m; + bp->b_pages[j] = mtmp; } } if ((bp->b_flags & B_INVAL) == 0) { pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } + m = bp->b_pages[i]; } if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) { int poffset = foff & PAGE_MASK; @@ -1051,9 +1080,11 @@ brelse(struct buf * bp) KASSERT(presid >= 0, ("brelse: extra page")); vm_page_set_invalid(m, poffset, presid); + if (had_bogus) + printf("avoided corruption bug in bogus_page/brelse code\n"); } resid -= PAGE_SIZE - (foff & PAGE_MASK); - foff = (foff + PAGE_SIZE) & ~PAGE_MASK; + foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bp->b_flags & (B_INVAL | B_RELBUF)) @@ -1171,7 +1202,7 @@ brelse(struct buf * bp) /* * Release a buffer back to the appropriate queue but do not try to free - * it. + * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when @@ -1203,6 +1234,15 @@ bqrelse(struct buf * bp) } else if (bp->b_flags & B_DELWRI) { bp->b_qindex = QUEUE_DIRTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); + } else if (vm_page_count_severe()) { + /* + * We are too low on memory, we have to try to free the + * buffer (most importantly: the wired pages making up its + * backing store) *now*. + */ + splx(s); + brelse(bp); + return; } else { bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); @@ -1264,6 +1304,8 @@ vfs_vmio_release(bp) vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); + } else if (vm_page_count_severe()) { + vm_page_try_to_cache(m); } } } @@ -1419,15 +1461,15 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize) struct buf *nbp; int defrag = 0; int nqindex; - int isspecial; static int flushingbufs; - if (curproc != idleproc && - (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0) - isspecial = 0; - else - isspecial = 1; - + /* + * We can't afford to block since we might be holding a vnode lock, + * which may prevent system daemons from running. We deal with + * low-memory situations by proactively returning memory and running + * async I/O rather then sync I/O. + */ + ++getnewbufcalls; --getnewbufrestarts; restart: @@ -1445,42 +1487,28 @@ restart: * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ + nqindex = QUEUE_EMPTYKVA; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); - if (isspecial == 0 && numfreebuffers < lofreebuffers) { + if (nbp == NULL) { /* - * This will cause an immediate failure + * If no EMPTYKVA buffers and we are either + * defragging or reusing, locate a CLEAN buffer + * to free or reuse. If bufspace useage is low + * skip this step so we can allocate a new buffer. */ - nqindex = QUEUE_CLEAN; - nbp = NULL; - } else { + if (defrag || bufspace >= lobufspace) { + nqindex = QUEUE_CLEAN; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); + } + /* - * Locate a buffer which already has KVA assigned. First - * try EMPTYKVA buffers. + * Nada. If we are allowed to allocate an EMPTY + * buffer, go get one. */ - nqindex = QUEUE_EMPTYKVA; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); - - if (nbp == NULL) { - /* - * If no EMPTYKVA buffers and we are either - * defragging or reusing, locate a CLEAN buffer - * to free or reuse. If bufspace useage is low - * skip this step so we can allocate a new buffer. - */ - if (defrag || bufspace >= lobufspace) { - nqindex = QUEUE_CLEAN; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); - } - - /* - * Nada. If we are allowed to allocate an EMPTY - * buffer, go get one. - */ - if (nbp == NULL && defrag == 0 && - (isspecial || bufspace < hibufspace)) { - nqindex = QUEUE_EMPTY; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); - } + if (nbp == NULL && defrag == 0 && bufspace < hibufspace) { + nqindex = QUEUE_EMPTY; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } } @@ -1610,26 +1638,16 @@ restart: goto restart; } - /* - * If we are a normal process then deal with bufspace - * hysteresis. A normal process tries to keep bufspace - * between lobufspace and hibufspace. Note: if we encounter - * a buffer with b_kvasize == 0 then it means we started - * our scan on the EMPTY list and should allocate a new - * buffer. - */ - if (isspecial == 0) { - if (bufspace > hibufspace) - flushingbufs = 1; - if (flushingbufs && bp->b_kvasize != 0) { - bp->b_flags |= B_INVAL; - bfreekva(bp); - brelse(bp); - goto restart; - } - if (bufspace < lobufspace) - flushingbufs = 0; + if (bufspace >= hibufspace) + flushingbufs = 1; + if (flushingbufs && bp->b_kvasize != 0) { + bp->b_flags |= B_INVAL; + bfreekva(bp); + brelse(bp); + goto restart; } + if (bufspace < lobufspace) + flushingbufs = 0; break; } @@ -1705,6 +1723,7 @@ restart: return(bp); } +#if 0 /* * waitfreebuffers: * @@ -1723,6 +1742,8 @@ waitfreebuffers(int slpflag, int slptimeo) } } +#endif + /* * buf_daemon: * @@ -2073,8 +2094,12 @@ loop: * If this check ever becomes a bottleneck it may be better to * move it into the else, when gbincore() fails. At the moment * it isn't a problem. + * + * XXX remove if 0 sections (clean this up after its proven) */ +#if 0 if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) { +#endif if (numfreebuffers == 0) { if (curproc == idleproc) return NULL; @@ -2082,9 +2107,11 @@ loop: tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", slptimeo); } +#if 0 } else if (numfreebuffers < lofreebuffers) { waitfreebuffers(slpflag, slptimeo); } +#endif if ((bp = gbincore(vp, blkno))) { /* @@ -2468,7 +2495,13 @@ allocbuf(struct buf *bp, int size) pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; if ((m = vm_page_lookup(obj, pi)) == NULL) { - m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL); + /* + * note: must allocate system pages + * since blocking here could intefere + * with paging I/O, no matter which + * process we are. + */ + m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM); if (m == NULL) { VM_WAIT; vm_pageout_deficit += desiredpages - bp->b_npages; @@ -2671,7 +2704,7 @@ bufdone(struct buf *bp) buf_complete(bp); if (bp->b_flags & B_VMIO) { - int i, resid; + int i; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; @@ -2722,16 +2755,29 @@ bufdone(struct buf *bp) for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; + int resid; + + resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; + if (resid > iosize) + resid = iosize; + + /* + * cleanup bogus pages, restoring the originals + */ m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (!m) { + panic("biodone: page disappeared!"); #if defined(VFS_BIO_DEBUG) printf("biodone: page disappeared\n"); #endif vm_object_pip_subtract(obj, 1); bp->b_flags &= ~B_CACHE; + foff = (foff + PAGE_SIZE) & + ~(off_t)PAGE_MASK; + iosize -= resid; continue; } bp->b_pages[i] = m; @@ -2744,9 +2790,6 @@ bufdone(struct buf *bp) (unsigned long)foff, m->pindex); } #endif - resid = IDX_TO_OFF(m->pindex + 1) - foff; - if (resid > iosize) - resid = iosize; /* * In the write case, the valid and clean bits are @@ -2784,7 +2827,7 @@ bufdone(struct buf *bp) } vm_page_io_finish(m); vm_object_pip_subtract(obj, 1); - foff += resid; + foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } if (obj) @@ -2862,7 +2905,7 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) * of the buffer. */ soff = off; - eoff = (off + PAGE_SIZE) & ~PAGE_MASK; + eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; @@ -2948,7 +2991,7 @@ retry: bp->b_pages[i] = bogus_page; bogus++; } - foff = (foff + PAGE_SIZE) & ~PAGE_MASK; + foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bogus) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); @@ -2976,7 +3019,7 @@ vfs_clean_pages(struct buf * bp) ("vfs_clean_pages: no buffer offset")); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; - vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK; + vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; vm_ooffset_t eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) @@ -3104,9 +3147,14 @@ vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) tryagain: + /* + * note: must allocate system pages since blocking here + * could intefere with paging I/O, no matter which + * process we are. + */ p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), - VM_ALLOC_NORMAL); + VM_ALLOC_SYSTEM); if (!p) { vm_pageout_deficit += (to - from) >> PAGE_SHIFT; VM_WAIT; diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 4f1aecf..29a1879 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -48,6 +48,7 @@ #include <sys/malloc.h> #include <sys/mount.h> #include <sys/resourcevar.h> +#include <sys/vmmeter.h> #include <vm/vm.h> #include <vm/vm_object.h> #include <vm/vm_page.h> @@ -665,6 +666,11 @@ cluster_write(bp, filesize, seqcount) cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); vp->v_clen = 0; vp->v_cstart = lbn + 1; + } else if (vm_page_count_severe()) { + /* + * We are low on memory, get it going NOW + */ + bawrite(bp); } else { /* * In the middle of a cluster, so just delay the I/O for now. diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index fd81bc8..cb46c34 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -1438,10 +1438,14 @@ vget(vp, flags, p) if ((flags & LK_INTERLOCK) == 0) mtx_enter(&vp->v_interlock, MTX_DEF); if (vp->v_flag & VXLOCK) { - vp->v_flag |= VXWANT; - mtx_exit(&vp->v_interlock, MTX_DEF); - tsleep((caddr_t)vp, PINOD, "vget", 0); - return (ENOENT); + if (vp->v_vxproc == curproc) { + printf("VXLOCK interlock avoided\n"); + } else { + vp->v_flag |= VXWANT; + mtx_exit(&vp->v_interlock, MTX_DEF); + tsleep((caddr_t)vp, PINOD, "vget", 0); + return (ENOENT); + } } vp->v_usecount++; @@ -1731,6 +1735,7 @@ vclean(vp, flags, p) if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; + vp->v_vxproc = curproc; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK @@ -1807,6 +1812,7 @@ vclean(vp, flags, p) vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; + vp->v_vxproc = NULL; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index fd81bc8..cb46c34 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1438,10 +1438,14 @@ vget(vp, flags, p) if ((flags & LK_INTERLOCK) == 0) mtx_enter(&vp->v_interlock, MTX_DEF); if (vp->v_flag & VXLOCK) { - vp->v_flag |= VXWANT; - mtx_exit(&vp->v_interlock, MTX_DEF); - tsleep((caddr_t)vp, PINOD, "vget", 0); - return (ENOENT); + if (vp->v_vxproc == curproc) { + printf("VXLOCK interlock avoided\n"); + } else { + vp->v_flag |= VXWANT; + mtx_exit(&vp->v_interlock, MTX_DEF); + tsleep((caddr_t)vp, PINOD, "vget", 0); + return (ENOENT); + } } vp->v_usecount++; @@ -1731,6 +1735,7 @@ vclean(vp, flags, p) if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; + vp->v_vxproc = curproc; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK @@ -1807,6 +1812,7 @@ vclean(vp, flags, p) vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; + vp->v_vxproc = NULL; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 448a2a6..b7cea77 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -642,12 +642,14 @@ debug_vn_lock(vp, flags, p, filename, line) do { if ((flags & LK_INTERLOCK) == 0) mtx_enter(&vp->v_interlock, MTX_DEF); - if (vp->v_flag & VXLOCK) { + if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curproc) { vp->v_flag |= VXWANT; mtx_exit(&vp->v_interlock, MTX_DEF); tsleep((caddr_t)vp, PINOD, "vn_lock", 0); error = ENOENT; } else { + if (vp->v_vxproc != NULL) + printf("VXLOCK interlock avoided in vn_lock\n"); #ifdef DEBUG_LOCKS vp->filename = filename; vp->line = line; diff --git a/sys/sys/buf.h b/sys/sys/buf.h index d085de6..a10083f 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -494,6 +494,7 @@ struct uio; caddr_t bufhashinit __P((caddr_t)); void bufinit __P((void)); void bwillwrite __P((void)); +int buf_dirty_count_severe __P((void)); void bremfree __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 582d00c..75462f6 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -129,6 +129,7 @@ struct vnode { short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ } v_pollinfo; + struct proc *v_vxproc; /* proc owning VXLOCK */ #ifdef DEBUG_LOCKS const char *filename; /* Source file doing locking */ int line; /* Line number doing locking */ diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 30f36ee7..a8ae464 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -45,6 +45,7 @@ #include <sys/vnode.h> #include <sys/malloc.h> #include <sys/resourcevar.h> +#include <sys/vmmeter.h> #include <sys/stat.h> #include <vm/vm.h> @@ -111,6 +112,8 @@ ffs_update(vp, waitfor) ino_to_fsbo(fs, ip->i_number)) = ip->i_din; if (waitfor && !DOINGASYNC(vp)) { return (bwrite(bp)); + } else if (vm_page_count_severe() || buf_dirty_count_severe()) { + return (bwrite(bp)); } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 98ad959..c6ac0bd 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -91,6 +91,8 @@ MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); +#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) + #define D_PAGEDEP 0 #define D_INODEDEP 1 #define D_NEWBLK 2 @@ -802,7 +804,7 @@ top: goto top; } MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, - M_WAITOK); + M_SOFTDEP_FLAGS); bzero(pagedep, sizeof(struct pagedep)); pagedep->pd_list.wk_type = D_PAGEDEP; pagedep->pd_mnt = mp; @@ -879,7 +881,7 @@ top: } num_inodedep += 1; MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), - M_INODEDEP, M_WAITOK); + M_INODEDEP, M_SOFTDEP_FLAGS); inodedep->id_list.wk_type = D_INODEDEP; inodedep->id_fs = fs; inodedep->id_ino = inum; @@ -941,7 +943,7 @@ top: if (sema_get(&newblk_in_progress, 0) == 0) goto top; MALLOC(newblk, struct newblk *, sizeof(struct newblk), - M_NEWBLK, M_WAITOK); + M_NEWBLK, M_SOFTDEP_FLAGS); newblk->nb_state = 0; newblk->nb_fs = fs; newblk->nb_newblkno = newblkno; @@ -1127,7 +1129,7 @@ bmsafemap_lookup(bp) return (WK_BMSAFEMAP(wk)); FREE_LOCK(&lk); MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), - M_BMSAFEMAP, M_WAITOK); + M_BMSAFEMAP, M_SOFTDEP_FLAGS); bmsafemap->sm_list.wk_type = D_BMSAFEMAP; bmsafemap->sm_list.wk_state = 0; bmsafemap->sm_buf = bp; @@ -1187,7 +1189,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct newblk *newblk; MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), - M_ALLOCDIRECT, M_WAITOK); + M_ALLOCDIRECT, M_SOFTDEP_FLAGS); bzero(adp, sizeof(struct allocdirect)); adp->ad_list.wk_type = D_ALLOCDIRECT; adp->ad_lbn = lbn; @@ -1339,7 +1341,7 @@ newfreefrag(ip, blkno, size) if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), - M_FREEFRAG, M_WAITOK); + M_FREEFRAG, M_SOFTDEP_FLAGS); freefrag->ff_list.wk_type = D_FREEFRAG; freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */ freefrag->ff_inum = ip->i_number; @@ -1408,7 +1410,7 @@ newallocindir(ip, ptrno, newblkno, oldblkno) struct allocindir *aip; MALLOC(aip, struct allocindir *, sizeof(struct allocindir), - M_ALLOCINDIR, M_WAITOK); + M_ALLOCINDIR, M_SOFTDEP_FLAGS); bzero(aip, sizeof(struct allocindir)); aip->ai_list.wk_type = D_ALLOCINDIR; aip->ai_state = ATTACHED; @@ -1561,7 +1563,7 @@ setup_allocindir_phase2(bp, ip, aip) if (indirdep) break; MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), - M_INDIRDEP, M_WAITOK); + M_INDIRDEP, M_SOFTDEP_FLAGS); newindirdep->ir_list.wk_type = D_INDIRDEP; newindirdep->ir_state = ATTACHED; LIST_INIT(&newindirdep->ir_deplisthd); @@ -1623,7 +1625,7 @@ softdep_setup_freeblocks(ip, length) if (length != 0) panic("softde_setup_freeblocks: non-zero length"); MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), - M_FREEBLKS, M_WAITOK); + M_FREEBLKS, M_SOFTDEP_FLAGS); bzero(freeblks, sizeof(struct freeblks)); freeblks->fb_list.wk_type = D_FREEBLKS; freeblks->fb_uid = ip->i_uid; @@ -1870,7 +1872,7 @@ softdep_freefile(pvp, ino, mode) * This sets up the inode de-allocation dependency. */ MALLOC(freefile, struct freefile *, sizeof(struct freefile), - M_FREEFILE, M_WAITOK); + M_FREEFILE, M_SOFTDEP_FLAGS); freefile->fx_list.wk_type = D_FREEFILE; freefile->fx_list.wk_state = 0; freefile->fx_mode = mode; @@ -2186,7 +2188,7 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) fs = dp->i_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); - MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); + MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS); bzero(dap, sizeof(struct diradd)); dap->da_list.wk_type = D_DIRADD; dap->da_offset = offset; @@ -2198,12 +2200,12 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, - M_WAITOK); + M_SOFTDEP_FLAGS); mkdir1->md_list.wk_type = D_MKDIR; mkdir1->md_state = MKDIR_BODY; mkdir1->md_diradd = dap; MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, - M_WAITOK); + M_SOFTDEP_FLAGS); mkdir2->md_list.wk_type = D_MKDIR; mkdir2->md_state = MKDIR_PARENT; mkdir2->md_diradd = dap; @@ -2438,7 +2440,7 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) (void) request_cleanup(FLUSH_REMOVE, 0); num_dirrem += 1; MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), - M_DIRREM, M_WAITOK); + M_DIRREM, M_SOFTDEP_FLAGS); bzero(dirrem, sizeof(struct dirrem)); dirrem->dm_list.wk_type = D_DIRREM; dirrem->dm_state = isrmdir ? RMDIR : 0; @@ -2535,7 +2537,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) */ if (newinum != WINO) { MALLOC(dap, struct diradd *, sizeof(struct diradd), - M_DIRADD, M_WAITOK); + M_DIRADD, M_SOFTDEP_FLAGS); bzero(dap, sizeof(struct diradd)); dap->da_list.wk_type = D_DIRADD; dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; @@ -2841,7 +2843,7 @@ softdep_disk_io_initiation(bp) * Replace up-to-date version with safe version. */ MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, - M_INDIRDEP, M_WAITOK); + M_INDIRDEP, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; @@ -2942,7 +2944,7 @@ initiate_write_inodeblock(inodedep, bp) if (inodedep->id_savedino != NULL) panic("initiate_write_inodeblock: already doing I/O"); MALLOC(inodedep->id_savedino, struct dinode *, - sizeof(struct dinode), M_INODEDEP, M_WAITOK); + sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS); *inodedep->id_savedino = *dp; bzero((caddr_t)dp, sizeof(struct dinode)); return; diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index be43550..785219c 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -48,6 +48,7 @@ #include <vm/vm_map.h> #include <vm/vnode_pager.h> #include <sys/event.h> +#include <sys/vmmeter.h> #define VN_KNOTE(vp, b) \ KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) @@ -501,6 +502,9 @@ WRITE(ap) } else { bawrite(bp); } + } else if (vm_page_count_severe() || buf_dirty_count_severe()) { + bp->b_flags |= B_CLUSTEROK; + bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 6a427c9..a625bc8 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -80,6 +80,7 @@ #include <sys/sysctl.h> #include <sys/blist.h> #include <sys/lock.h> +#include <sys/vmmeter.h> #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 16 @@ -1619,10 +1620,11 @@ swp_pager_async_iodone(bp) * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). */ - vm_page_protect(m, VM_PROT_READ); pmap_clear_modify(m); vm_page_undirty(m); vm_page_io_finish(m); + if (!vm_page_count_severe() || !vm_page_try_to_cache(m)) + vm_page_protect(m, VM_PROT_READ); } } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 038a5ad..9c868fc 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -860,7 +860,7 @@ loop: * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ - if (vm_paging_needed() || cnt.v_free_count < cnt.v_pageout_free_min) + if (vm_paging_needed()) pagedaemon_wakeup(); splx(s); @@ -882,10 +882,10 @@ vm_wait() s = splvm(); if (curproc == pageproc) { vm_pageout_pages_needed = 1; - tsleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0); + tsleep(&vm_pageout_pages_needed, PSWP, "VMWait", 0); } else { if (!vm_pages_needed) { - vm_pages_needed++; + vm_pages_needed = 1; wakeup(&vm_pages_needed); } tsleep(&cnt.v_free_count, PVM, "vmwait", 0); @@ -1030,7 +1030,8 @@ vm_page_free_wakeup() * if pageout daemon needs pages, then tell it that there are * some free. */ - if (vm_pageout_pages_needed) { + if (vm_pageout_pages_needed && + cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } @@ -1039,9 +1040,9 @@ vm_page_free_wakeup() * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ - if (vm_pages_needed && vm_page_count_min()) { - wakeup(&cnt.v_free_count); + if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = 0; + wakeup(&cnt.v_free_count); } } @@ -1240,6 +1241,9 @@ vm_page_wire(m) * processes. This optimization causes one-time-use metadata to be * reused more quickly. * + * BUT, if we are in a low-memory situation we have no choice but to + * put clean pages on the cache queue. + * * A number of routines use vm_page_unwire() to guarantee that the page * will go into either the inactive or active queues, and will NEVER * be placed in the cache - for example, just after dirtying a page. @@ -1326,6 +1330,25 @@ vm_page_deactivate(vm_page_t m) } /* + * vm_page_try_to_cache: + * + * Returns 0 on failure, 1 on success + */ +int +vm_page_try_to_cache(vm_page_t m) +{ + if (m->dirty || m->hold_count || m->busy || m->wire_count || + (m->flags & (PG_BUSY|PG_UNMANAGED))) { + return(0); + } + vm_page_test_dirty(m); + if (m->dirty) + return(0); + vm_page_cache(m); + return(1); +} + +/* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index cf58985..4c31df9 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -251,6 +251,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT]; #define PG_SWAPINPROG 0x0200 /* swap I/O in progress on page */ #define PG_NOSYNC 0x0400 /* do not collect for syncer */ #define PG_UNMANAGED 0x0800 /* No PV management for page */ +#define PG_MARKER 0x1000 /* special queue marker page */ /* * Misc constants. @@ -403,6 +404,7 @@ void vm_page_activate __P((vm_page_t)); vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int)); vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int)); void vm_page_cache __P((register vm_page_t)); +int vm_page_try_to_cache __P((vm_page_t)); void vm_page_dontneed __P((register vm_page_t)); static __inline void vm_page_copy __P((vm_page_t, vm_page_t)); static __inline void vm_page_free __P((vm_page_t)); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index d12ecac..4ab3930 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -146,6 +146,7 @@ static int defer_swap_pageouts=0; static int disable_swap_pageouts=0; static int max_page_launder=100; +static int vm_pageout_actcmp=0; #if defined(NO_SWAPPING) static int vm_swap_enabled=0; static int vm_swap_idle_enabled=0; @@ -189,6 +190,8 @@ SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, SYSCTL_INT(_vm, OID_AUTO, max_page_launder, CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass"); +SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp, + CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness"); #define VM_PAGEOUT_PAGE_COUNT 16 @@ -372,6 +375,7 @@ vm_pageout_flush(mc, count, flags) */ for (i = 0; i < count; i++) { + KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL && mc[i]->dirty == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially dirty page", mc[i], i, count)); vm_page_io_start(mc[i]); vm_page_protect(mc[i], VM_PROT_READ); } @@ -424,6 +428,8 @@ vm_pageout_flush(mc, count, flags) if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_io_finish(mt); + if (!vm_page_count_severe() || !vm_page_try_to_cache(mt)) + vm_page_protect(mt, VM_PROT_READ); } } return numpagedout; @@ -621,10 +627,10 @@ static int vm_pageout_scan() { vm_page_t m, next; + struct vm_page marker; int page_shortage, maxscan, pcount; int addl_page_shortage, addl_page_shortage_init; int maxlaunder; - int launder_loop = 0; struct proc *p, *bigproc; vm_offset_t size, bigsize; vm_object_t object; @@ -646,33 +652,37 @@ vm_pageout_scan() /* * Calculate the number of pages we want to either free or move - * to the cache. + * to the cache. Be more agressive if we aren't making our target. */ - page_shortage = vm_paging_target() + addl_page_shortage_init; + page_shortage = vm_paging_target() + + addl_page_shortage_init + vm_pageout_actcmp; /* - * Figure out what to do with dirty pages when they are encountered. - * Assume that 1/3 of the pages on the inactive list are clean. If - * we think we can reach our target, disable laundering (do not - * clean any dirty pages). If we miss the target we will loop back - * up and do a laundering run. + * Figure out how agressively we should flush dirty pages. */ + { + int factor = vm_pageout_actcmp; - if (cnt.v_inactive_count / 3 > page_shortage) { - maxlaunder = 0; - launder_loop = 0; - } else { - maxlaunder = - (cnt.v_inactive_target > max_page_launder) ? - max_page_launder : cnt.v_inactive_target; - launder_loop = 1; + maxlaunder = cnt.v_inactive_target / 3 + factor; + if (maxlaunder > max_page_launder + factor) + maxlaunder = max_page_launder + factor; } /* + * Initialize our marker + */ + bzero(&marker, sizeof(marker)); + marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; + marker.queue = PQ_INACTIVE; + marker.wire_count = 1; + + /* * Start scanning the inactive queue for pages we can move to the * cache or free. The scan will stop when the target is reached or - * we have scanned the entire inactive queue. + * we have scanned the entire inactive queue. Note that m->act_count + * is not used to form decisions for the inactive queue, only for the + * active queue. */ rescan0: @@ -690,6 +700,12 @@ rescan0: next = TAILQ_NEXT(m, pageq); + /* + * skip marker pages + */ + if (m->flags & PG_MARKER) + continue; + if (m->hold_count) { s = splvm(); TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); @@ -766,7 +782,8 @@ rescan0: --page_shortage; /* - * Clean pages can be placed onto the cache queue. + * Clean pages can be placed onto the cache queue. This + * effectively frees them. */ } else if (m->dirty == 0) { vm_page_cache(m); @@ -777,7 +794,6 @@ rescan0: * only a limited number of pages per pagedaemon pass. */ } else if (maxlaunder > 0) { - int written; int swap_pageouts_ok; struct vnode *vp = NULL; struct mount *mp; @@ -806,29 +822,6 @@ rescan0: } /* - * For now we protect against potential memory - * deadlocks by requiring significant memory to be - * free if the object is not OBJT_DEFAULT or OBJT_SWAP. - * We do not 'trust' any other object type to operate - * with low memory, not even OBJT_DEVICE. The VM - * allocator will special case allocations done by - * the pageout daemon so the check below actually - * does have some hysteresis in it. It isn't the best - * solution, though. - */ - - if (object->type != OBJT_DEFAULT && - object->type != OBJT_SWAP && - cnt.v_free_count < cnt.v_free_reserved) { - s = splvm(); - TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq); - TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, - pageq); - splx(s); - continue; - } - - /* * Presumably we have sufficient free memory to do * the more sophisticated checks and locking required * for vnodes. @@ -879,10 +872,15 @@ rescan0: } /* - * The page might have been moved to another queue - * during potential blocking in vget() above. + * The page might have been moved to another + * queue during potential blocking in vget() + * above. The page might have been freed and + * reused for another vnode. The object might + * have been reused for another vnode. */ - if (m->queue != PQ_INACTIVE) { + if (m->queue != PQ_INACTIVE || + m->object != object || + object->handle != vp) { if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; vput(vp); @@ -891,9 +889,10 @@ rescan0: } /* - * The page may have been busied during the blocking in - * vput(); We don't move the page back onto the end of - * the queue so that statistics are more correct if we don't. + * The page may have been busied during the + * blocking in vput(); We don't move the + * page back onto the end of the queue so that + * statistics are more correct if we don't. */ if (m->busy || (m->flags & PG_BUSY)) { vput(vp); @@ -921,42 +920,57 @@ rescan0: * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we - * start the cleaning operation. + * start the cleaning operation. maxlaunder nominally + * counts I/O cost (seeks) rather then bytes. + * + * This operation may cluster, invalidating the 'next' + * pointer. To prevent an inordinate number of + * restarts we use our marker to remember our place. */ - written = vm_pageout_clean(m); + s = splvm(); + TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq); + splx(s); + if (vm_pageout_clean(m) != 0) + --maxlaunder; + s = splvm(); + next = TAILQ_NEXT(&marker, pageq); + TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq); + splx(s); if (vp) { vput(vp); vn_finished_write(mp); } - - maxlaunder -= written; } } /* - * If we still have a page shortage and we didn't launder anything, - * run the inactive scan again and launder something this time. + * If we were not able to meet our target, increase actcmp */ - if (launder_loop == 0 && page_shortage > 0) { - launder_loop = 1; - maxlaunder = - (cnt.v_inactive_target > max_page_launder) ? - max_page_launder : cnt.v_inactive_target; - goto rescan0; + if (vm_page_count_min()) { + if (vm_pageout_actcmp < ACT_MAX / 2) + vm_pageout_actcmp += ACT_ADVANCE; + } else { + if (vm_pageout_actcmp < ACT_DECLINE) + vm_pageout_actcmp = 0; + else + vm_pageout_actcmp -= ACT_DECLINE; } /* - * Compute the page shortage from the point of view of having to - * move pages from the active queue to the inactive queue. + * Compute the number of pages we want to try to move from the + * active queue to the inactive queue. */ - page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) - - (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); + page_shortage = vm_paging_target() + + cnt.v_inactive_target - cnt.v_inactive_count; page_shortage += addl_page_shortage; + page_shortage += vm_pageout_actcmp; /* - * Scan the active queue for things we can deactivate + * Scan the active queue for things we can deactivate. We nominally + * track the per-page activity counter and use it to locate + * deactivation candidates. */ pcount = cnt.v_active_count; @@ -1026,7 +1040,8 @@ rescan0: } else { m->act_count -= min(m->act_count, ACT_DECLINE); if (vm_pageout_algorithm_lru || - (m->object->ref_count == 0) || (m->act_count == 0)) { + (m->object->ref_count == 0) || + (m->act_count <= vm_pageout_actcmp)) { page_shortage--; if (m->object->ref_count == 0) { vm_page_protect(m, VM_PROT_NONE); @@ -1111,7 +1126,7 @@ rescan0: * make sure that we have swap space -- if we are low on memory and * swap -- then kill the biggest process. */ - if ((vm_swap_size == 0 || swap_pager_full) && vm_page_count_min()) { + if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) { bigproc = NULL; bigsize = 0; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { @@ -1349,20 +1364,31 @@ vm_pageout() int error; int s = splvm(); - if (vm_pages_needed && vm_page_count_min()) { + /* + * If we have enough free memory, wakeup waiters. Do + * not clear vm_pages_needed until we reach our target, + * otherwise we may be woken up over and over again and + * waste a lot of cpu. + */ + if (vm_pages_needed && !vm_page_count_min()) { + if (vm_paging_needed() <= 0) + vm_pages_needed = 0; + wakeup(&cnt.v_free_count); + } + if (vm_pages_needed) { /* * Still not done, sleep a bit and go again */ - vm_pages_needed = 0; tsleep(&vm_pages_needed, PVM, "psleep", hz/2); } else { /* * Good enough, sleep & handle stats */ - vm_pages_needed = 0; error = tsleep(&vm_pages_needed, PVM, "psleep", vm_pageout_stats_interval * hz); if (error && !vm_pages_needed) { + if (vm_pageout_actcmp > 0) + --vm_pageout_actcmp; splx(s); vm_pageout_page_stats(); continue; @@ -1371,11 +1397,9 @@ vm_pageout() if (vm_pages_needed) cnt.v_pdwakeups++; - vm_pages_needed = 0; splx(s); vm_pageout_scan(); vm_pageout_deficit = 0; - wakeup(&cnt.v_free_count); } } |