Implement a low-memory deadlock solution.

Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
author: dillon <dillon@FreeBSD.org> 2000-11-18 23:06:26 +0000
committer: dillon <dillon@FreeBSD.org> 2000-11-18 23:06:26 +0000
commit: 2ace35208525bb250b47fe7af60ec2ce681c6c92 (patch)
tree: 8b9f3edb21d176840f55c8efbf3c9ffe76fdabc6 /sys/kern/vfs_bio.c
parent: 59e131028ff3997be98ab838d5ab9f965b1589ca (diff)
download: FreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.zip
FreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.tar.gz
1 files changed, 128 insertions, 80 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 17def1b..9a9aae7 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -597,8 +597,14 @@ bwrite(struct buf * bp)
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
 	 * copy so as to leave this buffer ready for further use.
+	 *
+	 * This optimization eats a lot of memory.  If we have a page
+	 * or buffer shortfall we can't do it.
 	 */
-	if ((bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC)) {
+	if ((bp->b_xflags & BX_BKGRDWRITE) && 
+	    (bp->b_flags & B_ASYNC) &&
+	    !vm_page_count_severe() &&
+	    !buf_dirty_count_severe()) {
 		if (bp->b_iodone != NULL) {
 			printf("bp->b_iodone = %p\n", bp->b_iodone);
 			panic("bwrite: need chained iodone");
@@ -682,7 +688,10 @@ vfs_backgroundwritedone(bp)
 	/*
 	 * Clear the BX_BKGRDINPROG flag in the original buffer
 	 * and awaken it if it is waiting for the write to complete.
+	 * If BX_BKGRDINPROG is not set in the original buffer it must
+	 * have been released and re-instantiated - which is not legal.
 	 */
+	KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
 	origbp->b_xflags &= ~BX_BKGRDINPROG;
 	if (origbp->b_xflags & BX_BKGRDWAIT) {
 		origbp->b_xflags &= ~BX_BKGRDWAIT;
@@ -903,6 +912,15 @@ bwillwrite(void)
 }
 
 /*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+	return(numdirtybuffers >= hidirtybuffers);
+}
+
+/*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
@@ -964,10 +982,14 @@ brelse(struct buf * bp)
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 	 * if B_DELWRI is set.
+	 *
+	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
+	 * on pages to return pages to the VM page queues.
 	 */
-
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
+	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
+		bp->b_flags |= B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
@@ -989,8 +1011,7 @@ brelse(struct buf * bp)
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_tag == VT_NFS &&
 		 !vn_isdisk(bp->b_vp, NULL) &&
-		 (bp->b_flags & B_DELWRI) &&
-		 (bp->b_xflags & BX_BKGRDINPROG))
+		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
@@ -1017,32 +1038,40 @@ brelse(struct buf * bp)
 		 *
 		 * See man buf(9) for more information
 		 */
-
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
 
 		for (i = 0; i < bp->b_npages; i++) {
+			int had_bogus = 0;
+
 			m = bp->b_pages[i];
 			vm_page_flag_clear(m, PG_ZERO);
-			if (m == bogus_page) {
 
+			/*
+			 * If we hit a bogus page, fixup *all* the bogus pages
+			 * now.
+			 */
+			if (m == bogus_page) {
 				VOP_GETVOBJECT(vp, &obj);
 				poff = OFF_TO_IDX(bp->b_offset);
+				had_bogus = 1;
 
 				for (j = i; j < bp->b_npages; j++) {
-					m = bp->b_pages[j];
-					if (m == bogus_page) {
-						m = vm_page_lookup(obj, poff + j);
-						if (!m) {
+					vm_page_t mtmp;
+					mtmp = bp->b_pages[j];
+					if (mtmp == bogus_page) {
+						mtmp = vm_page_lookup(obj, poff + j);
+						if (!mtmp) {
 							panic("brelse: page missing\n");
 						}
-						bp->b_pages[j] = m;
+						bp->b_pages[j] = mtmp;
 					}
 				}
 
 				if ((bp->b_flags & B_INVAL) == 0) {
 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 				}
+				m = bp->b_pages[i];
 			}
 			if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
 				int poffset = foff & PAGE_MASK;
@@ -1051,9 +1080,11 @@ brelse(struct buf * bp)
 
 				KASSERT(presid >= 0, ("brelse: extra page"));
 				vm_page_set_invalid(m, poffset, presid);
+				if (had_bogus)
+					printf("avoided corruption bug in bogus_page/brelse code\n");
 			}
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
@@ -1171,7 +1202,7 @@ brelse(struct buf * bp)
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
- * it.
+ * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
@@ -1203,6 +1234,15 @@ bqrelse(struct buf * bp)
 	} else if (bp->b_flags & B_DELWRI) {
 		bp->b_qindex = QUEUE_DIRTY;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are too low on memory, we have to try to free the
+		 * buffer (most importantly: the wired pages making up its
+		 * backing store) *now*.
+		 */
+		splx(s);
+		brelse(bp);
+		return;
 	} else {
 		bp->b_qindex = QUEUE_CLEAN;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
@@ -1264,6 +1304,8 @@ vfs_vmio_release(bp)
 				vm_page_busy(m);
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_free(m);
+			} else if (vm_page_count_severe()) {
+				vm_page_try_to_cache(m);
 			}
 		}
 	}
@@ -1419,15 +1461,15 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 	struct buf *nbp;
 	int defrag = 0;
 	int nqindex;
-	int isspecial;
 	static int flushingbufs;
 
-	if (curproc != idleproc &&
-	    (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0)
-		isspecial = 0;
-	else
-		isspecial = 1;
-	
+	/*
+	 * We can't afford to block since we might be holding a vnode lock,
+	 * which may prevent system daemons from running.  We deal with
+	 * low-memory situations by proactively returning memory and running
+	 * async I/O rather then sync I/O.
+	 */
+
 	++getnewbufcalls;
 	--getnewbufrestarts;
 restart:
@@ -1445,42 +1487,28 @@ restart:
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
+	nqindex = QUEUE_EMPTYKVA;
+	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 
-	if (isspecial == 0 && numfreebuffers < lofreebuffers) {
+	if (nbp == NULL) {
 		/*
-		 * This will cause an immediate failure
+		 * If no EMPTYKVA buffers and we are either
+		 * defragging or reusing, locate a CLEAN buffer
+		 * to free or reuse.  If bufspace useage is low
+		 * skip this step so we can allocate a new buffer.
 		 */
-		nqindex = QUEUE_CLEAN;
-		nbp = NULL;
-	} else {
+		if (defrag || bufspace >= lobufspace) {
+			nqindex = QUEUE_CLEAN;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+		}
+
 		/*
-		 * Locate a buffer which already has KVA assigned.  First
-		 * try EMPTYKVA buffers.
+		 * Nada.  If we are allowed to allocate an EMPTY 
+		 * buffer, go get one.
 		 */
-		nqindex = QUEUE_EMPTYKVA;
-		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
-
-		if (nbp == NULL) {
-			/*
-			 * If no EMPTYKVA buffers and we are either
-			 * defragging or reusing, locate a CLEAN buffer
-			 * to free or reuse.  If bufspace useage is low
-			 * skip this step so we can allocate a new buffer.
-			 */
-			if (defrag || bufspace >= lobufspace) {
-				nqindex = QUEUE_CLEAN;
-				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
-			}
-
-			/*
-			 * Nada.  If we are allowed to allocate an EMPTY 
-			 * buffer, go get one.
-			 */
-			if (nbp == NULL && defrag == 0 && 
-			    (isspecial || bufspace < hibufspace)) {
-				nqindex = QUEUE_EMPTY;
-				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
-			}
+		if (nbp == NULL && defrag == 0 && bufspace < hibufspace) {
+			nqindex = QUEUE_EMPTY;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		}
 	}
 
@@ -1610,26 +1638,16 @@ restart:
 			goto restart;
 		}
 
-		/*
-		 * If we are a normal process then deal with bufspace
-		 * hysteresis.  A normal process tries to keep bufspace
-		 * between lobufspace and hibufspace.  Note: if we encounter
-		 * a buffer with b_kvasize == 0 then it means we started
-		 * our scan on the EMPTY list and should allocate a new
-		 * buffer.
-		 */
-		if (isspecial == 0) {
-			if (bufspace > hibufspace)
-				flushingbufs = 1;
-			if (flushingbufs && bp->b_kvasize != 0) {
-				bp->b_flags |= B_INVAL;
-				bfreekva(bp);
-				brelse(bp);
-				goto restart;
-			}
-			if (bufspace < lobufspace)
-				flushingbufs = 0;
+		if (bufspace >= hibufspace)
+			flushingbufs = 1;
+		if (flushingbufs && bp->b_kvasize != 0) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
 		}
+		if (bufspace < lobufspace)
+			flushingbufs = 0;
 		break;
 	}
 
@@ -1705,6 +1723,7 @@ restart:
 	return(bp);
 }
 
+#if 0
 /*
  *	waitfreebuffers:
  *
@@ -1723,6 +1742,8 @@ waitfreebuffers(int slpflag, int slptimeo)
 	}
 }
 
+#endif
+
 /*
  *	buf_daemon:
  *
@@ -2073,8 +2094,12 @@ loop:
          * If this check ever becomes a bottleneck it may be better to
          * move it into the else, when gbincore() fails.  At the moment
          * it isn't a problem.
+	 *
+	 * XXX remove if 0 sections (clean this up after its proven)
          */
+#if 0
 	if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
+#endif
 		if (numfreebuffers == 0) {
 			if (curproc == idleproc)
 				return NULL;
@@ -2082,9 +2107,11 @@ loop:
 			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
 			    slptimeo);
 		}
+#if 0
 	} else if (numfreebuffers < lofreebuffers) {
 		waitfreebuffers(slpflag, slptimeo);
 	}
+#endif
 
 	if ((bp = gbincore(vp, blkno))) {
 		/*
@@ -2468,7 +2495,13 @@ allocbuf(struct buf *bp, int size)
 
 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
-					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
+					/*
+					 * note: must allocate system pages
+					 * since blocking here could intefere
+					 * with paging I/O, no matter which
+					 * process we are.
+					 */
+					m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
 					if (m == NULL) {
 						VM_WAIT;
 						vm_pageout_deficit += desiredpages - bp->b_npages;
@@ -2671,7 +2704,7 @@ bufdone(struct buf *bp)
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
-		int i, resid;
+		int i;
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
@@ -2722,16 +2755,29 @@ bufdone(struct buf *bp)
 
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
+			int resid;
+
+			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+			if (resid > iosize)
+				resid = iosize;
+
+			/*
+			 * cleanup bogus pages, restoring the originals
+			 */
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (!m) {
+					panic("biodone: page disappeared!");
 #if defined(VFS_BIO_DEBUG)
 					printf("biodone: page disappeared\n");
 #endif
 					vm_object_pip_subtract(obj, 1);
 					bp->b_flags &= ~B_CACHE;
+					foff = (foff + PAGE_SIZE) &
+					    ~(off_t)PAGE_MASK;
+					iosize -= resid;
 					continue;
 				}
 				bp->b_pages[i] = m;
@@ -2744,9 +2790,6 @@ bufdone(struct buf *bp)
 				    (unsigned long)foff, m->pindex);
 			}
 #endif
-			resid = IDX_TO_OFF(m->pindex + 1) - foff;
-			if (resid > iosize)
-				resid = iosize;
 
 			/*
 			 * In the write case, the valid and clean bits are
@@ -2784,7 +2827,7 @@ bufdone(struct buf *bp)
 			}
 			vm_page_io_finish(m);
 			vm_object_pip_subtract(obj, 1);
-			foff += resid;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
 		if (obj)
@@ -2862,7 +2905,7 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 	 * of the buffer.
 	 */
 	soff = off;
-	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
+	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
@@ -2948,7 +2991,7 @@ retry:
 				bp->b_pages[i] = bogus_page;
 				bogus++;
 			}
-			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 		if (bogus)
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
@@ -2976,7 +3019,7 @@ vfs_clean_pages(struct buf * bp)
 		    ("vfs_clean_pages: no buffer offset"));
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
-			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
+			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			vm_ooffset_t eoff = noff;
 
 			if (eoff > bp->b_offset + bp->b_bufsize)
@@ -3104,9 +3147,14 @@ vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 
 tryagain:
 
+		/*
+		 * note: must allocate system pages since blocking here
+		 * could intefere with paging I/O, no matter which
+		 * process we are.
+		 */
 		p = vm_page_alloc(kernel_object,
 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
-		    VM_ALLOC_NORMAL);
+		    VM_ALLOC_SYSTEM);
 		if (!p) {
 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 			VM_WAIT;
author	dillon <dillon@FreeBSD.org>	2000-11-18 23:06:26 +0000
committer	dillon <dillon@FreeBSD.org>	2000-11-18 23:06:26 +0000
commit	2ace35208525bb250b47fe7af60ec2ce681c6c92 (patch)
tree	8b9f3edb21d176840f55c8efbf3c9ffe76fdabc6 /sys/kern/vfs_bio.c
parent	59e131028ff3997be98ab838d5ab9f965b1589ca (diff)
download	FreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.zip FreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.tar.gz