1 files changed, 101 insertions, 4 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index c1b53d8..a980330 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -281,6 +281,8 @@ waitrunningbufspace(void)
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
+ *
+ *	must be called with vm_mtx held
  */
 static __inline__
 void
@@ -426,11 +428,13 @@ bufinit(void)
  * from buf_daemon.
  */
 
+	mtx_lock(&vm_mtx);
 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
 	bogus_page = vm_page_alloc(kernel_object,
 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 			VM_ALLOC_NORMAL);
 	cnt.v_wire_count++;
+	mtx_unlock(&vm_mtx);
 
 }
 
@@ -441,17 +445,27 @@ bufinit(void)
  *	buffer_map.
  *
  *	Since this call frees up buffer space, we call bufspacewakeup().
+ *
+ *	Can be called with or without the vm_mtx.
  */
 static void
 bfreekva(struct buf * bp)
 {
+
 	if (bp->b_kvasize) {
+		int hadvmlock;
+
 		++buffreekvacnt;
 		bufspace -= bp->b_kvasize;
+		hadvmlock = mtx_owned(&vm_mtx);
+		if (!hadvmlock)
+			mtx_lock(&vm_mtx);
 		vm_map_delete(buffer_map,
 		    (vm_offset_t) bp->b_kvabase,
 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
 		);
+		if (!hadvmlock)
+			mtx_unlock(&vm_mtx);
 		bp->b_kvasize = 0;
 		bufspacewakeup();
 	}
@@ -807,6 +821,7 @@ bdwrite(struct buf * bp)
 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
+	mtx_lock(&vm_mtx);
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
 	 */
@@ -820,6 +835,7 @@ bdwrite(struct buf * bp)
 	 * out on the next sync, or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages(bp);
+	mtx_unlock(&vm_mtx);
 	bqrelse(bp);
 
 	/*
@@ -973,12 +989,15 @@ buf_dirty_count_severe(void)
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
+ *
+ *	vm_mtx must be not be held.
  */
 void
 brelse(struct buf * bp)
 {
 	int s;
 
+	mtx_assert(&vm_mtx, MA_NOTOWNED);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	s = splbio();
@@ -1088,6 +1107,7 @@ brelse(struct buf * bp)
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
 
+		mtx_lock(&vm_mtx);
 		for (i = 0; i < bp->b_npages; i++) {
 			int had_bogus = 0;
 
@@ -1099,10 +1119,12 @@ brelse(struct buf * bp)
 			 * now.
 			 */
 			if (m == bogus_page) {
+				mtx_unlock(&vm_mtx);
 				VOP_GETVOBJECT(vp, &obj);
 				poff = OFF_TO_IDX(bp->b_offset);
 				had_bogus = 1;
 
+				mtx_lock(&vm_mtx);
 				for (j = i; j < bp->b_npages; j++) {
 					vm_page_t mtmp;
 					mtmp = bp->b_pages[j];
@@ -1136,11 +1158,15 @@ brelse(struct buf * bp)
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
+		mtx_unlock(&vm_mtx);
 
 	} else if (bp->b_flags & B_VMIO) {
 
-		if (bp->b_flags & (B_INVAL | B_RELBUF))
+		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
+			mtx_lock(&vm_mtx);
 			vfs_vmio_release(bp);
+			mtx_unlock(&vm_mtx);
+		}
 
 	}
 			
@@ -1302,6 +1328,9 @@ bqrelse(struct buf * bp)
 	splx(s);
 }
 
+/*
+ * Must be called with vm_mtx held.
+ */
 static void
 vfs_vmio_release(bp)
 	struct buf *bp;
@@ -1310,6 +1339,7 @@ vfs_vmio_release(bp)
 	vm_page_t m;
 
 	s = splvm();
+	mtx_assert(&vm_mtx, MA_OWNED);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		bp->b_pages[i] = NULL;
@@ -1343,6 +1373,9 @@ vfs_vmio_release(bp)
 	}
 	splx(s);
 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+
+	/* could drop vm_mtx here */
+	
 	if (bp->b_bufsize) {
 		bufspacewakeup();
 		bp->b_bufsize = 0;
@@ -1614,7 +1647,9 @@ restart:
 		if (qindex == QUEUE_CLEAN) {
 			if (bp->b_flags & B_VMIO) {
 				bp->b_flags &= ~B_ASYNC;
+				mtx_lock(&vm_mtx);
 				vfs_vmio_release(bp);
+				mtx_unlock(&vm_mtx);
 			}
 			if (bp->b_vp)
 				brelvp(bp);
@@ -1735,6 +1770,8 @@ restart:
 		if (maxsize != bp->b_kvasize) {
 			vm_offset_t addr = 0;
 
+			/* we'll hold the lock over some vm ops */
+			mtx_lock(&vm_mtx);
 			bfreekva(bp);
 
 			if (vm_map_findspace(buffer_map,
@@ -1743,6 +1780,7 @@ restart:
 				 * Uh oh.  Buffer map is to fragmented.  We
 				 * must defragment the map.
 				 */
+				mtx_unlock(&vm_mtx);
 				++bufdefragcnt;
 				defrag = 1;
 				bp->b_flags |= B_INVAL;
@@ -1759,6 +1797,7 @@ restart:
 				bufspace += bp->b_kvasize;
 				++bufreusecnt;
 			}
+			mtx_unlock(&vm_mtx);
 		}
 		bp->b_data = bp->b_kvabase;
 	}
@@ -1936,18 +1975,24 @@ inmem(struct vnode * vp, daddr_t blkno)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
+	mtx_lock(&vm_mtx);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
-			return 0;
+			goto notinmem;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
-			return 0;
+			goto notinmem;
 	}
+	mtx_unlock(&vm_mtx);
 	return 1;
+
+notinmem:
+	mtx_unlock(&vm_mtx);
+	return (0);
 }
 
 /*
@@ -1960,11 +2005,14 @@ inmem(struct vnode * vp, daddr_t blkno)
  *
  *	This routine is primarily used by NFS, but is generalized for the
  *	B_VMIO case.
+ *
+ *	Can be called with or without vm_mtx
  */
 static void
 vfs_setdirty(struct buf *bp) 
 {
 	int i;
+	int hadvmlock;
 	vm_object_t object;
 
 	/*
@@ -1983,6 +2031,10 @@ vfs_setdirty(struct buf *bp)
 	if ((bp->b_flags & B_VMIO) == 0)
 		return;
 
+	hadvmlock = mtx_owned(&vm_mtx);
+	if (!hadvmlock)
+		mtx_lock(&vm_mtx);
+
 	object = bp->b_pages[0]->object;
 
 	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
@@ -2040,6 +2092,8 @@ vfs_setdirty(struct buf *bp)
 				bp->b_dirtyend = eoffset;
 		}
 	}
+	if (!hadvmlock)
+		mtx_unlock(&vm_mtx);
 }
 
 /*
@@ -2441,6 +2495,7 @@ allocbuf(struct buf *bp, int size)
 			 * DEV_BSIZE aligned existing buffer size.  Figure out
 			 * if we have to remove any pages.
 			 */
+			mtx_lock(&vm_mtx);
 			if (desiredpages < bp->b_npages) {
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
@@ -2461,6 +2516,7 @@ allocbuf(struct buf *bp, int size)
 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				bp->b_npages = desiredpages;
 			}
+			mtx_unlock(&vm_mtx);
 		} else if (size > bp->b_bcount) {
 			/*
 			 * We are growing the buffer, possibly in a 
@@ -2481,6 +2537,7 @@ allocbuf(struct buf *bp, int size)
 			vp = bp->b_vp;
 			VOP_GETVOBJECT(vp, &obj);
 
+			mtx_lock(&vm_mtx);
 			while (bp->b_npages < desiredpages) {
 				vm_page_t m;
 				vm_pindex_t pi;
@@ -2589,6 +2646,9 @@ allocbuf(struct buf *bp, int size)
 			    bp->b_pages, 
 			    bp->b_npages
 			);
+			
+			mtx_unlock(&vm_mtx);
+
 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 		}
@@ -2726,6 +2786,7 @@ bufdone(struct buf *bp)
 		if (error) {
 			panic("biodone: no object");
 		}
+		mtx_lock(&vm_mtx);
 #if defined(VFS_BIO_DEBUG)
 		if (obj->paging_in_progress < bp->b_npages) {
 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
@@ -2814,6 +2875,7 @@ bufdone(struct buf *bp)
 		}
 		if (obj)
 			vm_object_pip_wakeupn(obj, 0);
+		mtx_unlock(&vm_mtx);
 	}
 
 	/*
@@ -2837,12 +2899,15 @@ bufdone(struct buf *bp)
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistant.
+ *
+ * vm_mtx should not be held
  */
 void
 vfs_unbusy_pages(struct buf * bp)
 {
 	int i;
 
+	mtx_assert(&vm_mtx, MA_NOTOWNED);
 	runningbufwakeup(bp);
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
@@ -2850,6 +2915,7 @@ vfs_unbusy_pages(struct buf * bp)
 
 		VOP_GETVOBJECT(vp, &obj);
 
+		mtx_lock(&vm_mtx);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 
@@ -2866,6 +2932,7 @@ vfs_unbusy_pages(struct buf * bp)
 			vm_page_io_finish(m);
 		}
 		vm_object_pip_wakeupn(obj, 0);
+		mtx_unlock(&vm_mtx);
 	}
 }
 
@@ -2876,12 +2943,15 @@ vfs_unbusy_pages(struct buf * bp)
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
+ *
+ *	vm_mtx should be held
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
+	mtx_assert(&vm_mtx, MA_OWNED);
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundry or cross the end of the buffer.  The end of the
@@ -2917,12 +2987,15 @@ vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistant state
  * and should be ignored.
+ *
+ * vm_mtx should not be held
  */
 void
 vfs_busy_pages(struct buf * bp, int clear_modify)
 {
 	int i, bogus;
 
+	mtx_assert(&vm_mtx, MA_NOTOWNED);
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj;
@@ -2932,6 +3005,7 @@ vfs_busy_pages(struct buf * bp, int clear_modify)
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("vfs_busy_pages: no buffer offset"));
+		mtx_lock(&vm_mtx);
 		vfs_setdirty(bp);
 
 retry:
@@ -2979,6 +3053,7 @@ retry:
 		}
 		if (bogus)
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+		mtx_unlock(&vm_mtx);
 	}
 }
 
@@ -2989,12 +3064,15 @@ retry:
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
+ *
+ * should be called with vm_mtx held
  */
 static void
 vfs_clean_pages(struct buf * bp)
 {
 	int i;
 
+	mtx_assert(&vm_mtx, MA_OWNED);
 	if (bp->b_flags & B_VMIO) {
 		vm_ooffset_t foff;
 
@@ -3021,6 +3099,7 @@ vfs_clean_pages(struct buf * bp)
  *	Set the range within the buffer to valid and clean.  The range is 
  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
  *	itself may be offset from the beginning of the first page.
+ *
  */
 
 void   
@@ -3061,13 +3140,18 @@ vfs_bio_set_validclean(struct buf *bp, int base, int size)
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
+ *
+ *	We'll get vm_mtx here for safety if processing a VMIO buffer.
+ *	I don't think vm_mtx is needed, but we're twiddling vm_page flags.
  */
 
 void
 vfs_bio_clrbuf(struct buf *bp) {
 	int i, mask = 0;
 	caddr_t sa, ea;
+
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
+		mtx_lock(&vm_mtx);
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
@@ -3079,6 +3163,7 @@ vfs_bio_clrbuf(struct buf *bp) {
 			}
 			bp->b_pages[0]->valid |= mask;
 			bp->b_resid = 0;
+			mtx_unlock(&vm_mtx);
 			return;
 		}
 		ea = sa = bp->b_data;
@@ -3106,6 +3191,7 @@ vfs_bio_clrbuf(struct buf *bp) {
 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 		}
 		bp->b_resid = 0;
+		mtx_unlock(&vm_mtx);
 	} else {
 		clrbuf(bp);
 	}
@@ -3115,18 +3201,22 @@ vfs_bio_clrbuf(struct buf *bp) {
  * vm_hold_load_pages and vm_hold_unload pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
+ *
+ * vm_mtx should not be held
  */
-void
+static void
 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
+	mtx_assert(&vm_mtx, MA_NOTOWNED);
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
+	mtx_lock(&vm_mtx);
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 
 tryagain:
@@ -3152,6 +3242,7 @@ tryagain:
 		vm_page_wakeup(p);
 	}
 	bp->b_npages = index;
+	mtx_unlock(&vm_mtx);
 }
 
 void
@@ -3160,11 +3251,15 @@ vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 	vm_offset_t pg;
 	vm_page_t p;
 	int index, newnpages;
+	int hadvmlock;
 
 	from = round_page(from);
 	to = round_page(to);
 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
+	hadvmlock = mtx_owned(&vm_mtx);
+	if (!hadvmlock)
+		mtx_lock(&vm_mtx);
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 		p = bp->b_pages[index];
 		if (p && (index < bp->b_npages)) {
@@ -3180,6 +3275,8 @@ vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 		}
 	}
 	bp->b_npages = newnpages;
+	if (!hadvmlock)
+		mtx_unlock(&vm_mtx);
 }