This is David Schultz's swapoff code which I am finally able to commit.

This should be considered highly experimental for the moment. Submitted by: David Schultz <dschultz@uclink.Berkeley.EDU> MFC after: 3 weeks
author: dillon <dillon@FreeBSD.org> 2002-12-15 19:17:57 +0000
committer: dillon <dillon@FreeBSD.org> 2002-12-15 19:17:57 +0000
commit: b43fb3e9200092f2885e909dc7ee85cb0871cfef (patch)
tree: fc6e3be9fa1b757f9ac0967a46494adcf0cc5682 /sys/vm
parent: 2925e70a14eb46bd10c8905fd619024bb19f7f9d (diff)
download: FreeBSD-src-b43fb3e9200092f2885e909dc7ee85cb0871cfef.zip
FreeBSD-src-b43fb3e9200092f2885e909dc7ee85cb0871cfef.tar.gz
5 files changed, 339 insertions, 8 deletions
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index de203e2..2f43bc4 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -206,6 +206,8 @@ static __inline daddr_t	swp_pager_getswapspace(int npages);
 /*
  * Metadata functions
  */
+static __inline struct swblock **
+    swp_pager_hash(vm_object_t object, vm_pindex_t index);
 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free_all(vm_object_t);
@@ -512,12 +514,22 @@ swp_pager_freeswapspace(blk, npages)
 	daddr_t blk;
 	int npages;
 {
+	struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)];
+
 	GIANT_REQUIRED;
 
+	/* per-swap area stats */
+	sp->sw_used -= npages;
+
+	/*
+	 * If we are attempting to stop swapping on this device, we
+	 * don't want to mark any blocks free lest they be reused.
+	 */
+	if (sp->sw_flags & SW_CLOSING)
+		return;
+
 	blist_free(swapblist, blk, npages);
 	vm_swap_size += npages;
-	/* per-swap area stats */
-	swdevt[BLK2DEVIDX(blk)].sw_used -= npages;
 	swp_sizecheck();
 }
 
@@ -1624,6 +1636,149 @@ swp_pager_async_iodone(bp)
 	splx(s);
 }
 
+/*
+ *	swap_pager_isswapped:
+ *
+ *	Return 1 if at least one page in the given object is paged
+ *	out to the given swap device.
+ *
+ *	This routine may not block.
+ */
+int swap_pager_isswapped(vm_object_t object, int devidx) {
+	daddr_t index = 0;
+	int bcount;
+	int i;
+
+	for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
+		struct swblock *swap;
+
+		if ((swap = *swp_pager_hash(object, index)) != NULL) {
+			for (i = 0; i < SWAP_META_PAGES; ++i) {
+				daddr_t v = swap->swb_pages[i];
+				if (v != SWAPBLK_NONE &&
+				    BLK2DEVIDX(v) == devidx)
+					return 1;
+			}
+		}
+
+		index += SWAP_META_PAGES;
+		if (index > 0x20000000)
+			panic("swap_pager_isswapped: failed to locate all swap meta blocks");
+	}
+	return 0;
+}
+
+/*
+ * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
+ *
+ *	This routine dissociates the page at the given index within a
+ *	swap block from its backing store, paging it in if necessary.
+ *	If the page is paged in, it is placed in the inactive queue,
+ *	since it had its backing store ripped out from under it.
+ *	We also attempt to swap in all other pages in the swap block,
+ *	we only guarantee that the one at the specified index is
+ *	paged in.
+ *
+ *	XXX - The code to page the whole block in doesn't work, so we
+ *	      revert to the one-by-one behavior for now.  Sigh.
+ */
+static __inline void
+swp_pager_force_pagein(struct swblock *swap, int idx)
+{
+	vm_object_t object;
+	vm_page_t m;
+	vm_pindex_t pindex;
+
+	object = swap->swb_object;
+	pindex = swap->swb_index;
+
+	vm_object_pip_add(object, 1);
+	m = vm_page_grab(object, pindex + idx, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
+	if (m->valid == VM_PAGE_BITS_ALL) {
+		vm_object_pip_subtract(object, 1);
+		vm_page_lock_queues();
+		vm_page_activate(m);
+		vm_page_dirty(m);
+		vm_page_wakeup(m);
+		vm_page_unlock_queues();
+		vm_pager_page_unswapped(m);
+		return;
+	}
+
+	if (swap_pager_getpages(object, &m, 1, 0) !=
+	    VM_PAGER_OK)
+		panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
+	vm_object_pip_subtract(object, 1);
+
+	vm_page_lock_queues();
+	vm_page_dirty(m);
+	vm_page_dontneed(m);
+	vm_page_wakeup(m);
+	vm_page_unlock_queues();
+	vm_pager_page_unswapped(m);
+}
+
+
+/*
+ *	swap_pager_swapoff:
+ *
+ *	Page in all of the pages that have been paged out to the
+ *	given device.  The corresponding blocks in the bitmap must be
+ *	marked as allocated and the device must be flagged SW_CLOSING.
+ *	There may be no processes swapped out to the device.
+ *
+ *	The sw_used parameter points to the field in the swdev structure
+ *	that contains a count of the number of blocks still allocated
+ *	on the device.  If we encounter objects with a nonzero pip count
+ *	in our scan, we use this number to determine if we're really done.
+ *
+ *	This routine may block.
+ */
+void
+swap_pager_swapoff(int devidx, int *sw_used)
+{
+	struct swblock **pswap;
+	struct swblock *swap;
+	vm_object_t waitobj;
+	daddr_t v;
+	int i, j;
+
+	GIANT_REQUIRED;
+
+full_rescan:
+	waitobj = NULL;
+	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
+restart:
+		pswap = &swhash[i];
+		while ((swap = *pswap) != NULL) {
+                        for (j = 0; j < SWAP_META_PAGES; ++j) {
+                                v = swap->swb_pages[j];
+                                if (v != SWAPBLK_NONE &&
+				    BLK2DEVIDX(v) == devidx)
+                                        break;
+                        }
+			if (j < SWAP_META_PAGES) {
+				swp_pager_force_pagein(swap, j);
+				goto restart;
+			} else if (swap->swb_object->paging_in_progress) {
+				if (!waitobj)
+					waitobj = swap->swb_object;
+			}
+			pswap = &swap->swb_hnext;
+		}
+	}
+	if (waitobj && *sw_used) {
+	    /*
+	     * We wait on an arbitrary object to clock our rescans
+	     * to the rate of paging completion.
+	     */
+	    vm_object_pip_wait(waitobj, "swpoff");
+	    goto full_rescan;
+	}
+	if (*sw_used)
+	    panic("swapoff: failed to locate %d swap blocks", *sw_used);
+}
+
 /************************************************************************
  *				SWAP META DATA 				*
  ************************************************************************
diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h
index 97d50d3..4402284 100644
--- a/sys/vm/swap_pager.h
+++ b/sys/vm/swap_pager.h
@@ -83,9 +83,11 @@ extern struct pagerlst swap_pager_un_object_list;
 extern int swap_pager_full;
 extern struct blist *swapblist;
 extern struct uma_zone *swap_zone;
+extern int nswap_lowat, nswap_hiwat;
 
 void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
+void swap_pager_swapoff(int devidx, int *sw_used);
 
 int swap_pager_swp_alloc(vm_object_t, int);
 void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int);
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 6ac6a96..e38b3d3 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -91,6 +91,7 @@
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
 
 #include <sys/user.h>
 
@@ -324,6 +325,45 @@ vm_proc_swapin(struct proc *p)
 	up = (vm_offset_t)p->p_uarea;
 	pmap_qenter(up, ma, UAREA_PAGES);
 }
+
+/*
+ * Swap in the UAREAs of all processes swapped out to the given device.
+ * The pages in the UAREA are marked dirty and their swap metadata is freed.
+ */
+void
+vm_proc_swapin_all(int devidx)
+{
+	struct proc *p;
+	vm_object_t object;
+	vm_page_t m;
+
+retry:
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		mtx_lock_spin(&sched_lock);
+
+		object = p->p_upages_obj;
+		if (object != NULL &&
+		    swap_pager_isswapped(p->p_upages_obj, devidx)) {
+			sx_sunlock(&allproc_lock);
+			faultin(p);
+			mtx_unlock_spin(&sched_lock);
+			PROC_UNLOCK(p);
+			vm_page_lock_queues();
+			TAILQ_FOREACH(m, &object->memq, listq)
+				vm_page_dirty(m);
+			vm_page_unlock_queues();
+			swap_pager_freespace(object, 0,
+			    object->un_pager.swp.swp_bcount);
+			goto retry;
+		}
+
+		mtx_unlock_spin(&sched_lock);
+		PROC_UNLOCK(p);
+	}
+	sx_sunlock(&allproc_lock);
+}
 #endif
 
 /*
diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h
index c909c68..d68ec79 100644
--- a/sys/vm/vm_pageout.h
+++ b/sys/vm/vm_pageout.h
@@ -104,6 +104,12 @@ extern void pagedaemon_wakeup(void);
 extern void vm_wait(void);
 extern void vm_waitpfault(void);
 
+/* XXX This is probably misplaced. */
+#ifndef NO_SWAPPING
+void vm_proc_swapin_all(int);
+int swap_pager_isswapped(vm_object_t, int);
+#endif	/* !NO_SWAPPING */
+
 #ifdef _KERNEL
 void vm_pageout_page(vm_page_t, vm_object_t);
 void vm_pageout_cluster(vm_page_t, vm_object_t);
diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c
index 1781182..0ec5220 100644
--- a/sys/vm/vm_swap.c
+++ b/sys/vm/vm_swap.c
@@ -36,6 +36,7 @@
 
 #include "opt_mac.h"
 #include "opt_swap.h"
+#include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -58,6 +59,7 @@
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
@@ -73,6 +75,8 @@ struct swdevt *swdevt = should_be_malloced;
 static int nswap;		/* first block after the interleaved devs */
 int nswdev = NSWAPDEV;
 int vm_swap_size;
+static int swdev_syscall_active = 0; /* serialize swap(on|off) */
+
 
 static int swapdev_strategy(struct vop_strategy_args *ap);
 struct vnode *swapdev_vp;
@@ -165,11 +169,12 @@ swapdev_strategy(ap)
 
 /*
  * Create a special vnode op vector for swapdev_vp - we only use
- * VOP_STRATEGY(), everything else returns an error.
+ * VOP_STRATEGY() and reclaim; everything else returns an error.
  */
 vop_t **swapdev_vnodeop_p;
 static struct vnodeopv_entry_desc swapdev_vnodeop_entries[] = {  
 	{ &vop_default_desc,		(vop_t *) vop_defaultop },
+	{ &vop_reclaim_desc,		(vop_t *) vop_null },
 	{ &vop_strategy_desc,		(vop_t *) swapdev_strategy },
 	{ NULL, NULL }
 };
@@ -208,19 +213,23 @@ swapon(td, uap)
 	if (error)
 		goto done2;
 
+	while (swdev_syscall_active)
+	    tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
+	swdev_syscall_active = 1;
+
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
 	if (swap_zone == NULL) {
 		error = ENOMEM;
-		goto done2;
+		goto done;
 	}
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
 	error = namei(&nd);
 	if (error)
-		goto done2;
+		goto done;
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
@@ -239,6 +248,9 @@ swapon(td, uap)
 
 	if (error)
 		vrele(vp);
+done:
+	swdev_syscall_active = 0;
+	wakeup_one(&swdev_syscall_active);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
@@ -252,8 +264,6 @@ done2:
  *
  * The new swap code uses page-sized blocks.  The old swap code used
  * DEV_BSIZE'd chunks.
- *
- * XXX locking when multiple swapon's run in parallel
  */
 int
 swaponvp(td, vp, dev, nblks)
@@ -330,7 +340,7 @@ swaponvp(td, vp, dev, nblks)
 	sp->sw_vp = vp;
 	sp->sw_dev = dev2udev(dev);
 	sp->sw_device = dev;
-	sp->sw_flags |= SW_FREED;
+	sp->sw_flags = SW_FREED;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 
@@ -356,9 +366,127 @@ swaponvp(td, vp, dev, nblks)
 		vm_swap_size += blk;
 	}
 
+	swap_pager_full = 0;
+
 	return (0);
 }
 
+/*
+ * SYSCALL: swapoff(devname)
+ *
+ * Disable swapping on the given device.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct swapoff_args {
+	char *name;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+swapoff(td, uap)
+	struct thread *td;
+	struct swapoff_args *uap;
+{
+	struct vnode *vp;
+	struct nameidata nd;
+	struct swdevt *sp;
+	swblk_t dvbase, vsbase;
+	u_long nblks, aligned_nblks, blk;
+	int error, index;
+
+	mtx_lock(&Giant);
+
+	error = suser(td);
+	if (error)
+		goto done2;
+
+	while (swdev_syscall_active)
+	    tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
+	swdev_syscall_active = 1;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
+	error = namei(&nd);
+	if (error)
+		goto done;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+
+	for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) {
+		if (sp->sw_vp == vp)
+			goto found;
+	}
+	error = EINVAL;
+	goto done;
+found:
+	nblks = sp->sw_nblks;
+
+	/*
+	 * We can turn off this swap device safely only if the
+	 * available virtual memory in the system will fit the amount
+	 * of data we will have to page back in, plus an epsilon so
+	 * the system doesn't become critically low on swap space.
+	 */
+	if (cnt.v_free_count + cnt.v_cache_count + vm_swap_size <
+	    nblks + nswap_lowat) {
+		error = ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * Prevent further allocations on this device.
+	 */
+	sp->sw_flags |= SW_CLOSING;
+	for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) {
+		blk = min(nblks - dvbase, dmmax);
+		vsbase = index * dmmax + dvbase * nswdev;
+		vm_swap_size -= blist_fill(swapblist, vsbase, blk);
+	}
+
+	/*
+	 * Page in the contents of the device and close it.
+	 */
+#ifndef NO_SWAPPING
+       	vm_proc_swapin_all(index);
+#endif /* !NO_SWAPPING */
+	swap_pager_swapoff(index, &sp->sw_used);
+
+	VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
+	vrele(vp);
+	sp->sw_vp = NULL;
+
+	/*
+	 * Resize the bitmap based on the new largest swap device,
+	 * or free the bitmap if there are no more devices.
+	 */
+	for (sp = swdevt, nblks = 0; sp < swdevt + nswdev; sp++) {
+		if (sp->sw_vp == NULL)
+			continue;
+		nblks = max(nblks, sp->sw_nblks);
+	}
+
+	aligned_nblks = (nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1);
+	nswap = aligned_nblks * nswdev;
+
+	if (nswap == 0) {
+		blist_destroy(swapblist);
+		swapblist = NULL;
+		vrele(swapdev_vp);
+		swapdev_vp = NULL;
+	} else
+		blist_resize(&swapblist, nswap, 0);
+
+done:
+	swdev_syscall_active = 0;
+	wakeup_one(&swdev_syscall_active);
+done2:
+	mtx_unlock(&Giant);
+	return (error);
+}
+
 static int
 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 {
author	dillon <dillon@FreeBSD.org>	2002-12-15 19:17:57 +0000
committer	dillon <dillon@FreeBSD.org>	2002-12-15 19:17:57 +0000
commit	b43fb3e9200092f2885e909dc7ee85cb0871cfef (patch)
tree	fc6e3be9fa1b757f9ac0967a46494adcf0cc5682 /sys/vm
parent	2925e70a14eb46bd10c8905fd619024bb19f7f9d (diff)
download	FreeBSD-src-b43fb3e9200092f2885e909dc7ee85cb0871cfef.zip FreeBSD-src-b43fb3e9200092f2885e909dc7ee85cb0871cfef.tar.gz