summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authordillon <dillon@FreeBSD.org>2002-12-15 19:17:57 +0000
committerdillon <dillon@FreeBSD.org>2002-12-15 19:17:57 +0000
commitb43fb3e9200092f2885e909dc7ee85cb0871cfef (patch)
treefc6e3be9fa1b757f9ac0967a46494adcf0cc5682 /sys
parent2925e70a14eb46bd10c8905fd619024bb19f7f9d (diff)
downloadFreeBSD-src-b43fb3e9200092f2885e909dc7ee85cb0871cfef.zip
FreeBSD-src-b43fb3e9200092f2885e909dc7ee85cb0871cfef.tar.gz
This is David Schultz's swapoff code which I am finally able to commit.
This should be considered highly experimental for the moment. Submitted by: David Schultz <dschultz@uclink.Berkeley.EDU> MFC after: 3 weeks
Diffstat (limited to 'sys')
-rw-r--r--sys/amd64/ia32/syscalls.master1
-rw-r--r--sys/compat/freebsd32/syscalls.master1
-rw-r--r--sys/ia64/ia32/syscalls.master1
-rw-r--r--sys/kern/subr_blist.c215
-rw-r--r--sys/kern/syscalls.master1
-rw-r--r--sys/sys/blist.h2
-rw-r--r--sys/sys/conf.h1
-rw-r--r--sys/sys/linedisc.h1
-rw-r--r--sys/vm/swap_pager.c159
-rw-r--r--sys/vm/swap_pager.h2
-rw-r--r--sys/vm/vm_glue.c40
-rw-r--r--sys/vm/vm_pageout.h6
-rw-r--r--sys/vm/vm_swap.c140
13 files changed, 530 insertions, 40 deletions
diff --git a/sys/amd64/ia32/syscalls.master b/sys/amd64/ia32/syscalls.master
index 66cc75f..d30b591 100644
--- a/sys/amd64/ia32/syscalls.master
+++ b/sys/amd64/ia32/syscalls.master
@@ -594,3 +594,4 @@
421 UNIMPL BSD getcontext
422 UNIMPL BSD setcontext
423 UNIMPL BSD swapcontext
+424 MNOPROTO BSD swapoff
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index 66cc75f..d30b591 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -594,3 +594,4 @@
421 UNIMPL BSD getcontext
422 UNIMPL BSD setcontext
423 UNIMPL BSD swapcontext
+424 MNOPROTO BSD swapoff
diff --git a/sys/ia64/ia32/syscalls.master b/sys/ia64/ia32/syscalls.master
index 66cc75f..d30b591 100644
--- a/sys/ia64/ia32/syscalls.master
+++ b/sys/ia64/ia32/syscalls.master
@@ -594,3 +594,4 @@
421 UNIMPL BSD getcontext
422 UNIMPL BSD setcontext
423 UNIMPL BSD swapcontext
+424 MNOPROTO BSD swapoff
diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c
index eeeb7d9..1ae2ee2 100644
--- a/sys/kern/subr_blist.c
+++ b/sys/kern/subr_blist.c
@@ -93,7 +93,7 @@
#include <stdlib.h>
#include <stdarg.h>
-#define malloc(a,b,c) malloc(a)
+#define malloc(a,b,c) calloc(a, 1)
#define free(a,b) free(a)
typedef unsigned int u_daddr_t;
@@ -116,6 +116,9 @@ static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
daddr_t radix, int skip, daddr_t blk);
static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
daddr_t skip, blist_t dest, daddr_t count);
+static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
+static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
+ daddr_t radix, int skip, daddr_t blk);
static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix,
int skip, daddr_t count);
#ifndef _KERNEL
@@ -165,13 +168,14 @@ blist_create(daddr_t blocks)
#if defined(BLIST_DEBUG)
printf(
- "BLIST representing %d blocks (%d MB of swap)"
- ", requiring %dK of ram\n",
- bl->bl_blocks,
- bl->bl_blocks * 4 / 1024,
- (bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+ "BLIST representing %lld blocks (%lld MB of swap)"
+ ", requiring %lldK of ram\n",
+ (long long)bl->bl_blocks,
+ (long long)bl->bl_blocks * 4 / 1024,
+ (long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
);
- printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks);
+ printf("BLIST raw radix tree contains %lld records\n",
+ (long long)bl->bl_rootblks);
#endif
blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
@@ -226,6 +230,30 @@ blist_free(blist_t bl, daddr_t blkno, daddr_t count)
}
/*
+ * blist_fill() - mark a region in the block bitmap as off-limits
+ * to the allocator (i.e. allocate it), ignoring any
+ * existing allocations. Return the number of blocks
+ * actually filled that were free before the call.
+ */
+
+int
+blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
+{
+ int filled;
+
+ if (bl) {
+ if (bl->bl_radix == BLIST_BMAP_RADIX)
+ filled = blst_leaf_fill(bl->bl_root, blkno, count);
+ else
+ filled = blst_meta_fill(bl->bl_root, blkno, count,
+ bl->bl_radix, bl->bl_skip, 0);
+ bl->bl_free -= filled;
+ return filled;
+ } else
+ return 0;
+}
+
+/*
* blist_resize() - resize an existing radix tree to handle the
* specified number of blocks. This will reallocate
* the tree and transfer the previous bitmap to the new
@@ -507,9 +535,9 @@ blst_meta_free(
int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
#if 0
- printf("FREE (%x,%d) FROM (%x,%d)\n",
- freeBlk, count,
- blk, radix
+ printf("FREE (%llx,%lld) FROM (%llx,%lld)\n",
+ (long long)freeBlk, (long long)count,
+ (long long)blk, (long long)radix
);
#endif
@@ -679,6 +707,117 @@ static void blst_copy(
}
/*
+ * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
+ *
+ * This routine allocates all blocks in the specified range
+ * regardless of any existing allocations in that range. Returns
+ * the number of blocks allocated by the call.
+ */
+
+static int
+blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
+{
+ int n = blk & (BLIST_BMAP_RADIX - 1);
+ int nblks;
+ u_daddr_t mask, bitmap;
+
+ mask = ((u_daddr_t)-1 << n) &
+ ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+ /* Count the number of blocks we're about to allocate */
+ bitmap = scan->u.bmu_bitmap & mask;
+ for (nblks = 0; bitmap != 0; nblks++)
+ bitmap &= bitmap - 1;
+
+ scan->u.bmu_bitmap &= ~mask;
+ return nblks;
+}
+
+/*
+ * BLIST_META_FILL() - allocate specific blocks at a meta node
+ *
+ * This routine allocates the specified range of blocks,
+ * regardless of any existing allocations in the range. The
+ * range must be within the extent of this node. Returns the
+ * number of blocks allocated by the call.
+ */
+static int
+blst_meta_fill(
+ blmeta_t *scan,
+ daddr_t allocBlk,
+ daddr_t count,
+ daddr_t radix,
+ int skip,
+ daddr_t blk
+) {
+ int i;
+ int next_skip = (skip >> BLIST_META_RADIX_SHIFT);
+ int nblks = 0;
+
+ if (count == radix || scan->u.bmu_avail == 0) {
+ /*
+ * ALL-ALLOCATED special case
+ */
+ nblks = scan->u.bmu_avail;
+ scan->u.bmu_avail = 0;
+ scan->bm_bighint = count;
+ return nblks;
+ }
+
+ if (scan->u.bmu_avail == radix) {
+ radix >>= BLIST_META_RADIX_SHIFT;
+
+ /*
+ * ALL-FREE special case, initialize sublevel
+ */
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+ if (next_skip == 1) {
+ scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+ scan[i].bm_bighint = BLIST_BMAP_RADIX;
+ } else {
+ scan[i].bm_bighint = radix;
+ scan[i].u.bmu_avail = radix;
+ }
+ }
+ } else {
+ radix >>= BLIST_META_RADIX_SHIFT;
+ }
+
+ if (count > radix)
+ panic("blist_meta_fill: allocation too large");
+
+ i = (allocBlk - blk) / radix;
+ blk += i * radix;
+ i = i * next_skip + 1;
+
+ while (i <= skip && blk < allocBlk + count) {
+ daddr_t v;
+
+ v = blk + radix - allocBlk;
+ if (v > count)
+ v = count;
+
+ if (scan->bm_bighint == (daddr_t)-1)
+ panic("blst_meta_fill: filling unexpected range");
+
+ if (next_skip == 1) {
+ nblks += blst_leaf_fill(&scan[i], allocBlk, v);
+ } else {
+ nblks += blst_meta_fill(&scan[i], allocBlk, v,
+ radix, next_skip - 1, blk);
+ }
+ count -= v;
+ allocBlk += v;
+ blk += radix;
+ i += next_skip;
+ }
+ scan->u.bmu_avail -= nblks;
+ return nblks;
+}
+
+/*
* BLST_RADIX_INIT() - initialize radix tree
*
* Initialize our meta structures and bitmaps and calculate the exact
@@ -768,41 +907,41 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
if (radix == BLIST_BMAP_RADIX) {
printf(
- "%*.*s(%04x,%d): bitmap %08x big=%d\n",
+ "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n",
tab, tab, "",
- blk, radix,
- scan->u.bmu_bitmap,
- scan->bm_bighint
+ (long long)blk, (long long)radix,
+ (long long)scan->u.bmu_bitmap,
+ (long long)scan->bm_bighint
);
return;
}
if (scan->u.bmu_avail == 0) {
printf(
- "%*.*s(%04x,%d) ALL ALLOCATED\n",
+ "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
tab, tab, "",
- blk,
- radix
+ (long long)blk,
+ (long long)radix
);
return;
}
if (scan->u.bmu_avail == radix) {
printf(
- "%*.*s(%04x,%d) ALL FREE\n",
+ "%*.*s(%08llx,%lld) ALL FREE\n",
tab, tab, "",
- blk,
- radix
+ (long long)blk,
+ (long long)radix
);
return;
}
printf(
- "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n",
+ "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
tab, tab, "",
- blk, radix,
- scan->u.bmu_avail,
- radix,
- scan->bm_bighint
+ (long long)blk, (long long)radix,
+ (long long)scan->u.bmu_avail,
+ (long long)radix,
+ (long long)scan->bm_bighint
);
radix >>= BLIST_META_RADIX_SHIFT;
@@ -812,9 +951,9 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
for (i = 1; i <= skip; i += next_skip) {
if (scan[i].bm_bighint == (daddr_t)-1) {
printf(
- "%*.*s(%04x,%d): Terminator\n",
+ "%*.*s(%08llx,%lld): Terminator\n",
tab, tab, "",
- blk, radix
+ (long long)blk, (long long)radix
);
lastState = 0;
break;
@@ -866,13 +1005,14 @@ main(int ac, char **av)
daddr_t count = 0;
- printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix);
+ printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
+ (long long)size, (long long)bl->bl_radix);
fflush(stdout);
if (fgets(buf, sizeof(buf), stdin) == NULL)
break;
switch(buf[0]) {
case 'r':
- if (sscanf(buf + 1, "%d", &count) == 1) {
+ if (sscanf(buf + 1, "%lld", &count) == 1) {
blist_resize(&bl, count, 1);
} else {
printf("?\n");
@@ -881,26 +1021,37 @@ main(int ac, char **av)
blist_print(bl);
break;
case 'a':
- if (sscanf(buf + 1, "%d", &count) == 1) {
+ if (sscanf(buf + 1, "%lld", &count) == 1) {
daddr_t blk = blist_alloc(bl, count);
- printf(" R=%04x\n", blk);
+ printf(" R=%08llx\n", (long long)blk);
} else {
printf("?\n");
}
break;
case 'f':
- if (sscanf(buf + 1, "%x %d", &da, &count) == 2) {
+ if (sscanf(buf + 1, "%llx %lld",
+ (long long *)&da, (long long *)&count) == 2) {
blist_free(bl, da, count);
} else {
printf("?\n");
}
break;
+ case 'l':
+ if (sscanf(buf + 1, "%llx %lld",
+ (long long *)&da, (long long *)&count) == 2) {
+ printf(" n=%d\n",
+ blist_fill(bl, da, count));
+ } else {
+ printf("?\n");
+ }
+ break;
case '?':
case 'h':
puts(
"p -print\n"
"a %d -allocate\n"
"f %x %d -free\n"
+ "l %x %d -fill\n"
"r %d -resize\n"
"h/? -help"
);
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index a41eb12..0144274 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -612,6 +612,7 @@
422 MSTD BSD { int setcontext(const struct __ucontext *ucp); }
423 MSTD BSD { int swapcontext(struct __ucontext *oucp, \
const struct __ucontext *ucp); }
+424 MSTD BSD { int swapoff(const char *name); }
; Please copy any additions and changes to the following compatability tables:
; sys/ia64/ia32/syscalls.master (take a best guess)
diff --git a/sys/sys/blist.h b/sys/sys/blist.h
index fa4be7e..d426e48 100644
--- a/sys/sys/blist.h
+++ b/sys/sys/blist.h
@@ -9,6 +9,7 @@
* (void) blist_destroy(blist)
* blkno = blist_alloc(blist, count)
* (void) blist_free(blist, blkno, count)
+ * nblks = blist_fill(blist, blkno, count)
* (void) blist_resize(&blist, count, freeextra)
*
*
@@ -78,6 +79,7 @@ extern blist_t blist_create(daddr_t blocks);
extern void blist_destroy(blist_t blist);
extern daddr_t blist_alloc(blist_t blist, daddr_t count);
extern void blist_free(blist_t blist, daddr_t blkno, daddr_t count);
+extern int blist_fill(blist_t bl, daddr_t blkno, daddr_t count);
extern void blist_print(blist_t blist);
extern void blist_resize(blist_t *pblist, daddr_t count, int freenew);
diff --git a/sys/sys/conf.h b/sys/sys/conf.h
index 1be8506..15108c2 100644
--- a/sys/sys/conf.h
+++ b/sys/sys/conf.h
@@ -274,6 +274,7 @@ struct swdevt {
};
#define SW_FREED 0x01
#define SW_SEQUENTIAL 0x02
+#define SW_CLOSING 0x04
#define sw_freed sw_flags /* XXX compat */
#ifdef _KERNEL
diff --git a/sys/sys/linedisc.h b/sys/sys/linedisc.h
index 1be8506..15108c2 100644
--- a/sys/sys/linedisc.h
+++ b/sys/sys/linedisc.h
@@ -274,6 +274,7 @@ struct swdevt {
};
#define SW_FREED 0x01
#define SW_SEQUENTIAL 0x02
+#define SW_CLOSING 0x04
#define sw_freed sw_flags /* XXX compat */
#ifdef _KERNEL
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index de203e2..2f43bc4 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -206,6 +206,8 @@ static __inline daddr_t swp_pager_getswapspace(int npages);
/*
* Metadata functions
*/
+static __inline struct swblock **
+ swp_pager_hash(vm_object_t object, vm_pindex_t index);
static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
static void swp_pager_meta_free_all(vm_object_t);
@@ -512,12 +514,22 @@ swp_pager_freeswapspace(blk, npages)
daddr_t blk;
int npages;
{
+ struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)];
+
GIANT_REQUIRED;
+ /* per-swap area stats */
+ sp->sw_used -= npages;
+
+ /*
+ * If we are attempting to stop swapping on this device, we
+ * don't want to mark any blocks free lest they be reused.
+ */
+ if (sp->sw_flags & SW_CLOSING)
+ return;
+
blist_free(swapblist, blk, npages);
vm_swap_size += npages;
- /* per-swap area stats */
- swdevt[BLK2DEVIDX(blk)].sw_used -= npages;
swp_sizecheck();
}
@@ -1624,6 +1636,149 @@ swp_pager_async_iodone(bp)
splx(s);
}
+/*
+ * swap_pager_isswapped:
+ *
+ * Return 1 if at least one page in the given object is paged
+ * out to the given swap device.
+ *
+ * This routine may not block.
+ */
+int swap_pager_isswapped(vm_object_t object, int devidx) {
+ daddr_t index = 0;
+ int bcount;
+ int i;
+
+ for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
+ struct swblock *swap;
+
+ if ((swap = *swp_pager_hash(object, index)) != NULL) {
+ for (i = 0; i < SWAP_META_PAGES; ++i) {
+ daddr_t v = swap->swb_pages[i];
+ if (v != SWAPBLK_NONE &&
+ BLK2DEVIDX(v) == devidx)
+ return 1;
+ }
+ }
+
+ index += SWAP_META_PAGES;
+ if (index > 0x20000000)
+ panic("swap_pager_isswapped: failed to locate all swap meta blocks");
+ }
+ return 0;
+}
+
+/*
+ * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
+ *
+ * This routine dissociates the page at the given index within a
+ * swap block from its backing store, paging it in if necessary.
+ * If the page is paged in, it is placed in the inactive queue,
+ * since it had its backing store ripped out from under it.
+ * We also attempt to swap in all other pages in the swap block,
+ * we only guarantee that the one at the specified index is
+ * paged in.
+ *
+ * XXX - The code to page the whole block in doesn't work, so we
+ * revert to the one-by-one behavior for now. Sigh.
+ */
+static __inline void
+swp_pager_force_pagein(struct swblock *swap, int idx)
+{
+ vm_object_t object;
+ vm_page_t m;
+ vm_pindex_t pindex;
+
+ object = swap->swb_object;
+ pindex = swap->swb_index;
+
+ vm_object_pip_add(object, 1);
+ m = vm_page_grab(object, pindex + idx, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
+ if (m->valid == VM_PAGE_BITS_ALL) {
+ vm_object_pip_subtract(object, 1);
+ vm_page_lock_queues();
+ vm_page_activate(m);
+ vm_page_dirty(m);
+ vm_page_wakeup(m);
+ vm_page_unlock_queues();
+ vm_pager_page_unswapped(m);
+ return;
+ }
+
+ if (swap_pager_getpages(object, &m, 1, 0) !=
+ VM_PAGER_OK)
+ panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
+ vm_object_pip_subtract(object, 1);
+
+ vm_page_lock_queues();
+ vm_page_dirty(m);
+ vm_page_dontneed(m);
+ vm_page_wakeup(m);
+ vm_page_unlock_queues();
+ vm_pager_page_unswapped(m);
+}
+
+
+/*
+ * swap_pager_swapoff:
+ *
+ * Page in all of the pages that have been paged out to the
+ * given device. The corresponding blocks in the bitmap must be
+ * marked as allocated and the device must be flagged SW_CLOSING.
+ * There may be no processes swapped out to the device.
+ *
+ * The sw_used parameter points to the field in the swdev structure
+ * that contains a count of the number of blocks still allocated
+ * on the device. If we encounter objects with a nonzero pip count
+ * in our scan, we use this number to determine if we're really done.
+ *
+ * This routine may block.
+ */
+void
+swap_pager_swapoff(int devidx, int *sw_used)
+{
+ struct swblock **pswap;
+ struct swblock *swap;
+ vm_object_t waitobj;
+ daddr_t v;
+ int i, j;
+
+ GIANT_REQUIRED;
+
+full_rescan:
+ waitobj = NULL;
+ for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
+restart:
+ pswap = &swhash[i];
+ while ((swap = *pswap) != NULL) {
+ for (j = 0; j < SWAP_META_PAGES; ++j) {
+ v = swap->swb_pages[j];
+ if (v != SWAPBLK_NONE &&
+ BLK2DEVIDX(v) == devidx)
+ break;
+ }
+ if (j < SWAP_META_PAGES) {
+ swp_pager_force_pagein(swap, j);
+ goto restart;
+ } else if (swap->swb_object->paging_in_progress) {
+ if (!waitobj)
+ waitobj = swap->swb_object;
+ }
+ pswap = &swap->swb_hnext;
+ }
+ }
+ if (waitobj && *sw_used) {
+ /*
+ * We wait on an arbitrary object to clock our rescans
+ * to the rate of paging completion.
+ */
+ vm_object_pip_wait(waitobj, "swpoff");
+ goto full_rescan;
+ }
+ if (*sw_used)
+ panic("swapoff: failed to locate %d swap blocks", *sw_used);
+}
+
/************************************************************************
* SWAP META DATA *
************************************************************************
diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h
index 97d50d3..4402284 100644
--- a/sys/vm/swap_pager.h
+++ b/sys/vm/swap_pager.h
@@ -83,9 +83,11 @@ extern struct pagerlst swap_pager_un_object_list;
extern int swap_pager_full;
extern struct blist *swapblist;
extern struct uma_zone *swap_zone;
+extern int nswap_lowat, nswap_hiwat;
void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
+void swap_pager_swapoff(int devidx, int *sw_used);
int swap_pager_swp_alloc(vm_object_t, int);
void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int);
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 6ac6a96..e38b3d3 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -91,6 +91,7 @@
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
#include <sys/user.h>
@@ -324,6 +325,45 @@ vm_proc_swapin(struct proc *p)
up = (vm_offset_t)p->p_uarea;
pmap_qenter(up, ma, UAREA_PAGES);
}
+
+/*
+ * Swap in the UAREAs of all processes swapped out to the given device.
+ * The pages in the UAREA are marked dirty and their swap metadata is freed.
+ */
+void
+vm_proc_swapin_all(int devidx)
+{
+ struct proc *p;
+ vm_object_t object;
+ vm_page_t m;
+
+retry:
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ mtx_lock_spin(&sched_lock);
+
+ object = p->p_upages_obj;
+ if (object != NULL &&
+ swap_pager_isswapped(p->p_upages_obj, devidx)) {
+ sx_sunlock(&allproc_lock);
+ faultin(p);
+ mtx_unlock_spin(&sched_lock);
+ PROC_UNLOCK(p);
+ vm_page_lock_queues();
+ TAILQ_FOREACH(m, &object->memq, listq)
+ vm_page_dirty(m);
+ vm_page_unlock_queues();
+ swap_pager_freespace(object, 0,
+ object->un_pager.swp.swp_bcount);
+ goto retry;
+ }
+
+ mtx_unlock_spin(&sched_lock);
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+}
#endif
/*
diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h
index c909c68..d68ec79 100644
--- a/sys/vm/vm_pageout.h
+++ b/sys/vm/vm_pageout.h
@@ -104,6 +104,12 @@ extern void pagedaemon_wakeup(void);
extern void vm_wait(void);
extern void vm_waitpfault(void);
+/* XXX This is probably misplaced. */
+#ifndef NO_SWAPPING
+void vm_proc_swapin_all(int);
+int swap_pager_isswapped(vm_object_t, int);
+#endif /* !NO_SWAPPING */
+
#ifdef _KERNEL
void vm_pageout_page(vm_page_t, vm_object_t);
void vm_pageout_cluster(vm_page_t, vm_object_t);
diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c
index 1781182..0ec5220 100644
--- a/sys/vm/vm_swap.c
+++ b/sys/vm/vm_swap.c
@@ -36,6 +36,7 @@
#include "opt_mac.h"
#include "opt_swap.h"
+#include "opt_vm.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -58,6 +59,7 @@
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
#include <vm/swap_pager.h>
#include <vm/uma.h>
@@ -73,6 +75,8 @@ struct swdevt *swdevt = should_be_malloced;
static int nswap; /* first block after the interleaved devs */
int nswdev = NSWAPDEV;
int vm_swap_size;
+static int swdev_syscall_active = 0; /* serialize swap(on|off) */
+
static int swapdev_strategy(struct vop_strategy_args *ap);
struct vnode *swapdev_vp;
@@ -165,11 +169,12 @@ swapdev_strategy(ap)
/*
* Create a special vnode op vector for swapdev_vp - we only use
- * VOP_STRATEGY(), everything else returns an error.
+ * VOP_STRATEGY() and reclaim; everything else returns an error.
*/
vop_t **swapdev_vnodeop_p;
static struct vnodeopv_entry_desc swapdev_vnodeop_entries[] = {
{ &vop_default_desc, (vop_t *) vop_defaultop },
+ { &vop_reclaim_desc, (vop_t *) vop_null },
{ &vop_strategy_desc, (vop_t *) swapdev_strategy },
{ NULL, NULL }
};
@@ -208,19 +213,23 @@ swapon(td, uap)
if (error)
goto done2;
+ while (swdev_syscall_active)
+ tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
+ swdev_syscall_active = 1;
+
/*
* Swap metadata may not fit in the KVM if we have physical
* memory of >1GB.
*/
if (swap_zone == NULL) {
error = ENOMEM;
- goto done2;
+ goto done;
}
NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
error = namei(&nd);
if (error)
- goto done2;
+ goto done;
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
@@ -239,6 +248,9 @@ swapon(td, uap)
if (error)
vrele(vp);
+done:
+ swdev_syscall_active = 0;
+ wakeup_one(&swdev_syscall_active);
done2:
mtx_unlock(&Giant);
return (error);
@@ -252,8 +264,6 @@ done2:
*
* The new swap code uses page-sized blocks. The old swap code used
* DEV_BSIZE'd chunks.
- *
- * XXX locking when multiple swapon's run in parallel
*/
int
swaponvp(td, vp, dev, nblks)
@@ -330,7 +340,7 @@ swaponvp(td, vp, dev, nblks)
sp->sw_vp = vp;
sp->sw_dev = dev2udev(dev);
sp->sw_device = dev;
- sp->sw_flags |= SW_FREED;
+ sp->sw_flags = SW_FREED;
sp->sw_nblks = nblks;
sp->sw_used = 0;
@@ -356,9 +366,127 @@ swaponvp(td, vp, dev, nblks)
vm_swap_size += blk;
}
+ swap_pager_full = 0;
+
return (0);
}
+/*
+ * SYSCALL: swapoff(devname)
+ *
+ * Disable swapping on the given device.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct swapoff_args {
+ char *name;
+};
+#endif
+
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+swapoff(td, uap)
+ struct thread *td;
+ struct swapoff_args *uap;
+{
+ struct vnode *vp;
+ struct nameidata nd;
+ struct swdevt *sp;
+ swblk_t dvbase, vsbase;
+ u_long nblks, aligned_nblks, blk;
+ int error, index;
+
+ mtx_lock(&Giant);
+
+ error = suser(td);
+ if (error)
+ goto done2;
+
+ while (swdev_syscall_active)
+ tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
+ swdev_syscall_active = 1;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td);
+ error = namei(&nd);
+ if (error)
+ goto done;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+
+ for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) {
+ if (sp->sw_vp == vp)
+ goto found;
+ }
+ error = EINVAL;
+ goto done;
+found:
+ nblks = sp->sw_nblks;
+
+ /*
+ * We can turn off this swap device safely only if the
+ * available virtual memory in the system will fit the amount
+ * of data we will have to page back in, plus an epsilon so
+ * the system doesn't become critically low on swap space.
+ */
+ if (cnt.v_free_count + cnt.v_cache_count + vm_swap_size <
+ nblks + nswap_lowat) {
+ error = ENOMEM;
+ goto done;
+ }
+
+ /*
+ * Prevent further allocations on this device.
+ */
+ sp->sw_flags |= SW_CLOSING;
+ for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) {
+ blk = min(nblks - dvbase, dmmax);
+ vsbase = index * dmmax + dvbase * nswdev;
+ vm_swap_size -= blist_fill(swapblist, vsbase, blk);
+ }
+
+ /*
+ * Page in the contents of the device and close it.
+ */
+#ifndef NO_SWAPPING
+ vm_proc_swapin_all(index);
+#endif /* !NO_SWAPPING */
+ swap_pager_swapoff(index, &sp->sw_used);
+
+ VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td);
+ vrele(vp);
+ sp->sw_vp = NULL;
+
+ /*
+ * Resize the bitmap based on the new largest swap device,
+ * or free the bitmap if there are no more devices.
+ */
+ for (sp = swdevt, nblks = 0; sp < swdevt + nswdev; sp++) {
+ if (sp->sw_vp == NULL)
+ continue;
+ nblks = max(nblks, sp->sw_nblks);
+ }
+
+ aligned_nblks = (nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1);
+ nswap = aligned_nblks * nswdev;
+
+ if (nswap == 0) {
+ blist_destroy(swapblist);
+ swapblist = NULL;
+ vrele(swapdev_vp);
+ swapdev_vp = NULL;
+ } else
+ blist_resize(&swapblist, nswap, 0);
+
+done:
+ swdev_syscall_active = 0;
+ wakeup_one(&swdev_syscall_active);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
static int
sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
{
OpenPOWER on IntegriCloud