diff options
-rw-r--r-- | include/unistd.h | 1 | ||||
-rw-r--r-- | lib/libc/sys/Makefile.inc | 1 | ||||
-rw-r--r-- | lib/libc/sys/swapon.2 | 63 | ||||
-rw-r--r-- | sbin/swapon/Makefile | 2 | ||||
-rw-r--r-- | sbin/swapon/swapon.8 | 37 | ||||
-rw-r--r-- | sbin/swapon/swapon.c | 43 | ||||
-rw-r--r-- | sys/amd64/ia32/syscalls.master | 1 | ||||
-rw-r--r-- | sys/compat/freebsd32/syscalls.master | 1 | ||||
-rw-r--r-- | sys/ia64/ia32/syscalls.master | 1 | ||||
-rw-r--r-- | sys/kern/subr_blist.c | 215 | ||||
-rw-r--r-- | sys/kern/syscalls.master | 1 | ||||
-rw-r--r-- | sys/sys/blist.h | 2 | ||||
-rw-r--r-- | sys/sys/conf.h | 1 | ||||
-rw-r--r-- | sys/sys/linedisc.h | 1 | ||||
-rw-r--r-- | sys/vm/swap_pager.c | 159 | ||||
-rw-r--r-- | sys/vm/swap_pager.h | 2 | ||||
-rw-r--r-- | sys/vm/vm_glue.c | 40 | ||||
-rw-r--r-- | sys/vm/vm_pageout.h | 6 | ||||
-rw-r--r-- | sys/vm/vm_swap.c | 140 |
19 files changed, 633 insertions, 84 deletions
diff --git a/include/unistd.h b/include/unistd.h index 160dada..68e456f 100644 --- a/include/unistd.h +++ b/include/unistd.h @@ -522,6 +522,7 @@ int setruid(uid_t); void setusershell(void); int strtofflags(char **, u_long *, u_long *); int swapon(const char *); +int swapoff(const char *); int syscall(int, ...); off_t __syscall(quad_t, ...); int ttyslot(void); diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc index 4fe3d7e..257ef7a 100644 --- a/lib/libc/sys/Makefile.inc +++ b/lib/libc/sys/Makefile.inc @@ -131,6 +131,7 @@ MLINKS+=shmat.2 shmdt.2 MLINKS+=stat.2 fstat.2 stat.2 lstat.2 MLINKS+=statfs.2 fstatfs.2 MLINKS+=syscall.2 __syscall.2 +MLINKS+=swapon.2 swapoff.2 MLINKS+=truncate.2 ftruncate.2 MLINKS+=utimes.2 futimes.2 utimes.2 lutimes.2 MLINKS+=wait.2 wait3.2 wait.2 wait4.2 wait.2 waitpid.2 diff --git a/lib/libc/sys/swapon.2 b/lib/libc/sys/swapon.2 index a852ba0..f4f9092 100644 --- a/lib/libc/sys/swapon.2 +++ b/lib/libc/sys/swapon.2 @@ -36,14 +36,16 @@ .Dt SWAPON 2 .Os .Sh NAME -.Nm swapon -.Nd add a swap device for interleaved paging/swapping +.Nm swapon , swapoff +.Nd control devices for interleaved paging/swapping .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In unistd.h .Ft int .Fn swapon "const char *special" +.Ft int +.Fn swapoff "const char *special" .Sh DESCRIPTION .Fn Swapon makes the block device @@ -55,13 +57,22 @@ configuration time. The size of the swap area on .Fa special is calculated at the time the device is first made available for swapping. +.Pp +The +.Fn swapoff +system call disables paging and swapping on the given device. +All associated swap metadata are deallocated, and the device +is made available for other purposes. .Sh RETURN VALUES If an error has occurred, a value of -1 is returned and .Va errno is set to indicate the error. .Sh ERRORS -.Fn Swapon -succeeds unless: +Both +.Fn swapon +and +.Fn swapoff +can fail if: .Bl -tag -width Er .It Bq Er ENOTDIR A component of the path prefix is not a directory. @@ -76,6 +87,19 @@ Search permission is denied for a component of the path prefix. Too many symbolic links were encountered in translating the pathname. .It Bq Er EPERM The caller is not the super-user. +.It Bq Er EFAULT +.Fa Special +points outside the process's allocated address space. +.El +.Pp +Additionally, +.Fn swapon +can fail for the following reasons: +.Bl -tag -width Er +.It Bq Er EINVAL +The system has reached the boot-time limit on the number of +swap devices, +.Va vm.nswapdev . .It Bq Er ENOTBLK .Fa Special is not a block device. @@ -84,11 +108,6 @@ The device specified by .Fa special has already been made available for swapping -.It Bq Er EINVAL -The device configured by -.Fa special -was not -configured into the system as a swap device. .It Bq Er ENXIO The major device number of .Fa special @@ -96,20 +115,28 @@ is out of range (this indicates no device driver exists for the associated hardware). .It Bq Er EIO An I/O error occurred while opening the swap device. -.It Bq Er EFAULT -.Fa Special -points outside the process's allocated address space. +.El +.Pp +Lastly, +.Fn swapoff +can fail if: +.Bl -tag -width Er +.It Bq Er EINVAL +The system is not currently swapping to +.Fa special . +.It Bq Er ENOMEM +Not enough virtual memory is available to safely disable +paging and swapping to the given device. .El .Sh SEE ALSO .Xr config 8 , -.Xr swapon 8 -.Sh BUGS -There is no way to stop swapping on a disk so that the pack may be -dismounted. -.Pp -This call will be upgraded in future versions of the system. +.Xr swapon 8 , +.Xr sysctl 8 .Sh HISTORY The .Fn swapon function call appeared in .Bx 4.0 . +.Fn Swapoff +appeared in +.Fx 5.0 . diff --git a/sbin/swapon/Makefile b/sbin/swapon/Makefile index be803b2..f052567 100644 --- a/sbin/swapon/Makefile +++ b/sbin/swapon/Makefile @@ -3,5 +3,7 @@ PROG= swapon MAN= swapon.8 +LINKS= ${BINDIR}/swapon ${BINDIR}/swapoff +MLINKS= swapon.8 swapoff.8 .include <bsd.prog.mk> diff --git a/sbin/swapon/swapon.8 b/sbin/swapon/swapon.8 index ce23b38..edda998 100644 --- a/sbin/swapon/swapon.8 +++ b/sbin/swapon/swapon.8 @@ -36,39 +36,46 @@ .Dt SWAPON 8 .Os .Sh NAME -.Nm swapon -.Nd "specify additional device for paging and swapping" +.Nm swapon , swapoff +.Nd "specify devices for paging and swapping" .Sh SYNOPSIS -.Nm +.Nm swap[on|off] .Fl a -.Nm +.Nm swap[on|off] .Ar special_file ... .Sh DESCRIPTION The -.Nm +.Nm swapon utility is used to specify additional devices on which paging and swapping are to take place. The system begins by swapping and paging on only a single device so that only one disk is required at bootstrap time. Calls to -.Nm +.Nm swapon normally occur in the system multi-user initialization file .Pa /etc/rc making all swap devices available, so that the paging and swapping activity is interleaved across several devices. .Pp +The +.Nm swapoff +utility disables paging and swapping on a device. +Calls to +.Nm swapoff +succeed only if disabling the device would leave enough +remaining virtual memory to accomodate all running programs. +.Pp Normally, the first form is used: .Bl -tag -width indent .It Fl a All devices marked as ``sw'' swap devices in .Pa /etc/fstab -are made available unless their ``noauto'' option is also set. +are added to or removed from the pool of available swap +unless their ``noauto'' option is also set. .El .Pp -The second form gives individual block devices as given -in the system swap configuration table. The call makes only this space -available to the system for swap allocation. +The second form is used to configure or disable individual devices. .Sh SEE ALSO .Xr swapon 2 , .Xr fstab 5 , @@ -85,12 +92,12 @@ memory disk devices .It Pa /etc/fstab ASCII file system description table .El -.Sh BUGS -There is no way to stop paging and swapping on a device. -It is therefore not possible to dismount swap devices which are -mounted during system operation. .Sh HISTORY The -.Nm +.Nm swapon utility appeared in .Bx 4.0 . +The +.Nm swapoff +utility appeared in +.Fx 5.0 . diff --git a/sbin/swapon/swapon.c b/sbin/swapon/swapon.c index 69f4e73..51042bc 100644 --- a/sbin/swapon/swapon.c +++ b/sbin/swapon/swapon.c @@ -53,8 +53,9 @@ static const char rcsid[] = #include <string.h> #include <unistd.h> -static void usage(void); -int add(char *name, int ignoreebusy); +static void usage(const char *); +static int is_swapoff(const char *); +int swap_on_off(char *name, int ignoreebusy, int do_swapoff); int main(int argc, char **argv) @@ -62,6 +63,10 @@ main(int argc, char **argv) struct fstab *fsp; int stat; int ch, doall; + int do_swapoff; + char *pname = argv[0]; + + do_swapoff = is_swapoff(pname); doall = 0; while ((ch = getopt(argc, argv, "a")) != -1) @@ -71,7 +76,7 @@ main(int argc, char **argv) break; case '?': default: - usage(); + usage(pname); } argv += optind; @@ -82,23 +87,24 @@ main(int argc, char **argv) continue; if (strstr(fsp->fs_mntops, "noauto")) continue; - if (add(fsp->fs_spec, 1)) + if (swap_on_off(fsp->fs_spec, 1, do_swapoff)) stat = 1; else - printf("swapon: adding %s as swap device\n", + printf("%s: %sing %s as swap device\n", + pname, do_swapoff ? "remov" : "add", fsp->fs_spec); } else if (!*argv) - usage(); + usage(pname); for (; *argv; ++argv) - stat |= add(*argv, 0); + stat |= swap_on_off(*argv, 0, do_swapoff); exit(stat); } int -add(char *name, int ignoreebusy) +swap_on_off(char *name, int ignoreebusy, int do_swapoff) { - if (swapon(name) == -1) { + if ((do_swapoff ? swapoff(name) : swapon(name)) == -1) { switch (errno) { case EBUSY: if (!ignoreebusy) @@ -114,8 +120,23 @@ add(char *name, int ignoreebusy) } static void -usage() +usage(const char *pname) { - fprintf(stderr, "usage: swapon [-a] [special_file ...]\n"); + fprintf(stderr, "usage: %s [-a] [special_file ...]\n", pname); exit(1); } + +static int +is_swapoff(const char *s) +{ + const char *u; + + if ((u = strrchr(s, '/')) != NULL) + ++u; + else + u = s; + if (strcmp(u, "swapoff") == 0) + return 1; + else + return 0; +} diff --git a/sys/amd64/ia32/syscalls.master b/sys/amd64/ia32/syscalls.master index 66cc75f..d30b591 100644 --- a/sys/amd64/ia32/syscalls.master +++ b/sys/amd64/ia32/syscalls.master @@ -594,3 +594,4 @@ 421 UNIMPL BSD getcontext 422 UNIMPL BSD setcontext 423 UNIMPL BSD swapcontext +424 MNOPROTO BSD swapoff diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index 66cc75f..d30b591 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -594,3 +594,4 @@ 421 UNIMPL BSD getcontext 422 UNIMPL BSD setcontext 423 UNIMPL BSD swapcontext +424 MNOPROTO BSD swapoff diff --git a/sys/ia64/ia32/syscalls.master b/sys/ia64/ia32/syscalls.master index 66cc75f..d30b591 100644 --- a/sys/ia64/ia32/syscalls.master +++ b/sys/ia64/ia32/syscalls.master @@ -594,3 +594,4 @@ 421 UNIMPL BSD getcontext 422 UNIMPL BSD setcontext 423 UNIMPL BSD swapcontext +424 MNOPROTO BSD swapoff diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c index eeeb7d9..1ae2ee2 100644 --- a/sys/kern/subr_blist.c +++ b/sys/kern/subr_blist.c @@ -93,7 +93,7 @@ #include <stdlib.h> #include <stdarg.h> -#define malloc(a,b,c) malloc(a) +#define malloc(a,b,c) calloc(a, 1) #define free(a,b) free(a) typedef unsigned int u_daddr_t; @@ -116,6 +116,9 @@ static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix, int skip, daddr_t blk); static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip, blist_t dest, daddr_t count); +static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); +static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, + daddr_t radix, int skip, daddr_t blk); static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count); #ifndef _KERNEL @@ -165,13 +168,14 @@ blist_create(daddr_t blocks) #if defined(BLIST_DEBUG) printf( - "BLIST representing %d blocks (%d MB of swap)" - ", requiring %dK of ram\n", - bl->bl_blocks, - bl->bl_blocks * 4 / 1024, - (bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024 + "BLIST representing %lld blocks (%lld MB of swap)" + ", requiring %lldK of ram\n", + (long long)bl->bl_blocks, + (long long)bl->bl_blocks * 4 / 1024, + (long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024 ); - printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks); + printf("BLIST raw radix tree contains %lld records\n", + (long long)bl->bl_rootblks); #endif blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks); @@ -226,6 +230,30 @@ blist_free(blist_t bl, daddr_t blkno, daddr_t count) } /* + * blist_fill() - mark a region in the block bitmap as off-limits + * to the allocator (i.e. allocate it), ignoring any + * existing allocations. Return the number of blocks + * actually filled that were free before the call. + */ + +int +blist_fill(blist_t bl, daddr_t blkno, daddr_t count) +{ + int filled; + + if (bl) { + if (bl->bl_radix == BLIST_BMAP_RADIX) + filled = blst_leaf_fill(bl->bl_root, blkno, count); + else + filled = blst_meta_fill(bl->bl_root, blkno, count, + bl->bl_radix, bl->bl_skip, 0); + bl->bl_free -= filled; + return filled; + } else + return 0; +} + +/* * blist_resize() - resize an existing radix tree to handle the * specified number of blocks. This will reallocate * the tree and transfer the previous bitmap to the new @@ -507,9 +535,9 @@ blst_meta_free( int next_skip = (skip >> BLIST_META_RADIX_SHIFT); #if 0 - printf("FREE (%x,%d) FROM (%x,%d)\n", - freeBlk, count, - blk, radix + printf("FREE (%llx,%lld) FROM (%llx,%lld)\n", + (long long)freeBlk, (long long)count, + (long long)blk, (long long)radix ); #endif @@ -679,6 +707,117 @@ static void blst_copy( } /* + * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap + * + * This routine allocates all blocks in the specified range + * regardless of any existing allocations in that range. Returns + * the number of blocks allocated by the call. + */ + +static int +blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) +{ + int n = blk & (BLIST_BMAP_RADIX - 1); + int nblks; + u_daddr_t mask, bitmap; + + mask = ((u_daddr_t)-1 << n) & + ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); + + /* Count the number of blocks we're about to allocate */ + bitmap = scan->u.bmu_bitmap & mask; + for (nblks = 0; bitmap != 0; nblks++) + bitmap &= bitmap - 1; + + scan->u.bmu_bitmap &= ~mask; + return nblks; +} + +/* + * BLIST_META_FILL() - allocate specific blocks at a meta node + * + * This routine allocates the specified range of blocks, + * regardless of any existing allocations in the range. The + * range must be within the extent of this node. Returns the + * number of blocks allocated by the call. + */ +static int +blst_meta_fill( + blmeta_t *scan, + daddr_t allocBlk, + daddr_t count, + daddr_t radix, + int skip, + daddr_t blk +) { + int i; + int next_skip = (skip >> BLIST_META_RADIX_SHIFT); + int nblks = 0; + + if (count == radix || scan->u.bmu_avail == 0) { + /* + * ALL-ALLOCATED special case + */ + nblks = scan->u.bmu_avail; + scan->u.bmu_avail = 0; + scan->bm_bighint = count; + return nblks; + } + + if (scan->u.bmu_avail == radix) { + radix >>= BLIST_META_RADIX_SHIFT; + + /* + * ALL-FREE special case, initialize sublevel + */ + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (daddr_t)-1) + break; + if (next_skip == 1) { + scan[i].u.bmu_bitmap = (u_daddr_t)-1; + scan[i].bm_bighint = BLIST_BMAP_RADIX; + } else { + scan[i].bm_bighint = radix; + scan[i].u.bmu_avail = radix; + } + } + } else { + radix >>= BLIST_META_RADIX_SHIFT; + } + + if (count > radix) + panic("blist_meta_fill: allocation too large"); + + i = (allocBlk - blk) / radix; + blk += i * radix; + i = i * next_skip + 1; + + while (i <= skip && blk < allocBlk + count) { + daddr_t v; + + v = blk + radix - allocBlk; + if (v > count) + v = count; + + if (scan->bm_bighint == (daddr_t)-1) + panic("blst_meta_fill: filling unexpected range"); + + if (next_skip == 1) { + nblks += blst_leaf_fill(&scan[i], allocBlk, v); + } else { + nblks += blst_meta_fill(&scan[i], allocBlk, v, + radix, next_skip - 1, blk); + } + count -= v; + allocBlk += v; + blk += radix; + i += next_skip; + } + scan->u.bmu_avail -= nblks; + return nblks; +} + +/* * BLST_RADIX_INIT() - initialize radix tree * * Initialize our meta structures and bitmaps and calculate the exact @@ -768,41 +907,41 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab) if (radix == BLIST_BMAP_RADIX) { printf( - "%*.*s(%04x,%d): bitmap %08x big=%d\n", + "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n", tab, tab, "", - blk, radix, - scan->u.bmu_bitmap, - scan->bm_bighint + (long long)blk, (long long)radix, + (long long)scan->u.bmu_bitmap, + (long long)scan->bm_bighint ); return; } if (scan->u.bmu_avail == 0) { printf( - "%*.*s(%04x,%d) ALL ALLOCATED\n", + "%*.*s(%08llx,%lld) ALL ALLOCATED\n", tab, tab, "", - blk, - radix + (long long)blk, + (long long)radix ); return; } if (scan->u.bmu_avail == radix) { printf( - "%*.*s(%04x,%d) ALL FREE\n", + "%*.*s(%08llx,%lld) ALL FREE\n", tab, tab, "", - blk, - radix + (long long)blk, + (long long)radix ); return; } printf( - "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n", + "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n", tab, tab, "", - blk, radix, - scan->u.bmu_avail, - radix, - scan->bm_bighint + (long long)blk, (long long)radix, + (long long)scan->u.bmu_avail, + (long long)radix, + (long long)scan->bm_bighint ); radix >>= BLIST_META_RADIX_SHIFT; @@ -812,9 +951,9 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab) for (i = 1; i <= skip; i += next_skip) { if (scan[i].bm_bighint == (daddr_t)-1) { printf( - "%*.*s(%04x,%d): Terminator\n", + "%*.*s(%08llx,%lld): Terminator\n", tab, tab, "", - blk, radix + (long long)blk, (long long)radix ); lastState = 0; break; @@ -866,13 +1005,14 @@ main(int ac, char **av) daddr_t count = 0; - printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix); + printf("%lld/%lld/%lld> ", (long long)bl->bl_free, + (long long)size, (long long)bl->bl_radix); fflush(stdout); if (fgets(buf, sizeof(buf), stdin) == NULL) break; switch(buf[0]) { case 'r': - if (sscanf(buf + 1, "%d", &count) == 1) { + if (sscanf(buf + 1, "%lld", &count) == 1) { blist_resize(&bl, count, 1); } else { printf("?\n"); @@ -881,26 +1021,37 @@ main(int ac, char **av) blist_print(bl); break; case 'a': - if (sscanf(buf + 1, "%d", &count) == 1) { + if (sscanf(buf + 1, "%lld", &count) == 1) { daddr_t blk = blist_alloc(bl, count); - printf(" R=%04x\n", blk); + printf(" R=%08llx\n", (long long)blk); } else { printf("?\n"); } break; case 'f': - if (sscanf(buf + 1, "%x %d", &da, &count) == 2) { + if (sscanf(buf + 1, "%llx %lld", + (long long *)&da, (long long *)&count) == 2) { blist_free(bl, da, count); } else { printf("?\n"); } break; + case 'l': + if (sscanf(buf + 1, "%llx %lld", + (long long *)&da, (long long *)&count) == 2) { + printf(" n=%d\n", + blist_fill(bl, da, count)); + } else { + printf("?\n"); + } + break; case '?': case 'h': puts( "p -print\n" "a %d -allocate\n" "f %x %d -free\n" + "l %x %d -fill\n" "r %d -resize\n" "h/? -help" ); diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index a41eb12..0144274 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -612,6 +612,7 @@ 422 MSTD BSD { int setcontext(const struct __ucontext *ucp); } 423 MSTD BSD { int swapcontext(struct __ucontext *oucp, \ const struct __ucontext *ucp); } +424 MSTD BSD { int swapoff(const char *name); } ; Please copy any additions and changes to the following compatability tables: ; sys/ia64/ia32/syscalls.master (take a best guess) diff --git a/sys/sys/blist.h b/sys/sys/blist.h index fa4be7e..d426e48 100644 --- a/sys/sys/blist.h +++ b/sys/sys/blist.h @@ -9,6 +9,7 @@ * (void) blist_destroy(blist) * blkno = blist_alloc(blist, count) * (void) blist_free(blist, blkno, count) + * nblks = blist_fill(blist, blkno, count) * (void) blist_resize(&blist, count, freeextra) * * @@ -78,6 +79,7 @@ extern blist_t blist_create(daddr_t blocks); extern void blist_destroy(blist_t blist); extern daddr_t blist_alloc(blist_t blist, daddr_t count); extern void blist_free(blist_t blist, daddr_t blkno, daddr_t count); +extern int blist_fill(blist_t bl, daddr_t blkno, daddr_t count); extern void blist_print(blist_t blist); extern void blist_resize(blist_t *pblist, daddr_t count, int freenew); diff --git a/sys/sys/conf.h b/sys/sys/conf.h index 1be8506..15108c2 100644 --- a/sys/sys/conf.h +++ b/sys/sys/conf.h @@ -274,6 +274,7 @@ struct swdevt { }; #define SW_FREED 0x01 #define SW_SEQUENTIAL 0x02 +#define SW_CLOSING 0x04 #define sw_freed sw_flags /* XXX compat */ #ifdef _KERNEL diff --git a/sys/sys/linedisc.h b/sys/sys/linedisc.h index 1be8506..15108c2 100644 --- a/sys/sys/linedisc.h +++ b/sys/sys/linedisc.h @@ -274,6 +274,7 @@ struct swdevt { }; #define SW_FREED 0x01 #define SW_SEQUENTIAL 0x02 +#define SW_CLOSING 0x04 #define sw_freed sw_flags /* XXX compat */ #ifdef _KERNEL diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index de203e2..2f43bc4 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -206,6 +206,8 @@ static __inline daddr_t swp_pager_getswapspace(int npages); /* * Metadata functions */ +static __inline struct swblock ** + swp_pager_hash(vm_object_t object, vm_pindex_t index); static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free_all(vm_object_t); @@ -512,12 +514,22 @@ swp_pager_freeswapspace(blk, npages) daddr_t blk; int npages; { + struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)]; + GIANT_REQUIRED; + /* per-swap area stats */ + sp->sw_used -= npages; + + /* + * If we are attempting to stop swapping on this device, we + * don't want to mark any blocks free lest they be reused. + */ + if (sp->sw_flags & SW_CLOSING) + return; + blist_free(swapblist, blk, npages); vm_swap_size += npages; - /* per-swap area stats */ - swdevt[BLK2DEVIDX(blk)].sw_used -= npages; swp_sizecheck(); } @@ -1624,6 +1636,149 @@ swp_pager_async_iodone(bp) splx(s); } +/* + * swap_pager_isswapped: + * + * Return 1 if at least one page in the given object is paged + * out to the given swap device. + * + * This routine may not block. + */ +int swap_pager_isswapped(vm_object_t object, int devidx) { + daddr_t index = 0; + int bcount; + int i; + + for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) { + struct swblock *swap; + + if ((swap = *swp_pager_hash(object, index)) != NULL) { + for (i = 0; i < SWAP_META_PAGES; ++i) { + daddr_t v = swap->swb_pages[i]; + if (v != SWAPBLK_NONE && + BLK2DEVIDX(v) == devidx) + return 1; + } + } + + index += SWAP_META_PAGES; + if (index > 0x20000000) + panic("swap_pager_isswapped: failed to locate all swap meta blocks"); + } + return 0; +} + +/* + * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in + * + * This routine dissociates the page at the given index within a + * swap block from its backing store, paging it in if necessary. + * If the page is paged in, it is placed in the inactive queue, + * since it had its backing store ripped out from under it. + * We also attempt to swap in all other pages in the swap block, + * we only guarantee that the one at the specified index is + * paged in. + * + * XXX - The code to page the whole block in doesn't work, so we + * revert to the one-by-one behavior for now. Sigh. + */ +static __inline void +swp_pager_force_pagein(struct swblock *swap, int idx) +{ + vm_object_t object; + vm_page_t m; + vm_pindex_t pindex; + + object = swap->swb_object; + pindex = swap->swb_index; + + vm_object_pip_add(object, 1); + m = vm_page_grab(object, pindex + idx, VM_ALLOC_NORMAL|VM_ALLOC_RETRY); + if (m->valid == VM_PAGE_BITS_ALL) { + vm_object_pip_subtract(object, 1); + vm_page_lock_queues(); + vm_page_activate(m); + vm_page_dirty(m); + vm_page_wakeup(m); + vm_page_unlock_queues(); + vm_pager_page_unswapped(m); + return; + } + + if (swap_pager_getpages(object, &m, 1, 0) != + VM_PAGER_OK) + panic("swap_pager_force_pagein: read from swap failed");/*XXX*/ + vm_object_pip_subtract(object, 1); + + vm_page_lock_queues(); + vm_page_dirty(m); + vm_page_dontneed(m); + vm_page_wakeup(m); + vm_page_unlock_queues(); + vm_pager_page_unswapped(m); +} + + +/* + * swap_pager_swapoff: + * + * Page in all of the pages that have been paged out to the + * given device. The corresponding blocks in the bitmap must be + * marked as allocated and the device must be flagged SW_CLOSING. + * There may be no processes swapped out to the device. + * + * The sw_used parameter points to the field in the swdev structure + * that contains a count of the number of blocks still allocated + * on the device. If we encounter objects with a nonzero pip count + * in our scan, we use this number to determine if we're really done. + * + * This routine may block. + */ +void +swap_pager_swapoff(int devidx, int *sw_used) +{ + struct swblock **pswap; + struct swblock *swap; + vm_object_t waitobj; + daddr_t v; + int i, j; + + GIANT_REQUIRED; + +full_rescan: + waitobj = NULL; + for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */ +restart: + pswap = &swhash[i]; + while ((swap = *pswap) != NULL) { + for (j = 0; j < SWAP_META_PAGES; ++j) { + v = swap->swb_pages[j]; + if (v != SWAPBLK_NONE && + BLK2DEVIDX(v) == devidx) + break; + } + if (j < SWAP_META_PAGES) { + swp_pager_force_pagein(swap, j); + goto restart; + } else if (swap->swb_object->paging_in_progress) { + if (!waitobj) + waitobj = swap->swb_object; + } + pswap = &swap->swb_hnext; + } + } + if (waitobj && *sw_used) { + /* + * We wait on an arbitrary object to clock our rescans + * to the rate of paging completion. + */ + vm_object_pip_wait(waitobj, "swpoff"); + goto full_rescan; + } + if (*sw_used) + panic("swapoff: failed to locate %d swap blocks", *sw_used); +} + /************************************************************************ * SWAP META DATA * ************************************************************************ diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h index 97d50d3..4402284 100644 --- a/sys/vm/swap_pager.h +++ b/sys/vm/swap_pager.h @@ -83,9 +83,11 @@ extern struct pagerlst swap_pager_un_object_list; extern int swap_pager_full; extern struct blist *swapblist; extern struct uma_zone *swap_zone; +extern int nswap_lowat, nswap_hiwat; void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); +void swap_pager_swapoff(int devidx, int *sw_used); int swap_pager_swp_alloc(vm_object_t, int); void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int); diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index 6ac6a96..e38b3d3 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -91,6 +91,7 @@ #include <vm/vm_kern.h> #include <vm/vm_extern.h> #include <vm/vm_pager.h> +#include <vm/swap_pager.h> #include <sys/user.h> @@ -324,6 +325,45 @@ vm_proc_swapin(struct proc *p) up = (vm_offset_t)p->p_uarea; pmap_qenter(up, ma, UAREA_PAGES); } + +/* + * Swap in the UAREAs of all processes swapped out to the given device. + * The pages in the UAREA are marked dirty and their swap metadata is freed. + */ +void +vm_proc_swapin_all(int devidx) +{ + struct proc *p; + vm_object_t object; + vm_page_t m; + +retry: + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + mtx_lock_spin(&sched_lock); + + object = p->p_upages_obj; + if (object != NULL && + swap_pager_isswapped(p->p_upages_obj, devidx)) { + sx_sunlock(&allproc_lock); + faultin(p); + mtx_unlock_spin(&sched_lock); + PROC_UNLOCK(p); + vm_page_lock_queues(); + TAILQ_FOREACH(m, &object->memq, listq) + vm_page_dirty(m); + vm_page_unlock_queues(); + swap_pager_freespace(object, 0, + object->un_pager.swp.swp_bcount); + goto retry; + } + + mtx_unlock_spin(&sched_lock); + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); +} #endif /* diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h index c909c68..d68ec79 100644 --- a/sys/vm/vm_pageout.h +++ b/sys/vm/vm_pageout.h @@ -104,6 +104,12 @@ extern void pagedaemon_wakeup(void); extern void vm_wait(void); extern void vm_waitpfault(void); +/* XXX This is probably misplaced. */ +#ifndef NO_SWAPPING +void vm_proc_swapin_all(int); +int swap_pager_isswapped(vm_object_t, int); +#endif /* !NO_SWAPPING */ + #ifdef _KERNEL void vm_pageout_page(vm_page_t, vm_object_t); void vm_pageout_cluster(vm_page_t, vm_object_t); diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c index 1781182..0ec5220 100644 --- a/sys/vm/vm_swap.c +++ b/sys/vm/vm_swap.c @@ -36,6 +36,7 @@ #include "opt_mac.h" #include "opt_swap.h" +#include "opt_vm.h" #include <sys/param.h> #include <sys/systm.h> @@ -58,6 +59,7 @@ #include <vm/vm.h> #include <vm/vm_extern.h> #include <vm/vm_param.h> +#include <vm/vm_pageout.h> #include <vm/swap_pager.h> #include <vm/uma.h> @@ -73,6 +75,8 @@ struct swdevt *swdevt = should_be_malloced; static int nswap; /* first block after the interleaved devs */ int nswdev = NSWAPDEV; int vm_swap_size; +static int swdev_syscall_active = 0; /* serialize swap(on|off) */ + static int swapdev_strategy(struct vop_strategy_args *ap); struct vnode *swapdev_vp; @@ -165,11 +169,12 @@ swapdev_strategy(ap) /* * Create a special vnode op vector for swapdev_vp - we only use - * VOP_STRATEGY(), everything else returns an error. + * VOP_STRATEGY() and reclaim; everything else returns an error. */ vop_t **swapdev_vnodeop_p; static struct vnodeopv_entry_desc swapdev_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_defaultop }, + { &vop_reclaim_desc, (vop_t *) vop_null }, { &vop_strategy_desc, (vop_t *) swapdev_strategy }, { NULL, NULL } }; @@ -208,19 +213,23 @@ swapon(td, uap) if (error) goto done2; + while (swdev_syscall_active) + tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0); + swdev_syscall_active = 1; + /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swap_zone == NULL) { error = ENOMEM; - goto done2; + goto done; } NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) - goto done2; + goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; @@ -239,6 +248,9 @@ swapon(td, uap) if (error) vrele(vp); +done: + swdev_syscall_active = 0; + wakeup_one(&swdev_syscall_active); done2: mtx_unlock(&Giant); return (error); @@ -252,8 +264,6 @@ done2: * * The new swap code uses page-sized blocks. The old swap code used * DEV_BSIZE'd chunks. - * - * XXX locking when multiple swapon's run in parallel */ int swaponvp(td, vp, dev, nblks) @@ -330,7 +340,7 @@ swaponvp(td, vp, dev, nblks) sp->sw_vp = vp; sp->sw_dev = dev2udev(dev); sp->sw_device = dev; - sp->sw_flags |= SW_FREED; + sp->sw_flags = SW_FREED; sp->sw_nblks = nblks; sp->sw_used = 0; @@ -356,9 +366,127 @@ swaponvp(td, vp, dev, nblks) vm_swap_size += blk; } + swap_pager_full = 0; + return (0); } +/* + * SYSCALL: swapoff(devname) + * + * Disable swapping on the given device. + */ +#ifndef _SYS_SYSPROTO_H_ +struct swapoff_args { + char *name; +}; +#endif + +/* + * MPSAFE + */ +/* ARGSUSED */ +int +swapoff(td, uap) + struct thread *td; + struct swapoff_args *uap; +{ + struct vnode *vp; + struct nameidata nd; + struct swdevt *sp; + swblk_t dvbase, vsbase; + u_long nblks, aligned_nblks, blk; + int error, index; + + mtx_lock(&Giant); + + error = suser(td); + if (error) + goto done2; + + while (swdev_syscall_active) + tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0); + swdev_syscall_active = 1; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, td); + error = namei(&nd); + if (error) + goto done; + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + + for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) { + if (sp->sw_vp == vp) + goto found; + } + error = EINVAL; + goto done; +found: + nblks = sp->sw_nblks; + + /* + * We can turn off this swap device safely only if the + * available virtual memory in the system will fit the amount + * of data we will have to page back in, plus an epsilon so + * the system doesn't become critically low on swap space. + */ + if (cnt.v_free_count + cnt.v_cache_count + vm_swap_size < + nblks + nswap_lowat) { + error = ENOMEM; + goto done; + } + + /* + * Prevent further allocations on this device. + */ + sp->sw_flags |= SW_CLOSING; + for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) { + blk = min(nblks - dvbase, dmmax); + vsbase = index * dmmax + dvbase * nswdev; + vm_swap_size -= blist_fill(swapblist, vsbase, blk); + } + + /* + * Page in the contents of the device and close it. + */ +#ifndef NO_SWAPPING + vm_proc_swapin_all(index); +#endif /* !NO_SWAPPING */ + swap_pager_swapoff(index, &sp->sw_used); + + VOP_CLOSE(vp, FREAD | FWRITE, td->td_ucred, td); + vrele(vp); + sp->sw_vp = NULL; + + /* + * Resize the bitmap based on the new largest swap device, + * or free the bitmap if there are no more devices. + */ + for (sp = swdevt, nblks = 0; sp < swdevt + nswdev; sp++) { + if (sp->sw_vp == NULL) + continue; + nblks = max(nblks, sp->sw_nblks); + } + + aligned_nblks = (nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1); + nswap = aligned_nblks * nswdev; + + if (nswap == 0) { + blist_destroy(swapblist); + swapblist = NULL; + vrele(swapdev_vp); + swapdev_vp = NULL; + } else + blist_resize(&swapblist, nswap, 0); + +done: + swdev_syscall_active = 0; + wakeup_one(&swdev_syscall_active); +done2: + mtx_unlock(&Giant); + return (error); +} + static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { |