diff options
author | mckusick <mckusick@FreeBSD.org> | 2010-04-28 05:33:59 +0000 |
---|---|---|
committer | mckusick <mckusick@FreeBSD.org> | 2010-04-28 05:33:59 +0000 |
commit | 3a0f5972a0de87aebef1af257922515700da4217 (patch) | |
tree | a65d36ab57a1e076de7e7a1d78add642fbd7062e /sbin | |
parent | f40c3a9dc50f808e512fcc9f9f738717013b483b (diff) | |
parent | a768cbcadec7189b9947e9f3cde39fe806bbc1d7 (diff) | |
download | FreeBSD-src-3a0f5972a0de87aebef1af257922515700da4217.zip FreeBSD-src-3a0f5972a0de87aebef1af257922515700da4217.tar.gz |
Update to current version of head.
Diffstat (limited to 'sbin')
54 files changed, 5243 insertions, 1151 deletions
diff --git a/sbin/ddb/Makefile b/sbin/ddb/Makefile index b9189c1..c556be1 100644 --- a/sbin/ddb/Makefile +++ b/sbin/ddb/Makefile @@ -3,7 +3,7 @@ PROG= ddb SRCS= ddb.c ddb_capture.c ddb_script.c MAN= ddb.8 -WARNS= 3 +WARNS?= 3 DPADD= ${LIBKVM} LDADD= -lkvm diff --git a/sbin/devd/devd.conf.5 b/sbin/devd/devd.conf.5 index f7efefb..17c5e09 100644 --- a/sbin/devd/devd.conf.5 +++ b/sbin/devd/devd.conf.5 @@ -250,18 +250,40 @@ CIS-vendor. Device class. .It Li device Device ID. +.It Li devclass +Device Class (USB) +.It Li devsubclass +Device Sub-class (USB) .It Li device-name Name of attached/detached device. +.It Li endpoints +Endpoint count (USB) .It Li function Card functions. +.It Li interface +Interface ID (USB) +.It Li intclass +Interface Class (USB) +.It Li intprotocol +Interface Protocol (USB) +.It Li intsubclass +Interface Sub-class (USB) .It Li manufacturer Manufacturer ID (pccard). +.It Li mode +Peripheral mode (USB) .It Li notify Match the value of the .Dq Li notify variable. +.It Li parent +Parent device +.It Li port +Hub port number (USB) .It Li product -Product ID (pccard). +Product ID (pccard/USB). +.It Li release +Hardware revision (USB) .It Li serial Serial Number (USB). .It Li slot @@ -342,6 +364,27 @@ The node is destroyed. .El .El +.It Li USB +Events related to the USB subsystem. +.Bl -tag -width ".Sy Subsystem" -compact +.It Sy Subsystem +.It Li DEVICE +.Bl -tag -width ".Li DETACH" -compact +.It Sy Type +.It Li ATTACH +USB device is attached to the system. +.It Li DETACH +USB device is detached from the system. +.El +.It Li INTERFACE +.Bl -tag -width ".Li DETACH" -compact +.It Sy Type +.It Li ATTACH +USB interface is attached from a device. +.It Li DETACH +USB interface is detached from a device. +.El +.El .It Li coretemp Events related to the .Xr coretemp 4 @@ -461,6 +504,17 @@ notify 0 { }; # +# Match a USB device type +# +notify 0 { + match "system" "USB"; + match "subsystem" "INTERFACE"; + match "type" "ATTACH"; + match "intclass" "0x0e"; + action "logger USB video device attached"; +}; + +# # Try to configure ath and wi devices with pccard_ether # as they are attached. # diff --git a/sbin/dumpfs/dumpfs.c b/sbin/dumpfs/dumpfs.c index e4b5995..38c05f6 100644 --- a/sbin/dumpfs/dumpfs.c +++ b/sbin/dumpfs/dumpfs.c @@ -238,7 +238,7 @@ dumpfs(const char *name) if (fsflags & FS_UNCLEAN) printf("unclean "); if (fsflags & FS_DOSOFTDEP) - printf("soft-updates "); + printf("soft-updates%s ", (fsflags & FS_SUJ) ? "+journal" : ""); if (fsflags & FS_NEEDSFSCK) printf("needs fsck run "); if (fsflags & FS_INDEXDIRS) @@ -255,7 +255,7 @@ dumpfs(const char *name) printf("nfsv4acls "); fsflags &= ~(FS_UNCLEAN | FS_DOSOFTDEP | FS_NEEDSFSCK | FS_INDEXDIRS | FS_ACLS | FS_MULTILABEL | FS_GJOURNAL | FS_FLAGS_UPDATED | - FS_NFS4ACLS); + FS_NFS4ACLS | FS_SUJ); if (fsflags != 0) printf("unknown flags (%#x)", fsflags); putchar('\n'); diff --git a/sbin/fsck_ffs/Makefile b/sbin/fsck_ffs/Makefile index aaae685..db2930b 100644 --- a/sbin/fsck_ffs/Makefile +++ b/sbin/fsck_ffs/Makefile @@ -7,8 +7,7 @@ LINKS+= ${BINDIR}/fsck_ffs ${BINDIR}/fsck_4.2bsd MAN= fsck_ffs.8 MLINKS= fsck_ffs.8 fsck_ufs.8 fsck_ffs.8 fsck_4.2bsd.8 SRCS= dir.c ea.c fsutil.c inode.c main.c pass1.c pass1b.c pass2.c pass3.c \ - pass4.c pass5.c setup.c utilities.c ffs_subr.c ffs_tables.c gjournal.c \ - getmntopts.c + pass4.c pass5.c setup.c suj.c utilities.c gjournal.c getmntopts.c DPADD= ${LIBUFS} LDADD= -lufs WARNS?= 2 diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h index ad7fe13..08f9ef5 100644 --- a/sbin/fsck_ffs/fsck.h +++ b/sbin/fsck_ffs/fsck.h @@ -60,6 +60,9 @@ * $FreeBSD$ */ +#ifndef _FSCK_H_ +#define _FSCK_H_ + #include <unistd.h> #include <stdlib.h> #include <stdio.h> @@ -347,10 +350,6 @@ void direrror(ino_t ino, const char *errmesg); int dirscan(struct inodesc *); int dofix(struct inodesc *, const char *msg); int eascan(struct inodesc *, struct ufs2_dinode *dp); -void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); -void ffs_fragacct(struct fs *, int, int32_t [], int); -int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); -void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t); void fileerror(ino_t cwd, ino_t ino, const char *errmesg); int findino(struct inodesc *); int findname(struct inodesc *); @@ -392,3 +391,6 @@ void sblock_init(void); void setinodebuf(ino_t); int setup(char *dev); void gjournal_check(const char *filesys); +int suj_check(const char *filesys); + +#endif /* !_FSCK_H_ */ diff --git a/sbin/fsck_ffs/gjournal.c b/sbin/fsck_ffs/gjournal.c index bd887ca..10c32c0 100644 --- a/sbin/fsck_ffs/gjournal.c +++ b/sbin/fsck_ffs/gjournal.c @@ -96,27 +96,6 @@ struct ufs2_dinode ufs2_zino; static void putcgs(void); /* - * Write current block of inodes. - */ -static int -putino(struct uufsd *disk, ino_t inode) -{ - caddr_t inoblock; - struct fs *fs; - ssize_t ret; - - fs = &disk->d_fs; - inoblock = disk->d_inoblock; - - assert(inoblock != NULL); - assert(inode >= disk->d_inomin && inode <= disk->d_inomax); - ret = bwrite(disk, fsbtodb(fs, ino_to_fsba(fs, inode)), inoblock, - fs->fs_bsize); - - return (ret == -1 ? -1 : 0); -} - -/* * Return cylinder group from the cache or load it if it is not in the * cache yet. * Don't cache more than MAX_CACHED_CGS cylinder groups. @@ -242,13 +221,11 @@ cancelcgs(void) #endif /* - * Open the given provider, load statistics. + * Open the given provider, load superblock. */ static void -getdisk(void) +opendisk(void) { - int i; - if (disk != NULL) return; disk = malloc(sizeof(*disk)); @@ -259,24 +236,6 @@ getdisk(void) disk->d_error); } fs = &disk->d_fs; - fs->fs_csp = malloc((size_t)fs->fs_cssize); - if (fs->fs_csp == NULL) - err(1, "malloc(%zu)", (size_t)fs->fs_cssize); - bzero(fs->fs_csp, (size_t)fs->fs_cssize); - for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) { - if (bread(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)), - (void *)(((char *)fs->fs_csp) + i), - (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) { - err(1, "bread: %s", disk->d_error); - } - } - if (fs->fs_contigsumsize > 0) { - fs->fs_maxcluster = malloc(fs->fs_ncg * sizeof(int32_t)); - if (fs->fs_maxcluster == NULL) - err(1, "malloc(%zu)", fs->fs_ncg * sizeof(int32_t)); - for (i = 0; i < fs->fs_ncg; i++) - fs->fs_maxcluster[i] = fs->fs_contigsumsize; - } } /* @@ -286,11 +245,6 @@ static void closedisk(void) { - free(fs->fs_csp); - if (fs->fs_contigsumsize > 0) { - free(fs->fs_maxcluster); - fs->fs_maxcluster = NULL; - } fs->fs_clean = 1; if (sbwrite(disk, 0) == -1) err(1, "sbwrite(%s)", devnam); @@ -301,227 +255,6 @@ closedisk(void) fs = NULL; } -/* - * Write the statistics back, call closedisk(). - */ -static void -putdisk(void) -{ - int i; - - assert(disk != NULL && fs != NULL); - for (i = 0; i < fs->fs_cssize; i += fs->fs_bsize) { - if (bwrite(disk, fsbtodb(fs, fs->fs_csaddr + numfrags(fs, i)), - (void *)(((char *)fs->fs_csp) + i), - (size_t)(fs->fs_cssize - i < fs->fs_bsize ? fs->fs_cssize - i : fs->fs_bsize)) == -1) { - err(1, "bwrite: %s", disk->d_error); - } - } - closedisk(); -} - -#if 0 -/* - * Free memory, close the disk, but don't write anything back. - */ -static void -canceldisk(void) -{ - int i; - - assert(disk != NULL && fs != NULL); - free(fs->fs_csp); - if (fs->fs_contigsumsize > 0) - free(fs->fs_maxcluster); - if (ufs_disk_close(disk) == -1) - err(1, "ufs_disk_close(%s)", devnam); - free(disk); - disk = NULL; - fs = NULL; -} -#endif - -static int -isblock(unsigned char *cp, ufs1_daddr_t h) -{ - unsigned char mask; - - switch ((int)fs->fs_frag) { - case 8: - return (cp[h] == 0xff); - case 4: - mask = 0x0f << ((h & 0x1) << 2); - return ((cp[h >> 1] & mask) == mask); - case 2: - mask = 0x03 << ((h & 0x3) << 1); - return ((cp[h >> 2] & mask) == mask); - case 1: - mask = 0x01 << (h & 0x7); - return ((cp[h >> 3] & mask) == mask); - default: - assert(!"isblock: invalid number of fragments"); - } - return (0); -} - -/* - * put a block into the map - */ -static void -setblock(unsigned char *cp, ufs1_daddr_t h) -{ - - switch ((int)fs->fs_frag) { - case 8: - cp[h] = 0xff; - return; - case 4: - cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); - return; - case 2: - cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); - return; - case 1: - cp[h >> 3] |= (0x01 << (h & 0x7)); - return; - default: - assert(!"setblock: invalid number of fragments"); - } -} - -/* - * check if a block is free - */ -static int -isfreeblock(u_char *cp, ufs1_daddr_t h) -{ - - switch ((int)fs->fs_frag) { - case 8: - return (cp[h] == 0); - case 4: - return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); - case 2: - return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); - case 1: - return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); - default: - assert(!"isfreeblock: invalid number of fragments"); - } - return (0); -} - -/* - * Update the frsum fields to reflect addition or deletion - * of some frags. - */ -void -fragacct(int fragmap, int32_t fraglist[], int cnt) -{ - int inblk; - int field, subfield; - int siz, pos; - - inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; - fragmap <<= 1; - for (siz = 1; siz < fs->fs_frag; siz++) { - if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) - continue; - field = around[siz]; - subfield = inside[siz]; - for (pos = siz; pos <= fs->fs_frag; pos++) { - if ((fragmap & field) == subfield) { - fraglist[siz] += cnt; - pos += siz; - field <<= siz; - subfield <<= siz; - } - field <<= 1; - subfield <<= 1; - } - } -} - -static void -clusteracct(struct cg *cgp, ufs1_daddr_t blkno) -{ - int32_t *sump; - int32_t *lp; - u_char *freemapp, *mapp; - int i, start, end, forw, back, map, bit; - - if (fs->fs_contigsumsize <= 0) - return; - freemapp = cg_clustersfree(cgp); - sump = cg_clustersum(cgp); - /* - * Clear the actual block. - */ - setbit(freemapp, blkno); - /* - * Find the size of the cluster going forward. - */ - start = blkno + 1; - end = start + fs->fs_contigsumsize; - if (end >= cgp->cg_nclusterblks) - end = cgp->cg_nclusterblks; - mapp = &freemapp[start / NBBY]; - map = *mapp++; - bit = 1 << (start % NBBY); - for (i = start; i < end; i++) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != (NBBY - 1)) { - bit <<= 1; - } else { - map = *mapp++; - bit = 1; - } - } - forw = i - start; - /* - * Find the size of the cluster going backward. - */ - start = blkno - 1; - end = start - fs->fs_contigsumsize; - if (end < 0) - end = -1; - mapp = &freemapp[start / NBBY]; - map = *mapp--; - bit = 1 << (start % NBBY); - for (i = start; i > end; i--) { - if ((map & bit) == 0) - break; - if ((i & (NBBY - 1)) != 0) { - bit >>= 1; - } else { - map = *mapp--; - bit = 1 << (NBBY - 1); - } - } - back = start - i; - /* - * Account for old cluster and the possibly new forward and - * back clusters. - */ - i = back + forw + 1; - if (i > fs->fs_contigsumsize) - i = fs->fs_contigsumsize; - sump[i]++; - if (back > 0) - sump[back]--; - if (forw > 0) - sump[forw]--; - /* - * Update cluster summary information. - */ - lp = &sump[fs->fs_contigsumsize]; - for (i = fs->fs_contigsumsize; i > 0; i--) - if (*lp-- > 0) - break; - fs->fs_maxcluster[cgp->cg_cgx] = i; -} - static void blkfree(ufs2_daddr_t bno, long size) { @@ -539,10 +272,10 @@ blkfree(ufs2_daddr_t bno, long size) blksfree = cg_blksfree(cgp); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); - if (!isfreeblock(blksfree, fragno)) + if (!ffs_isfreeblock(fs, blksfree, fragno)) assert(!"blkfree: freeing free block"); - setblock(blksfree, fragno); - clusteracct(cgp, fragno); + ffs_setblock(fs, blksfree, fragno); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -552,7 +285,7 @@ blkfree(ufs2_daddr_t bno, long size) * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); - fragacct(blk, cgp->cg_frsum, -1); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ @@ -569,16 +302,16 @@ blkfree(ufs2_daddr_t bno, long size) * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); - fragacct(blk, cgp->cg_frsum, 1); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); - if (isblock(blksfree, fragno)) { + if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; - clusteracct(cgp, fragno); + ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; @@ -599,7 +332,7 @@ freeindir(ufs2_daddr_t blk, int level) if (bread(disk, fsbtodb(fs, blk), (void *)&sblks, (size_t)fs->fs_bsize) == -1) err(1, "bread: %s", disk->d_error); blks = (ufs2_daddr_t *)&sblks; - for (i = 0; i < howmany(fs->fs_bsize, sizeof(ufs2_daddr_t)); i++) { + for (i = 0; i < NINDIR(fs); i++) { if (blks[i] == 0) break; if (level == 0) @@ -671,7 +404,7 @@ gjournal_check(const char *filesys) int cg, mode; devnam = filesys; - getdisk(); + opendisk(); /* Are there any unreferenced inodes in this file system? */ if (fs->fs_unrefs == 0) { //printf("No unreferenced inodes.\n"); @@ -747,7 +480,7 @@ gjournal_check(const char *filesys) /* Zero-fill the inode. */ *dino = ufs2_zino; /* Write the inode back. */ - if (putino(disk, ino) == -1) + if (putino(disk) == -1) err(1, "putino(cg=%d ino=%d)", cg, ino); if (cgp->cg_unrefs == 0) { //printf("No more unreferenced inodes in cg=%d.\n", cg); @@ -772,5 +505,5 @@ gjournal_check(const char *filesys) /* Write back modified cylinder groups. */ putcgs(); /* Write back updated statistics and super-block. */ - putdisk(); + closedisk(); } diff --git a/sbin/fsck_ffs/main.c b/sbin/fsck_ffs/main.c index 66edd63..e9a9704 100644 --- a/sbin/fsck_ffs/main.c +++ b/sbin/fsck_ffs/main.c @@ -242,8 +242,9 @@ checkfilesys(char *filesys) if ((fsreadfd = open(filesys, O_RDONLY)) < 0 || readsb(0) == 0) exit(3); /* Cannot read superblock */ close(fsreadfd); - if (sblock.fs_flags & FS_NEEDSFSCK) - exit(4); /* Earlier background failed */ + /* Earlier background failed or journaled */ + if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ)) + exit(4); if ((sblock.fs_flags & FS_DOSOFTDEP) == 0) exit(5); /* Not running soft updates */ size = MIBSIZE; @@ -299,7 +300,7 @@ checkfilesys(char *filesys) pfatal("MOUNTED READ-ONLY, CANNOT RUN IN BACKGROUND\n"); } else if ((fsreadfd = open(filesys, O_RDONLY)) >= 0) { if (readsb(0) != 0) { - if (sblock.fs_flags & FS_NEEDSFSCK) { + if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ)) { bkgrdflag = 0; pfatal("UNEXPECTED INCONSISTENCY, %s\n", "CANNOT RUN IN BACKGROUND\n"); @@ -384,6 +385,26 @@ checkfilesys(char *filesys) sblock.fs_cstotal.cs_nffree * 100.0 / sblock.fs_dsize); return (0); } + /* + * Determine if we can and should do journal recovery. + */ + if ((sblock.fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == FS_SUJ) { + if (preen || reply("USE JOURNAL?")) { + if (suj_check(filesys) == 0) { + if (chkdoreload(mntp) == 0) + exit(0); + exit(4); + } + /* suj_check failed, fall through. */ + } + printf("** Skipping journal, falling through to full fsck\n"); + /* + * Write the superblock so we don't try to recover the + * journal on another pass. + */ + sblock.fs_mtime = time(NULL); + sbdirty(); + } /* * Cleared if any questions answered no. Used to decide if diff --git a/sbin/fsck_ffs/pass5.c b/sbin/fsck_ffs/pass5.c index 173156e..639ce0f 100644 --- a/sbin/fsck_ffs/pass5.c +++ b/sbin/fsck_ffs/pass5.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <inttypes.h> #include <limits.h> #include <string.h> +#include <libufs.h> #include "fsck.h" diff --git a/sbin/fsck_ffs/suj.c b/sbin/fsck_ffs/suj.c new file mode 100644 index 0000000..f0240bd --- /dev/null +++ b/sbin/fsck_ffs/suj.c @@ -0,0 +1,2634 @@ +/*- + * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/disklabel.h> +#include <sys/mount.h> +#include <sys/stat.h> + +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/dinode.h> +#include <ufs/ufs/dir.h> +#include <ufs/ffs/fs.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <libufs.h> +#include <string.h> +#include <strings.h> +#include <err.h> +#include <assert.h> + +#include "fsck.h" + +#define DOTDOT_OFFSET DIRECTSIZ(1) +#define SUJ_HASHSIZE 2048 +#define SUJ_HASHMASK (SUJ_HASHSIZE - 1) +#define SUJ_HASH(x) ((x * 2654435761) & SUJ_HASHMASK) + +struct suj_seg { + TAILQ_ENTRY(suj_seg) ss_next; + struct jsegrec ss_rec; + uint8_t *ss_blk; +}; + +struct suj_rec { + TAILQ_ENTRY(suj_rec) sr_next; + union jrec *sr_rec; +}; +TAILQ_HEAD(srechd, suj_rec); + +struct suj_ino { + LIST_ENTRY(suj_ino) si_next; + struct srechd si_recs; + struct srechd si_newrecs; + struct srechd si_movs; + struct jtrncrec *si_trunc; + ino_t si_ino; + char si_skipparent; + char si_hasrecs; + char si_blkadj; + char si_linkadj; + int si_mode; + nlink_t si_nlinkadj; + nlink_t si_nlink; + nlink_t si_dotlinks; +}; +LIST_HEAD(inohd, suj_ino); + +struct suj_blk { + LIST_ENTRY(suj_blk) sb_next; + struct srechd sb_recs; + ufs2_daddr_t sb_blk; +}; +LIST_HEAD(blkhd, suj_blk); + +struct data_blk { + LIST_ENTRY(data_blk) db_next; + uint8_t *db_buf; + ufs2_daddr_t db_blk; + int db_size; + int db_dirty; +}; + +struct ino_blk { + LIST_ENTRY(ino_blk) ib_next; + uint8_t *ib_buf; + int ib_dirty; + ufs2_daddr_t ib_blk; +}; +LIST_HEAD(iblkhd, ino_blk); + +struct suj_cg { + LIST_ENTRY(suj_cg) sc_next; + struct blkhd sc_blkhash[SUJ_HASHSIZE]; + struct inohd sc_inohash[SUJ_HASHSIZE]; + struct iblkhd sc_iblkhash[SUJ_HASHSIZE]; + struct ino_blk *sc_lastiblk; + struct suj_ino *sc_lastino; + struct suj_blk *sc_lastblk; + uint8_t *sc_cgbuf; + struct cg *sc_cgp; + int sc_dirty; + int sc_cgx; +}; + +LIST_HEAD(cghd, suj_cg) cghash[SUJ_HASHSIZE]; +LIST_HEAD(dblkhd, data_blk) dbhash[SUJ_HASHSIZE]; +struct suj_cg *lastcg; +struct data_blk *lastblk; + +TAILQ_HEAD(seghd, suj_seg) allsegs; +uint64_t oldseq; +static struct uufsd *disk = NULL; +static struct fs *fs = NULL; +ino_t sujino; + +/* + * Summary statistics. + */ +uint64_t freefrags; +uint64_t freeblocks; +uint64_t freeinos; +uint64_t freedir; +uint64_t jbytes; +uint64_t jrecs; + +typedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int); +static void ino_trunc(ino_t, off_t); +static void ino_decr(ino_t); +static void ino_adjust(struct suj_ino *); +static void ino_build(struct suj_ino *); +static int blk_isfree(ufs2_daddr_t); + +static void * +errmalloc(size_t n) +{ + void *a; + + a = malloc(n); + if (a == NULL) + errx(1, "malloc(%zu)", n); + return (a); +} + +/* + * Open the given provider, load superblock. + */ +static void +opendisk(const char *devnam) +{ + if (disk != NULL) + return; + disk = malloc(sizeof(*disk)); + if (disk == NULL) + errx(1, "malloc(%zu)", sizeof(*disk)); + if (ufs_disk_fillout(disk, devnam) == -1) { + err(1, "ufs_disk_fillout(%s) failed: %s", devnam, + disk->d_error); + } + fs = &disk->d_fs; +} + +/* + * Mark file system as clean, write the super-block back, close the disk. + */ +static void +closedisk(const char *devnam) +{ + struct csum *cgsum; + int i; + + /* + * Recompute the fs summary info from correct cs summaries. + */ + bzero(&fs->fs_cstotal, sizeof(struct csum_total)); + for (i = 0; i < fs->fs_ncg; i++) { + cgsum = &fs->fs_cs(fs, i); + fs->fs_cstotal.cs_nffree += cgsum->cs_nffree; + fs->fs_cstotal.cs_nbfree += cgsum->cs_nbfree; + fs->fs_cstotal.cs_nifree += cgsum->cs_nifree; + fs->fs_cstotal.cs_ndir += cgsum->cs_ndir; + } + fs->fs_pendinginodes = 0; + fs->fs_pendingblocks = 0; + fs->fs_clean = 1; + fs->fs_time = time(NULL); + fs->fs_mtime = time(NULL); + if (sbwrite(disk, 0) == -1) + err(1, "sbwrite(%s)", devnam); + if (ufs_disk_close(disk) == -1) + err(1, "ufs_disk_close(%s)", devnam); + free(disk); + disk = NULL; + fs = NULL; +} + +/* + * Lookup a cg by number in the hash so we can keep track of which cgs + * need stats rebuilt. + */ +static struct suj_cg * +cg_lookup(int cgx) +{ + struct cghd *hd; + struct suj_cg *sc; + + if (cgx < 0 || cgx >= fs->fs_ncg) { + abort(); + errx(1, "Bad cg number %d", cgx); + } + if (lastcg && lastcg->sc_cgx == cgx) + return (lastcg); + hd = &cghash[SUJ_HASH(cgx)]; + LIST_FOREACH(sc, hd, sc_next) + if (sc->sc_cgx == cgx) { + lastcg = sc; + return (sc); + } + sc = errmalloc(sizeof(*sc)); + bzero(sc, sizeof(*sc)); + sc->sc_cgbuf = errmalloc(fs->fs_bsize); + sc->sc_cgp = (struct cg *)sc->sc_cgbuf; + sc->sc_cgx = cgx; + LIST_INSERT_HEAD(hd, sc, sc_next); + if (bread(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf, + fs->fs_bsize) == -1) + err(1, "Unable to read cylinder group %d", sc->sc_cgx); + + return (sc); +} + +/* + * Lookup an inode number in the hash and allocate a suj_ino if it does + * not exist. + */ +static struct suj_ino * +ino_lookup(ino_t ino, int creat) +{ + struct suj_ino *sino; + struct inohd *hd; + struct suj_cg *sc; + + sc = cg_lookup(ino_to_cg(fs, ino)); + if (sc->sc_lastino && sc->sc_lastino->si_ino == ino) + return (sc->sc_lastino); + hd = &sc->sc_inohash[SUJ_HASH(ino)]; + LIST_FOREACH(sino, hd, si_next) + if (sino->si_ino == ino) + return (sino); + if (creat == 0) + return (NULL); + sino = errmalloc(sizeof(*sino)); + bzero(sino, sizeof(*sino)); + sino->si_ino = ino; + TAILQ_INIT(&sino->si_recs); + TAILQ_INIT(&sino->si_newrecs); + TAILQ_INIT(&sino->si_movs); + LIST_INSERT_HEAD(hd, sino, si_next); + + return (sino); +} + +/* + * Lookup a block number in the hash and allocate a suj_blk if it does + * not exist. + */ +static struct suj_blk * +blk_lookup(ufs2_daddr_t blk, int creat) +{ + struct suj_blk *sblk; + struct suj_cg *sc; + struct blkhd *hd; + + sc = cg_lookup(dtog(fs, blk)); + if (sc->sc_lastblk && sc->sc_lastblk->sb_blk == blk) + return (sc->sc_lastblk); + hd = &sc->sc_blkhash[SUJ_HASH(fragstoblks(fs, blk))]; + LIST_FOREACH(sblk, hd, sb_next) + if (sblk->sb_blk == blk) + return (sblk); + if (creat == 0) + return (NULL); + sblk = errmalloc(sizeof(*sblk)); + bzero(sblk, sizeof(*sblk)); + sblk->sb_blk = blk; + TAILQ_INIT(&sblk->sb_recs); + LIST_INSERT_HEAD(hd, sblk, sb_next); + + return (sblk); +} + +static struct data_blk * +dblk_lookup(ufs2_daddr_t blk) +{ + struct data_blk *dblk; + struct dblkhd *hd; + + hd = &dbhash[SUJ_HASH(fragstoblks(fs, blk))]; + if (lastblk && lastblk->db_blk == blk) + return (lastblk); + LIST_FOREACH(dblk, hd, db_next) + if (dblk->db_blk == blk) + return (dblk); + /* + * The inode block wasn't located, allocate a new one. + */ + dblk = errmalloc(sizeof(*dblk)); + bzero(dblk, sizeof(*dblk)); + LIST_INSERT_HEAD(hd, dblk, db_next); + dblk->db_blk = blk; + return (dblk); +} + +static uint8_t * +dblk_read(ufs2_daddr_t blk, int size) +{ + struct data_blk *dblk; + + dblk = dblk_lookup(blk); + /* + * I doubt size mismatches can happen in practice but it is trivial + * to handle. + */ + if (size != dblk->db_size) { + if (dblk->db_buf) + free(dblk->db_buf); + dblk->db_buf = errmalloc(size); + dblk->db_size = size; + if (bread(disk, fsbtodb(fs, blk), dblk->db_buf, size) == -1) + err(1, "Failed to read data block %jd", blk); + } + return (dblk->db_buf); +} + +static void +dblk_dirty(ufs2_daddr_t blk) +{ + struct data_blk *dblk; + + dblk = dblk_lookup(blk); + dblk->db_dirty = 1; +} + +static void +dblk_write(void) +{ + struct data_blk *dblk; + int i; + + for (i = 0; i < SUJ_HASHSIZE; i++) { + LIST_FOREACH(dblk, &dbhash[i], db_next) { + if (dblk->db_dirty == 0 || dblk->db_size == 0) + continue; + if (bwrite(disk, fsbtodb(fs, dblk->db_blk), + dblk->db_buf, dblk->db_size) == -1) + err(1, "Unable to write block %jd", + dblk->db_blk); + } + } +} + +static union dinode * +ino_read(ino_t ino) +{ + struct ino_blk *iblk; + struct iblkhd *hd; + struct suj_cg *sc; + ufs2_daddr_t blk; + int off; + + blk = ino_to_fsba(fs, ino); + sc = cg_lookup(ino_to_cg(fs, ino)); + iblk = sc->sc_lastiblk; + if (iblk && iblk->ib_blk == blk) + goto found; + hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))]; + LIST_FOREACH(iblk, hd, ib_next) + if (iblk->ib_blk == blk) + goto found; + /* + * The inode block wasn't located, allocate a new one. + */ + iblk = errmalloc(sizeof(*iblk)); + bzero(iblk, sizeof(*iblk)); + iblk->ib_buf = errmalloc(fs->fs_bsize); + iblk->ib_blk = blk; + LIST_INSERT_HEAD(hd, iblk, ib_next); + if (bread(disk, fsbtodb(fs, blk), iblk->ib_buf, fs->fs_bsize) == -1) + err(1, "Failed to read inode block %jd", blk); +found: + sc->sc_lastiblk = iblk; + off = ino_to_fsbo(fs, ino); + if (fs->fs_magic == FS_UFS1_MAGIC) + return (union dinode *)&((struct ufs1_dinode *)iblk->ib_buf)[off]; + else + return (union dinode *)&((struct ufs2_dinode *)iblk->ib_buf)[off]; +} + +static void +ino_dirty(ino_t ino) +{ + struct ino_blk *iblk; + struct iblkhd *hd; + struct suj_cg *sc; + ufs2_daddr_t blk; + + blk = ino_to_fsba(fs, ino); + sc = cg_lookup(ino_to_cg(fs, ino)); + iblk = sc->sc_lastiblk; + if (iblk && iblk->ib_blk == blk) { + iblk->ib_dirty = 1; + return; + } + hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))]; + LIST_FOREACH(iblk, hd, ib_next) { + if (iblk->ib_blk == blk) { + iblk->ib_dirty = 1; + return; + } + } + ino_read(ino); + ino_dirty(ino); +} + +static void +iblk_write(struct ino_blk *iblk) +{ + + if (iblk->ib_dirty == 0) + return; + if (bwrite(disk, fsbtodb(fs, iblk->ib_blk), iblk->ib_buf, + fs->fs_bsize) == -1) + err(1, "Failed to write inode block %jd", iblk->ib_blk); +} + +static int +blk_overlaps(struct jblkrec *brec, ufs2_daddr_t start, int frags) +{ + ufs2_daddr_t bstart; + ufs2_daddr_t bend; + ufs2_daddr_t end; + + end = start + frags; + bstart = brec->jb_blkno + brec->jb_oldfrags; + bend = bstart + brec->jb_frags; + if (start < bend && end > bstart) + return (1); + return (0); +} + +static int +blk_equals(struct jblkrec *brec, ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t start, + int frags) +{ + + if (brec->jb_ino != ino || brec->jb_lbn != lbn) + return (0); + if (brec->jb_blkno + brec->jb_oldfrags != start) + return (0); + if (brec->jb_frags != frags) + return (0); + return (1); +} + +static void +blk_setmask(struct jblkrec *brec, int *mask) +{ + int i; + + for (i = brec->jb_oldfrags; i < brec->jb_oldfrags + brec->jb_frags; i++) + *mask |= 1 << i; +} + +/* + * Determine whether a given block has been reallocated to a new location. + * Returns a mask of overlapping bits if any frags have been reused or + * zero if the block has not been re-used and the contents can be trusted. + * + * This is used to ensure that an orphaned pointer due to truncate is safe + * to be freed. The mask value can be used to free partial blocks. + */ +static int +blk_freemask(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags) +{ + struct suj_blk *sblk; + struct suj_rec *srec; + struct jblkrec *brec; + int mask; + int off; + + /* + * To be certain we're not freeing a reallocated block we lookup + * this block in the blk hash and see if there is an allocation + * journal record that overlaps with any fragments in the block + * we're concerned with. If any fragments have ben reallocated + * the block has already been freed and re-used for another purpose. + */ + mask = 0; + sblk = blk_lookup(blknum(fs, blk), 0); + if (sblk == NULL) + return (0); + off = blk - sblk->sb_blk; + TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { + brec = (struct jblkrec *)srec->sr_rec; + /* + * If the block overlaps but does not match + * exactly it's a new allocation. If it matches + * exactly this record refers to the current + * location. + */ + if (blk_overlaps(brec, blk, frags) == 0) + continue; + if (blk_equals(brec, ino, lbn, blk, frags) == 1) + mask = 0; + else + blk_setmask(brec, &mask); + } + if (debug) + printf("blk_freemask: blk %jd sblk %jd off %d mask 0x%X\n", + blk, sblk->sb_blk, off, mask); + return (mask >> off); +} + +/* + * Determine whether it is safe to follow an indirect. It is not safe + * if any part of the indirect has been reallocated or the last journal + * entry was an allocation. Just allocated indirects may not have valid + * pointers yet and all of their children will have their own records. + * It is also not safe to follow an indirect if the cg bitmap has been + * cleared as a new allocation may write to the block prior to the journal + * being written. + * + * Returns 1 if it's safe to follow the indirect and 0 otherwise. + */ +static int +blk_isindir(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn) +{ + struct suj_blk *sblk; + struct jblkrec *brec; + + sblk = blk_lookup(blk, 0); + if (sblk == NULL) + return (1); + if (TAILQ_EMPTY(&sblk->sb_recs)) + return (1); + brec = (struct jblkrec *)TAILQ_LAST(&sblk->sb_recs, srechd)->sr_rec; + if (blk_equals(brec, ino, lbn, blk, fs->fs_frag)) + if (brec->jb_op == JOP_FREEBLK) + return (!blk_isfree(blk)); + return (0); +} + +/* + * Clear an inode from the cg bitmap. If the inode was already clear return + * 0 so the caller knows it does not have to check the inode contents. + */ +static int +ino_free(ino_t ino, int mode) +{ + struct suj_cg *sc; + uint8_t *inosused; + struct cg *cgp; + int cg; + + cg = ino_to_cg(fs, ino); + ino = ino % fs->fs_ipg; + sc = cg_lookup(cg); + cgp = sc->sc_cgp; + inosused = cg_inosused(cgp); + /* + * The bitmap may never have made it to the disk so we have to + * conditionally clear. We can avoid writing the cg in this case. + */ + if (isclr(inosused, ino)) + return (0); + freeinos++; + clrbit(inosused, ino); + if (ino < cgp->cg_irotor) + cgp->cg_irotor = ino; + cgp->cg_cs.cs_nifree++; + if ((mode & IFMT) == IFDIR) { + freedir++; + cgp->cg_cs.cs_ndir--; + } + sc->sc_dirty = 1; + + return (1); +} + +/* + * Free 'frags' frags starting at filesystem block 'bno' skipping any frags + * set in the mask. + */ +static void +blk_free(ufs2_daddr_t bno, int mask, int frags) +{ + ufs1_daddr_t fragno, cgbno; + struct suj_cg *sc; + struct cg *cgp; + int i, cg; + uint8_t *blksfree; + + if (debug) + printf("Freeing %d frags at blk %jd\n", frags, bno); + cg = dtog(fs, bno); + sc = cg_lookup(cg); + cgp = sc->sc_cgp; + cgbno = dtogd(fs, bno); + blksfree = cg_blksfree(cgp); + + /* + * If it's not allocated we only wrote the journal entry + * and never the bitmaps. Here we unconditionally clear and + * resolve the cg summary later. + */ + if (frags == fs->fs_frag && mask == 0) { + fragno = fragstoblks(fs, cgbno); + ffs_setblock(fs, blksfree, fragno); + freeblocks++; + } else { + /* + * deallocate the fragment + */ + for (i = 0; i < frags; i++) + if ((mask & (1 << i)) == 0 && isclr(blksfree, cgbno +i)) { + freefrags++; + setbit(blksfree, cgbno + i); + } + } + sc->sc_dirty = 1; +} + +/* + * Returns 1 if the whole block starting at 'bno' is marked free and 0 + * otherwise. + */ +static int +blk_isfree(ufs2_daddr_t bno) +{ + struct suj_cg *sc; + + sc = cg_lookup(dtog(fs, bno)); + return ffs_isblock(fs, cg_blksfree(sc->sc_cgp), dtogd(fs, bno)); +} + +/* + * Fetch an indirect block to find the block at a given lbn. The lbn + * may be negative to fetch a specific indirect block pointer or positive + * to fetch a specific block. + */ +static ufs2_daddr_t +indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn) +{ + ufs2_daddr_t *bap2; + ufs2_daddr_t *bap1; + ufs_lbn_t lbnadd; + ufs_lbn_t base; + int level; + int i; + + if (blk == 0) + return (0); + level = lbn_level(cur); + if (level == -1) + errx(1, "Invalid indir lbn %jd", lbn); + if (level == 0 && lbn < 0) + errx(1, "Invalid lbn %jd", lbn); + bap2 = (void *)dblk_read(blk, fs->fs_bsize); + bap1 = (void *)bap2; + lbnadd = 1; + base = -(cur + level); + for (i = level; i > 0; i--) + lbnadd *= NINDIR(fs); + if (lbn > 0) + i = (lbn - base) / lbnadd; + else + i = (-lbn - base) / lbnadd; + if (i < 0 || i >= NINDIR(fs)) + errx(1, "Invalid indirect index %d produced by lbn %jd", + i, lbn); + if (level == 0) + cur = base + (i * lbnadd); + else + cur = -(base + (i * lbnadd)) - (level - 1); + if (fs->fs_magic == FS_UFS1_MAGIC) + blk = bap1[i]; + else + blk = bap2[i]; + if (cur == lbn) + return (blk); + if (level == 0) { + abort(); + errx(1, "Invalid lbn %jd at level 0", lbn); + } + return indir_blkatoff(blk, ino, cur, lbn); +} + +/* + * Finds the disk block address at the specified lbn within the inode + * specified by ip. This follows the whole tree and honors di_size and + * di_extsize so it is a true test of reachability. The lbn may be + * negative if an extattr or indirect block is requested. + */ +static ufs2_daddr_t +ino_blkatoff(union dinode *ip, ino_t ino, ufs_lbn_t lbn, int *frags) +{ + ufs_lbn_t tmpval; + ufs_lbn_t cur; + ufs_lbn_t next; + int i; + + /* + * Handle extattr blocks first. + */ + if (lbn < 0 && lbn >= -NXADDR) { + lbn = -1 - lbn; + if (lbn > lblkno(fs, ip->dp2.di_extsize - 1)) + return (0); + *frags = numfrags(fs, sblksize(fs, ip->dp2.di_extsize, lbn)); + return (ip->dp2.di_extb[lbn]); + } + /* + * Now direct and indirect. + */ + if (DIP(ip, di_mode) == IFLNK && + DIP(ip, di_size) < fs->fs_maxsymlinklen) + return (0); + if (lbn >= 0 && lbn < NDADDR) { + *frags = numfrags(fs, sblksize(fs, DIP(ip, di_size), lbn)); + return (DIP(ip, di_db[lbn])); + } + *frags = fs->fs_frag; + + for (i = 0, tmpval = NINDIR(fs), cur = NDADDR; i < NIADDR; i++, + tmpval *= NINDIR(fs), cur = next) { + next = cur + tmpval; + if (lbn == -cur - i) + return (DIP(ip, di_ib[i])); + /* + * Determine whether the lbn in question is within this tree. + */ + if (lbn < 0 && -lbn >= next) + continue; + if (lbn > 0 && lbn >= next) + continue; + return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn); + } + errx(1, "lbn %jd not in ino", lbn); +} + +/* + * Determine whether a block exists at a particular lbn in an inode. + * Returns 1 if found, 0 if not. lbn may be negative for indirects + * or ext blocks. + */ +static int +blk_isat(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int *frags) +{ + union dinode *ip; + ufs2_daddr_t nblk; + + ip = ino_read(ino); + + if (DIP(ip, di_nlink) == 0 || DIP(ip, di_mode) == 0) + return (0); + nblk = ino_blkatoff(ip, ino, lbn, frags); + + return (nblk == blk); +} + +/* + * Determines whether a pointer to an inode exists within a directory + * at a specified offset. Returns the mode of the found entry. + */ +static int +ino_isat(ino_t parent, off_t diroff, ino_t child, int *mode, int *isdot) +{ + union dinode *dip; + struct direct *dp; + ufs2_daddr_t blk; + uint8_t *block; + ufs_lbn_t lbn; + int blksize; + int frags; + int dpoff; + int doff; + + *isdot = 0; + dip = ino_read(parent); + *mode = DIP(dip, di_mode); + if ((*mode & IFMT) != IFDIR) { + if (debug) { + /* + * This can happen if the parent inode + * was reallocated. + */ + if (*mode != 0) + printf("Directory %d has bad mode %o\n", + parent, *mode); + else + printf("Directory %d zero inode\n", parent); + } + return (0); + } + lbn = lblkno(fs, diroff); + doff = blkoff(fs, diroff); + blksize = sblksize(fs, DIP(dip, di_size), lbn); + if (diroff + DIRECTSIZ(1) > DIP(dip, di_size) || doff >= blksize) { + if (debug) + printf("ino %d absent from %d due to offset %jd" + " exceeding size %jd\n", + child, parent, diroff, DIP(dip, di_size)); + return (0); + } + blk = ino_blkatoff(dip, parent, lbn, &frags); + if (blk <= 0) { + if (debug) + printf("Sparse directory %d", parent); + return (0); + } + block = dblk_read(blk, blksize); + /* + * Walk through the records from the start of the block to be + * certain we hit a valid record and not some junk in the middle + * of a file name. Stop when we reach or pass the expected offset. + */ + dpoff = (doff / DIRBLKSIZ) * DIRBLKSIZ; + do { + dp = (struct direct *)&block[dpoff]; + if (dpoff == doff) + break; + if (dp->d_reclen == 0) + break; + dpoff += dp->d_reclen; + } while (dpoff <= doff); + if (dpoff > fs->fs_bsize) + errx(1, "Corrupt directory block in dir ino %d", parent); + /* Not found. */ + if (dpoff != doff) { + if (debug) + printf("ino %d not found in %d, lbn %jd, dpoff %d\n", + child, parent, lbn, dpoff); + return (0); + } + /* + * We found the item in question. Record the mode and whether it's + * a . or .. link for the caller. + */ + if (dp->d_ino == child) { + if (child == parent) + *isdot = 1; + else if (dp->d_namlen == 2 && + dp->d_name[0] == '.' && dp->d_name[1] == '.') + *isdot = 1; + *mode = DTTOIF(dp->d_type); + return (1); + } + if (debug) + printf("ino %d doesn't match dirent ino %d in parent %d\n", + child, dp->d_ino, parent); + return (0); +} + +#define VISIT_INDIR 0x0001 +#define VISIT_EXT 0x0002 +#define VISIT_ROOT 0x0004 /* Operation came via root & valid pointers. */ + +/* + * Read an indirect level which may or may not be linked into an inode. + */ +static void +indir_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, uint64_t *frags, + ino_visitor visitor, int flags) +{ + ufs2_daddr_t *bap2; + ufs1_daddr_t *bap1; + ufs_lbn_t lbnadd; + ufs2_daddr_t nblk; + ufs_lbn_t nlbn; + int level; + int i; + + /* + * Don't visit indirect blocks with contents we can't trust. This + * should only happen when indir_visit() is called to complete a + * truncate that never finished and not when a pointer is found via + * an inode. + */ + if (blk == 0) + return; + level = lbn_level(lbn); + if (level == -1) + errx(1, "Invalid level for lbn %jd", lbn); + if ((flags & VISIT_ROOT) == 0 && blk_isindir(blk, ino, lbn) == 0) { + if (debug) + printf("blk %jd ino %d lbn %jd(%d) is not indir.\n", + blk, ino, lbn, level); + goto out; + } + lbnadd = 1; + for (i = level; i > 0; i--) + lbnadd *= NINDIR(fs); + bap1 = (void *)dblk_read(blk, fs->fs_bsize); + bap2 = (void *)bap1; + for (i = 0; i < NINDIR(fs); i++) { + if (fs->fs_magic == FS_UFS1_MAGIC) + nblk = *bap1++; + else + nblk = *bap2++; + if (nblk == 0) + continue; + if (level == 0) { + nlbn = -lbn + i * lbnadd; + (*frags) += fs->fs_frag; + visitor(ino, nlbn, nblk, fs->fs_frag); + } else { + nlbn = (lbn + 1) - (i * lbnadd); + indir_visit(ino, nlbn, nblk, frags, visitor, flags); + } + } +out: + if (flags & VISIT_INDIR) { + (*frags) += fs->fs_frag; + visitor(ino, lbn, blk, fs->fs_frag); + } +} + +/* + * Visit each block in an inode as specified by 'flags' and call a + * callback function. The callback may inspect or free blocks. The + * count of frags found according to the size in the file is returned. + * This is not valid for sparse files but may be used to determine + * the correct di_blocks for a file. + */ +static uint64_t +ino_visit(union dinode *ip, ino_t ino, ino_visitor visitor, int flags) +{ + ufs_lbn_t nextlbn; + ufs_lbn_t tmpval; + ufs_lbn_t lbn; + uint64_t size; + uint64_t fragcnt; + int mode; + int frags; + int i; + + size = DIP(ip, di_size); + mode = DIP(ip, di_mode) & IFMT; + fragcnt = 0; + if ((flags & VISIT_EXT) && + fs->fs_magic == FS_UFS2_MAGIC && ip->dp2.di_extsize) { + for (i = 0; i < NXADDR; i++) { + if (ip->dp2.di_extb[i] == 0) + continue; + frags = sblksize(fs, ip->dp2.di_extsize, i); + frags = numfrags(fs, frags); + fragcnt += frags; + visitor(ino, -1 - i, ip->dp2.di_extb[i], frags); + } + } + /* Skip datablocks for short links and devices. */ + if (mode == IFBLK || mode == IFCHR || + (mode == IFLNK && size < fs->fs_maxsymlinklen)) + return (fragcnt); + for (i = 0; i < NDADDR; i++) { + if (DIP(ip, di_db[i]) == 0) + continue; + frags = sblksize(fs, size, i); + frags = numfrags(fs, frags); + fragcnt += frags; + visitor(ino, i, DIP(ip, di_db[i]), frags); + } + /* + * We know the following indirects are real as we're following + * real pointers to them. + */ + flags |= VISIT_ROOT; + for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++, + lbn = nextlbn) { + nextlbn = lbn + tmpval; + tmpval *= NINDIR(fs); + if (DIP(ip, di_ib[i]) == 0) + continue; + indir_visit(ino, -lbn - i, DIP(ip, di_ib[i]), &fragcnt, visitor, + flags); + } + return (fragcnt); +} + +/* + * Null visitor function used when we just want to count blocks and + * record the lbn. + */ +ufs_lbn_t visitlbn; +static void +null_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) +{ + if (lbn > 0) + visitlbn = lbn; +} + +/* + * Recalculate di_blocks when we discover that a block allocation or + * free was not successfully completed. The kernel does not roll this back + * because it would be too expensive to compute which indirects were + * reachable at the time the inode was written. + */ +static void +ino_adjblks(struct suj_ino *sino) +{ + union dinode *ip; + uint64_t blocks; + uint64_t frags; + off_t isize; + off_t size; + ino_t ino; + + ino = sino->si_ino; + ip = ino_read(ino); + /* No need to adjust zero'd inodes. */ + if (DIP(ip, di_mode) == 0) + return; + /* + * Visit all blocks and count them as well as recording the last + * valid lbn in the file. If the file size doesn't agree with the + * last lbn we need to truncate to fix it. Otherwise just adjust + * the blocks count. + */ + visitlbn = 0; + frags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT); + blocks = fsbtodb(fs, frags); + /* + * We assume the size and direct block list is kept coherent by + * softdep. For files that have extended into indirects we truncate + * to the size in the inode or the maximum size permitted by + * populated indirects. + */ + if (visitlbn >= NDADDR) { + isize = DIP(ip, di_size); + size = lblktosize(fs, visitlbn + 1); + if (isize > size) + isize = size; + /* Always truncate to free any unpopulated indirects. */ + ino_trunc(sino->si_ino, isize); + return; + } + if (blocks == DIP(ip, di_blocks)) + return; + if (debug) + printf("ino %d adjusting block count from %jd to %jd\n", + ino, DIP(ip, di_blocks), blocks); + DIP_SET(ip, di_blocks, blocks); + ino_dirty(ino); +} + +static void +blk_free_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) +{ + int mask; + + mask = blk_freemask(blk, ino, lbn, frags); + if (debug) + printf("blk %jd freemask 0x%X\n", blk, mask); + blk_free(blk, mask, frags); +} + +/* + * Free a block or tree of blocks that was previously rooted in ino at + * the given lbn. If the lbn is an indirect all children are freed + * recursively. + */ +static void +blk_free_lbn(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags, int follow) +{ + uint64_t resid; + int mask; + + mask = blk_freemask(blk, ino, lbn, frags); + if (debug) + printf("blk %jd freemask 0x%X\n", blk, mask); + resid = 0; + if (lbn <= -NDADDR && follow && mask == 0) + indir_visit(ino, lbn, blk, &resid, blk_free_visit, VISIT_INDIR); + else + blk_free(blk, mask, frags); +} + +static void +ino_setskip(struct suj_ino *sino, ino_t parent) +{ + int isdot; + int mode; + + if (ino_isat(sino->si_ino, DOTDOT_OFFSET, parent, &mode, &isdot)) + sino->si_skipparent = 1; +} + +/* + * Free the children of a directory when the directory is discarded. + */ +static void +ino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) +{ + struct suj_ino *sino; + struct suj_rec *srec; + struct jrefrec *rrec; + struct direct *dp; + off_t diroff; + uint8_t *block; + int skipparent; + int isparent; + int dpoff; + int size; + + sino = ino_lookup(ino, 0); + if (sino) + skipparent = sino->si_skipparent; + else + skipparent = 0; + size = lfragtosize(fs, frags); + block = dblk_read(blk, size); + dp = (struct direct *)&block[0]; + for (dpoff = 0; dpoff < size && dp->d_reclen; dpoff += dp->d_reclen) { + dp = (struct direct *)&block[dpoff]; + if (dp->d_ino == 0 || dp->d_ino == WINO) + continue; + if (dp->d_namlen == 1 && dp->d_name[0] == '.') + continue; + isparent = dp->d_namlen == 2 && dp->d_name[0] == '.' && + dp->d_name[1] == '.'; + if (isparent && skipparent == 1) + continue; + if (debug) + printf("Directory %d removing ino %d name %s\n", + ino, dp->d_ino, dp->d_name); + /* + * Lookup this inode to see if we have a record for it. + * If not, we've already adjusted it assuming this path + * was valid and we have to adjust once more. + */ + sino = ino_lookup(dp->d_ino, 0); + if (sino == NULL || sino->si_hasrecs == 0) { + ino_decr(ino); + continue; + } + /* + * Use ino_adjust() so if we lose the last non-dot reference + * to a directory it can be discarded. + */ + if (sino->si_linkadj) { + sino->si_nlink--; + if (isparent) + sino->si_dotlinks--; + ino_adjust(sino); + } + /* + * Tell any child directories we've already removed their + * parent. Don't try to adjust our link down again. + */ + if (isparent == 0) + ino_setskip(sino, ino); + /* + * If we haven't yet processed this inode we need to make + * sure we will successfully discover the lost path. If not + * use nlinkadj to remember. + */ + diroff = lblktosize(fs, lbn) + dpoff; + TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { + rrec = (struct jrefrec *)srec->sr_rec; + if (rrec->jr_parent == ino && + rrec->jr_diroff == diroff) + break; + } + if (srec == NULL) + sino->si_nlinkadj++; + } +} + +/* + * Reclaim an inode, freeing all blocks and decrementing all children's + * link counts. Free the inode back to the cg. + */ +static void +ino_reclaim(union dinode *ip, ino_t ino, int mode) +{ + uint32_t gen; + + if (ino == ROOTINO) + errx(1, "Attempting to free ROOTINO"); + if (debug) + printf("Truncating and freeing ino %d, nlink %d, mode %o\n", + ino, DIP(ip, di_nlink), DIP(ip, di_mode)); + + /* We are freeing an inode or directory. */ + if ((DIP(ip, di_mode) & IFMT) == IFDIR) + ino_visit(ip, ino, ino_free_children, 0); + DIP_SET(ip, di_nlink, 0); + ino_visit(ip, ino, blk_free_visit, VISIT_EXT | VISIT_INDIR); + /* Here we have to clear the inode and release any blocks it holds. */ + gen = DIP(ip, di_gen); + if (fs->fs_magic == FS_UFS1_MAGIC) + bzero(ip, sizeof(struct ufs1_dinode)); + else + bzero(ip, sizeof(struct ufs2_dinode)); + DIP_SET(ip, di_gen, gen); + ino_dirty(ino); + ino_free(ino, mode); + return; +} + +/* + * Adjust an inode's link count down by one when a directory goes away. + */ +static void +ino_decr(ino_t ino) +{ + union dinode *ip; + int reqlink; + int nlink; + int mode; + + ip = ino_read(ino); + nlink = DIP(ip, di_nlink); + mode = DIP(ip, di_mode); + if (nlink < 1) + errx(1, "Inode %d link count %d invalid", ino, nlink); + if (mode == 0) + errx(1, "Inode %d has a link of %d with 0 mode.", ino, nlink); + nlink--; + if ((mode & IFMT) == IFDIR) + reqlink = 2; + else + reqlink = 1; + if (nlink < reqlink) { + if (debug) + printf("ino %d not enough links to live %d < %d\n", + ino, nlink, reqlink); + ino_reclaim(ip, ino, mode); + return; + } + DIP_SET(ip, di_nlink, nlink); + ino_dirty(ino); +} + +/* + * Adjust the inode link count to 'nlink'. If the count reaches zero + * free it. + */ +static void +ino_adjust(struct suj_ino *sino) +{ + struct jrefrec *rrec; + struct suj_rec *srec; + struct suj_ino *stmp; + union dinode *ip; + nlink_t nlink; + int reqlink; + int mode; + ino_t ino; + + nlink = sino->si_nlink; + ino = sino->si_ino; + /* + * If it's a directory with no real names pointing to it go ahead + * and truncate it. This will free any children. + */ + if ((sino->si_mode & IFMT) == IFDIR && + nlink - sino->si_dotlinks == 0) { + sino->si_nlink = nlink = 0; + /* + * Mark any .. links so they know not to free this inode + * when they are removed. + */ + TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { + rrec = (struct jrefrec *)srec->sr_rec; + if (rrec->jr_diroff == DOTDOT_OFFSET) { + stmp = ino_lookup(rrec->jr_parent, 0); + if (stmp) + ino_setskip(stmp, ino); + } + } + } + ip = ino_read(ino); + mode = DIP(ip, di_mode) & IFMT; + if (nlink > LINK_MAX) + errx(1, + "ino %d nlink manipulation error, new link %d, old link %d", + ino, nlink, DIP(ip, di_nlink)); + if (debug) + printf("Adjusting ino %d, nlink %d, old link %d lastmode %o\n", + ino, nlink, DIP(ip, di_nlink), sino->si_mode); + if (mode == 0) { + if (debug) + printf("ino %d, zero inode freeing bitmap\n", ino); + ino_free(ino, sino->si_mode); + return; + } + /* XXX Should be an assert? */ + if (mode != sino->si_mode && debug) + printf("ino %d, mode %o != %o\n", ino, mode, sino->si_mode); + if ((mode & IFMT) == IFDIR) + reqlink = 2; + else + reqlink = 1; + /* If the inode doesn't have enough links to live, free it. */ + if (nlink < reqlink) { + if (debug) + printf("ino %d not enough links to live %d < %d\n", + ino, nlink, reqlink); + ino_reclaim(ip, ino, mode); + return; + } + /* If required write the updated link count. */ + if (DIP(ip, di_nlink) == nlink) { + if (debug) + printf("ino %d, link matches, skipping.\n", ino); + return; + } + DIP_SET(ip, di_nlink, nlink); + ino_dirty(ino); +} + +/* + * Truncate some or all blocks in an indirect, freeing any that are required + * and zeroing the indirect. + */ +static void +indir_trunc(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, ufs_lbn_t lastlbn) +{ + ufs2_daddr_t *bap2; + ufs1_daddr_t *bap1; + ufs_lbn_t lbnadd; + ufs2_daddr_t nblk; + ufs_lbn_t next; + ufs_lbn_t nlbn; + int dirty; + int level; + int i; + + if (blk == 0) + return; + dirty = 0; + level = lbn_level(lbn); + if (level == -1) + errx(1, "Invalid level for lbn %jd", lbn); + lbnadd = 1; + for (i = level; i > 0; i--) + lbnadd *= NINDIR(fs); + bap1 = (void *)dblk_read(blk, fs->fs_bsize); + bap2 = (void *)bap1; + for (i = 0; i < NINDIR(fs); i++) { + if (fs->fs_magic == FS_UFS1_MAGIC) + nblk = *bap1++; + else + nblk = *bap2++; + if (nblk == 0) + continue; + if (level != 0) { + nlbn = (lbn + 1) - (i * lbnadd); + /* + * Calculate the lbn of the next indirect to + * determine if any of this indirect must be + * reclaimed. + */ + next = -(lbn + level) + ((i+1) * lbnadd); + if (next <= lastlbn) + continue; + indir_trunc(ino, nlbn, nblk, lastlbn); + /* If all of this indirect was reclaimed, free it. */ + nlbn = next - lbnadd; + if (nlbn < lastlbn) + continue; + } else { + nlbn = -lbn + i * lbnadd; + if (nlbn < lastlbn) + continue; + } + dirty = 1; + blk_free(nblk, 0, fs->fs_frag); + if (fs->fs_magic == FS_UFS1_MAGIC) + *(bap1 - 1) = 0; + else + *(bap2 - 1) = 0; + } + if (dirty) + dblk_dirty(blk); +} + +/* + * Truncate an inode to the minimum of the given size or the last populated + * block after any over size have been discarded. The kernel would allocate + * the last block in the file but fsck does not and neither do we. This + * code never extends files, only shrinks them. + */ +static void +ino_trunc(ino_t ino, off_t size) +{ + union dinode *ip; + ufs2_daddr_t bn; + uint64_t totalfrags; + ufs_lbn_t nextlbn; + ufs_lbn_t lastlbn; + ufs_lbn_t tmpval; + ufs_lbn_t lbn; + ufs_lbn_t i; + int frags; + off_t cursize; + off_t off; + int mode; + + ip = ino_read(ino); + mode = DIP(ip, di_mode) & IFMT; + cursize = DIP(ip, di_size); + if (debug) + printf("Truncating ino %d, mode %o to size %jd from size %jd\n", + ino, mode, size, cursize); + + /* Skip datablocks for short links and devices. */ + if (mode == 0 || mode == IFBLK || mode == IFCHR || + (mode == IFLNK && cursize < fs->fs_maxsymlinklen)) + return; + /* Don't extend. */ + if (size > cursize) + size = cursize; + lastlbn = lblkno(fs, blkroundup(fs, size)); + for (i = lastlbn; i < NDADDR; i++) { + if (DIP(ip, di_db[i]) == 0) + continue; + frags = sblksize(fs, cursize, i); + frags = numfrags(fs, frags); + blk_free(DIP(ip, di_db[i]), 0, frags); + DIP_SET(ip, di_db[i], 0); + } + /* + * Follow indirect blocks, freeing anything required. + */ + for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++, + lbn = nextlbn) { + nextlbn = lbn + tmpval; + tmpval *= NINDIR(fs); + /* If we're not freeing any in this indirect range skip it. */ + if (lastlbn >= nextlbn) + continue; + if (DIP(ip, di_ib[i]) == 0) + continue; + indir_trunc(ino, -lbn - i, DIP(ip, di_ib[i]), lastlbn); + /* If we freed everything in this indirect free the indir. */ + if (lastlbn > lbn) + continue; + blk_free(DIP(ip, di_ib[i]), 0, frags); + DIP_SET(ip, di_ib[i], 0); + } + ino_dirty(ino); + /* + * Now that we've freed any whole blocks that exceed the desired + * truncation size, figure out how many blocks remain and what the + * last populated lbn is. We will set the size to this last lbn + * rather than worrying about allocating the final lbn as the kernel + * would've done. This is consistent with normal fsck behavior. + */ + visitlbn = 0; + totalfrags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT); + if (size > lblktosize(fs, visitlbn + 1)) + size = lblktosize(fs, visitlbn + 1); + /* + * If we're truncating direct blocks we have to adjust frags + * accordingly. + */ + if (visitlbn < NDADDR && totalfrags) { + long oldspace, newspace; + + bn = DIP(ip, di_db[visitlbn]); + if (bn == 0) + errx(1, "Bad blk at ino %d lbn %jd\n", ino, visitlbn); + oldspace = sblksize(fs, cursize, visitlbn); + newspace = sblksize(fs, size, visitlbn); + if (oldspace != newspace) { + bn += numfrags(fs, newspace); + frags = numfrags(fs, oldspace - newspace); + blk_free(bn, 0, frags); + totalfrags -= frags; + } + } + DIP_SET(ip, di_blocks, fsbtodb(fs, totalfrags)); + DIP_SET(ip, di_size, size); + /* + * If we've truncated into the middle of a block or frag we have + * to zero it here. Otherwise the file could extend into + * uninitialized space later. + */ + off = blkoff(fs, size); + if (off) { + uint8_t *buf; + long clrsize; + + bn = ino_blkatoff(ip, ino, visitlbn, &frags); + if (bn == 0) + errx(1, "Block missing from ino %d at lbn %jd\n", + ino, visitlbn); + clrsize = frags * fs->fs_fsize; + buf = dblk_read(bn, clrsize); + clrsize -= off; + buf += off; + bzero(buf, clrsize); + dblk_dirty(bn); + } + return; +} + +/* + * Process records available for one inode and determine whether the + * link count is correct or needs adjusting. + */ +static void +ino_check(struct suj_ino *sino) +{ + struct suj_rec *srec; + struct jrefrec *rrec; + nlink_t dotlinks; + int newlinks; + int removes; + int nlink; + ino_t ino; + int isdot; + int isat; + int mode; + + if (sino->si_hasrecs == 0) + return; + ino = sino->si_ino; + rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec; + nlink = rrec->jr_nlink; + newlinks = 0; + dotlinks = 0; + removes = sino->si_nlinkadj; + TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { + rrec = (struct jrefrec *)srec->sr_rec; + isat = ino_isat(rrec->jr_parent, rrec->jr_diroff, + rrec->jr_ino, &mode, &isdot); + if (isat && (mode & IFMT) != (rrec->jr_mode & IFMT)) + errx(1, "Inode mode/directory type mismatch %o != %o", + mode, rrec->jr_mode); + if (debug) + printf("jrefrec: op %d ino %d, nlink %d, parent %d, " + "diroff %jd, mode %o, isat %d, isdot %d\n", + rrec->jr_op, rrec->jr_ino, rrec->jr_nlink, + rrec->jr_parent, rrec->jr_diroff, rrec->jr_mode, + isat, isdot); + mode = rrec->jr_mode & IFMT; + if (rrec->jr_op == JOP_REMREF) + removes++; + newlinks += isat; + if (isdot) + dotlinks += isat; + } + /* + * The number of links that remain are the starting link count + * subtracted by the total number of removes with the total + * links discovered back in. An incomplete remove thus + * makes no change to the link count but an add increases + * by one. + */ + if (debug) + printf("ino %d nlink %d newlinks %d removes %d dotlinks %d\n", + ino, nlink, newlinks, removes, dotlinks); + nlink += newlinks; + nlink -= removes; + sino->si_linkadj = 1; + sino->si_nlink = nlink; + sino->si_dotlinks = dotlinks; + sino->si_mode = mode; + ino_adjust(sino); +} + +/* + * Process records available for one block and determine whether it is + * still allocated and whether the owning inode needs to be updated or + * a free completed. + */ +static void +blk_check(struct suj_blk *sblk) +{ + struct suj_rec *srec; + struct jblkrec *brec; + struct suj_ino *sino; + ufs2_daddr_t blk; + int mask; + int frags; + int isat; + + /* + * Each suj_blk actually contains records for any fragments in that + * block. As a result we must evaluate each record individually. + */ + sino = NULL; + TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { + brec = (struct jblkrec *)srec->sr_rec; + frags = brec->jb_frags; + blk = brec->jb_blkno + brec->jb_oldfrags; + isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags); + if (sino == NULL || sino->si_ino != brec->jb_ino) { + sino = ino_lookup(brec->jb_ino, 1); + sino->si_blkadj = 1; + } + if (debug) + printf("op %d blk %jd ino %d lbn %jd frags %d isat %d (%d)\n", + brec->jb_op, blk, brec->jb_ino, brec->jb_lbn, + brec->jb_frags, isat, frags); + /* + * If we found the block at this address we still have to + * determine if we need to free the tail end that was + * added by adding contiguous fragments from the same block. + */ + if (isat == 1) { + if (frags == brec->jb_frags) + continue; + mask = blk_freemask(blk, brec->jb_ino, brec->jb_lbn, + brec->jb_frags); + mask >>= frags; + blk += frags; + frags = brec->jb_frags - frags; + blk_free(blk, mask, frags); + continue; + } + /* + * The block wasn't found, attempt to free it. It won't be + * freed if it was actually reallocated. If this was an + * allocation we don't want to follow indirects as they + * may not be written yet. Any children of the indirect will + * have their own records. If it's a free we need to + * recursively free children. + */ + blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags, + brec->jb_op == JOP_FREEBLK); + } +} + +/* + * Walk the list of inode records for this cg and resolve moved and duplicate + * inode references now that we have a complete picture. + */ +static void +cg_build(struct suj_cg *sc) +{ + struct suj_ino *sino; + int i; + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) + ino_build(sino); +} + +/* + * Handle inodes requiring truncation. This must be done prior to + * looking up any inodes in directories. + */ +static void +cg_trunc(struct suj_cg *sc) +{ + struct suj_ino *sino; + int i; + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) + if (sino->si_trunc) { + ino_trunc(sino->si_ino, + sino->si_trunc->jt_size); + sino->si_trunc = NULL; + } +} + +/* + * Free any partially allocated blocks and then resolve inode block + * counts. + */ +static void +cg_check_blk(struct suj_cg *sc) +{ + struct suj_ino *sino; + struct suj_blk *sblk; + int i; + + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH(sblk, &sc->sc_blkhash[i], sb_next) + blk_check(sblk); + /* + * Now that we've freed blocks which are not referenced we + * make a second pass over all inodes to adjust their block + * counts. + */ + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) + if (sino->si_blkadj) + ino_adjblks(sino); +} + +/* + * Walk the list of inode records for this cg, recovering any + * changes which were not complete at the time of crash. + */ +static void +cg_check_ino(struct suj_cg *sc) +{ + struct suj_ino *sino; + int i; + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) + ino_check(sino); +} + +/* + * Write a potentially dirty cg. Recalculate the summary information and + * update the superblock summary. + */ +static void +cg_write(struct suj_cg *sc) +{ + ufs1_daddr_t fragno, cgbno, maxbno; + u_int8_t *blksfree; + struct cg *cgp; + int blk; + int i; + + if (sc->sc_dirty == 0) + return; + /* + * Fix the frag and cluster summary. + */ + cgp = sc->sc_cgp; + cgp->cg_cs.cs_nbfree = 0; + cgp->cg_cs.cs_nffree = 0; + bzero(&cgp->cg_frsum, sizeof(cgp->cg_frsum)); + maxbno = fragstoblks(fs, fs->fs_fpg); + if (fs->fs_contigsumsize > 0) { + for (i = 1; i <= fs->fs_contigsumsize; i++) + cg_clustersum(cgp)[i] = 0; + bzero(cg_clustersfree(cgp), howmany(maxbno, CHAR_BIT)); + } + blksfree = cg_blksfree(cgp); + for (cgbno = 0; cgbno < maxbno; cgbno++) { + if (ffs_isfreeblock(fs, blksfree, cgbno)) + continue; + if (ffs_isblock(fs, blksfree, cgbno)) { + ffs_clusteracct(fs, cgp, cgbno, 1); + cgp->cg_cs.cs_nbfree++; + continue; + } + fragno = blkstofrags(fs, cgbno); + blk = blkmap(fs, blksfree, fragno); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1); + for (i = 0; i < fs->fs_frag; i++) + if (isset(blksfree, fragno + i)) + cgp->cg_cs.cs_nffree++; + } + /* + * Update the superblock cg summary from our now correct values + * before writing the block. + */ + fs->fs_cs(fs, sc->sc_cgx) = cgp->cg_cs; + if (bwrite(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf, + fs->fs_bsize) == -1) + err(1, "Unable to write cylinder group %d", sc->sc_cgx); +} + +/* + * Write out any modified inodes. + */ +static void +cg_write_inos(struct suj_cg *sc) +{ + struct ino_blk *iblk; + int i; + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next) + if (iblk->ib_dirty) + iblk_write(iblk); +} + +static void +cg_apply(void (*apply)(struct suj_cg *)) +{ + struct suj_cg *scg; + int i; + + for (i = 0; i < SUJ_HASHSIZE; i++) + LIST_FOREACH(scg, &cghash[i], sc_next) + apply(scg); +} + +/* + * Process the unlinked but referenced file list. Freeing all inodes. + */ +static void +ino_unlinked(void) +{ + union dinode *ip; + uint16_t mode; + ino_t inon; + ino_t ino; + + ino = fs->fs_sujfree; + fs->fs_sujfree = 0; + while (ino != 0) { + ip = ino_read(ino); + mode = DIP(ip, di_mode) & IFMT; + inon = DIP(ip, di_freelink); + DIP_SET(ip, di_freelink, 0); + /* + * XXX Should this be an errx? + */ + if (DIP(ip, di_nlink) == 0) { + if (debug) + printf("Freeing unlinked ino %d mode %o\n", + ino, mode); + ino_reclaim(ip, ino, mode); + } else if (debug) + printf("Skipping ino %d mode %o with link %d\n", + ino, mode, DIP(ip, di_nlink)); + ino = inon; + } +} + +/* + * Append a new record to the list of records requiring processing. + */ +static void +ino_append(union jrec *rec) +{ + struct jrefrec *refrec; + struct jmvrec *mvrec; + struct suj_ino *sino; + struct suj_rec *srec; + + mvrec = &rec->rec_jmvrec; + refrec = &rec->rec_jrefrec; + if (debug && mvrec->jm_op == JOP_MVREF) + printf("ino move: ino %d, parent %d, diroff %jd, oldoff %jd\n", + mvrec->jm_ino, mvrec->jm_parent, mvrec->jm_newoff, + mvrec->jm_oldoff); + else if (debug && + (refrec->jr_op == JOP_ADDREF || refrec->jr_op == JOP_REMREF)) + printf("ino ref: op %d, ino %d, nlink %d, " + "parent %d, diroff %jd\n", + refrec->jr_op, refrec->jr_ino, refrec->jr_nlink, + refrec->jr_parent, refrec->jr_diroff); + /* + * Lookup the ino and clear truncate if one is found. Partial + * truncates are always done synchronously so if we discover + * an operation that requires a lock the truncation has completed + * and can be discarded. + */ + sino = ino_lookup(((struct jrefrec *)rec)->jr_ino, 1); + sino->si_trunc = NULL; + sino->si_hasrecs = 1; + srec = errmalloc(sizeof(*srec)); + srec->sr_rec = rec; + TAILQ_INSERT_TAIL(&sino->si_newrecs, srec, sr_next); +} + +/* + * Add a reference adjustment to the sino list and eliminate dups. The + * primary loop in ino_build_ref() checks for dups but new ones may be + * created as a result of offset adjustments. + */ +static void +ino_add_ref(struct suj_ino *sino, struct suj_rec *srec) +{ + struct jrefrec *refrec; + struct suj_rec *srn; + struct jrefrec *rrn; + + refrec = (struct jrefrec *)srec->sr_rec; + /* + * We walk backwards so that the oldest link count is preserved. If + * an add record conflicts with a remove keep the remove. Redundant + * removes are eliminated in ino_build_ref. Otherwise we keep the + * oldest record at a given location. + */ + for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn; + srn = TAILQ_PREV(srn, srechd, sr_next)) { + rrn = (struct jrefrec *)srn->sr_rec; + if (rrn->jr_parent != refrec->jr_parent || + rrn->jr_diroff != refrec->jr_diroff) + continue; + if (rrn->jr_op == JOP_REMREF || refrec->jr_op == JOP_ADDREF) { + rrn->jr_mode = refrec->jr_mode; + return; + } + /* + * Adding a remove. + * + * Replace the record in place with the old nlink in case + * we replace the head of the list. Abandon srec as a dup. + */ + refrec->jr_nlink = rrn->jr_nlink; + srn->sr_rec = srec->sr_rec; + return; + } + TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next); +} + +/* + * Create a duplicate of a reference at a previous location. + */ +static void +ino_dup_ref(struct suj_ino *sino, struct jrefrec *refrec, off_t diroff) +{ + struct jrefrec *rrn; + struct suj_rec *srn; + + rrn = errmalloc(sizeof(*refrec)); + *rrn = *refrec; + rrn->jr_op = JOP_ADDREF; + rrn->jr_diroff = diroff; + srn = errmalloc(sizeof(*srn)); + srn->sr_rec = (union jrec *)rrn; + ino_add_ref(sino, srn); +} + +/* + * Add a reference to the list at all known locations. We follow the offset + * changes for a single instance and create duplicate add refs at each so + * that we can tolerate any version of the directory block. Eliminate + * removes which collide with adds that are seen in the journal. They should + * not adjust the link count down. + */ +static void +ino_build_ref(struct suj_ino *sino, struct suj_rec *srec) +{ + struct jrefrec *refrec; + struct jmvrec *mvrec; + struct suj_rec *srp; + struct suj_rec *srn; + struct jrefrec *rrn; + off_t diroff; + + refrec = (struct jrefrec *)srec->sr_rec; + /* + * Search for a mvrec that matches this offset. Whether it's an add + * or a remove we can delete the mvref after creating a dup record in + * the old location. + */ + if (!TAILQ_EMPTY(&sino->si_movs)) { + diroff = refrec->jr_diroff; + for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn; srn = srp) { + srp = TAILQ_PREV(srn, srechd, sr_next); + mvrec = (struct jmvrec *)srn->sr_rec; + if (mvrec->jm_parent != refrec->jr_parent || + mvrec->jm_newoff != diroff) + continue; + diroff = mvrec->jm_oldoff; + TAILQ_REMOVE(&sino->si_movs, srn, sr_next); + ino_dup_ref(sino, refrec, diroff); + } + } + /* + * If a remove wasn't eliminated by an earlier add just append it to + * the list. + */ + if (refrec->jr_op == JOP_REMREF) { + ino_add_ref(sino, srec); + return; + } + /* + * Walk the list of records waiting to be added to the list. We + * must check for moves that apply to our current offset and remove + * them from the list. Remove any duplicates to eliminate removes + * with corresponding adds. + */ + TAILQ_FOREACH_SAFE(srn, &sino->si_newrecs, sr_next, srp) { + switch (srn->sr_rec->rec_jrefrec.jr_op) { + case JOP_ADDREF: + /* + * This should actually be an error we should + * have a remove for every add journaled. + */ + rrn = (struct jrefrec *)srn->sr_rec; + if (rrn->jr_parent != refrec->jr_parent || + rrn->jr_diroff != refrec->jr_diroff) + break; + TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); + break; + case JOP_REMREF: + /* + * Once we remove the current iteration of the + * record at this address we're done. + */ + rrn = (struct jrefrec *)srn->sr_rec; + if (rrn->jr_parent != refrec->jr_parent || + rrn->jr_diroff != refrec->jr_diroff) + break; + TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); + ino_add_ref(sino, srec); + return; + case JOP_MVREF: + /* + * Update our diroff based on any moves that match + * and remove the move. + */ + mvrec = (struct jmvrec *)srn->sr_rec; + if (mvrec->jm_parent != refrec->jr_parent || + mvrec->jm_oldoff != refrec->jr_diroff) + break; + ino_dup_ref(sino, refrec, mvrec->jm_oldoff); + refrec->jr_diroff = mvrec->jm_newoff; + TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); + break; + default: + errx(1, "ino_build_ref: Unknown op %d", + srn->sr_rec->rec_jrefrec.jr_op); + } + } + ino_add_ref(sino, srec); +} + +/* + * Walk the list of new records and add them in-order resolving any + * dups and adjusted offsets. + */ +static void +ino_build(struct suj_ino *sino) +{ + struct suj_rec *srec; + + while ((srec = TAILQ_FIRST(&sino->si_newrecs)) != NULL) { + TAILQ_REMOVE(&sino->si_newrecs, srec, sr_next); + switch (srec->sr_rec->rec_jrefrec.jr_op) { + case JOP_ADDREF: + case JOP_REMREF: + ino_build_ref(sino, srec); + break; + case JOP_MVREF: + /* + * Add this mvrec to the queue of pending mvs. + */ + TAILQ_INSERT_TAIL(&sino->si_movs, srec, sr_next); + break; + default: + errx(1, "ino_build: Unknown op %d", + srec->sr_rec->rec_jrefrec.jr_op); + } + } + if (TAILQ_EMPTY(&sino->si_recs)) + sino->si_hasrecs = 0; +} + +/* + * Modify journal records so they refer to the base block number + * and a start and end frag range. This is to facilitate the discovery + * of overlapping fragment allocations. + */ +static void +blk_build(struct jblkrec *blkrec) +{ + struct suj_rec *srec; + struct suj_blk *sblk; + struct jblkrec *blkrn; + struct suj_ino *sino; + ufs2_daddr_t blk; + off_t foff; + int frag; + + if (debug) + printf("blk_build: op %d blkno %jd frags %d oldfrags %d " + "ino %d lbn %jd\n", + blkrec->jb_op, blkrec->jb_blkno, blkrec->jb_frags, + blkrec->jb_oldfrags, blkrec->jb_ino, blkrec->jb_lbn); + + /* + * Look up the inode and clear the truncate if any lbns after the + * truncate lbn are freed or allocated. + */ + sino = ino_lookup(blkrec->jb_ino, 0); + if (sino && sino->si_trunc) { + foff = lblktosize(fs, blkrec->jb_lbn); + foff += lfragtosize(fs, blkrec->jb_frags); + if (foff > sino->si_trunc->jt_size) + sino->si_trunc = NULL; + } + blk = blknum(fs, blkrec->jb_blkno); + frag = fragnum(fs, blkrec->jb_blkno); + sblk = blk_lookup(blk, 1); + /* + * Rewrite the record using oldfrags to indicate the offset into + * the block. Leave jb_frags as the actual allocated count. + */ + blkrec->jb_blkno -= frag; + blkrec->jb_oldfrags = frag; + if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag) + errx(1, "Invalid fragment count %d oldfrags %d", + blkrec->jb_frags, frag); + /* + * Detect dups. If we detect a dup we always discard the oldest + * record as it is superseded by the new record. This speeds up + * later stages but also eliminates free records which are used + * to indicate that the contents of indirects can be trusted. + */ + TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { + blkrn = (struct jblkrec *)srec->sr_rec; + if (blkrn->jb_ino != blkrec->jb_ino || + blkrn->jb_lbn != blkrec->jb_lbn || + blkrn->jb_blkno != blkrec->jb_blkno || + blkrn->jb_frags != blkrec->jb_frags || + blkrn->jb_oldfrags != blkrec->jb_oldfrags) + continue; + if (debug) + printf("Removed dup.\n"); + /* Discard the free which is a dup with an alloc. */ + if (blkrec->jb_op == JOP_FREEBLK) + return; + TAILQ_REMOVE(&sblk->sb_recs, srec, sr_next); + free(srec); + break; + } + srec = errmalloc(sizeof(*srec)); + srec->sr_rec = (union jrec *)blkrec; + TAILQ_INSERT_TAIL(&sblk->sb_recs, srec, sr_next); +} + +static void +ino_build_trunc(struct jtrncrec *rec) +{ + struct suj_ino *sino; + + if (debug) + printf("ino_build_trunc: ino %d, size %jd\n", + rec->jt_ino, rec->jt_size); + sino = ino_lookup(rec->jt_ino, 1); + sino->si_trunc = rec; +} + +/* + * Build up tables of the operations we need to recover. + */ +static void +suj_build(void) +{ + struct suj_seg *seg; + union jrec *rec; + int off; + int i; + + TAILQ_FOREACH(seg, &allsegs, ss_next) { + if (debug) + printf("seg %jd has %d records, oldseq %jd.\n", + seg->ss_rec.jsr_seq, seg->ss_rec.jsr_cnt, + seg->ss_rec.jsr_oldest); + off = 0; + rec = (union jrec *)seg->ss_blk; + for (i = 0; i < seg->ss_rec.jsr_cnt; off += JREC_SIZE, rec++) { + /* skip the segrec. */ + if ((off % DEV_BSIZE) == 0) + continue; + switch (rec->rec_jrefrec.jr_op) { + case JOP_ADDREF: + case JOP_REMREF: + case JOP_MVREF: + ino_append(rec); + break; + case JOP_NEWBLK: + case JOP_FREEBLK: + blk_build((struct jblkrec *)rec); + break; + case JOP_TRUNC: + ino_build_trunc((struct jtrncrec *)rec); + break; + default: + errx(1, "Unknown journal operation %d (%d)", + rec->rec_jrefrec.jr_op, off); + } + i++; + } + } +} + +/* + * Prune the journal segments to those we care about based on the + * oldest sequence in the newest segment. Order the segment list + * based on sequence number. + */ +static void +suj_prune(void) +{ + struct suj_seg *seg; + struct suj_seg *segn; + uint64_t newseq; + int discard; + + if (debug) + printf("Pruning up to %jd\n", oldseq); + /* First free the expired segments. */ + TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { + if (seg->ss_rec.jsr_seq >= oldseq) + continue; + TAILQ_REMOVE(&allsegs, seg, ss_next); + free(seg->ss_blk); + free(seg); + } + /* Next ensure that segments are ordered properly. */ + seg = TAILQ_FIRST(&allsegs); + if (seg == NULL) { + if (debug) + printf("Empty journal\n"); + return; + } + newseq = seg->ss_rec.jsr_seq; + for (;;) { + seg = TAILQ_LAST(&allsegs, seghd); + if (seg->ss_rec.jsr_seq >= newseq) + break; + TAILQ_REMOVE(&allsegs, seg, ss_next); + TAILQ_INSERT_HEAD(&allsegs, seg, ss_next); + newseq = seg->ss_rec.jsr_seq; + + } + if (newseq != oldseq) + errx(1, "Journal file sequence mismatch %jd != %jd", + newseq, oldseq); + /* + * The kernel may asynchronously write segments which can create + * gaps in the sequence space. Throw away any segments after the + * gap as the kernel guarantees only those that are contiguously + * reachable are marked as completed. + */ + discard = 0; + TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { + if (!discard && newseq++ == seg->ss_rec.jsr_seq) { + jrecs += seg->ss_rec.jsr_cnt; + jbytes += seg->ss_rec.jsr_blocks * DEV_BSIZE; + continue; + } + discard = 1; + if (debug) + printf("Journal order mismatch %jd != %jd pruning\n", + newseq-1, seg->ss_rec.jsr_seq); + TAILQ_REMOVE(&allsegs, seg, ss_next); + free(seg->ss_blk); + free(seg); + } + if (debug) + printf("Processing journal segments from %jd to %jd\n", + oldseq, newseq-1); +} + +/* + * Verify the journal inode before attempting to read records. + */ +static int +suj_verifyino(union dinode *ip) +{ + + if (DIP(ip, di_nlink) != 1) { + printf("Invalid link count %d for journal inode %d\n", + DIP(ip, di_nlink), sujino); + return (-1); + } + + if ((DIP(ip, di_flags) & (SF_IMMUTABLE | SF_NOUNLINK)) != + (SF_IMMUTABLE | SF_NOUNLINK)) { + printf("Invalid flags 0x%X for journal inode %d\n", + DIP(ip, di_flags), sujino); + return (-1); + } + + if (DIP(ip, di_mode) != (IFREG | IREAD)) { + printf("Invalid mode %o for journal inode %d\n", + DIP(ip, di_mode), sujino); + return (-1); + } + + if (DIP(ip, di_size) < SUJ_MIN || DIP(ip, di_size) > SUJ_MAX) { + printf("Invalid size %jd for journal inode %d\n", + DIP(ip, di_size), sujino); + return (-1); + } + + if (DIP(ip, di_modrev) != fs->fs_mtime) { + printf("Journal timestamp does not match fs mount time\n"); + return (-1); + } + + return (0); +} + +struct jblocks { + struct jextent *jb_extent; /* Extent array. */ + int jb_avail; /* Available extents. */ + int jb_used; /* Last used extent. */ + int jb_head; /* Allocator head. */ + int jb_off; /* Allocator extent offset. */ +}; +struct jextent { + ufs2_daddr_t je_daddr; /* Disk block address. */ + int je_blocks; /* Disk block count. */ +}; + +struct jblocks *suj_jblocks; + +static struct jblocks * +jblocks_create(void) +{ + struct jblocks *jblocks; + int size; + + jblocks = errmalloc(sizeof(*jblocks)); + jblocks->jb_avail = 10; + jblocks->jb_used = 0; + jblocks->jb_head = 0; + jblocks->jb_off = 0; + size = sizeof(struct jextent) * jblocks->jb_avail; + jblocks->jb_extent = errmalloc(size); + bzero(jblocks->jb_extent, size); + + return (jblocks); +} + +/* + * Return the next available disk block and the amount of contiguous + * free space it contains. + */ +static ufs2_daddr_t +jblocks_next(struct jblocks *jblocks, int bytes, int *actual) +{ + struct jextent *jext; + ufs2_daddr_t daddr; + int freecnt; + int blocks; + + blocks = bytes / DEV_BSIZE; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks - jblocks->jb_off; + if (freecnt == 0) { + jblocks->jb_off = 0; + if (++jblocks->jb_head > jblocks->jb_used) + return (0); + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks; + } + if (freecnt > blocks) + freecnt = blocks; + *actual = freecnt * DEV_BSIZE; + daddr = jext->je_daddr + jblocks->jb_off; + + return (daddr); +} + +/* + * Advance the allocation head by a specified number of bytes, consuming + * one journal segment. + */ +static void +jblocks_advance(struct jblocks *jblocks, int bytes) +{ + + jblocks->jb_off += bytes / DEV_BSIZE; +} + +static void +jblocks_destroy(struct jblocks *jblocks) +{ + + free(jblocks->jb_extent); + free(jblocks); +} + +static void +jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks) +{ + struct jextent *jext; + int size; + + jext = &jblocks->jb_extent[jblocks->jb_used]; + /* Adding the first block. */ + if (jext->je_daddr == 0) { + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; + } + /* Extending the last extent. */ + if (jext->je_daddr + jext->je_blocks == daddr) { + jext->je_blocks += blocks; + return; + } + /* Adding a new extent. */ + if (++jblocks->jb_used == jblocks->jb_avail) { + jblocks->jb_avail *= 2; + size = sizeof(struct jextent) * jblocks->jb_avail; + jext = errmalloc(size); + bzero(jext, size); + bcopy(jblocks->jb_extent, jext, + sizeof(struct jextent) * jblocks->jb_used); + free(jblocks->jb_extent); + jblocks->jb_extent = jext; + } + jext = &jblocks->jb_extent[jblocks->jb_used]; + jext->je_daddr = daddr; + jext->je_blocks = blocks; + + return; +} + +/* + * Add a file block from the journal to the extent map. We can't read + * each file block individually because the kernel treats it as a circular + * buffer and segments may span mutliple contiguous blocks. + */ +static void +suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) +{ + + jblocks_add(suj_jblocks, fsbtodb(fs, blk), fsbtodb(fs, frags)); +} + +static void +suj_read(void) +{ + uint8_t block[1 * 1024 * 1024]; + struct suj_seg *seg; + struct jsegrec *recn; + struct jsegrec *rec; + ufs2_daddr_t blk; + int readsize; + int blocks; + int recsize; + int size; + int i; + + /* + * Read records until we exhaust the journal space. If we find + * an invalid record we start searching for a valid segment header + * at the next block. This is because we don't have a head/tail + * pointer and must recover the information indirectly. At the gap + * between the head and tail we won't necessarily have a valid + * segment. + */ +restart: + for (;;) { + size = sizeof(block); + blk = jblocks_next(suj_jblocks, size, &readsize); + if (blk == 0) + return; + size = readsize; + /* + * Read 1MB at a time and scan for records within this block. + */ + if (bread(disk, blk, &block, size) == -1) + err(1, "Error reading journal block %jd", + (intmax_t)blk); + for (rec = (void *)block; size; size -= recsize, + rec = (struct jsegrec *)((uintptr_t)rec + recsize)) { + recsize = DEV_BSIZE; + if (rec->jsr_time != fs->fs_mtime) { + if (debug) + printf("Rec time %jd != fs mtime %jd\n", + rec->jsr_time, fs->fs_mtime); + jblocks_advance(suj_jblocks, recsize); + continue; + } + if (rec->jsr_cnt == 0) { + if (debug) + printf("Found illegal count %d\n", + rec->jsr_cnt); + jblocks_advance(suj_jblocks, recsize); + continue; + } + blocks = rec->jsr_blocks; + recsize = blocks * DEV_BSIZE; + if (recsize > size) { + /* + * We may just have run out of buffer, restart + * the loop to re-read from this spot. + */ + if (size < fs->fs_bsize && + size != readsize && + recsize <= fs->fs_bsize) + goto restart; + if (debug) + printf("Found invalid segsize %d > %d\n", + recsize, size); + recsize = DEV_BSIZE; + jblocks_advance(suj_jblocks, recsize); + continue; + } + /* + * Verify that all blocks in the segment are present. + */ + for (i = 1; i < blocks; i++) { + recn = (void *) + ((uintptr_t)rec) + i * DEV_BSIZE; + if (recn->jsr_seq == rec->jsr_seq && + recn->jsr_time == rec->jsr_time) + continue; + if (debug) + printf("Incomplete record %jd (%d)\n", + rec->jsr_seq, i); + recsize = i * DEV_BSIZE; + jblocks_advance(suj_jblocks, recsize); + goto restart; + } + seg = errmalloc(sizeof(*seg)); + seg->ss_blk = errmalloc(recsize); + seg->ss_rec = *rec; + bcopy((void *)rec, seg->ss_blk, recsize); + if (rec->jsr_oldest > oldseq) + oldseq = rec->jsr_oldest; + TAILQ_INSERT_TAIL(&allsegs, seg, ss_next); + jblocks_advance(suj_jblocks, recsize); + } + } +} + +/* + * Search a directory block for the SUJ_FILE. + */ +static void +suj_find(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) +{ + char block[MAXBSIZE]; + struct direct *dp; + int bytes; + int off; + + if (sujino) + return; + bytes = lfragtosize(fs, frags); + if (bread(disk, fsbtodb(fs, blk), block, bytes) <= 0) + err(1, "Failed to read ROOTINO directory block %jd", blk); + for (off = 0; off < bytes; off += dp->d_reclen) { + dp = (struct direct *)&block[off]; + if (dp->d_reclen == 0) + break; + if (dp->d_ino == 0) + continue; + if (dp->d_namlen != strlen(SUJ_FILE)) + continue; + if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0) + continue; + sujino = dp->d_ino; + return; + } +} + +/* + * Orchestrate the verification of a filesystem via the softupdates journal. + */ +int +suj_check(const char *filesys) +{ + union dinode *jip; + union dinode *ip; + uint64_t blocks; + + opendisk(filesys); + TAILQ_INIT(&allsegs); + /* + * Find the journal inode. + */ + ip = ino_read(ROOTINO); + sujino = 0; + ino_visit(ip, ROOTINO, suj_find, 0); + if (sujino == 0) + errx(1, "Journal inode removed. Use tunefs to re-create."); + /* + * Fetch the journal inode and verify it. + */ + jip = ino_read(sujino); + printf("** SU+J Recovering %s\n", filesys); + if (suj_verifyino(jip) != 0) + return (-1); + /* + * Build a list of journal blocks in jblocks before parsing the + * available journal blocks in with suj_read(). + */ + printf("** Reading %jd byte journal from inode %d.\n", + DIP(jip, di_size), sujino); + suj_jblocks = jblocks_create(); + blocks = ino_visit(jip, sujino, suj_add_block, 0); + if (blocks != numfrags(fs, DIP(jip, di_size))) + errx(1, "Sparse journal inode %d.\n", sujino); + suj_read(); + jblocks_destroy(suj_jblocks); + suj_jblocks = NULL; + if (preen || reply("RECOVER")) { + printf("** Building recovery table.\n"); + suj_prune(); + suj_build(); + cg_apply(cg_build); + printf("** Resolving unreferenced inode list.\n"); + ino_unlinked(); + printf("** Processing journal entries.\n"); + cg_apply(cg_trunc); + cg_apply(cg_check_blk); + cg_apply(cg_check_ino); + } + if (preen == 0 && reply("WRITE CHANGES") == 0) + return (0); + /* + * To remain idempotent with partial truncations the free bitmaps + * must be written followed by indirect blocks and lastly inode + * blocks. This preserves access to the modified pointers until + * they are freed. + */ + cg_apply(cg_write); + dblk_write(); + cg_apply(cg_write_inos); + /* Write back superblock. */ + closedisk(filesys); + printf("** %jd journal records in %jd bytes for %.2f%% utilization\n", + jrecs, jbytes, ((float)jrecs / (float)(jbytes / JREC_SIZE)) * 100); + printf("** Freed %jd inodes (%jd dirs) %jd blocks, and %jd frags.\n", + freeinos, freedir, freeblocks, freefrags); + + return (0); +} diff --git a/sbin/fsdb/fsdb.c b/sbin/fsdb/fsdb.c index f7354e8..5622cbb 100644 --- a/sbin/fsdb/fsdb.c +++ b/sbin/fsdb/fsdb.c @@ -396,7 +396,8 @@ const char *typename[] = { "unregistered #13", "whiteout", }; - + +int diroff; int slot; int @@ -404,9 +405,10 @@ scannames(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; - printf("slot %d ino %d reclen %d: %s, `%.*s'\n", - slot++, dirp->d_ino, dirp->d_reclen, typename[dirp->d_type], - dirp->d_namlen, dirp->d_name); + printf("slot %d off %d ino %d reclen %d: %s, `%.*s'\n", + slot++, diroff, dirp->d_ino, dirp->d_reclen, + typename[dirp->d_type], dirp->d_namlen, dirp->d_name); + diroff += dirp->d_reclen; return (KEEPON); } @@ -416,6 +418,7 @@ CMDFUNCSTART(ls) checkactivedir(); /* let it go on anyway */ slot = 0; + diroff = 0; idesc.id_number = curinum; idesc.id_func = scannames; idesc.id_type = DATA; diff --git a/sbin/fsdb/fsdbutil.c b/sbin/fsdb/fsdbutil.c index d50c6c0..2c5710a 100644 --- a/sbin/fsdb/fsdbutil.c +++ b/sbin/fsdb/fsdbutil.c @@ -52,7 +52,7 @@ static const char rcsid[] = #include "fsck.h" static int charsperline(void); -static int printindir(ufs2_daddr_t blk, int level, char *bufp); +static void printindir(ufs2_daddr_t blk, int level, char *bufp); static void printblocks(ino_t inum, union dinode *dp); char ** @@ -226,7 +226,7 @@ charsperline(void) /* * Recursively print a list of indirect blocks. */ -static int +static void printindir(ufs2_daddr_t blk, int level, char *bufp) { struct bufarea buf, *bp; @@ -234,6 +234,9 @@ printindir(ufs2_daddr_t blk, int level, char *bufp) int i, j, cpl, charssofar; ufs2_daddr_t blkno; + if (blk == 0) + return; + printf("%jd (%d) =>\n", (intmax_t)blk, level); if (level == 0) { /* for the final indirect level, don't use the cache */ bp = &buf; @@ -251,11 +254,8 @@ printindir(ufs2_daddr_t blk, int level, char *bufp) blkno = bp->b_un.b_indir1[i]; else blkno = bp->b_un.b_indir2[i]; - if (blkno == 0) { - if (level == 0) - putchar('\n'); - return 0; - } + if (blkno == 0) + continue; j = sprintf(tempbuf, "%jd", (intmax_t)blkno); if (level == 0) { charssofar += j; @@ -270,13 +270,14 @@ printindir(ufs2_daddr_t blk, int level, char *bufp) charssofar += 2; } else { printf(" =>\n"); - if (printindir(blkno, level - 1, bufp) == 0) - return 0; + printindir(blkno, level - 1, bufp); + printf("\n"); + charssofar = 0; } } if (level == 0) putchar('\n'); - return 1; + return; } @@ -309,7 +310,7 @@ printblocks(ino_t inum, union dinode *dp) } } putchar('\n'); - if (DIP(dp, di_ib[0]) == 0) + if (ndb == 0) return; bufp = malloc((unsigned int)sblock.fs_bsize); @@ -317,8 +318,7 @@ printblocks(ino_t inum, union dinode *dp) errx(EEXIT, "cannot allocate indirect block buffer"); printf("Indirect blocks:\n"); for (i = 0; i < NIADDR; i++) - if (printindir(DIP(dp, di_ib[i]), i, bufp) == 0) - break; + printindir(DIP(dp, di_ib[i]), i, bufp); free(bufp); } diff --git a/sbin/geom/class/Makefile b/sbin/geom/class/Makefile index 591f79f..0611cdd 100644 --- a/sbin/geom/class/Makefile +++ b/sbin/geom/class/Makefile @@ -15,6 +15,7 @@ SUBDIR+=multipath SUBDIR+=nop SUBDIR+=part SUBDIR+=raid3 +SUBDIR+=sched SUBDIR+=shsec SUBDIR+=stripe SUBDIR+=virstor diff --git a/sbin/geom/class/cache/gcache.8 b/sbin/geom/class/cache/gcache.8 index d3f782c..b9f03bd 100644 --- a/sbin/geom/class/cache/gcache.8 +++ b/sbin/geom/class/cache/gcache.8 @@ -14,14 +14,14 @@ .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR THE VOICES IN HIS HEAD BE -.\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -.\" POSSIBILITY OF SUCH DAMAGE. +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" diff --git a/sbin/geom/class/mountver/gmountver.8 b/sbin/geom/class/mountver/gmountver.8 index c7a8f04..02d77db 100644 --- a/sbin/geom/class/mountver/gmountver.8 +++ b/sbin/geom/class/mountver/gmountver.8 @@ -14,14 +14,14 @@ .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR THE VOICES IN HIS HEAD BE -.\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -.\" POSSIBILITY OF SUCH DAMAGE. +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" diff --git a/sbin/geom/class/multipath/geom_multipath.c b/sbin/geom/class/multipath/geom_multipath.c index 4319d04..288492c 100644 --- a/sbin/geom/class/multipath/geom_multipath.c +++ b/sbin/geom/class/multipath/geom_multipath.c @@ -48,6 +48,7 @@ uint32_t version = G_MULTIPATH_VERSION; static void mp_main(struct gctl_req *, unsigned int); static void mp_label(struct gctl_req *); static void mp_clear(struct gctl_req *); +static void mp_add(struct gctl_req *); struct g_command class_commands[] = { { @@ -55,6 +56,10 @@ struct g_command class_commands[] = { NULL, "[-v] name prov ..." }, { + "add", G_FLAG_VERBOSE | G_FLAG_LOADKLD, mp_main, G_NULL_OPTS, + NULL, "[-v] name prov ..." + }, + { "destroy", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, NULL, "[-v] prov ..." }, @@ -62,6 +67,14 @@ struct g_command class_commands[] = { "clear", G_FLAG_VERBOSE, mp_main, G_NULL_OPTS, NULL, "[-v] prov ..." }, + { + "rotate", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, + NULL, "[-v] prov ..." + }, + { + "getactive", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, + NULL, "[-v] prov ..." + }, G_CMD_SENTINEL }; @@ -77,6 +90,8 @@ mp_main(struct gctl_req *req, unsigned int flags __unused) } if (strcmp(name, "label") == 0) { mp_label(req); + } else if (strcmp(name, "add") == 0) { + mp_add(req); } else if (strcmp(name, "clear") == 0) { mp_clear(req); } else { @@ -93,7 +108,7 @@ mp_label(struct gctl_req *req) char *ptr; uuid_t uuid; uint32_t secsize = 0, ssize, status; - const char *name; + const char *name, *mpname; int error, i, nargs; nargs = gctl_get_int(req, "nargs"); @@ -148,8 +163,8 @@ mp_label(struct gctl_req *req) */ strlcpy(md.md_magic, G_MULTIPATH_MAGIC, sizeof(md.md_magic)); md.md_version = G_MULTIPATH_VERSION; - name = gctl_get_ascii(req, "arg0"); - strlcpy(md.md_name, name, sizeof(md.md_name)); + mpname = gctl_get_ascii(req, "arg0"); + strlcpy(md.md_name, mpname, sizeof(md.md_name)); md.md_size = disksiz; md.md_sectorsize = secsize; uuid_create(&uuid, &status); @@ -166,46 +181,44 @@ mp_label(struct gctl_req *req) free(ptr); /* - * Clear last sector first for each provider to spoil anything extant + * Clear metadata on initial provider first. */ - for (i = 1; i < nargs; i++) { - name = gctl_get_ascii(req, "arg%d", i); - error = g_metadata_clear(name, NULL); - if (error != 0) { - gctl_error(req, "cannot clear metadata on %s: %s.", - name, strerror(error)); - return; - } + name = gctl_get_ascii(req, "arg1"); + error = g_metadata_clear(name, NULL); + if (error != 0) { + gctl_error(req, "cannot clear metadata on %s: %s.", name, strerror(error)); + return; } + /* + * encode the metadata + */ multipath_metadata_encode(&md, sector); /* - * Ok, store metadata. + * Store metadata on the initial provider. */ - for (i = 1; i < nargs; i++) { - name = gctl_get_ascii(req, "arg%d", i); - error = g_metadata_store(name, sector, secsize); - if (error != 0) { - fprintf(stderr, "Can't store metadata on %s: %s.\n", - name, strerror(error)); - goto fail; - } + error = g_metadata_store(name, sector, secsize); + if (error != 0) { + gctl_error(req, "cannot store metadata on %s: %s.", name, strerror(error)); + return; } - return; -fail: /* - * Clear last sector first for each provider to spoil anything extant + * Now add the rest of the providers. */ - for (i = 1; i < nargs; i++) { - name = gctl_get_ascii(req, "arg%d", i); - error = g_metadata_clear(name, NULL); - if (error != 0) { - gctl_error(req, "cannot clear metadata on %s: %s.", - name, strerror(error)); + error = gctl_change_param(req, "verb", -1, "add"); + if (error) { + gctl_error(req, "unable to change verb to \"add\": %s.", strerror(error)); + return; + } + for (i = 2; i < nargs; i++) { + error = gctl_change_param(req, "arg1", -1, gctl_get_ascii(req, "arg%d", i)); + if (error) { + gctl_error(req, "unable to add %s to %s: %s.", gctl_get_ascii(req, "arg%d", i), mpname, strerror(error)); continue; } + mp_add(req); } } @@ -213,22 +226,23 @@ static void mp_clear(struct gctl_req *req) { const char *name; - int error, i, nargs; + int error; - nargs = gctl_get_int(req, "nargs"); - if (nargs < 1) { - gctl_error(req, "Too few arguments."); - return; + name = gctl_get_ascii(req, "arg1"); + error = g_metadata_clear(name, G_MULTIPATH_MAGIC); + if (error != 0) { + fprintf(stderr, "Can't clear metadata on %s: %s.\n", name, strerror(error)); + gctl_error(req, "Not fully done."); } +} - for (i = 0; i < nargs; i++) { - name = gctl_get_ascii(req, "arg%d", i); - error = g_metadata_clear(name, G_MULTIPATH_MAGIC); - if (error != 0) { - fprintf(stderr, "Can't clear metadata on %s: %s.\n", - name, strerror(error)); - gctl_error(req, "Not fully done."); - continue; - } - } +static void +mp_add(struct gctl_req *req) +{ + const char *errstr; + + errstr = gctl_issue(req); + if (errstr != NULL && errstr[0] != '\0') { + gctl_error(req, "%s", errstr); + } } diff --git a/sbin/geom/class/part/geom_part.c b/sbin/geom/class/part/geom_part.c index e2a045e..3ddbd7a 100644 --- a/sbin/geom/class/part/geom_part.c +++ b/sbin/geom/class/part/geom_part.c @@ -133,6 +133,13 @@ struct g_command PUBSYM(class_commands)[] = { G_OPT_SENTINEL }, "geom", NULL }, + { "resize", 0, gpart_issue, { + { 's', "size", autofill, G_TYPE_ASCLBA }, + { 'i', index_param, NULL, G_TYPE_ASCNUM }, + { 'f', "flags", flags, G_TYPE_STRING }, + G_OPT_SENTINEL }, + "geom", NULL + }, G_CMD_SENTINEL }; @@ -243,6 +250,99 @@ fmtattrib(struct gprovider *pp) } static int +gpart_autofill_resize(struct gctl_req *req) +{ + struct gmesh mesh; + struct gclass *cp; + struct ggeom *gp; + struct gprovider *pp; + unsigned long long last, size, start, new_size; + unsigned long long lba, new_lba; + const char *s; + char *val; + int error, idx; + + s = gctl_get_ascii(req, "size"); + if (*s == '*') + new_size = (unsigned long long)atoll(s); + else + return (0); + + s = gctl_get_ascii(req, index_param); + idx = strtol(s, &val, 10); + if (idx < 1 || *s == '\0' || *val != '\0') + errx(EXIT_FAILURE, "invalid partition index"); + + error = geom_gettree(&mesh); + if (error) + return (error); + s = gctl_get_ascii(req, "class"); + if (s == NULL) + abort(); + cp = find_class(&mesh, s); + if (cp == NULL) + errx(EXIT_FAILURE, "Class %s not found.", s); + s = gctl_get_ascii(req, "geom"); + if (s == NULL) + abort(); + gp = find_geom(cp, s); + if (gp == NULL) + errx(EXIT_FAILURE, "No such geom: %s.", s); + last = atoll(find_geomcfg(gp, "last")); + + LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { + s = find_provcfg(pp, "index"); + if (s == NULL) + continue; + if (atoi(s) == idx) + break; + } + if (pp == NULL) + errx(EXIT_FAILURE, "invalid partition index"); + + s = find_provcfg(pp, "start"); + if (s == NULL) { + s = find_provcfg(pp, "offset"); + start = atoll(s) / pp->lg_sectorsize; + } else + start = atoll(s); + s = find_provcfg(pp, "end"); + if (s == NULL) { + s = find_provcfg(pp, "length"); + lba = start + atoll(s) / pp->lg_sectorsize; + } else + lba = atoll(s) + 1; + + if (lba > last) + return (ENOSPC); + size = lba - start; + pp = find_provider(gp, lba); + if (pp == NULL) + new_size = last - start + 1; + else { + s = find_provcfg(pp, "start"); + if (s == NULL) { + s = find_provcfg(pp, "offset"); + new_lba = atoll(s) / pp->lg_sectorsize; + } else + new_lba = atoll(s); + /* Is there any free space between current and + * next providers? + */ + if (new_lba > lba) + new_size = new_lba - start; + else + return (ENOSPC); + } + asprintf(&val, "%llu", new_size); + if (val == NULL) + return (ENOMEM); + gctl_change_param(req, "size", -1, val); + + return (0); +} + +static int gpart_autofill(struct gctl_req *req) { struct gmesh mesh; @@ -257,6 +357,8 @@ gpart_autofill(struct gctl_req *req) int error, has_size, has_start; s = gctl_get_ascii(req, "verb"); + if (strcmp(s, "resize") == 0) + return gpart_autofill_resize(req); if (strcmp(s, "add") != 0) return (0); diff --git a/sbin/geom/class/part/gpart.8 b/sbin/geom/class/part/gpart.8 index 3de658c..66557f3 100644 --- a/sbin/geom/class/part/gpart.8 +++ b/sbin/geom/class/part/gpart.8 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd November 18, 2008 +.Dd April 22, 2010 .Dt GPART 8 .Os .Sh NAME @@ -37,6 +37,7 @@ lines in your kernel configuration file: .Bd -ragged -offset indent .Cd "options GEOM_PART_APM" .Cd "options GEOM_PART_BSD" +.Cd "options GEOM_PART_EBR" .Cd "options GEOM_PART_GPT" .Cd "options GEOM_PART_MBR" .Cd "options GEOM_PART_PC98" @@ -53,6 +54,10 @@ option adds support for the traditional .Bx disklabel. The +.Dv GEOM_PART_EBR +option adds support for the Extended Boot Record (EBR), +which is used to define a logical partition. +The .Dv GEOM_PART_GPT option adds support for the GUID Partition Table (GPT) found on Intel Itanium computers and Intel-based Macintosh computers. @@ -120,6 +125,13 @@ utility: .Op Fl t Ar type .Op Fl f Ar flags .Ar geom +.\" ==== RESIZE ==== +.Nm +.Cm resize +.Fl i Ar index +.Op Fl s Ar size +.Op Fl f Ar flags +.Ar geom .\" ==== SET ==== .Nm .Cm set @@ -325,6 +337,30 @@ See the section entitled below for a discussion about its use. .El +.\" ==== RESIZE ==== +.It Cm resize +Resize a partition from geom +.Ar geom +and further identified by the +.Fl i Ar index +option. New partition size is expressed in logical block +numbers and can be given by the +.Fl s Ar size +option. If +.Fl s +option is ommited then new size is automatically calculated +to maximum available from given geom +.Ar geom . +.Pp +Additional options include: +.Bl -tag -width 10n +.It Fl f Ar flags +Additional operational flags. +See the section entitled +.Sx "OPERATIONAL FLAGS" +below for a discussion +about its use. +.El .\" ==== SET ==== .It Cm set Set the named attribute on the partition entry. diff --git a/sbin/geom/class/sched/Makefile b/sbin/geom/class/sched/Makefile new file mode 100644 index 0000000..6656cdd --- /dev/null +++ b/sbin/geom/class/sched/Makefile @@ -0,0 +1,18 @@ +# GEOM_LIBRARY_PATH +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../misc +#CFLAGS += -I/usr/src/sbin/geom + +CLASS=sched + +WARNS?= 6 +CLASS_DIR?=/lib/geom + +SHLIBDIR?=${CLASS_DIR} +SHLIB_NAME?=geom_${CLASS}.so +LINKS= ${BINDIR}/geom ${BINDIR}/g${CLASS} +MAN= g${CLASS}.8 +SRCS+= geom_${CLASS}.c subr.c + +.include <bsd.lib.mk> diff --git a/sbin/geom/class/sched/geom_sched.c b/sbin/geom/class/sched/geom_sched.c new file mode 100644 index 0000000..ca05350 --- /dev/null +++ b/sbin/geom/class/sched/geom_sched.c @@ -0,0 +1,124 @@ +/*- + * Copyright (c) 2009 Fabio Checconi + * Copyright (c) 2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $Id$ + * $FreeBSD$ + * + * This file implements the userspace library used by the 'geom' + * command to load and manipulate disk schedulers. + */ + +#include <sys/cdefs.h> +#include <sys/param.h> +#include <sys/linker.h> +#include <sys/module.h> + +#include <stdio.h> +#include <stdint.h> +#include <libgeom.h> + +#include "core/geom.h" +#include "misc/subr.h" + +#define G_SCHED_VERSION 0 + +uint32_t lib_version = G_LIB_VERSION; +uint32_t version = G_SCHED_VERSION; + +/* + * storage for parameters used by this geom class. + * Right now only the scheduler name is used. + */ +static char algo[] = "rr"; /* default scheduler */ + +/* + * Adapt to differences in geom library. + * in V1 struct g_command misses gc_argname, eld, and G_BOOL is undefined + */ +#if G_LIB_VERSION == 1 +#define G_ARGNAME +#define G_TYPE_BOOL G_TYPE_NUMBER +#else +#define G_ARGNAME NULL, +#endif + +static void +gcmd_createinsert(struct gctl_req *req, unsigned flags __unused) +{ + const char *reqalgo; + char name[64]; + + if (gctl_has_param(req, "algo")) + reqalgo = gctl_get_ascii(req, "algo"); + else + reqalgo = algo; + + snprintf(name, sizeof(name), "gsched_%s", reqalgo); + /* + * Do not complain about errors here, gctl_issue() + * will fail anyway. + */ + if (modfind(name) < 0) + kldload(name); + gctl_issue(req); +} + +struct g_command class_commands[] = { + { "create", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, + { + { 'a', "algo", algo, G_TYPE_STRING }, + G_OPT_SENTINEL + }, + G_ARGNAME "[-v] [-a algorithm_name] dev ..." + }, + { "insert", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, + { + { 'a', "algo", algo, G_TYPE_STRING }, + G_OPT_SENTINEL + }, + G_ARGNAME "[-v] [-a algorithm_name] dev ..." + }, + { "configure", G_FLAG_VERBOSE, NULL, + { + { 'a', "algo", algo, G_TYPE_STRING }, + G_OPT_SENTINEL + }, + G_ARGNAME "[-v] [-a algorithm_name] prov ..." + }, + { "destroy", G_FLAG_VERBOSE, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, + G_ARGNAME "[-fv] prov ..." + }, + { "reset", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, + G_ARGNAME "[-v] prov ..." + }, + G_CMD_SENTINEL +}; diff --git a/sbin/geom/class/sched/gsched.8 b/sbin/geom/class/sched/gsched.8 new file mode 100644 index 0000000..cde9485 --- /dev/null +++ b/sbin/geom/class/sched/gsched.8 @@ -0,0 +1,163 @@ +.\" Copyright (c) 2009-2010 Fabio Checconi +.\" Copyright (c) 2009-2010 Luigi Rizzo, Universita` di Pisa +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd April 12, 2010 +.Dt GSCHED 8 +.Os +.Sh NAME +.Nm gsched +.Nd "control utility for disk scheduler GEOM class" +.Sh SYNOPSIS +.Nm +.Cm create +.Op Fl v +.Op Fl a Ar algorithm +.Ar provider ... +.Nm +.Cm insert +.Op Fl v +.Op Fl a Ar algorithm +.Ar provider ... +.Nm +.Cm configure +.Op Fl v +.Op Fl a Ar algorithm +.Ar node ... +.Nm +.Cm destroy +.Op Fl fv +.Ar node ... +.Nm +.Cm reset +.Op Fl v +.Ar node ... +.Nm +.Cm { list | status | load | unload } +.Sh DESCRIPTION +The +.Nm +utility (also callable as +.Nm geom sched ... ) +changes the scheduling policy of the requests going to a provider. +.Pp +The first argument to +.Nm +indicates an action to be performed: +.Bl -tag -width ".Cm configure" +.It Cm create +Create a new provider and geom node using the specified scheduling algorithm. +.Ar algorithm +is the name of the scheduling algorithm used for the provider. +Available algorithms include: +.Ar rr , +which implements anticipatory scheduling with round robin service +among clients; +.Ar as , +which implements a simple form of anticipatory scheduling with +no per-client queue. +.Pp +If the operation succeeds, the new provider should appear with name +.Pa /dev/ Ns Ao Ar dev Ac Ns Pa .sched. . +The kernel module +.Pa geom_sched.ko +will be loaded if it is not loaded already. +.It Cm insert +Operates as "create", but the insertion is "transparent", +i.e. the existing provider is rerouted to the newly created geom, +which in turn forwards requests to the existing geom. +This operation allows one to start/stop a scheduling service +on an already existing provider. +.Pp +A subsequent 'destroy' will remove the newly created geom and +hook the provider back to the original geom. +.Ar algorithm +.It Cm configure +Configure existing scheduling provider. It supports the same options +as the +.Nm create +command. +.It Cm destroy +Destroy the geom specified in the parameter. +.It Cm reset +Do nothing. +.It Cm list | status | load | unload +See +.Xr geom 8 . +.El +.Pp +Additional options: +.Bl -tag -width ".Fl f" +.It Fl f +Force the removal of the specified provider. +.It Fl v +Be more verbose. +.El +.Sh SYSCTL VARIABLES +The following +.Xr sysctl 8 +variables can be used to control the behavior of the +.Nm SCHED +GEOM class. +The default value is shown next to each variable. +.Bl -tag -width indent +.It Va kern.geom.sched.debug : No 0 +Debug level of the +.Nm SCHED +GEOM class. +This can be set to a number between 0 and 2 inclusive. +If set to 0 minimal debug information is printed, and if set to 2 the +maximum amount of debug information is printed. +.El +.Sh EXIT STATUS +Exit status is 0 on success, and 1 if the command fails. +.Sh EXAMPLES +The following example shows how to create a scheduling provider for disk +.Pa /dev/da0 +, and how to destroy it. +.Bd -literal -offset indent +# Load the geom_sched module: +kldload geom_sched +# Load some scheduler classes used by geom_sched: +kldload gsched_rr gsched_as +# Configure device ad0 to use scheduler 'rr': +geom sched insert -s rr ad0 +# Now provider ad0 uses the 'rr' algorithm; +# the new geom is ad0.sched. +# Remove the scheduler on the device: +geom sched destroy -v ad0.sched. +.Ed +.Pp +.Sh SEE ALSO +.Xr geom 4 , +.Xr geom 8 +.Sh HISTORY +The +.Nm +utility appeared in April 2010. +.Sh AUTHORS +.An Fabio Checconi Aq fabio@FreeBSD.org +.An Luigi Rizzo Aq luigi@FreeBSD.org diff --git a/sbin/geom/misc/subr.c b/sbin/geom/misc/subr.c index 64df7c6..21deac8 100644 --- a/sbin/geom/misc/subr.c +++ b/sbin/geom/misc/subr.c @@ -236,6 +236,7 @@ g_metadata_store(const char *name, u_char *md, size_t size) error = errno; goto out; } + (void)ioctl(fd, DIOCGFLUSH, NULL); out: if (sector != NULL) free(sector); @@ -293,6 +294,7 @@ g_metadata_clear(const char *name, const char *magic) error = errno; goto out; } + (void)ioctl(fd, DIOCGFLUSH, NULL); out: if (sector != NULL) free(sector); diff --git a/sbin/gvinum/gvinum.c b/sbin/gvinum/gvinum.c index 7f97f9d..041f140 100644 --- a/sbin/gvinum/gvinum.c +++ b/sbin/gvinum/gvinum.c @@ -83,8 +83,9 @@ void printconfig(FILE *, char *); char *create_drive(char *); void create_volume(int, char **, char *); char *find_name(const char *, int, int); -char *find_drive(const char *); char *find_pattern(char *, char *); +void copy_device(struct gv_drive *, const char *); +#define find_drive() find_name("gvinumdrive", GV_TYPE_DRIVE, GV_MAXDRIVENAME) int main(int argc, char **argv) @@ -424,7 +425,7 @@ create_drive(char *device) drives = 1; dname = NULL; - drivename = find_drive(device); + drivename = find_drive(); if (drivename == NULL) return (NULL); @@ -436,7 +437,7 @@ create_drive(char *device) err(1, "unable to allocate for gv_drive object"); strlcpy(d->name, drivename, sizeof(d->name)); - strlcpy(d->device, device, sizeof(d->device)); + copy_device(d, device); gctl_ro_param(req, "drive0", sizeof(*d), d); gctl_ro_param(req, "flags", sizeof(int), &flags); gctl_ro_param(req, "drives", sizeof(int), &drives); @@ -626,14 +627,13 @@ find_name(const char *prefix, int type, int namelen) return (NULL); } -char * -find_drive(const char *device) +void +copy_device(struct gv_drive *d, const char *device) { - - /* Strip possible /dev/ in front. */ if (strncmp(device, "/dev/", 5) == 0) - device += 5; - return (find_name("gvinumdrive", GV_TYPE_DRIVE, GV_MAXDRIVENAME)); + strlcpy(d->device, (device + 5), sizeof(d->device)); + else + strlcpy(d->device, device, sizeof(d->device)); } /* Detach a plex or subdisk from its parent. */ @@ -1275,7 +1275,7 @@ gvinum_grow(int argc, char **argv) return; } /* Lookup device and set an appropriate drive name. */ - drive = find_drive(argv[2]); + drive = find_drive(); if (drive == NULL) { warn("unable to find an appropriate drive name"); free(s); @@ -1283,10 +1283,8 @@ gvinum_grow(int argc, char **argv) return; } strlcpy(d->name, drive, sizeof(d->name)); - if (strncmp(argv[2], "/dev/", 5) == 0) - strlcpy(d->device, (argv[2] + 5), sizeof(d->device)); - else - strlcpy(d->device, argv[2], sizeof(d->device)); + copy_device(d, argv[2]); + drives = 1; /* We try to use the plex name as basis for the subdisk name. */ diff --git a/sbin/hastctl/Makefile b/sbin/hastctl/Makefile index 43c8c20..301493c 100644 --- a/sbin/hastctl/Makefile +++ b/sbin/hastctl/Makefile @@ -15,7 +15,6 @@ SRCS+= proto.c proto_common.c proto_tcp4.c proto_uds.c SRCS+= token.l SRCS+= subr.c SRCS+= y.tab.h -WARNS?= 6 MAN= hastctl.8 CFLAGS+=-I${.CURDIR}/../hastd @@ -26,8 +25,13 @@ CFLAGS+=-DINET6 # This is needed to have WARNS > 1. CFLAGS+=-DYY_NO_UNPUT -DPADD= ${LIBCRYPTO} ${LIBL} -LDADD= -lcrypto -ll +DPADD= ${LIBL} +LDADD= -ll +.if ${MK_OPENSSL} != "no" +DPADD+= ${LIBCRYPTO} +LDADD+= -lcrypto +CFLAGS+=-DHAVE_CRYPTO +.endif YFLAGS+=-v diff --git a/sbin/hastd/Makefile b/sbin/hastd/Makefile index 4311807..bfb62f4 100644 --- a/sbin/hastd/Makefile +++ b/sbin/hastd/Makefile @@ -16,7 +16,6 @@ SRCS+= rangelock.c SRCS+= subr.c SRCS+= token.l SRCS+= y.tab.h -WARNS?= 6 MAN= hastd.8 hast.conf.5 CFLAGS+=-I${.CURDIR} @@ -27,9 +26,13 @@ CFLAGS+=-DINET6 # This is needed to have WARNS > 1. CFLAGS+=-DYY_NO_UNPUT -DPADD= ${LIBCRYPTO} ${LIBGEOM} ${LIBBSDXML} ${LIBSBUF} ${LIBL} \ - ${LIBPTHREAD} ${LIBUTIL} -LDADD= -lcrypto -lgeom -lbsdxml -lsbuf -ll -lpthread -lutil +DPADD= ${LIBGEOM} ${LIBBSDXML} ${LIBSBUF} ${LIBL} ${LIBPTHREAD} ${LIBUTIL} +LDADD= -lgeom -lbsdxml -lsbuf -ll -lpthread -lutil +.if ${MK_OPENSSL} != "no" +DPADD+= ${LIBCRYPTO} +LDADD+= -lcrypto +CFLAGS+=-DHAVE_CRYPTO +.endif YFLAGS+=-v diff --git a/sbin/hastd/hast_proto.c b/sbin/hastd/hast_proto.c index 6e66006..348dfc8 100644 --- a/sbin/hastd/hast_proto.c +++ b/sbin/hastd/hast_proto.c @@ -37,7 +37,9 @@ __FBSDID("$FreeBSD$"); #include <string.h> #include <strings.h> +#ifdef HAVE_CRYPTO #include <openssl/sha.h> +#endif #include <hast.h> #include <ebuf.h> @@ -67,14 +69,18 @@ static int compression_send(struct hast_resource *res, struct nv *nv, void **datap, size_t *sizep, bool *freedatap); static int compression_recv(struct hast_resource *res, struct nv *nv, void **datap, size_t *sizep, bool *freedatap); +#ifdef HAVE_CRYPTO static int checksum_send(struct hast_resource *res, struct nv *nv, void **datap, size_t *sizep, bool *freedatap); static int checksum_recv(struct hast_resource *res, struct nv *nv, void **datap, size_t *sizep, bool *freedatap); +#endif static struct hast_pipe_stage pipeline[] = { { "compression", compression_send, compression_recv }, +#ifdef HAVE_CRYPTO { "checksum", checksum_send, checksum_recv } +#endif }; static int @@ -161,6 +167,7 @@ compression_recv(struct hast_resource *res, struct nv *nv, void **datap, return (0); } +#ifdef HAVE_CRYPTO static int checksum_send(struct hast_resource *res, struct nv *nv, void **datap, size_t *sizep, bool *freedatap __unused) @@ -221,6 +228,7 @@ checksum_recv(struct hast_resource *res, struct nv *nv, void **datap, return (0); } +#endif /* HAVE_CRYPTO */ /* * Send the given nv structure via conn. diff --git a/sbin/hastd/hastd.c b/sbin/hastd/hastd.c index 19f0893..957885d 100644 --- a/sbin/hastd/hastd.c +++ b/sbin/hastd/hastd.c @@ -137,6 +137,7 @@ child_exit(void) pjdlog_error("Worker process failed (pid=%u, status=%d).", (unsigned int)pid, WEXITSTATUS(status)); } + proto_close(res->hr_ctrl); res->hr_workerpid = 0; if (res->hr_role == HAST_ROLE_PRIMARY) { sleep(1); diff --git a/sbin/hastd/pjdlog.c b/sbin/hastd/pjdlog.c index 38c5539..9f8b3f4 100644 --- a/sbin/hastd/pjdlog.c +++ b/sbin/hastd/pjdlog.c @@ -228,7 +228,7 @@ pjdlogv_common(int loglevel, int debuglevel, int error, const char *fmt, len = snprintf(log, sizeof(log), "%s", pjdlog_prefix); if ((size_t)len < sizeof(log)) - len = vsnprintf(log + len, sizeof(log) - len, fmt, ap); + len += vsnprintf(log + len, sizeof(log) - len, fmt, ap); if (error != -1 && (size_t)len < sizeof(log)) { (void)snprintf(log + len, sizeof(log) - len, ": %s.", strerror(error)); diff --git a/sbin/hastd/primary.c b/sbin/hastd/primary.c index ed6e91c..0915154 100644 --- a/sbin/hastd/primary.c +++ b/sbin/hastd/primary.c @@ -460,9 +460,11 @@ init_local(struct hast_resource *res) exit(EX_NOINPUT); } -static void -init_remote(struct hast_resource *res) +static bool +init_remote(struct hast_resource *res, struct proto_conn **inp, + struct proto_conn **outp) { + struct proto_conn *in, *out; struct nv *nvout, *nvin; const unsigned char *token; unsigned char *map; @@ -472,13 +474,17 @@ init_remote(struct hast_resource *res) uint32_t mapsize; size_t size; + assert((inp == NULL && outp == NULL) || (inp != NULL && outp != NULL)); + + in = out = NULL; + /* Prepare outgoing connection with remote node. */ - if (proto_client(res->hr_remoteaddr, &res->hr_remoteout) < 0) { + if (proto_client(res->hr_remoteaddr, &out) < 0) { primary_exit(EX_OSERR, "Unable to create connection to %s", res->hr_remoteaddr); } /* Try to connect, but accept failure. */ - if (proto_connect(res->hr_remoteout) < 0) { + if (proto_connect(out) < 0) { pjdlog_errno(LOG_WARNING, "Unable to connect to %s", res->hr_remoteaddr); goto close; @@ -496,7 +502,7 @@ init_remote(struct hast_resource *res) nv_free(nvout); goto close; } - if (hast_proto_send(res, res->hr_remoteout, nvout, NULL, 0) < 0) { + if (hast_proto_send(res, out, nvout, NULL, 0) < 0) { pjdlog_errno(LOG_WARNING, "Unable to send handshake header to %s", res->hr_remoteaddr); @@ -504,7 +510,7 @@ init_remote(struct hast_resource *res) goto close; } nv_free(nvout); - if (hast_proto_recv_hdr(res->hr_remoteout, &nvin) < 0) { + if (hast_proto_recv_hdr(out, &nvin) < 0) { pjdlog_errno(LOG_WARNING, "Unable to receive handshake header from %s", res->hr_remoteaddr); @@ -536,12 +542,12 @@ init_remote(struct hast_resource *res) * Second handshake step. * Setup incoming connection with remote node. */ - if (proto_client(res->hr_remoteaddr, &res->hr_remotein) < 0) { + if (proto_client(res->hr_remoteaddr, &in) < 0) { pjdlog_errno(LOG_WARNING, "Unable to create connection to %s", res->hr_remoteaddr); } /* Try to connect, but accept failure. */ - if (proto_connect(res->hr_remotein) < 0) { + if (proto_connect(in) < 0) { pjdlog_errno(LOG_WARNING, "Unable to connect to %s", res->hr_remoteaddr); goto close; @@ -560,7 +566,7 @@ init_remote(struct hast_resource *res) nv_free(nvout); goto close; } - if (hast_proto_send(res, res->hr_remotein, nvout, NULL, 0) < 0) { + if (hast_proto_send(res, in, nvout, NULL, 0) < 0) { pjdlog_errno(LOG_WARNING, "Unable to send handshake header to %s", res->hr_remoteaddr); @@ -568,7 +574,7 @@ init_remote(struct hast_resource *res) goto close; } nv_free(nvout); - if (hast_proto_recv_hdr(res->hr_remoteout, &nvin) < 0) { + if (hast_proto_recv_hdr(out, &nvin) < 0) { pjdlog_errno(LOG_WARNING, "Unable to receive handshake header from %s", res->hr_remoteaddr); @@ -611,7 +617,7 @@ init_remote(struct hast_resource *res) * Remote node have some dirty extents on its own, lets * download its activemap. */ - if (hast_proto_recv_data(res, res->hr_remoteout, nvin, map, + if (hast_proto_recv_data(res, out, nvin, map, mapsize) < 0) { pjdlog_errno(LOG_ERR, "Unable to receive remote activemap"); @@ -631,18 +637,29 @@ init_remote(struct hast_resource *res) (void)hast_activemap_flush(res); } pjdlog_info("Connected to %s.", res->hr_remoteaddr); + if (inp != NULL && outp != NULL) { + *inp = in; + *outp = out; + } else { + res->hr_remotein = in; + res->hr_remoteout = out; + } + return (true); +close: + proto_close(out); + if (in != NULL) + proto_close(in); + return (false); +} + +static void +sync_start(void) +{ + mtx_lock(&sync_lock); sync_inprogress = true; mtx_unlock(&sync_lock); cv_signal(&sync_cond); - return; -close: - proto_close(res->hr_remoteout); - res->hr_remoteout = NULL; - if (res->hr_remotein != NULL) { - proto_close(res->hr_remotein); - res->hr_remotein = NULL; - } } static void @@ -665,7 +682,7 @@ init_ggate(struct hast_resource *res) ggiocreate.gctl_mediasize = res->hr_datasize; ggiocreate.gctl_sectorsize = res->hr_local_sectorsize; ggiocreate.gctl_flags = 0; - ggiocreate.gctl_maxcount = 128; + ggiocreate.gctl_maxcount = G_GATE_MAX_QUEUE_SIZE; ggiocreate.gctl_timeout = 0; ggiocreate.gctl_unit = G_GATE_NAME_GIVEN; snprintf(ggiocreate.gctl_name, sizeof(ggiocreate.gctl_name), "hast/%s", @@ -735,7 +752,8 @@ hastd_primary(struct hast_resource *res) setproctitle("%s (primary)", res->hr_name); init_local(res); - init_remote(res); + if (init_remote(res, NULL, NULL)) + sync_start(); init_ggate(res); init_environment(res); error = pthread_create(&td, NULL, ggate_recv_thread, res); @@ -1695,6 +1713,7 @@ static void * guard_thread(void *arg) { struct hast_resource *res = arg; + struct proto_conn *in, *out; unsigned int ii, ncomps; int timeout; @@ -1738,26 +1757,31 @@ guard_thread(void *arg) * connected. */ rw_unlock(&hio_remote_lock[ii]); - rw_wlock(&hio_remote_lock[ii]); - assert(res->hr_remotein == NULL); - assert(res->hr_remoteout == NULL); pjdlog_debug(2, "remote_guard: Reconnecting to %s.", res->hr_remoteaddr); - init_remote(res); - if (ISCONNECTED(res, ii)) { + in = out = NULL; + if (init_remote(res, &in, &out)) { + rw_wlock(&hio_remote_lock[ii]); + assert(res->hr_remotein == NULL); + assert(res->hr_remoteout == NULL); + assert(in != NULL && out != NULL); + res->hr_remotein = in; + res->hr_remoteout = out; + rw_unlock(&hio_remote_lock[ii]); pjdlog_info("Successfully reconnected to %s.", res->hr_remoteaddr); + sync_start(); } else { /* Both connections should be NULL. */ assert(res->hr_remotein == NULL); assert(res->hr_remoteout == NULL); + assert(in == NULL && out == NULL); pjdlog_debug(2, "remote_guard: Reconnect to %s failed.", res->hr_remoteaddr); timeout = RECONNECT_SLEEP; } - rw_unlock(&hio_remote_lock[ii]); } } (void)cv_timedwait(&hio_guard_cond, &hio_guard_lock, timeout); diff --git a/sbin/ifconfig/ifconfig.c b/sbin/ifconfig/ifconfig.c index aebcdc0..aa96175 100644 --- a/sbin/ifconfig/ifconfig.c +++ b/sbin/ifconfig/ifconfig.c @@ -881,7 +881,7 @@ unsetifdescr(const char *val, int value, int s, const struct afswtch *afp) #define IFCAPBITS \ "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \ "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \ -"\21VLAN_HWFILTER\23VLAN_HWTSO" +"\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE" /* * Print the status of the interface. If an address family was @@ -922,19 +922,20 @@ status(const struct afswtch *afp, const struct sockaddr_dl *sdl, ifr.ifr_buffer.buffer = descr; ifr.ifr_buffer.length = descrlen; if (ioctl(s, SIOCGIFDESCR, &ifr) == 0) { - if (strlen(descr) > 0) - printf("\tdescription: %s\n", descr); - break; - } else if (errno == ENAMETOOLONG) - descrlen = ifr.ifr_buffer.length; - else - break; - } else { + if (ifr.ifr_buffer.buffer == descr) { + if (strlen(descr) > 0) + printf("\tdescription: %s\n", + descr); + } else if (ifr.ifr_buffer.length > descrlen) { + descrlen = ifr.ifr_buffer.length; + continue; + } + } + } else warn("unable to allocate memory for interface" "description"); - break; - } - }; + break; + } if (ioctl(s, SIOCGIFCAP, (caddr_t)&ifr) == 0) { if (ifr.ifr_curcap != 0) { diff --git a/sbin/ifconfig/ifieee80211.c b/sbin/ifconfig/ifieee80211.c index 0709242..6cb4d2c 100644 --- a/sbin/ifconfig/ifieee80211.c +++ b/sbin/ifconfig/ifieee80211.c @@ -4509,6 +4509,7 @@ end: } else { LINE_BREAK(); list_roam(s); + LINE_BREAK(); } } diff --git a/sbin/ipf/ipftest/Makefile b/sbin/ipf/ipftest/Makefile index 8903f25..d089b2b 100644 --- a/sbin/ipf/ipftest/Makefile +++ b/sbin/ipf/ipftest/Makefile @@ -1,7 +1,5 @@ # $FreeBSD$ -WARNS=0 - PROG= ipftest SRCS= ${GENHDRS} ipftest.c fil.c ip_frag.c ip_state.c ip_nat.c \ ip_proxy.c ip_auth.c ip_htable.c ip_lookup.c \ @@ -10,6 +8,7 @@ SRCS= ${GENHDRS} ipftest.c fil.c ip_frag.c ip_state.c ip_nat.c \ ipf_l.c ipnat_y.c ipnat_l.c md5.c radix.c bpf_filter.c MAN= ipftest.1 +WARNS?= 0 CFLAGS+= -DIPFILTER_LOG -DIPFILTER_COMPILED -DIPFILTER_LOOKUP \ -DIPFILTER_SCAN -DIPFILTER_SYNC -DIPFILTER_CKSUM -I. diff --git a/sbin/ipfw/altq.c b/sbin/ipfw/altq.c index b00a1e0..8cf19e5 100644 --- a/sbin/ipfw/altq.c +++ b/sbin/ipfw/altq.c @@ -39,6 +39,7 @@ #include <net/if.h> /* IFNAMSIZ */ #include <net/pfvar.h> +#include <netinet/in.h> /* in_addr */ #include <netinet/ip_fw.h> /* diff --git a/sbin/ipfw/dummynet.c b/sbin/ipfw/dummynet.c index 490aa53..19d52a4 100644 --- a/sbin/ipfw/dummynet.c +++ b/sbin/ipfw/dummynet.c @@ -1,10 +1,5 @@ /* - * Copyright (c) 2002-2003 Luigi Rizzo - * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp - * Copyright (c) 1994 Ugen J.S.Antsilevich - * - * Idea and grammar partially left from: - * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 2002-2003,2010 Luigi Rizzo * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. @@ -15,8 +10,6 @@ * * This software is provided ``AS IS'' without any warranties of any kind. * - * NEW command line interface for IP firewall facility - * * $FreeBSD$ * * dummynet support @@ -24,7 +17,6 @@ #include <sys/types.h> #include <sys/socket.h> -#include <sys/queue.h> /* XXX there are several sysctl leftover here */ #include <sys/sysctl.h> @@ -46,6 +38,7 @@ #include <netinet/ip_dummynet.h> #include <arpa/inet.h> /* inet_ntoa */ + static struct _s_x dummynet_params[] = { { "plr", TOK_PLR }, { "noerror", TOK_NOERROR }, @@ -56,27 +49,59 @@ static struct _s_x dummynet_params[] = { { "src-port", TOK_SRCPORT }, { "proto", TOK_PROTO }, { "weight", TOK_WEIGHT }, + { "lmax", TOK_LMAX }, + { "maxlen", TOK_LMAX }, { "all", TOK_ALL }, - { "mask", TOK_MASK }, + { "mask", TOK_MASK }, /* alias for both */ + { "sched_mask", TOK_SCHED_MASK }, + { "flow_mask", TOK_FLOW_MASK }, { "droptail", TOK_DROPTAIL }, { "red", TOK_RED }, { "gred", TOK_GRED }, { "bw", TOK_BW }, { "bandwidth", TOK_BW }, { "delay", TOK_DELAY }, + { "link", TOK_LINK }, { "pipe", TOK_PIPE }, { "queue", TOK_QUEUE }, + { "flowset", TOK_FLOWSET }, + { "sched", TOK_SCHED }, + { "pri", TOK_PRI }, + { "priority", TOK_PRI }, + { "type", TOK_TYPE }, { "flow-id", TOK_FLOWID}, { "dst-ipv6", TOK_DSTIP6}, { "dst-ip6", TOK_DSTIP6}, { "src-ipv6", TOK_SRCIP6}, { "src-ip6", TOK_SRCIP6}, - { "profile", TOK_PIPE_PROFILE}, + { "profile", TOK_PROFILE}, { "burst", TOK_BURST}, { "dummynet-params", TOK_NULL }, { NULL, 0 } /* terminator */ }; +#define O_NEXT(p, len) ((void *)((char *)p + len)) + +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} + +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + +#if 0 static int sort_q(void *arg, const void *pa, const void *pb) { @@ -108,117 +133,84 @@ sort_q(void *arg, const void *pa, const void *pb) res = 1; return (int)(rev ? res : -res); } +#endif +/* print a mask and header for the subsequent list of flows */ static void -list_queues(struct dn_flow_set *fs, struct dn_flow_queue *q) +print_mask(struct ipfw_flow_id *id) +{ + if (!IS_IP6_FLOW_ID(id)) { + printf(" " + "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", + id->extra ? "queue," : "", + id->proto, + id->src_ip, id->src_port, + id->dst_ip, id->dst_port); + + printf("BKT Prot ___Source IP/port____ " + "____Dest. IP/port____ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); + } else { + char buf[255]; + printf("\n mask: %sproto: 0x%02x, flow_id: 0x%08x, ", + id->extra ? "queue," : "", + id->proto, id->flow_id6); + inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); + printf("%s/0x%04x -> ", buf, id->src_port); + inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); + printf("%s/0x%04x\n", buf, id->dst_port); + + printf("BKT ___Prot___ _flow-id_ " + "______________Source IPv6/port_______________ " + "_______________Dest. IPv6/port_______________ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); + } +} + +static void +list_flow(struct dn_flow *ni) { - int l; - int index_printed, indexes = 0; char buff[255]; struct protoent *pe; + struct in_addr ina; + struct ipfw_flow_id *id = &ni->fid; - if (fs->rq_elements == 0) - return; - - if (co.do_sort != 0) - qsort_r(q, fs->rq_elements, sizeof *q, NULL, sort_q); - - /* Print IPv4 flows */ - index_printed = 0; - for (l = 0; l < fs->rq_elements; l++) { - struct in_addr ina; - + pe = getprotobynumber(id->proto); /* XXX: Should check for IPv4 flows */ - if (IS_IP6_FLOW_ID(&(q[l].id))) - continue; - - if (!index_printed) { - index_printed = 1; - if (indexes > 0) /* currently a no-op */ - printf("\n"); - indexes++; - printf(" " - "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", - fs->flow_mask.proto, - fs->flow_mask.src_ip, fs->flow_mask.src_port, - fs->flow_mask.dst_ip, fs->flow_mask.dst_port); - - printf("BKT Prot ___Source IP/port____ " - "____Dest. IP/port____ " - "Tot_pkt/bytes Pkt/Byte Drp\n"); - } - - printf("%3d ", q[l].hash_slot); - pe = getprotobynumber(q[l].id.proto); + printf("%3u%c", (ni->oid.id) & 0xff, + id->extra ? '*' : ' '); + if (!IS_IP6_FLOW_ID(id)) { if (pe) printf("%-4s ", pe->p_name); else - printf("%4u ", q[l].id.proto); - ina.s_addr = htonl(q[l].id.src_ip); + printf("%4u ", id->proto); + ina.s_addr = htonl(id->src_ip); printf("%15s/%-5d ", - inet_ntoa(ina), q[l].id.src_port); - ina.s_addr = htonl(q[l].id.dst_ip); + inet_ntoa(ina), id->src_port); + ina.s_addr = htonl(id->dst_ip); printf("%15s/%-5d ", - inet_ntoa(ina), q[l].id.dst_port); - printf("%4llu %8llu %2u %4u %3u\n", - align_uint64(&q[l].tot_pkts), - align_uint64(&q[l].tot_bytes), - q[l].len, q[l].len_bytes, q[l].drops); - if (co.verbose) - printf(" S %20llu F %20llu\n", - align_uint64(&q[l].S), align_uint64(&q[l].F)); - } - - /* Print IPv6 flows */ - index_printed = 0; - for (l = 0; l < fs->rq_elements; l++) { - if (!IS_IP6_FLOW_ID(&(q[l].id))) - continue; - - if (!index_printed) { - index_printed = 1; - if (indexes > 0) - printf("\n"); - indexes++; - printf("\n mask: proto: 0x%02x, flow_id: 0x%08x, ", - fs->flow_mask.proto, fs->flow_mask.flow_id6); - inet_ntop(AF_INET6, &(fs->flow_mask.src_ip6), - buff, sizeof(buff)); - printf("%s/0x%04x -> ", buff, fs->flow_mask.src_port); - inet_ntop( AF_INET6, &(fs->flow_mask.dst_ip6), - buff, sizeof(buff) ); - printf("%s/0x%04x\n", buff, fs->flow_mask.dst_port); - - printf("BKT ___Prot___ _flow-id_ " - "______________Source IPv6/port_______________ " - "_______________Dest. IPv6/port_______________ " - "Tot_pkt/bytes Pkt/Byte Drp\n"); - } - printf("%3d ", q[l].hash_slot); - pe = getprotobynumber(q[l].id.proto); + inet_ntoa(ina), id->dst_port); + } else { + /* Print IPv6 flows */ if (pe != NULL) printf("%9s ", pe->p_name); else - printf("%9u ", q[l].id.proto); - printf("%7d %39s/%-5d ", q[l].id.flow_id6, - inet_ntop(AF_INET6, &(q[l].id.src_ip6), buff, sizeof(buff)), - q[l].id.src_port); + printf("%9u ", id->proto); + printf("%7d %39s/%-5d ", id->flow_id6, + inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), + id->src_port); printf(" %39s/%-5d ", - inet_ntop(AF_INET6, &(q[l].id.dst_ip6), buff, sizeof(buff)), - q[l].id.dst_port); - printf(" %4llu %8llu %2u %4u %3u\n", - align_uint64(&q[l].tot_pkts), - align_uint64(&q[l].tot_bytes), - q[l].len, q[l].len_bytes, q[l].drops); - if (co.verbose) - printf(" S %20llu F %20llu\n", - align_uint64(&q[l].S), - align_uint64(&q[l].F)); + inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), + id->dst_port); } + pr_u64(&ni->tot_pkts, 4); + pr_u64(&ni->tot_bytes, 8); + printf("%2u %4u %3u\n", + ni->length, ni->len_bytes, ni->drops); } static void -print_flowset_parms(struct dn_flow_set *fs, char *prefix) +print_flowset_parms(struct dn_fs *fs, char *prefix) { int l; char qs[30]; @@ -226,7 +218,7 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix) char red[90]; /* Display RED parameters */ l = fs->qsize; - if (fs->flags_fs & DN_QSIZE_IS_BYTES) { + if (fs->flags & DN_QSIZE_BYTES) { if (l >= 8192) sprintf(qs, "%d KB", l / 1024); else @@ -237,23 +229,34 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix) sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); else plr[0] = '\0'; - if (fs->flags_fs & DN_IS_RED) /* RED parameters */ + + if (fs->flags & DN_IS_RED) /* RED parameters */ sprintf(red, "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", - (fs->flags_fs & DN_IS_GENTLE_RED) ? 'G' : ' ', + (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', 1.0 * fs->w_q / (double)(1 << SCALE_RED), - SCALE_VAL(fs->min_th), - SCALE_VAL(fs->max_th), + fs->min_th, + fs->max_th, 1.0 * fs->max_p / (double)(1 << SCALE_RED)); else sprintf(red, "droptail"); - printf("%s %s%s %d queues (%d buckets) %s\n", - prefix, qs, plr, fs->rq_elements, fs->rq_size, red); + if (prefix[0]) { + printf("%s %s%s %d queues (%d buckets) %s\n", + prefix, qs, plr, fs->oid.id, fs->buckets, red); + prefix[0] = '\0'; + } else { + printf("q%05d %s%s %d flows (%d buckets) sched %d " + "weight %d lmax %d pri %d %s\n", + fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, + fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); + if (fs->flags & DN_HAVE_MASK) + print_mask(&fs->flow_mask); + } } static void -print_extra_delay_parms(struct dn_pipe *p) +print_extra_delay_parms(struct dn_profile *p) { double loss; if (p->samples_no <= 0) @@ -265,105 +268,126 @@ print_extra_delay_parms(struct dn_pipe *p) p->name, loss, p->samples_no); } -void -ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[]) +static void +flush_buf(char *buf) { - int rulenum; - void *next = data; - struct dn_pipe *p = (struct dn_pipe *) data; - struct dn_flow_set *fs; - struct dn_flow_queue *q; - int l; - - if (ac > 0) - rulenum = strtoul(*av++, NULL, 10); - else - rulenum = 0; - for (; nbytes >= sizeof *p; p = (struct dn_pipe *)next) { - double b = p->bandwidth; - char buf[30]; - char prefix[80]; - char burst[5 + 7]; - - if (SLIST_NEXT(p, next) != (struct dn_pipe *)DN_IS_PIPE) - break; /* done with pipes, now queues */ - - /* - * compute length, as pipe have variable size - */ - l = sizeof(*p) + p->fs.rq_elements * sizeof(*q); - next = (char *)p + l; - nbytes -= l; - - if ((rulenum != 0 && rulenum != p->pipe_nr) || co.do_pipe == 2) - continue; - - /* - * Print rate (or clocking interface) - */ - if (p->if_name[0] != '\0') - sprintf(buf, "%s", p->if_name); - else if (b == 0) - sprintf(buf, "unlimited"); - else if (b >= 1000000) - sprintf(buf, "%7.3f Mbit/s", b/1000000); - else if (b >= 1000) - sprintf(buf, "%7.3f Kbit/s", b/1000); - else - sprintf(buf, "%7.3f bit/s ", b); - - sprintf(prefix, "%05d: %s %4d ms ", - p->pipe_nr, buf, p->delay); - - print_flowset_parms(&(p->fs), prefix); - - if (humanize_number(burst, sizeof(burst), p->burst, - "Byte", HN_AUTOSCALE, 0) < 0 || co.verbose) - printf("\t burst: %ju Byte\n", p->burst); - else - printf("\t burst: %s\n", burst); - - print_extra_delay_parms(p); - - q = (struct dn_flow_queue *)(p+1); - list_queues(&(p->fs), q); - } - for (fs = next; nbytes >= sizeof *fs; fs = next) { - char prefix[80]; - - if (SLIST_NEXT(fs, next) != (struct dn_flow_set *)DN_IS_QUEUE) - break; - l = sizeof(*fs) + fs->rq_elements * sizeof(*q); - next = (char *)fs + l; - nbytes -= l; - - if (rulenum != 0 && ((rulenum != fs->fs_nr && co.do_pipe == 2) || - (rulenum != fs->parent_nr && co.do_pipe == 1))) { - continue; - } - - q = (struct dn_flow_queue *)(fs+1); - sprintf(prefix, "q%05d: weight %d pipe %d ", - fs->fs_nr, fs->weight, fs->parent_nr); - print_flowset_parms(fs, prefix); - list_queues(fs, q); + if (buf[0]) + printf("%s\n", buf); + buf[0] = '\0'; +} + +/* + * generic list routine. We expect objects in a specific order, i.e. + * PIPES AND SCHEDULERS: + * link; scheduler; internal flowset if any; instances + * we can tell a pipe from the number. + * + * FLOWSETS: + * flowset; queues; + * link i (int queue); scheduler i; si(i) { flowsets() : queues } + */ +static void +list_pipes(struct dn_id *oid, struct dn_id *end) +{ + char buf[160]; /* pending buffer */ + + buf[0] = '\0'; + for (; oid != end; oid = O_NEXT(oid, oid->len)) { + if (oid->len < sizeof(*oid)) + errx(1, "invalid oid len %d\n", oid->len); + + switch (oid->type) { + default: + flush_buf(buf); + printf("unrecognized object %d size %d\n", oid->type, oid->len); + break; + case DN_TEXT: /* list of attached flowsets */ + { + int i, l; + struct { + struct dn_id id; + uint32_t p[0]; + } *d = (void *)oid; + l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); + if (l == 0) + break; + printf(" Children flowsets: "); + for (i = 0; i < l; i++) + printf("%u ", d->p[i]); + printf("\n"); + break; + } + case DN_CMD_GET: + if (co.verbose) + printf("answer for cmd %d, len %d\n", oid->type, oid->id); + break; + case DN_SCH: { + struct dn_sch *s = (struct dn_sch *)oid; + flush_buf(buf); + printf(" sched %d type %s flags 0x%x %d buckets %d active\n", + s->sched_nr, + s->name, s->flags, s->buckets, s->oid.id); + if (s->flags & DN_HAVE_MASK) + print_mask(&s->sched_mask); + } + break; + + case DN_FLOW: + list_flow((struct dn_flow *)oid); + break; + + case DN_LINK: { + struct dn_link *p = (struct dn_link *)oid; + double b = p->bandwidth; + char bwbuf[30]; + char burst[5 + 7]; + + /* This starts a new object so flush buffer */ + flush_buf(buf); + /* data rate */ + if (b == 0) + sprintf(bwbuf, "unlimited "); + else if (b >= 1000000) + sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); + else if (b >= 1000) + sprintf(bwbuf, "%7.3f Kbit/s", b/1000); + else + sprintf(bwbuf, "%7.3f bit/s ", b); + + if (humanize_number(burst, sizeof(burst), p->burst, + "", HN_AUTOSCALE, 0) < 0 || co.verbose) + sprintf(burst, "%d", (int)p->burst); + sprintf(buf, "%05d: %s %4d ms burst %s", + p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); + } + break; + + case DN_FS: + print_flowset_parms((struct dn_fs *)oid, buf); + break; + case DN_PROFILE: + flush_buf(buf); + print_extra_delay_parms((struct dn_profile *)oid); } + flush_buf(buf); // XXX does it really go here ? + } } /* - * Delete pipe or queue i + * Delete pipe, queue or scheduler i */ int -ipfw_delete_pipe(int pipe_or_queue, int i) +ipfw_delete_pipe(int do_pipe, int i) { - struct dn_pipe p; - - memset(&p, 0, sizeof p); - if (pipe_or_queue == 1) - p.pipe_nr = i; /* pipe */ - else - p.fs.fs_nr = i; /* queue */ - i = do_cmd(IP_DUMMYNET_DEL, &p, sizeof p); + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : + ( (do_pipe == 2) ? DN_FS : DN_SCH); + cmd.a[0] = i; + i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); if (i) { i = 1; warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); @@ -400,7 +424,7 @@ ipfw_delete_pipe(int pipe_or_queue, int i) * The empirical curve may have both vertical and horizontal lines. * Vertical lines represent constant delay for a range of * probabilities; horizontal lines correspond to a discontinuty - * in the delay distribution: the pipe will use the largest delay + * in the delay distribution: the link will use the largest delay * for a given probability. * * To pass the curve to dummynet, we must store the parameters @@ -490,9 +514,12 @@ static void read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) { if (*bandwidth != -1) - warn("duplicate token, override bandwidth value!"); + warnx("duplicate token, override bandwidth value!"); if (arg[0] >= 'a' && arg[0] <= 'z') { + if (!if_name) { + errx(1, "no if support"); + } if (namelen >= IFNAMSIZ) warn("interface name truncated"); namelen--; @@ -508,7 +535,7 @@ read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) if (*end == 'K' || *end == 'k') { end++; bw *= 1000; - } else if (*end == 'M') { + } else if (*end == 'M' || *end == 'm') { end++; bw *= 1000000; } @@ -521,7 +548,8 @@ read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) errx(EX_DATAERR, "bandwidth too large"); *bandwidth = bw; - if_name[0] = '\0'; + if (if_name) + if_name[0] = '\0'; } } @@ -551,7 +579,8 @@ compare_points(const void *vp1, const void *vp2) #define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno static void -load_extra_delays(const char *filename, struct dn_pipe *p) +load_extra_delays(const char *filename, struct dn_profile *p, + struct dn_link *link) { char line[ED_MAX_LINE_LEN]; FILE *f; @@ -566,6 +595,9 @@ load_extra_delays(const char *filename, struct dn_pipe *p) struct point points[ED_MAX_SAMPLES_NO]; int points_no = 0; + /* XXX link never NULL? */ + p->link_nr = link->link_nr; + profile_name[0] = '\0'; f = fopen(filename, "r"); if (f == NULL) @@ -606,7 +638,8 @@ load_extra_delays(const char *filename, struct dn_pipe *p) ED_MAX_SAMPLES_NO); do_points = 0; } else if (!strcasecmp(name, ED_TOK_BW)) { - read_bandwidth(arg, &p->bandwidth, p->if_name, sizeof(p->if_name)); + char buf[IFNAMSIZ]; + read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); } else if (!strcasecmp(name, ED_TOK_LOSS)) { if (loss != -1.0) errx(ED_EFMT("duplicated token: %s"), name); @@ -676,17 +709,17 @@ load_extra_delays(const char *filename, struct dn_pipe *p) double y2 = points[i+1].prob * samples; double x2 = points[i+1].delay; - int index = y1; + int ix = y1; int stop = y2; if (x1 == x2) { - for (; index<stop; ++index) - p->samples[index] = x1; + for (; ix<stop; ++ix) + p->samples[ix] = x1; } else { double m = (y2-y1)/(x2-x1); double c = y1 - m*x1; - for (; index<stop ; ++index) - p->samples[index] = (index - c)/m; + for (; ix<stop ; ++ix) + p->samples[ix] = (ix - c)/m; } } p->samples_no = samples; @@ -694,27 +727,120 @@ load_extra_delays(const char *filename, struct dn_pipe *p) strncpy(p->name, profile_name, sizeof(p->name)); } +/* + * configuration of pipes, schedulers, flowsets. + * When we configure a new scheduler, an empty pipe is created, so: + * + * do_pipe = 1 -> "pipe N config ..." only for backward compatibility + * sched N+Delta type fifo sched_mask ... + * pipe N+Delta <parameters> + * flowset N+Delta pipe N+Delta (no parameters) + * sched N type wf2q+ sched_mask ... + * pipe N <parameters> + * + * do_pipe = 2 -> flowset N config + * flowset N parameters + * + * do_pipe = 3 -> sched N config + * sched N parameters (default no pipe) + * optional Pipe N config ... + * pipe ==> + */ void ipfw_config_pipe(int ac, char **av) { - int samples[ED_MAX_SAMPLES_NO]; - struct dn_pipe p; - int i; + int i, j; char *end; void *par = NULL; - - memset(&p, 0, sizeof p); - p.bandwidth = -1; + struct dn_id *buf, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + struct ipfw_flow_id *mask = NULL; + int lmax; + uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; + + /* + * allocate space for 1 header, + * 1 scheduler, 1 link, 1 flowset, 1 profile + */ + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); av++; ac--; /* Pipe number */ if (ac && isdigit(**av)) { i = atoi(*av); av++; ac--; - if (co.do_pipe == 1) - p.pipe_nr = i; - else - p.fs.fs_nr = i; + } else + i = -1; + if (i <= 0) + errx(EX_USAGE, "need a pipe/flowset/sched number"); + base = buf = safe_calloc(1, lmax); + /* all commands start with a 'CONFIGURE' and a version */ + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + switch (co.do_pipe) { + case 1: /* "pipe N config ..." */ + /* Allocate space for the WF2Q+ scheduler, its link + * and the FIFO flowset. Set the number, but leave + * the scheduler subtype and other parameters to 0 + * so the kernel will use appropriate defaults. + * XXX todo: add a flag to record if a parameter + * is actually configured. + * If we do a 'pipe config' mask -> sched_mask. + * The FIFO scheduler and link are derived from the + * WF2Q+ one in the kernel. + */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + sch->sched_nr = i; + sch->oid.subtype = 0; /* defaults to WF2Q+ */ + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + *flags |= DN_PIPE_CMD; + + p->link_nr = i; + + /* This flowset is only for the FIFO scheduler */ + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + break; + + case 2: /* "queue N config ... " */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + fs->fs_nr = i; + mask = &fs->flow_mask; + flags = &fs->flags; + buckets = &fs->buckets; + break; + + case 3: /* "sched N config ..." */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + fs = o_next(&buf, sizeof(*fs), DN_FS); + sch->sched_nr = i; + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + /* fs is used only with !MULTIQUEUE schedulers */ + fs->fs_nr = i + DN_MAX_ID; + fs->sched_nr = i; + break; } + /* set to -1 those fields for which we want to reuse existing + * values from the kernel. + * Also, *_nr and subtype = 0 mean reuse the value from the kernel. + * XXX todo: support reuse of the mask. + */ + if (p) + p->bandwidth = -1; + for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) + fs->par[j] = -1; while (ac > 0) { double d; int tok = match_token(dummynet_params, *av); @@ -722,41 +848,48 @@ ipfw_config_pipe(int ac, char **av) switch(tok) { case TOK_NOERROR: - p.fs.flags_fs |= DN_NOERROR; + NEED(fs, "noerror is only for pipes"); + fs->flags |= DN_NOERROR; break; case TOK_PLR: + NEED(fs, "plr is only for pipes"); NEED1("plr needs argument 0..1\n"); d = strtod(av[0], NULL); if (d > 1) d = 1; else if (d < 0) d = 0; - p.fs.plr = (int)(d*0x7fffffff); + fs->plr = (int)(d*0x7fffffff); ac--; av++; break; case TOK_QUEUE: + NEED(fs, "queue is only for pipes or flowsets"); NEED1("queue needs queue size\n"); end = NULL; - p.fs.qsize = strtoul(av[0], &end, 0); + fs->qsize = strtoul(av[0], &end, 0); if (*end == 'K' || *end == 'k') { - p.fs.flags_fs |= DN_QSIZE_IS_BYTES; - p.fs.qsize *= 1024; + fs->flags |= DN_QSIZE_BYTES; + fs->qsize *= 1024; } else if (*end == 'B' || _substrcmp2(end, "by", "bytes") == 0) { - p.fs.flags_fs |= DN_QSIZE_IS_BYTES; + fs->flags |= DN_QSIZE_BYTES; } ac--; av++; break; case TOK_BUCKETS: + NEED(fs, "buckets is only for pipes or flowsets"); NEED1("buckets needs argument\n"); - p.fs.rq_size = strtoul(av[0], NULL, 0); + *buckets = strtoul(av[0], NULL, 0); ac--; av++; break; + case TOK_FLOW_MASK: + case TOK_SCHED_MASK: case TOK_MASK: + NEED(mask, "tok_mask"); NEED1("mask needs mask specifier\n"); /* * per-flow queue, mask is dst_ip, dst_port, @@ -764,7 +897,7 @@ ipfw_config_pipe(int ac, char **av) */ par = NULL; - bzero(&p.fs.flow_mask, sizeof(p.fs.flow_mask)); + bzero(mask, sizeof(*mask)); end = NULL; while (ac >= 1) { @@ -780,44 +913,55 @@ ipfw_config_pipe(int ac, char **av) case TOK_ALL: /* * special case, all bits significant + * except 'extra' (the queue number) */ - p.fs.flow_mask.dst_ip = ~0; - p.fs.flow_mask.src_ip = ~0; - p.fs.flow_mask.dst_port = ~0; - p.fs.flow_mask.src_port = ~0; - p.fs.flow_mask.proto = ~0; - n2mask(&(p.fs.flow_mask.dst_ip6), 128); - n2mask(&(p.fs.flow_mask.src_ip6), 128); - p.fs.flow_mask.flow_id6 = ~0; - p.fs.flags_fs |= DN_HAVE_FLOW_MASK; + mask->dst_ip = ~0; + mask->src_ip = ~0; + mask->dst_port = ~0; + mask->src_port = ~0; + mask->proto = ~0; + n2mask(&mask->dst_ip6, 128); + n2mask(&mask->src_ip6, 128); + mask->flow_id6 = ~0; + *flags |= DN_HAVE_MASK; + goto end_mask; + + case TOK_QUEUE: + mask->extra = ~0; + *flags |= DN_HAVE_MASK; goto end_mask; case TOK_DSTIP: - p32 = &p.fs.flow_mask.dst_ip; + mask->addr_type = 4; + p32 = &mask->dst_ip; break; case TOK_SRCIP: - p32 = &p.fs.flow_mask.src_ip; + mask->addr_type = 4; + p32 = &mask->src_ip; break; case TOK_DSTIP6: - pa6 = &(p.fs.flow_mask.dst_ip6); + mask->addr_type = 6; + pa6 = &mask->dst_ip6; break; case TOK_SRCIP6: - pa6 = &(p.fs.flow_mask.src_ip6); + mask->addr_type = 6; + pa6 = &mask->src_ip6; break; case TOK_FLOWID: - p20 = &p.fs.flow_mask.flow_id6; + mask->addr_type = 6; + p20 = &mask->flow_id6; break; case TOK_DSTPORT: - p16 = &p.fs.flow_mask.dst_port; + p16 = &mask->dst_port; break; case TOK_SRCPORT: - p16 = &p.fs.flow_mask.src_port; + p16 = &mask->src_port; break; case TOK_PROTO: @@ -857,10 +1001,10 @@ ipfw_config_pipe(int ac, char **av) if (a > 0xFF) errx(EX_DATAERR, "proto mask must be 8 bit"); - p.fs.flow_mask.proto = (uint8_t)a; + mask->proto = (uint8_t)a; } if (a != 0) - p.fs.flags_fs |= DN_HAVE_FLOW_MASK; + *flags |= DN_HAVE_MASK; ac--; av++; } /* end while, config masks */ end_mask: @@ -869,9 +1013,9 @@ end_mask: case TOK_RED: case TOK_GRED: NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); - p.fs.flags_fs |= DN_IS_RED; + fs->flags |= DN_IS_RED; if (tok == TOK_GRED) - p.fs.flags_fs |= DN_IS_GENTLE_RED; + fs->flags |= DN_IS_GENTLE_RED; /* * the format for parameters is w_q/min_th/max_th/max_p */ @@ -879,82 +1023,108 @@ end_mask: double w_q = strtod(end, NULL); if (w_q > 1 || w_q <= 0) errx(EX_DATAERR, "0 < w_q <= 1"); - p.fs.w_q = (int) (w_q * (1 << SCALE_RED)); + fs->w_q = (int) (w_q * (1 << SCALE_RED)); } if ((end = strsep(&av[0], "/"))) { - p.fs.min_th = strtoul(end, &end, 0); + fs->min_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') - p.fs.min_th *= 1024; + fs->min_th *= 1024; } if ((end = strsep(&av[0], "/"))) { - p.fs.max_th = strtoul(end, &end, 0); + fs->max_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') - p.fs.max_th *= 1024; + fs->max_th *= 1024; } if ((end = strsep(&av[0], "/"))) { double max_p = strtod(end, NULL); if (max_p > 1 || max_p <= 0) errx(EX_DATAERR, "0 < max_p <= 1"); - p.fs.max_p = (int)(max_p * (1 << SCALE_RED)); + fs->max_p = (int)(max_p * (1 << SCALE_RED)); } ac--; av++; break; case TOK_DROPTAIL: - p.fs.flags_fs &= ~(DN_IS_RED|DN_IS_GENTLE_RED); + NEED(fs, "droptail is only for flowsets"); + fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); break; case TOK_BW: + NEED(p, "bw is only for links"); NEED1("bw needs bandwidth or interface\n"); - if (co.do_pipe != 1) - errx(EX_DATAERR, "bandwidth only valid for pipes"); - read_bandwidth(av[0], &p.bandwidth, p.if_name, sizeof(p.if_name)); + read_bandwidth(av[0], &p->bandwidth, NULL, 0); ac--; av++; break; case TOK_DELAY: - if (co.do_pipe != 1) - errx(EX_DATAERR, "delay only valid for pipes"); + NEED(p, "delay is only for links"); NEED1("delay needs argument 0..10000ms\n"); - p.delay = strtoul(av[0], NULL, 0); + p->delay = strtoul(av[0], NULL, 0); + ac--; av++; + break; + + case TOK_TYPE: { + int l; + NEED(sch, "type is only for schedulers"); + NEED1("type needs a string"); + l = strlen(av[0]); + if (l == 0 || l > 15) + errx(1, "type %s too long\n", av[0]); + strcpy(sch->name, av[0]); + sch->oid.subtype = 0; /* use string */ ac--; av++; break; + } case TOK_WEIGHT: - if (co.do_pipe == 1) - errx(EX_DATAERR,"weight only valid for queues"); - NEED1("weight needs argument 0..100\n"); - p.fs.weight = strtoul(av[0], &end, 0); + NEED(fs, "weight is only for flowsets"); + NEED1("weight needs argument\n"); + fs->par[0] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_LMAX: + NEED(fs, "lmax is only for flowsets"); + NEED1("lmax needs argument\n"); + fs->par[1] = strtol(av[0], &end, 0); ac--; av++; break; + case TOK_PRI: + NEED(fs, "priority is only for flowsets"); + NEED1("priority needs argument\n"); + fs->par[2] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_SCHED: case TOK_PIPE: - if (co.do_pipe == 1) - errx(EX_DATAERR,"pipe only valid for queues"); - NEED1("pipe needs pipe_number\n"); - p.fs.parent_nr = strtoul(av[0], &end, 0); + NEED(fs, "pipe/sched"); + NEED1("pipe/link/sched needs number\n"); + fs->sched_nr = strtoul(av[0], &end, 0); ac--; av++; break; - case TOK_PIPE_PROFILE: - if (co.do_pipe != 1) - errx(EX_DATAERR, "extra delay only valid for pipes"); + case TOK_PROFILE: + NEED((!pf), "profile already set"); + NEED(p, "profile"); + { NEED1("extra delay needs the file name\n"); - p.samples = &samples[0]; - load_extra_delays(av[0], &p); + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + load_extra_delays(av[0], pf, p); //XXX can't fail? --ac; ++av; + } break; case TOK_BURST: - if (co.do_pipe != 1) - errx(EX_DATAERR, "burst only valid for pipes"); + NEED(p, "burst"); NEED1("burst needs argument\n"); errno = 0; - if (expand_number(av[0], (int64_t *)&p.burst) < 0) + if (expand_number(av[0], (int64_t *)&p->burst) < 0) if (errno != ERANGE) errx(EX_DATAERR, "burst: invalid argument"); - if (errno || p.burst > (1ULL << 48) - 1) + if (errno || p->burst > (1ULL << 48) - 1) errx(EX_DATAERR, "burst: out of range (0..2^48-1)"); ac--; av++; @@ -964,26 +1134,17 @@ end_mask: errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); } } - if (co.do_pipe == 1) { - if (p.pipe_nr == 0) - errx(EX_DATAERR, "pipe_nr must be > 0"); - if (p.delay > 10000) - errx(EX_DATAERR, "delay must be < 10000"); - } else { /* co.do_pipe == 2, queue */ - if (p.fs.parent_nr == 0) - errx(EX_DATAERR, "pipe must be > 0"); - if (p.fs.weight >100) - errx(EX_DATAERR, "weight must be <= 100"); - } - /* check for bandwidth value */ - if (p.bandwidth == -1) { - p.bandwidth = 0; - if (p.samples_no > 0) - errx(EX_DATAERR, "profile requires a bandwidth limit"); + /* check validity of parameters */ + if (p) { + if (p->delay > 10000) + errx(EX_DATAERR, "delay must be < 10000"); + if (p->bandwidth == -1) + p->bandwidth = 0; } - - if (p.fs.flags_fs & DN_QSIZE_IS_BYTES) { + if (fs) { + /* XXX accept a 0 scheduler to keep the default */ + if (fs->flags & DN_QSIZE_BYTES) { size_t len; long limit; @@ -991,9 +1152,9 @@ end_mask: if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", &limit, &len, NULL, 0) == -1) limit = 1024*1024; - if (p.fs.qsize > limit) + if (fs->qsize > limit) errx(EX_DATAERR, "queue size must be < %ldB", limit); - } else { + } else { size_t len; long limit; @@ -1001,27 +1162,25 @@ end_mask: if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", &limit, &len, NULL, 0) == -1) limit = 100; - if (p.fs.qsize > limit) + if (fs->qsize > limit) errx(EX_DATAERR, "2 <= queue size <= %ld", limit); - } - if (p.fs.flags_fs & DN_IS_RED) { + } + + if (fs->flags & DN_IS_RED) { size_t len; int lookup_depth, avg_pkt_size; - double s, idle, weight, w_q; - struct clockinfo ck; - int t; + double w_q; - if (p.fs.min_th >= p.fs.max_th) + if (fs->min_th >= fs->max_th) errx(EX_DATAERR, "min_th %d must be < than max_th %d", - p.fs.min_th, p.fs.max_th); - if (p.fs.max_th == 0) + fs->min_th, fs->max_th); + if (fs->max_th == 0) errx(EX_DATAERR, "max_th must be > 0"); len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", &lookup_depth, &len, NULL, 0) == -1) - errx(1, "sysctlbyname(\"%s\")", - "net.inet.ip.dummynet.red_lookup_depth"); + lookup_depth = 256; if (lookup_depth == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" " must be greater than zero"); @@ -1029,18 +1188,13 @@ end_mask: len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", &avg_pkt_size, &len, NULL, 0) == -1) + avg_pkt_size = 512; - errx(1, "sysctlbyname(\"%s\")", - "net.inet.ip.dummynet.red_avg_pkt_size"); if (avg_pkt_size == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_avg_pkt_size must" " be greater than zero"); - len = sizeof(struct clockinfo); - if (sysctlbyname("kern.clockrate", &ck, &len, NULL, 0) == -1) - errx(1, "sysctlbyname(\"%s\")", "kern.clockrate"); - /* * Ticks needed for sending a medium-sized packet. * Unfortunately, when we are configuring a WF2Q+ queue, we @@ -1050,38 +1204,181 @@ end_mask: * correct. But on the other hand, why do we want RED with * WF2Q+ ? */ +#if 0 if (p.bandwidth==0) /* this is a WF2Q+ queue */ s = 0; else s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; - +#endif /* * max idle time (in ticks) before avg queue size becomes 0. * NOTA: (3/w_q) is approx the value x so that * (1-w_q)^x < 10^-3. */ - w_q = ((double)p.fs.w_q) / (1 << SCALE_RED); + w_q = ((double)fs->w_q) / (1 << SCALE_RED); +#if 0 // go in kernel idle = s * 3. / w_q; - p.fs.lookup_step = (int)idle / lookup_depth; - if (!p.fs.lookup_step) - p.fs.lookup_step = 1; + fs->lookup_step = (int)idle / lookup_depth; + if (!fs->lookup_step) + fs->lookup_step = 1; weight = 1 - w_q; - for (t = p.fs.lookup_step; t > 1; --t) + for (t = fs->lookup_step; t > 1; --t) weight *= 1 - w_q; - p.fs.lookup_weight = (int)(weight * (1 << SCALE_RED)); + fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); +#endif + } } - if (p.samples_no <= 0) { - i = do_cmd(IP_DUMMYNET_CONFIGURE, &p, sizeof p); - } else { - struct dn_pipe_max pm; - int len = sizeof(pm); - memcpy(&pm.pipe, &p, sizeof(pm.pipe)); - memcpy(&pm.samples, samples, sizeof(pm.samples)); - - i = do_cmd(IP_DUMMYNET_CONFIGURE, &pm, len); - } + i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); if (i) err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); } + +void +dummynet_flush(void) +{ + struct dn_id oid; + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_cmd(IP_DUMMYNET3, &oid, oid.len); +} + +/* Parse input for 'ipfw [pipe|sched|queue] show [range list]' + * Returns the number of ranges, and possibly stores them + * in the array v of size len. + */ +static int +parse_range(int ac, char *av[], uint32_t *v, int len) +{ + int n = 0; + char *endptr, *s; + uint32_t base[2]; + + if (v == NULL || len < 2) { + v = base; + len = 2; + } + + for (s = *av; s != NULL; av++, ac--) { + v[0] = strtoul(s, &endptr, 10); + v[1] = (*endptr != '-') ? v[0] : + strtoul(endptr+1, &endptr, 10); + if (*endptr == '\0') { /* prepare for next round */ + s = (ac > 0) ? *(av+1) : NULL; + } else { + if (*endptr != ',') { + warn("invalid number: %s", s); + s = ++endptr; + continue; + } + /* continue processing from here */ + s = ++endptr; + ac++; + av--; + } + if (v[1] < v[0] || + v[1] < 0 || v[1] >= DN_MAX_ID-1 || + v[0] < 0 || v[1] >= DN_MAX_ID-1) { + continue; /* invalid entry */ + } + n++; + /* translate if 'pipe list' */ + if (co.do_pipe == 1) { + v[0] += DN_MAX_ID; + v[1] += DN_MAX_ID; + } + v = (n*2 < len) ? v + 2 : base; + } + return n; +} + +/* main entry point for dummynet list functions. co.do_pipe indicates + * which function we want to support. + * av may contain filtering arguments, either individual entries + * or ranges, or lists (space or commas are valid separators). + * Format for a range can be n1-n2 or n3 n4 n5 ... + * In a range n1 must be <= n2, otherwise the range is ignored. + * A number 'n4' is translate in a range 'n4-n4' + * All number must be > 0 and < DN_MAX_ID-1 + */ +void +dummynet_list(int ac, char *av[], int show_counters) +{ + struct dn_id *oid, *x = NULL; + int ret, i, l; + int n; /* # of ranges */ + int buflen; + int max_size; /* largest obj passed up */ + + ac--; + av++; /* skip 'list' | 'show' word */ + + n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */ + + /* Allocate space to store ranges */ + l = sizeof(*oid) + sizeof(uint32_t) * n * 2; + oid = safe_calloc(1, l); + oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION); + + if (n > 0) /* store ranges in idx */ + parse_range(ac, av, (uint32_t *)(oid + 1), n*2); + /* + * Compute the size of the largest object returned. If the + * response leaves at least this much spare space in the + * buffer, then surely the response is complete; otherwise + * there might be a risk of truncation and we will need to + * retry with a larger buffer. + * XXX don't bother with smaller structs. + */ + max_size = sizeof(struct dn_fs); + if (max_size < sizeof(struct dn_sch)) + max_size = sizeof(struct dn_sch); + if (max_size < sizeof(struct dn_flow)) + max_size = sizeof(struct dn_flow); + + switch (co.do_pipe) { + case 1: + oid->subtype = DN_LINK; /* list pipe */ + break; + case 2: + oid->subtype = DN_FS; /* list queue */ + break; + case 3: + oid->subtype = DN_SCH; /* list sched */ + break; + } + + /* + * Ask the kernel an estimate of the required space (result + * in oid.id), unless we are requesting a subset of objects, + * in which case the kernel does not give an exact answer. + * In any case, space might grow in the meantime due to the + * creation of new queues, so we must be prepared to retry. + */ + if (n > 0) { + buflen = 4*1024; + } else { + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0 || oid->id <= sizeof(*oid)) + goto done; + buflen = oid->id + max_size; + oid->len = sizeof(*oid); /* restore */ + } + /* Try a few times, until the buffer fits */ + for (i = 0; i < 20; i++) { + l = buflen; + x = safe_realloc(x, l); + bcopy(oid, x, oid->len); + ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); + if (ret != 0 || x->id <= sizeof(*oid)) + goto done; /* no response */ + if (l + max_size <= buflen) + break; /* ok */ + buflen *= 2; /* double for next attempt */ + } + list_pipes(x, O_NEXT(x, l)); +done: + if (x) + free(x); + free(oid); +} diff --git a/sbin/ipfw/ipfw.8 b/sbin/ipfw/ipfw.8 index fc83ecc..01dad12 100644 --- a/sbin/ipfw/ipfw.8 +++ b/sbin/ipfw/ipfw.8 @@ -1,13 +1,15 @@ .\" .\" $FreeBSD$ .\" -.Dd June 24, 2009 +.Dd March 20, 2010 .Dt IPFW 8 .Os .Sh NAME .Nm ipfw -.Nd IP firewall and traffic shaper control program +.Nd User interface for firewall, traffic shaper, packet scheduler, +in-kernel NAT. .Sh SYNOPSIS +.Ss FIREWALL CONFIGURATION .Nm .Op Fl cq .Cm add @@ -26,12 +28,6 @@ .Op Cm set Ar N .Brq Cm delete | zero | resetlog .Op Ar number ... -.Nm -.Cm enable -.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive -.Nm -.Cm disable -.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive .Pp .Nm .Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... @@ -43,8 +39,17 @@ .Cm set swap Ar number number .Nm .Cm set show +.Ss SYSCTL SHORTCUTS .Pp .Nm +.Cm enable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Nm +.Cm disable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Pp +.Ss LOOKUP TABLES +.Nm .Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value .Nm .Cm table Ar number Cm delete Ar addr Ns Op / Ns Ar masklen @@ -57,17 +62,19 @@ .Brq Ar number | all .Cm list .Pp +.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER) .Nm -.Brq Cm pipe | queue +.Brq Cm pipe | queue | sched .Ar number .Cm config .Ar config-options .Nm .Op Fl s Op Ar field -.Brq Cm pipe | queue +.Brq Cm pipe | queue | sched .Brq Cm delete | list | show .Op Ar number ... .Pp +.Ss IN-KERNEL NAT .Nm .Op Fl q .Cm nat @@ -89,28 +96,27 @@ The .Nm utility is the user interface for controlling the .Xr ipfw 4 -firewall and the +firewall, the .Xr dummynet 4 -traffic shaper in -.Fx . +traffic shaper/packet scheduler, and the +in-kernel NAT services. .Pp -An -.Nm -configuration, or +A firewall configuration, or .Em ruleset , is made of a list of .Em rules numbered from 1 to 65535. -Packets are passed to -.Nm +Packets are passed to the firewall from a number of different places in the protocol stack (depending on the source and destination of the packet, -it is possible that -.Nm -is invoked multiple times on the same packet). +it is possible for the firewall to be +invoked multiple times on the same packet). The packet passed to the firewall is compared -against each of the rules in the firewall -.Em ruleset . +against each of the rules in the +.Em ruleset , +in rule-number order +(multiple rules with the same number are permitted, in which case +they are processed in order of insertion). When a match is found, the action corresponding to the matching rule is performed. .Pp @@ -118,9 +124,7 @@ Depending on the action and certain system settings, packets can be reinjected into the firewall at some rule after the matching one for further processing. .Pp -An -.Nm -ruleset always includes a +A ruleset always includes a .Em default rule (numbered 65535) which cannot be modified or deleted, and matches all packets. @@ -137,14 +141,14 @@ If the ruleset includes one or more rules with the or .Cm limit option, -.Nm -will have a +the firewall will have a .Em stateful -behaviour, i.e., upon a match it will create dynamic rules matching -the exact parameters (source and destination addresses and ports) -of the matching packet. -.Pp -These dynamic rules, which have a limited lifetime, are checked +behaviour, i.e., upon a match it will create +.Em dynamic rules , +i.e. rules that match packets with the same 5-tuple +(protocol, source and destination addresses and ports) +as the packet which caused their creation. +Dynamic rules, which have a limited lifetime, are checked at the first occurrence of a .Cm check-state , .Cm keep-state @@ -283,6 +287,7 @@ When listing, show last match timestamp as seconds from the epoch. This form can be more convenient for postprocessing by scripts. .El .Pp +.Ss LIST OF RULES AND PREPROCESSING To ease configuration, rules can be put into a file which is processed using .Nm @@ -322,14 +327,16 @@ This allows for flexible configuration files (like conditionalizing them on the local hostname) and the use of macros to centralize frequently required arguments like IP addresses. .Pp +.Ss TRAFFIC SHAPER CONFIGURATION The .Nm -.Cm pipe +.Cm pipe , queue and -.Cm queue -commands are used to configure the traffic shaper, as shown in the +.Cm sched +commands are used to configure the traffic shaper and packet scheduler. +See the .Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION -Section below. +Section below for details. .Pp If the world and the kernel get out of sync the .Nm @@ -362,7 +369,7 @@ have this picture in mind in order to design a correct ruleset. | to devices | .Ed .Pp -As can be noted from the above picture, the number of +The number of times the same packet goes through the firewall can vary between 0 and 4 depending on packet source and destination, and system configuration. @@ -421,9 +428,9 @@ Keywords are case-sensitive, whereas arguments may or may not be case-sensitive depending on their nature (e.g.\& uid's are, hostnames are not). .Pp -In -.Nm ipfw2 -you can introduce spaces after commas ',' to make +Some arguments (e.g. port or address lists) are comma-separated +lists of values. +In this case, spaces after commas ',' are allowed to make the line more readable. You can also put the entire command (including flags) into a single argument. @@ -434,9 +441,7 @@ ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8 ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8" .Ed .Sh RULE FORMAT -The format of -.Nm -rules is the following: +The format of firewall rules is the following: .Bd -ragged -offset indent .Bk -words .Op Ar rule_number @@ -496,7 +501,7 @@ in future forwarding decisions. .El .Pp Note that some of the above information, e.g.\& source MAC or IP addresses and -TCP/UDP ports, could easily be spoofed, so filtering on those fields +TCP/UDP ports, can be easily spoofed, so filtering on those fields alone might not guarantee the desired results. .Bl -tag -width indent .It Ar rule_number @@ -1399,7 +1404,7 @@ If not found, the match fails. Otherwise, the match succeeds and .Cm tablearg is set to the value extracted from the table. -.Br +.Pp This option can be useful to quickly dispatch traffic based on certain packet fields. See the @@ -1496,7 +1501,7 @@ is invalid) whenever .Cm xmit is used. .Pp -A packet may not have a receive or transmit interface: packets +A packet might not have a receive or transmit interface: packets originating from the local host have no receive interface, while packets destined for the local host have no transmit interface. @@ -1643,15 +1648,17 @@ because it engages only on packets with source addresses of directly connected networks instead of all source addresses. .El .Sh LOOKUP TABLES -Lookup tables are useful to handle large sparse address sets, -typically from a hundred to several thousands of entries. +Lookup tables are useful to handle large sparse sets of +addresses or other search keys (e.g. ports, jail IDs). +In the rest of this section we will use the term ``address'' +to mean any unsigned value of up to 32-bit. There may be up to 128 different lookup tables, numbered 0 to 127. .Pp Each entry is represented by an .Ar addr Ns Op / Ns Ar masklen and will match all addresses with base .Ar addr -(specified as an IP address or a hostname) +(specified as an IP address, a hostname or an unsigned integer) and mask width of .Ar masklen bits. @@ -1669,9 +1676,9 @@ is not specified, it defaults to 0. .Pp An entry can be added to a table .Pq Cm add , -removed from a table -.Pq Cm delete , -a table can be examined +or removed from a table +.Pq Cm delete . +A table can be examined .Pq Cm list or flushed .Pq Cm flush . @@ -1680,7 +1687,7 @@ Internally, each table is stored in a Radix tree, the same way as the routing table (see .Xr route 4 ) . .Pp -Lookup tables currently support IPv4 addresses only. +Lookup tables currently support only ports, jail IDs and IPv4 addresses. .Pp The .Cm tablearg @@ -1838,9 +1845,9 @@ for more examples on how to use dynamic rules. .Nm is also the user interface for the .Nm dummynet -traffic shaper and network emulator, a subsystem that +traffic shaper, packet scheduler and network emulator, a subsystem that can artificially queue, delay or drop packets -emulator the behaviour of certain network links +emulating the behaviour of certain network links or queueing systems. .Pp .Nm dummynet @@ -1852,26 +1859,33 @@ Matching packets are then passed to either of two different objects, which implement the traffic regulation: .Bl -hang -offset XXXX .It Em pipe -A pipe emulates a link with given bandwidth, propagation delay, +A +.Em pipe +emulates a +.Em link +with given bandwidth and propagation delay, +driven by a FIFO scheduler and a single queue with programmable queue size and packet loss rate. -Packets are queued in front of the pipe as they come out from the classifier, -and then transferred to the pipe according to the pipe's parameters. +Packets are appended to the queue as they come out from +.Nm ipfw , +and then transferred in FIFO order to the link at the desired rate. .It Em queue -A queue -is an abstraction used to implement the WF2Q+ -(Worst-case Fair Weighted Fair Queueing) policy, which is -an efficient variant of the WFQ policy. -.Pp -The queue associates a -.Em weight -and a reference pipe to each flow (a flow is a set of packets -with the same addresses and ports after masking). -All backlogged flows (i.e., those -with packets queued) linked to the same pipe share the pipe's -bandwidth proportionally to their weights. -Note that weights are not priorities; a flow with a lower weight -is still guaranteed to get its fraction of the bandwidth even if a -flow with a higher weight is permanently backlogged. +A +.Em queue +is an abstraction used to implement packet scheduling +using one of several packet scheduling algorithms. +Packets sent to a +.Em queue +are first grouped into flows according to a mask on the 5-tuple. +Flows are then passed to the scheduler associated to the +.Em queue , +and each flow uses scheduling parameters (weight and others) +as configured in the +.Em queue +itself. +A scheduler in turn is connected to an emulated link, +and arbitrates the link's bandwidth among backlogged flows according to +weights and to the features of the scheduling algorithm in use. .El .Pp In practice, @@ -1880,6 +1894,52 @@ can be used to set hard limits to the bandwidth that a flow can use, whereas .Em queues can be used to determine how different flows share the available bandwidth. .Pp +A graphical representation of the binding of queues, +flows, schedulers and links is below. +.Bd -literal -offset indent + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ +.Ed +It is important to understand the role of the SCHED_MASK +and FLOW_MASK, which are configured through the commands +.Dl "ipfw sched N config mask SCHED_MASK ..." +and +.Dl "ipfw queue X config mask FLOW_MASK ..." . +.Pp +The SCHED_MASK is used to assign flows to one or more +scheduler instances, one for each +value of the packet's 5-fuple after applying SCHED_MASK. +As an example, using ``src-ip 0xffffff00'' creates one instance +for each /24 destination subnet. +.Pp +The FLOW_MASK, together with the SCHED_MASK, is used to split +packets into flows. As an example, using +``src-ip 0x000000ff'' +together with the previous SCHED_MASK makes a flow for +each individual source address. In turn, flows for each /24 +subnet will be sent to the same scheduler instance. +.Pp +The above diagram holds even for the +.Em pipe +case, with the only restriction that a +.Em pipe +only supports a SCHED_MASK, and forces the use of a FIFO +scheduler (these are for backward compatibility reasons; +in fact, internally, a +.Nm dummynet's +pipe is implemented exactly as above). +.Pp There are two modes of .Nm dummynet operation: @@ -1911,16 +1971,19 @@ mode can be enabled by setting the .Xr sysctl 8 variable to a non-zero value. .Pp -.Ss PIPE AND QUEUE CONFIGURATION +.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION The -.Em pipe -and +.Em pipe , .Em queue +and +.Em scheduler configuration commands are the following: .Bd -ragged -offset indent .Cm pipe Ar number Cm config Ar pipe-configuration .Pp .Cm queue Ar number Cm config Ar queue-configuration +.Pp +.Cm sched Ar number Cm config Ar sched-configuration .Ed .Pp The following parameters can be configured for a pipe: @@ -2073,6 +2136,41 @@ Specifies the weight to be used for flows matching this queue. The weight must be in the range 1..100, and defaults to 1. .El .Pp +The following parameters can be configured for a scheduler: +.Pp +.Bl -tag -width indent -compact +.It Cm type Ar {fifo | wf2qp | rr | qfq} +specifies the scheduling algorithm to use. +.Bl -tag -width indent -compact +.It cm fifo +is just a FIFO scheduler (which means that all packets +are stored in the same queue as they arrive to the scheduler). +FIFO has O(1) per-packet time complexity, with very low +constants (estimate 60-80ns on a 2Ghz desktop machine) +but gives no service guarantees. +.It Cm wf2qp +implements the WF2Q+ algorithm, which is a Weighted Fair Queueing +algorithm which permits flows to share bandwidth according to +their weights. Note that weights are not priorities; even a flow +with a minuscule weight will never starve. +WF2Q+ has O(log N) per-packet processing cost, where N is the number +of flows, and is the default algorithm used by previous versions +dummynet's queues. +.It Cm rr +implements the Deficit Round Robin algorithm, which has O(1) processing +costs (roughly, 100-150ns per packet) +and permits bandwidth allocation according to weights, but +with poor service guarantees. +.It Cm qfq +implements the QFQ algorithm, which is a very fast variant of +WF2Q+, with similar service guarantees and O(1) processing +costs (roughly, 200-250ns per packet). +.El +.El +.Pp +In addition to the type, all parameters allowed for a pipe can also +be specified for a scheduler. +.Pp Finally, the following parameters can be configured for both pipes and queues: .Pp diff --git a/sbin/ipfw/ipfw2.c b/sbin/ipfw/ipfw2.c index d4740c9..f313b51 100644 --- a/sbin/ipfw/ipfw2.c +++ b/sbin/ipfw/ipfw2.c @@ -57,7 +57,7 @@ struct cmdline_opts co; /* global options */ int resvd_set_number = RESVD_SET; #define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ - if (!ac) \ + if (!av[0]) \ errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ if (_substrcmp(*av, "tablearg") == 0) { \ arg = IP_FW_TABLEARG; \ @@ -65,23 +65,23 @@ int resvd_set_number = RESVD_SET; } \ \ { \ - long val; \ + long _xval; \ char *end; \ \ - val = strtol(*av, &end, 10); \ + _xval = strtol(*av, &end, 10); \ \ - if (!isdigit(**av) || *end != '\0' || (val == 0 && errno == EINVAL)) \ + if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ errx(EX_DATAERR, "%s: invalid argument: %s", \ match_value(s_x, tok), *av); \ \ - if (errno == ERANGE || val < min || val > max) \ + if (errno == ERANGE || _xval < min || _xval > max) \ errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ match_value(s_x, tok), min, max, *av); \ \ - if (val == IP_FW_TABLEARG) \ + if (_xval == IP_FW_TABLEARG) \ errx(EX_DATAERR, "%s: illegal argument value: %s", \ match_value(s_x, tok), *av); \ - arg = val; \ + arg = _xval; \ } \ } while (0) @@ -231,7 +231,7 @@ static struct _s_x rule_action_params[] = { */ static int lookup_key[] = { TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, - TOK_UID, TOK_JAIL, -1 }; + TOK_UID, TOK_JAIL, TOK_DSCP, -1 }; static struct _s_x rule_options[] = { { "tagged", TOK_TAGGED }, @@ -258,6 +258,7 @@ static struct _s_x rule_options[] = { { "iplen", TOK_IPLEN }, { "ipid", TOK_IPID }, { "ipprecedence", TOK_IPPRECEDENCE }, + { "dscp", TOK_DSCP }, { "iptos", TOK_IPTOS }, { "ipttl", TOK_IPTTL }, { "ipversion", TOK_IPVER }, @@ -313,22 +314,29 @@ static struct _s_x rule_options[] = { { NULL, 0 } /* terminator */ }; -/* - * The following is used to generate a printable argument for - * 64-bit numbers, irrespective of platform alignment and bit size. - * Because all the printf in this program use %llu as a format, - * we just return an unsigned long long, which is larger than - * we need in certain cases, but saves the hassle of using - * PRIu64 as a format specifier. - * We don't care about inlining, this is not performance critical code. +/* + * Helper routine to print a possibly unaligned uint64_t on + * various platform. If width > 0, print the value with + * the desired width, followed by a space; + * otherwise, return the required width. */ -unsigned long long -align_uint64(const uint64_t *pll) +int +pr_u64(uint64_t *pd, int width) { - uint64_t ret; - - bcopy (pll, &ret, sizeof(ret)); - return ret; +#ifdef TCC +#define U64_FMT "I64" +#else +#define U64_FMT "llu" +#endif + uint64_t u; + unsigned long long d; + + bcopy (pd, &u, sizeof(u)); + d = u; + return (width > 0) ? + printf("%*" U64_FMT " ", width, d) : + snprintf(NULL, 0, "%" U64_FMT, d) ; +#undef U64_FMT } void * @@ -353,6 +361,7 @@ safe_realloc(void *ptr, size_t size) /* * conditionally runs the command. + * Selected options or negative -> getsockopt */ int do_cmd(int optname, void *optval, uintptr_t optlen) @@ -372,11 +381,15 @@ do_cmd(int optname, void *optval, uintptr_t optlen) optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST || optname == IP_FW_TABLE_GETSIZE || optname == IP_FW_NAT_GET_CONFIG || - optname == IP_FW_NAT_GET_LOG) + optname < 0 || + optname == IP_FW_NAT_GET_LOG) { + if (optname < 0) + optname = -optname; i = getsockopt(s, IPPROTO_IP, optname, optval, (socklen_t *)optlen); - else + } else { i = setsockopt(s, IPPROTO_IP, optname, optval, optlen); + } return i; } @@ -749,7 +762,7 @@ static void print_ip(ipfw_insn_ip *cmd, char const *s) { struct hostent *he = NULL; - int len = F_LEN((ipfw_insn *)cmd); + uint32_t len = F_LEN((ipfw_insn *)cmd); uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { @@ -915,9 +928,9 @@ print_icmptypes(ipfw_insn_u32 *cmd) #define HAVE_DSTIP 0x0004 #define HAVE_PROTO4 0x0008 #define HAVE_PROTO6 0x0010 +#define HAVE_IP 0x0100 #define HAVE_OPTIONS 0x8000 -#define HAVE_IP (HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP) static void show_prerequisites(int *flags, int want, int cmd __unused) { @@ -967,9 +980,10 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) } printf("%05u ", rule->rulenum); - if (pcwidth>0 || bcwidth>0) - printf("%*llu %*llu ", pcwidth, align_uint64(&rule->pcnt), - bcwidth, align_uint64(&rule->bcnt)); + if (pcwidth > 0 || bcwidth > 0) { + pr_u64(&rule->pcnt, pcwidth); + pr_u64(&rule->bcnt, bcwidth); + } if (co.do_time == 2) printf("%10u ", rule->timestamp); @@ -1018,7 +1032,9 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) switch(cmd->opcode) { case O_CHECK_STATE: printf("check-state"); - flags = HAVE_IP; /* avoid printing anything else */ + /* avoid printing anything else */ + flags = HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP; break; case O_ACCEPT: @@ -1126,9 +1142,11 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) else printf(" log"); } +#ifndef NO_ALTQ if (altqptr) { print_altq_cmd(altqptr); } +#endif if (tagptr) { if (tagptr->len & F_NOT) PRINT_UINT_ARG(" untag ", tagptr->arg1); @@ -1156,7 +1174,8 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) show_prerequisites(&flags, HAVE_PROTO, 0); printf(" from any to any"); } - flags |= HAVE_IP | HAVE_OPTIONS; + flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO | + HAVE_SRCIP | HAVE_DSTIP; } if (co.comment_only) @@ -1245,9 +1264,12 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) break; case O_IP_DSTPORT: - show_prerequisites(&flags, HAVE_IP, 0); + show_prerequisites(&flags, + HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP, 0); case O_IP_SRCPORT: - show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0); + show_prerequisites(&flags, + HAVE_PROTO | HAVE_SRCIP, 0); if ((cmd->len & F_OR) && !or_block) printf(" {"); if (cmd->len & F_NOT) @@ -1268,7 +1290,8 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) && !(flags & HAVE_PROTO)) show_prerequisites(&flags, - HAVE_IP | HAVE_OPTIONS, 0); + HAVE_PROTO | HAVE_IP | HAVE_SRCIP | + HAVE_DSTIP | HAVE_OPTIONS, 0); if (flags & HAVE_OPTIONS) printf(" proto"); if (pe) @@ -1286,7 +1309,8 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) ((cmd->opcode == O_IP4) && (flags & HAVE_PROTO4))) break; - show_prerequisites(&flags, HAVE_IP | HAVE_OPTIONS, 0); + show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | + HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0); if ((cmd->len & F_OR) && !or_block) printf(" {"); if (cmd->len & F_NOT && cmd->opcode != O_IN) @@ -1540,7 +1564,8 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) or_block = 0; } } - show_prerequisites(&flags, HAVE_IP, 0); + show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP + | HAVE_IP, 0); if (comment) printf(" // %s", comment); printf("\n"); @@ -1560,10 +1585,12 @@ show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth) } bcopy(&d->rule, &rulenum, sizeof(rulenum)); printf("%05d", rulenum); - if (pcwidth>0 || bcwidth>0) - printf(" %*llu %*llu (%ds)", pcwidth, - align_uint64(&d->pcnt), bcwidth, - align_uint64(&d->bcnt), d->expire); + if (pcwidth > 0 || bcwidth > 0) { + printf(" "); + pr_u64(&d->pcnt, pcwidth); + pr_u64(&d->bcnt, bcwidth); + printf("(%ds)", d->expire); + } switch (d->dyn_type) { case O_LIMIT_PARENT: printf(" PARENT %d", d->count); @@ -1606,26 +1633,33 @@ show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth) * ipfw set move rule X to Y */ void -ipfw_sets_handler(int ac, char *av[]) +ipfw_sets_handler(char *av[]) { uint32_t set_disable, masks[2]; int i, nbytes; uint16_t rulenum; uint8_t cmd, new_set; - ac--; av++; - if (!ac) + if (av[0] == NULL) errx(EX_USAGE, "set needs command"); if (_substrcmp(*av, "show") == 0) { - void *data; + void *data = NULL; char const *msg; + int nalloc; + + nalloc = nbytes = sizeof(struct ip_fw); + while (nbytes >= nalloc) { + if (data) + free(data); + nalloc = nalloc * 2 + 200; + nbytes = nalloc; + data = safe_calloc(1, nbytes); + if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0) + err(EX_OSERR, "getsockopt(IP_FW_GET)"); + } - nbytes = sizeof(struct ip_fw); - data = safe_calloc(1, nbytes); - if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0) - err(EX_OSERR, "getsockopt(IP_FW_GET)"); bcopy(&((struct ip_fw *)data)->next_rule, &set_disable, sizeof(set_disable)); @@ -1642,8 +1676,8 @@ ipfw_sets_handler(int ac, char *av[]) } printf("\n"); } else if (_substrcmp(*av, "swap") == 0) { - ac--; av++; - if (ac != 2) + av++; + if ( av[0] == NULL || av[1] == NULL ) errx(EX_USAGE, "set swap needs 2 set numbers\n"); rulenum = atoi(av[0]); new_set = atoi(av[1]); @@ -1654,13 +1688,14 @@ ipfw_sets_handler(int ac, char *av[]) masks[0] = (4 << 24) | (new_set << 16) | (rulenum); i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); } else if (_substrcmp(*av, "move") == 0) { - ac--; av++; - if (ac && _substrcmp(*av, "rule") == 0) { + av++; + if (av[0] && _substrcmp(*av, "rule") == 0) { cmd = 2; - ac--; av++; + av++; } else cmd = 3; - if (ac != 3 || _substrcmp(av[1], "to") != 0) + if (av[0] == NULL || av[1] == NULL || av[2] == NULL || + av[3] != NULL || _substrcmp(av[1], "to") != 0) errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); rulenum = atoi(av[0]); new_set = atoi(av[2]); @@ -1675,10 +1710,10 @@ ipfw_sets_handler(int ac, char *av[]) _substrcmp(*av, "enable") == 0 ) { int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; - ac--; av++; + av++; masks[0] = masks[1] = 0; - while (ac) { + while (av[0]) { if (isdigit(**av)) { i = atoi(*av); if (i < 0 || i > RESVD_SET) @@ -1692,7 +1727,7 @@ ipfw_sets_handler(int ac, char *av[]) else errx(EX_DATAERR, "invalid set command %s\n", *av); - av++; ac--; + av++; } if ( (masks[0] & masks[1]) != 0 ) errx(EX_DATAERR, @@ -1706,16 +1741,17 @@ ipfw_sets_handler(int ac, char *av[]) } void -ipfw_sysctl_handler(int ac, char *av[], int which) +ipfw_sysctl_handler(char *av[], int which) { - ac--; av++; - if (ac == 0) { + if (av[0] == NULL) { warnx("missing keyword to enable/disable\n"); } else if (_substrcmp(*av, "firewall") == 0) { sysctlbyname("net.inet.ip.fw.enable", NULL, 0, &which, sizeof(which)); + sysctlbyname("net.inet6.ip6.fw.enable", NULL, 0, + &which, sizeof(which)); } else if (_substrcmp(*av, "one_pass") == 0) { sysctlbyname("net.inet.ip.fw.one_pass", NULL, 0, &which, sizeof(which)); @@ -1728,8 +1764,10 @@ ipfw_sysctl_handler(int ac, char *av[], int which) } else if (_substrcmp(*av, "dyn_keepalive") == 0) { sysctlbyname("net.inet.ip.fw.dyn_keepalive", NULL, 0, &which, sizeof(which)); +#ifndef NO_ALTQ } else if (_substrcmp(*av, "altq") == 0) { altq_set_enabled(which); +#endif } else { warnx("unrecognize enable/disable keyword: %s\n", *av); } @@ -1762,6 +1800,10 @@ ipfw_list(int ac, char *av[], int show_counters) fprintf(stderr, "Testing only, list disabled\n"); return; } + if (co.do_pipe) { + dummynet_list(ac, av, show_counters); + return; + } ac--; av++; @@ -1778,11 +1820,6 @@ ipfw_list(int ac, char *av[], int show_counters) co.do_pipe ? "DUMMYNET" : "FW"); } - if (co.do_pipe) { - ipfw_list_pipes(data, nbytes, ac, av); - goto done; - } - /* * Count static rules. They have variable size so we * need to scan the list to count them. @@ -1810,14 +1847,12 @@ ipfw_list(int ac, char *av[], int show_counters) continue; /* packet counter */ - width = snprintf(NULL, 0, "%llu", - align_uint64(&r->pcnt)); + width = pr_u64(&r->pcnt, 0); if (width > pcwidth) pcwidth = width; /* byte counter */ - width = snprintf(NULL, 0, "%llu", - align_uint64(&r->bcnt)); + width = pr_u64(&r->bcnt, 0); if (width > bcwidth) bcwidth = width; } @@ -1831,13 +1866,11 @@ ipfw_list(int ac, char *av[], int show_counters) if (set != co.use_set - 1) continue; } - width = snprintf(NULL, 0, "%llu", - align_uint64(&d->pcnt)); + width = pr_u64(&d->pcnt, 0); if (width > pcwidth) pcwidth = width; - width = snprintf(NULL, 0, "%llu", - align_uint64(&d->bcnt)); + width = pr_u64(&d->bcnt, 0); if (width > bcwidth) bcwidth = width; } @@ -2130,7 +2163,7 @@ fill_ip(ipfw_insn_ip *cmd, char *av) return; } /* A single IP can be stored in an optimized format */ - if (d[1] == ~0 && av == NULL && len == 0) { + if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); return; } @@ -2199,29 +2232,28 @@ fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode, void -ipfw_delete(int ac, char *av[]) +ipfw_delete(char *av[]) { uint32_t rulenum; int i; int exitval = EX_OK; int do_set = 0; - - av++; ac--; + av++; NEED1("missing rule specification"); - if (ac > 0 && _substrcmp(*av, "set") == 0) { + if ( *av && _substrcmp(*av, "set") == 0) { /* Do not allow using the following syntax: * ipfw set N delete set M */ if (co.use_set) errx(EX_DATAERR, "invalid syntax"); do_set = 1; /* delete set */ - ac--; av++; + av++; } /* Rule number */ - while (ac && isdigit(**av)) { - i = atoi(*av); av++; ac--; + while (*av && isdigit(**av)) { + i = atoi(*av); av++; if (co.do_nat) { exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); if (exitval) { @@ -2275,7 +2307,8 @@ fill_iface(ipfw_insn_if *cmd, char *arg) static void get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) { - int i, l; + int i; + size_t l; char *ap, *ptr, *optr; struct ether_addr *mac; const char *macset = "0123456789abcdefABCDEF:"; @@ -2297,11 +2330,11 @@ get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) if (ptr != NULL) { /* we have mask? */ if (p[ptr - optr - 1] == '/') { /* mask len */ - l = strtol(ptr, &ap, 10); - if (*ap != 0 || l > ETHER_ADDR_LEN * 8 || l < 0) + long ml = strtol(ptr, &ap, 10); + if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) errx(EX_DATAERR, "Incorrect mask length"); - for (i = 0; l > 0 && i < ETHER_ADDR_LEN; l -= 8, i++) - mask[i] = (l >= 8) ? 0xff: (~0) << (8 - l); + for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) + mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); } else { /* mask */ l = strlen(ptr); if (strspn(ptr, macset) != l || @@ -2336,7 +2369,7 @@ next_cmd(ipfw_insn *cmd) * Takes arguments and copies them into a comment */ static void -fill_comment(ipfw_insn *cmd, int ac, char **av) +fill_comment(ipfw_insn *cmd, char **av) { int i, l; char *p = (char *)(cmd + 1); @@ -2345,7 +2378,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av) cmd->len = (cmd->len & (F_NOT | F_OR)); /* Compute length of comment string. */ - for (i = 0, l = 0; i < ac; i++) + for (i = 0, l = 0; av[i] != NULL; i++) l += strlen(av[i]) + 1; if (l == 0) return; @@ -2354,7 +2387,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av) "comment too long (max 80 chars)"); l = 1 + (l+3)/4; cmd->len = (cmd->len & (F_NOT | F_OR)) | l; - for (i = 0; i < ac; i++) { + for (i = 0; av[i] != NULL; i++) { strcpy(p, av[i]); p += strlen(av[i]); *p++ = ' '; @@ -2379,11 +2412,11 @@ fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) * two microinstructions, and returns the pointer to the last one. */ static ipfw_insn * -add_mac(ipfw_insn *cmd, int ac, char *av[]) +add_mac(ipfw_insn *cmd, char *av[]) { ipfw_insn_mac *mac; - if (ac < 2) + if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) errx(EX_DATAERR, "MAC dst src"); cmd->opcode = O_MACADDR2; @@ -2397,9 +2430,9 @@ add_mac(ipfw_insn *cmd, int ac, char *av[]) } static ipfw_insn * -add_mactype(ipfw_insn *cmd, int ac, char *av) +add_mactype(ipfw_insn *cmd, char *av) { - if (ac < 1) + if (!av) errx(EX_DATAERR, "missing MAC type"); if (strcmp(av, "any") != 0) { /* we have a non-null type */ fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE); @@ -2507,6 +2540,7 @@ add_dstip(ipfw_insn *cmd, char *av) static ipfw_insn * add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode) { + /* XXX "any" is trapped before. Perhaps "to" */ if (_substrcmp(av, "any") == 0) { return NULL; } else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) { @@ -2530,11 +2564,11 @@ add_src(ipfw_insn *cmd, char *av, u_char proto) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || - inet_pton(AF_INET6, host, &a)) + inet_pton(AF_INET6, host, &a) == 1) ret = add_srcip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || - !inet_pton(AF_INET6, host, &a))) + inet_pton(AF_INET6, host, &a) != 1)) ret = add_srcip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; @@ -2556,11 +2590,11 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || - inet_pton(AF_INET6, host, &a)) + inet_pton(AF_INET6, host, &a) == 1) ret = add_dstip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || - !inet_pton(AF_INET6, host, &a))) + inet_pton(AF_INET6, host, &a) != 1)) ret = add_dstip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; @@ -2582,7 +2616,7 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto) * */ void -ipfw_add(int ac, char *av[]) +ipfw_add(char *av[]) { /* * rules are added into the 'rulebuf' and then copied in @@ -2621,37 +2655,36 @@ ipfw_add(int ac, char *av[]) cmd = (ipfw_insn *)cmdbuf; action = (ipfw_insn *)actbuf; - av++; ac--; + av++; /* [rule N] -- Rule number optional */ - if (ac && isdigit(**av)) { + if (av[0] && isdigit(**av)) { rule->rulenum = atoi(*av); av++; - ac--; } /* [set N] -- set number (0..RESVD_SET), optional */ - if (ac > 1 && _substrcmp(*av, "set") == 0) { + if (av[0] && av[1] && _substrcmp(*av, "set") == 0) { int set = strtoul(av[1], NULL, 10); if (set < 0 || set > RESVD_SET) errx(EX_DATAERR, "illegal set %s", av[1]); rule->set = set; - av += 2; ac -= 2; + av += 2; } /* [prob D] -- match probability, optional */ - if (ac > 1 && _substrcmp(*av, "prob") == 0) { + if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { match_prob = strtod(av[1], NULL); if (match_prob <= 0 || match_prob > 1) errx(EX_DATAERR, "illegal match prob. %s", av[1]); - av += 2; ac -= 2; + av += 2; } /* action -- mandatory */ NEED1("missing action"); i = match_token(rule_actions, *av); - ac--; av++; + av++; action->len = 1; /* default */ switch(i) { case TOK_CHECKSTATE: @@ -2687,14 +2720,14 @@ ipfw_add(int ac, char *av[]) action->opcode = O_REJECT; NEED1("missing reject code"); fill_reject_code(&action->arg1, *av); - ac--; av++; + av++; break; case TOK_UNREACH6: action->opcode = O_UNREACH6; NEED1("missing unreach code"); fill_unreach6_code(&action->arg1, *av); - ac--; av++; + av++; break; case TOK_COUNT: @@ -2727,7 +2760,7 @@ ipfw_add(int ac, char *av[]) case TOK_TEE: action->opcode = O_TEE; chkarg: - if (!ac) + if (!av[0]) errx(EX_USAGE, "missing argument for %s", *(av - 1)); if (isdigit(**av)) { action->arg1 = strtoul(*av, NULL, 10); @@ -2746,7 +2779,7 @@ chkarg: errx(EX_DATAERR, "illegal divert/tee port"); } else errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); - ac--; av++; + av++; break; case TOK_FORWARD: { @@ -2784,13 +2817,13 @@ chkarg: p->sa.sin_addr.s_addr = INADDR_ANY; else lookup_host(*av, &(p->sa.sin_addr)); - ac--; av++; + av++; break; } case TOK_COMMENT: /* pretend it is a 'count' rule followed by the comment */ action->opcode = O_COUNT; - ac++; av--; /* go back... */ + av--; /* go back... */ break; case TOK_SETFIB: @@ -2805,7 +2838,7 @@ chkarg: errx(EX_DATAERR, "fibs not suported.\n"); if (action->arg1 >= numfibs) /* Temporary */ errx(EX_DATAERR, "fib too large.\n"); - ac--; av++; + av++; break; } @@ -2825,8 +2858,8 @@ chkarg: * If they exist, it go first in the cmdbuf, but then it is * skipped in the copy section to the end of the buffer. */ - while (ac != 0 && (i = match_token(rule_action_params, *av)) != -1) { - ac--; av++; + while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { + av++; switch (i) { case TOK_LOG: { @@ -2839,15 +2872,15 @@ chkarg: have_log = (ipfw_insn *)c; cmd->len = F_INSN_SIZE(ipfw_insn_log); cmd->opcode = O_LOG; - if (ac && _substrcmp(*av, "logamount") == 0) { - ac--; av++; + if (av[0] && _substrcmp(*av, "logamount") == 0) { + av++; NEED1("logamount requires argument"); l = atoi(*av); if (l < 0) errx(EX_DATAERR, "logamount must be positive"); c->max_log = l; - ac--; av++; + av++; } else { len = sizeof(c->max_log); if (sysctlbyname("net.inet.ip.fw.verbose_limit", @@ -2858,6 +2891,7 @@ chkarg: } break; +#ifndef NO_ALTQ case TOK_ALTQ: { ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; @@ -2870,9 +2904,10 @@ chkarg: cmd->len = F_INSN_SIZE(ipfw_insn_altq); cmd->opcode = O_ALTQ; a->qid = altq_name_to_qid(*av); - ac--; av++; + av++; } break; +#endif case TOK_TAG: case TOK_UNTAG: { @@ -2885,7 +2920,7 @@ chkarg: rule_action_params); have_tag = cmd; fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); - ac--; av++; + av++; break; } @@ -2899,13 +2934,13 @@ chkarg: goto done; #define OR_START(target) \ - if (ac && (*av[0] == '(' || *av[0] == '{')) { \ + if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ if (open_par) \ errx(EX_USAGE, "nested \"(\" not allowed\n"); \ prev = NULL; \ open_par = 1; \ if ( (av[0])[1] == '\0') { \ - ac--; av++; \ + av++; \ } else \ (*av)++; \ } \ @@ -2914,30 +2949,30 @@ chkarg: #define CLOSE_PAR \ if (open_par) { \ - if (ac && ( \ + if (av[0] && ( \ strcmp(*av, ")") == 0 || \ strcmp(*av, "}") == 0)) { \ prev = NULL; \ open_par = 0; \ - ac--; av++; \ + av++; \ } else \ errx(EX_USAGE, "missing \")\"\n"); \ } #define NOT_BLOCK \ - if (ac && _substrcmp(*av, "not") == 0) { \ + if (av[0] && _substrcmp(*av, "not") == 0) { \ if (cmd->len & F_NOT) \ errx(EX_USAGE, "double \"not\" not allowed\n"); \ cmd->len |= F_NOT; \ - ac--; av++; \ + av++; \ } #define OR_BLOCK(target) \ - if (ac && _substrcmp(*av, "or") == 0) { \ + if (av[0] && _substrcmp(*av, "or") == 0) { \ if (prev == NULL || open_par == 0) \ errx(EX_DATAERR, "invalid OR block"); \ prev->len |= F_OR; \ - ac--; av++; \ + av++; \ goto target; \ } \ CLOSE_PAR; @@ -2954,15 +2989,15 @@ chkarg: NEED1("missing protocol"); if (_substrcmp(*av, "MAC") == 0 || _substrcmp(*av, "mac") == 0) { - ac--; av++; /* the "MAC" keyword */ - add_mac(cmd, ac, av); /* exits in case of errors */ + av++; /* the "MAC" keyword */ + add_mac(cmd, av); /* exits in case of errors */ cmd = next_cmd(cmd); - ac -= 2; av += 2; /* dst-mac and src-mac */ + av += 2; /* dst-mac and src-mac */ NOT_BLOCK; NEED1("missing mac type"); - if (add_mactype(cmd, ac, av[0])) + if (add_mactype(cmd, av[0])) cmd = next_cmd(cmd); - ac--; av++; /* any or mac-type */ + av++; /* any or mac-type */ goto read_options; } #endif @@ -2974,7 +3009,7 @@ chkarg: NOT_BLOCK; NEED1("missing protocol"); if (add_proto_compat(cmd, *av, &proto)) { - av++; ac--; + av++; if (F_LEN(cmd) != 0) { prev = cmd; cmd = next_cmd(cmd); @@ -2988,9 +3023,9 @@ chkarg: /* * "from", mandatory */ - if (!ac || _substrcmp(*av, "from") != 0) + if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) errx(EX_USAGE, "missing ``from''"); - ac--; av++; + av++; /* * source IP, mandatory @@ -2999,7 +3034,7 @@ chkarg: NOT_BLOCK; /* optional "not" */ NEED1("missing source address"); if (add_src(cmd, *av, proto)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); @@ -3012,10 +3047,10 @@ chkarg: * source ports, optional */ NOT_BLOCK; /* optional "not" */ - if (ac) { + if ( av[0] != NULL ) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } @@ -3024,9 +3059,9 @@ chkarg: /* * "to", mandatory */ - if (!ac || _substrcmp(*av, "to") != 0) + if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) errx(EX_USAGE, "missing ``to''"); - av++; ac--; + av++; /* * destination, mandatory @@ -3035,7 +3070,7 @@ chkarg: NOT_BLOCK; /* optional "not" */ NEED1("missing dst address"); if (add_dst(cmd, *av, proto)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); @@ -3048,17 +3083,17 @@ chkarg: * dest. ports, optional */ NOT_BLOCK; /* optional "not" */ - if (ac) { + if (av[0]) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } } read_options: - if (ac && first_cmd == cmd) { + if (av[0] && first_cmd == cmd) { /* * nothing specified so far, store in the rule to ease * printout later. @@ -3066,7 +3101,7 @@ read_options: rule->_pad = 1; } prev = NULL; - while (ac) { + while ( av[0] != NULL ) { char *s; ipfw_insn_u32 *cmd32; /* alias for cmd */ @@ -3080,7 +3115,7 @@ read_options: s++; } i = match_token(rule_options, s); - ac--; av++; + av++; switch(i) { case TOK_NOT: if (cmd->len & F_NOT) @@ -3142,7 +3177,7 @@ read_options: NEED1("recv, xmit, via require interface name" " or address"); fill_iface((ipfw_insn_if *)cmd, av[0]); - ac--; av++; + av++; if (F_LEN(cmd) == 0) /* not a valid address */ break; if (i == TOK_XMIT) @@ -3156,13 +3191,13 @@ read_options: case TOK_ICMPTYPES: NEED1("icmptypes requires list of types"); fill_icmptypes((ipfw_insn_u32 *)cmd, *av); - av++; ac--; + av++; break; case TOK_ICMP6TYPES: NEED1("icmptypes requires list of types"); fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av); - av++; ac--; + av++; break; case TOK_IPTTL: @@ -3172,7 +3207,7 @@ read_options: errx(EX_DATAERR, "invalid ipttl %s", *av); } else fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPID: @@ -3182,7 +3217,7 @@ read_options: errx(EX_DATAERR, "invalid ipid %s", *av); } else fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPLEN: @@ -3192,32 +3227,32 @@ read_options: errx(EX_DATAERR, "invalid ip len %s", *av); } else fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPVER: NEED1("ipver requires version"); fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPPRECEDENCE: NEED1("ipprecedence requires value"); fill_cmd(cmd, O_IPPRECEDENCE, 0, (strtoul(*av, NULL, 0) & 7) << 5); - ac--; av++; + av++; break; case TOK_IPOPTS: NEED1("missing argument for ipoptions"); fill_flags(cmd, O_IPOPT, f_ipopts, *av); - ac--; av++; + av++; break; case TOK_IPTOS: NEED1("missing argument for iptos"); fill_flags(cmd, O_IPTOS, f_iptos, *av); - ac--; av++; + av++; break; case TOK_UID: @@ -3234,7 +3269,7 @@ read_options: errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); cmd32->d[0] = pwd->pw_uid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - ac--; av++; + av++; } break; @@ -3252,7 +3287,7 @@ read_options: errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); cmd32->d[0] = grp->gr_gid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - ac--; av++; + av++; } break; @@ -3268,7 +3303,7 @@ read_options: errx(EX_DATAERR, "jail requires prison ID"); cmd32->d[0] = (uint32_t)jid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - ac--; av++; + av++; } break; @@ -3289,13 +3324,13 @@ read_options: } else fill_cmd(cmd, O_TCPDATALEN, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_TCPOPTS: NEED1("missing argument for tcpoptions"); fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av); - ac--; av++; + av++; break; case TOK_TCPSEQ: @@ -3304,21 +3339,21 @@ read_options: cmd->len = F_INSN_SIZE(ipfw_insn_u32); cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_TCPWIN: NEED1("tcpwin requires length"); fill_cmd(cmd, O_TCPWIN, 0, htons(strtoul(*av, NULL, 0))); - ac--; av++; + av++; break; case TOK_TCPFLAGS: NEED1("missing argument for tcpflags"); cmd->opcode = O_TCPFLAGS; fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av); - ac--; av++; + av++; break; case TOK_KEEPSTATE: @@ -3348,11 +3383,11 @@ read_options: cmd->opcode = O_LIMIT; c->limit_mask = c->conn_limit = 0; - while (ac > 0) { + while ( av[0] != NULL ) { if ((val = match_token(limit_masks, *av)) <= 0) break; c->limit_mask |= val; - ac--; av++; + av++; } if (c->limit_mask == 0) @@ -3361,14 +3396,14 @@ read_options: GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_LIMIT, rule_options); - ac--; av++; + av++; break; } case TOK_PROTO: NEED1("missing protocol"); if (add_proto(cmd, *av, &proto)) { - ac--; av++; + av++; } else errx(EX_DATAERR, "invalid protocol ``%s''", *av); @@ -3377,28 +3412,28 @@ read_options: case TOK_SRCIP: NEED1("missing source IP"); if (add_srcip(cmd, *av)) { - ac--; av++; + av++; } break; case TOK_DSTIP: NEED1("missing destination IP"); if (add_dstip(cmd, *av)) { - ac--; av++; + av++; } break; case TOK_SRCIP6: NEED1("missing source IP6"); if (add_srcip6(cmd, *av)) { - ac--; av++; + av++; } break; case TOK_DSTIP6: NEED1("missing destination IP6"); if (add_dstip6(cmd, *av)) { - ac--; av++; + av++; } break; @@ -3406,7 +3441,7 @@ read_options: NEED1("missing source port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { - ac--; av++; + av++; } else errx(EX_DATAERR, "invalid source port %s", *av); break; @@ -3415,23 +3450,22 @@ read_options: NEED1("missing destination port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { - ac--; av++; + av++; } else errx(EX_DATAERR, "invalid destination port %s", *av); break; case TOK_MAC: - if (add_mac(cmd, ac, av)) { - ac -= 2; av += 2; - } + if (add_mac(cmd, av)) + av += 2; break; case TOK_MACTYPE: NEED1("missing mac type"); - if (!add_mactype(cmd, ac, *av)) + if (!add_mactype(cmd, *av)) errx(EX_DATAERR, "invalid mac type %s", *av); - ac--; av++; + av++; break; case TOK_VERREVPATH: @@ -3460,7 +3494,7 @@ read_options: case TOK_EXT6HDR: fill_ext6hdr( cmd, *av ); - ac--; av++; + av++; break; case TOK_FLOWID: @@ -3468,17 +3502,16 @@ read_options: errx( EX_USAGE, "flow-id filter is active " "only for ipv6 protocol\n"); fill_flow6( (ipfw_insn_u32 *) cmd, *av ); - ac--; av++; + av++; break; case TOK_COMMENT: - fill_comment(cmd, ac, av); - av += ac; - ac = 0; + fill_comment(cmd, av); + av[0]=NULL; break; case TOK_TAGGED: - if (ac > 0 && strpbrk(*av, "-,")) { + if (av[0] && strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TAGGED)) errx(EX_DATAERR, "tagged: invalid tag" " list: %s", *av); @@ -3490,13 +3523,13 @@ read_options: TOK_TAGGED, rule_options); fill_cmd(cmd, O_TAGGED, 0, tag); } - ac--; av++; + av++; break; case TOK_FIB: NEED1("fib requires fib number"); fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_LOOKUP: { @@ -3504,7 +3537,7 @@ read_options: char *p; int j; - if (ac < 2) + if (!av[0] || !av[1]) errx(EX_USAGE, "format: lookup argument tablenum"); cmd->opcode = O_IP_DST_LOOKUP; cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; @@ -3516,11 +3549,11 @@ read_options: if (lookup_key[j] <= 0) errx(EX_USAGE, "format: cannot lookup on %s", *av); c->d[1] = j; // i converted to option - ac--; av++; + av++; cmd->arg1 = strtoul(*av, &p, 0); if (p && *p) errx(EX_USAGE, "format: lookup argument tablenum"); - ac--; av++; + av++; } break; @@ -3698,6 +3731,10 @@ ipfw_flush(int force) if (c == 'N') /* user said no */ return; } + if (co.do_pipe) { + dummynet_flush(); + return; + } /* `ipfw set N flush` - is the same that `ipfw delete set N` */ if (co.use_set) { uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24); @@ -3811,14 +3848,14 @@ ipfw_table_handler(int ac, char *av[]) } } } else if (_substrcmp(*av, "flush") == 0) { - a = is_all ? tables_max : (ent.tbl + 1); + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); do { if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl, sizeof(ent.tbl)) < 0) err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)"); } while (++ent.tbl < a); } else if (_substrcmp(*av, "list") == 0) { - a = is_all ? tables_max : (ent.tbl + 1); + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); do { table_list(ent, is_all); } while (++ent.tbl < a); diff --git a/sbin/ipfw/ipfw2.h b/sbin/ipfw/ipfw2.h index b393a7d..8566cde 100644 --- a/sbin/ipfw/ipfw2.h +++ b/sbin/ipfw/ipfw2.h @@ -35,7 +35,7 @@ struct cmdline_opts { int do_resolv; /* try to resolve all ip to names */ int do_time; /* Show time stamps */ int do_quiet; /* Be quiet in add and flush */ - int do_pipe; /* this cmd refers to a pipe */ + int do_pipe; /* this cmd refers to a pipe/queue/sched */ int do_nat; /* this cmd refers to a nat config */ int do_dynamic; /* display dynamic rules */ int do_expired; /* display expired dynamic rules */ @@ -82,7 +82,10 @@ enum tokens { TOK_ACCEPT, TOK_COUNT, TOK_PIPE, + TOK_LINK, TOK_QUEUE, + TOK_FLOWSET, + TOK_SCHED, TOK_DIVERT, TOK_TEE, TOK_NETGRAPH, @@ -122,6 +125,7 @@ enum tokens { TOK_IPLEN, TOK_IPID, TOK_IPPRECEDENCE, + TOK_DSCP, TOK_IPTOS, TOK_IPTTL, TOK_IPVER, @@ -151,15 +155,23 @@ enum tokens { TOK_SRCPORT, TOK_ALL, TOK_MASK, + TOK_FLOW_MASK, + TOK_SCHED_MASK, TOK_BW, TOK_DELAY, - TOK_PIPE_PROFILE, + TOK_PROFILE, TOK_BURST, TOK_RED, TOK_GRED, TOK_DROPTAIL, TOK_PROTO, + /* dummynet tokens */ TOK_WEIGHT, + TOK_LMAX, + TOK_PRI, + TOK_TYPE, + TOK_SLOTSIZE, + TOK_IP, TOK_IF, TOK_ALOG, @@ -192,9 +204,10 @@ enum tokens { * the following macro returns an error message if we run out of * arguments. */ -#define NEED1(msg) {if (!ac) errx(EX_USAGE, msg);} +#define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} +#define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} -unsigned long long align_uint64(const uint64_t *pll); +int pr_u64(uint64_t *pd, int width); /* memory allocation support */ void *safe_calloc(size_t number, size_t size); @@ -236,14 +249,14 @@ struct _ipfw_insn_icmp6; extern int resvd_set_number; /* first-level command handlers */ -void ipfw_add(int ac, char *av[]); +void ipfw_add(char *av[]); void ipfw_show_nat(int ac, char **av); void ipfw_config_pipe(int ac, char **av); void ipfw_config_nat(int ac, char **av); -void ipfw_sets_handler(int ac, char *av[]); +void ipfw_sets_handler(char *av[]); void ipfw_table_handler(int ac, char *av[]); -void ipfw_sysctl_handler(int ac, char *av[], int which); -void ipfw_delete(int ac, char *av[]); +void ipfw_sysctl_handler(char *av[], int which); +void ipfw_delete(char *av[]); void ipfw_flush(int force); void ipfw_zero(int ac, char *av[], int optname); void ipfw_list(int ac, char *av[], int show_counters); @@ -255,7 +268,8 @@ u_int32_t altq_name_to_qid(const char *name); void print_altq_cmd(struct _ipfw_insn_altq *altqptr); /* dummynet.c */ -void ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[]); +void dummynet_list(int ac, char *av[], int show_counters); +void dummynet_flush(void); int ipfw_delete_pipe(int pipe_or_queue, int n); /* ipv6.c */ diff --git a/sbin/ipfw/main.c b/sbin/ipfw/main.c index 3916057..43693e0 100644 --- a/sbin/ipfw/main.c +++ b/sbin/ipfw/main.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 2002-2003,2010 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * @@ -80,31 +80,27 @@ help(void) } /* - * Free a the (locally allocated) copy of command line arguments. - */ -static void -free_args(int ac, char **av) -{ - int i; - - for (i=0; i < ac; i++) - free(av[i]); - free(av); -} - -/* * Called with the arguments, including program name because getopt * wants it to be present. * Returns 0 if successful, 1 if empty command, errx() in case of errors. + * First thing we do is process parameters creating an argv[] array + * which includes the program name and a NULL entry at the end. + * If we are called with a single string, we split it on whitespace. + * Also, arguments with a trailing ',' are joined to the next one. + * The pointers (av[]) and data are in a a single chunk of memory. + * av[0] points to the original program name, all other entries + * point into the allocated chunk. */ static int ipfw_main(int oldac, char **oldav) { - int ch, ac, save_ac; + int ch, ac; const char *errstr; char **av, **save_av; int do_acct = 0; /* Show packet/byte count */ int try_next = 0; /* set if pipe cmd not found */ + int av_size; /* compute the av size */ + char *av_p; /* used to build the av list */ #define WHITESP " \t\f\v\n\r" if (oldac < 2) @@ -112,10 +108,9 @@ ipfw_main(int oldac, char **oldav) if (oldac == 2) { /* - * If we are called with a single string, try to split it into - * arguments for subsequent parsing. - * But first, remove spaces after a ',', by copying the string - * in-place. + * If we are called with one argument, try to split it into + * words for subsequent parsing. Spaces after a ',' are + * removed by copying the string in-place. */ char *arg = oldav[1]; /* The string is the first arg. */ int l = strlen(arg); @@ -150,31 +145,59 @@ ipfw_main(int oldac, char **oldav) ac++; /* - * Allocate the argument list, including one entry for - * the program name because getopt expects it. + * Allocate the argument list structure as a single block + * of memory, containing pointers and the argument + * strings. We include one entry for the program name + * because getopt expects it, and a NULL at the end + * to simplify further parsing. */ - av = safe_calloc(ac + 1, sizeof(char *)); + ac++; /* add 1 for the program name */ + av_size = (ac+1) * sizeof(char *) + l + 1; + av = safe_calloc(av_size, 1); /* - * Second, copy arguments from arg[] to av[]. For each one, + * Init the argument pointer to the end of the array + * and copy arguments from arg[] to av[]. For each one, * j is the initial character, i is the one past the end. */ - for (ac = 1, i = j = 0; i < l; i++) + av_p = (char *)&av[ac+1]; + for (ac = 1, i = j = 0; i < l; i++) { if (index(WHITESP, arg[i]) != NULL || i == l-1) { if (i == l-1) i++; - av[ac] = safe_calloc(i-j+1, 1); - bcopy(arg+j, av[ac], i-j); + bcopy(arg+j, av_p, i-j); + av[ac] = av_p; + av_p += i-j; /* the lenght of the string */ + *av_p++ = '\0'; ac++; j = i + 1; } + } } else { /* * If an argument ends with ',' join with the next one. */ - int first, i, l; + int first, i, l=0; + + /* + * Allocate the argument list structure as a single block + * of memory, containing both pointers and the argument + * strings. We include some space for the program name + * because getopt expects it. + * We add an extra pointer to the end of the array, + * to make simpler further parsing. + */ + for (i=0; i<oldac; i++) + l += strlen(oldav[i]); - av = safe_calloc(oldac, sizeof(char *)); + av_size = (oldac+1) * sizeof(char *) + l + oldac; + av = safe_calloc(av_size, 1); + + /* + * Init the argument pointer to the end of the array + * and copy arguments from arg[] to av[] + */ + av_p = (char *)&av[oldac+1]; for (first = i = ac = 1, l = 0; i < oldac; i++) { char *arg = oldav[i]; int k = strlen(arg); @@ -182,11 +205,12 @@ ipfw_main(int oldac, char **oldav) l += k; if (arg[k-1] != ',' || i == oldac-1) { /* Time to copy. */ - av[ac] = safe_calloc(l+1, 1); + av[ac] = av_p; for (l=0; first <= i; first++) { - strcat(av[ac]+l, oldav[first]); - l += strlen(oldav[first]); + strcat(av_p, oldav[first]); + av_p += strlen(oldav[first]); } + *av_p++ = '\0'; ac++; l = 0; first = i+1; @@ -194,13 +218,47 @@ ipfw_main(int oldac, char **oldav) } } - av[0] = strdup(oldav[0]); /* copy progname from the caller */ + /* + * set the progname pointer to the original string + * and terminate the array with null + */ + av[0] = oldav[0]; + av[ac] = NULL; + /* Set the force flag for non-interactive processes */ if (!co.do_force) co.do_force = !isatty(STDIN_FILENO); +#ifdef EMULATE_SYSCTL /* sysctl emulation */ + if ( ac >= 2 && !strcmp(av[1], "sysctl")) { + char *s; + int i; + + if (ac != 3) { + printf( "sysctl emulation usage:\n" + " ipfw sysctl name[=value]\n" + " ipfw sysctl -a\n"); + return 0; + } + s = index(av[2], '='); + if (s == NULL) { + s = !strcmp(av[2], "-a") ? NULL : av[2]; + sysctlbyname(s, NULL, NULL, NULL, 0); + } else { /* ipfw sysctl x.y.z=value */ + /* assume an INT value, will extend later */ + if (s[1] == '\0') { + printf("ipfw sysctl: missing value\n\n"); + return 0; + } + *s = '\0'; + i = strtol(s+1, NULL, 0); + sysctlbyname(av[2], NULL, NULL, &i, sizeof(int)); + } + return 0; + } +#endif + /* Save arguments for final freeing of memory. */ - save_ac = ac; save_av = av; optind = optreset = 1; /* restart getopt() */ @@ -232,7 +290,7 @@ ipfw_main(int oldac, char **oldav) break; case 'h': /* help */ - free_args(save_ac, save_av); + free(save_av); help(); break; /* NOTREACHED */ @@ -273,7 +331,7 @@ ipfw_main(int oldac, char **oldav) break; default: - free_args(save_ac, save_av); + free(save_av); return 1; } @@ -304,6 +362,10 @@ ipfw_main(int oldac, char **oldav) co.do_pipe = 1; else if (_substrcmp(*av, "queue") == 0) co.do_pipe = 2; + else if (_substrcmp(*av, "flowset") == 0) + co.do_pipe = 2; + else if (_substrcmp(*av, "sched") == 0) + co.do_pipe = 3; else if (!strncmp(*av, "set", strlen(*av))) { if (ac > 1 && isdigit(av[1][0])) { co.use_set = strtonum(av[1], 0, resvd_set_number, @@ -335,7 +397,7 @@ ipfw_main(int oldac, char **oldav) if (co.use_set == 0) { if (_substrcmp(*av, "add") == 0) - ipfw_add(ac, av); + ipfw_add(av); else if (co.do_nat && _substrcmp(*av, "show") == 0) ipfw_show_nat(ac, av); else if (co.do_pipe && _substrcmp(*av, "config") == 0) @@ -343,20 +405,20 @@ ipfw_main(int oldac, char **oldav) else if (co.do_nat && _substrcmp(*av, "config") == 0) ipfw_config_nat(ac, av); else if (_substrcmp(*av, "set") == 0) - ipfw_sets_handler(ac, av); + ipfw_sets_handler(av); else if (_substrcmp(*av, "table") == 0) ipfw_table_handler(ac, av); else if (_substrcmp(*av, "enable") == 0) - ipfw_sysctl_handler(ac, av, 1); + ipfw_sysctl_handler(av, 1); else if (_substrcmp(*av, "disable") == 0) - ipfw_sysctl_handler(ac, av, 0); + ipfw_sysctl_handler(av, 0); else try_next = 1; } if (co.use_set || try_next) { if (_substrcmp(*av, "delete") == 0) - ipfw_delete(ac, av); + ipfw_delete(av); else if (_substrcmp(*av, "flush") == 0) ipfw_flush(co.do_force); else if (_substrcmp(*av, "zero") == 0) @@ -373,7 +435,7 @@ ipfw_main(int oldac, char **oldav) } /* Free memory allocated in the argument parsing. */ - free_args(save_ac, save_av); + free(save_av); return 0; } @@ -491,11 +553,11 @@ ipfw_readfile(int ac, char *av[]) } while (fgets(buf, BUFSIZ, f)) { /* read commands */ - char linename[10]; + char linename[20]; char *args[2]; lineno++; - sprintf(linename, "Line %d", lineno); + snprintf(linename, sizeof(linename), "Line %d", lineno); setprogname(linename); /* XXX */ args[0] = progname; args[1] = buf; @@ -521,6 +583,20 @@ ipfw_readfile(int ac, char *av[]) int main(int ac, char *av[]) { +#if defined(_WIN32) && defined(TCC) + { + WSADATA wsaData; + int ret=0; + unsigned short wVersionRequested = MAKEWORD(2, 2); + ret = WSAStartup(wVersionRequested, &wsaData); + if (ret != 0) { + /* Tell the user that we could not find a usable */ + /* Winsock DLL. */ + printf("WSAStartup failed with error: %d\n", ret); + return 1; + } + } +#endif /* * If the last argument is an absolute pathname, interpret it * as a file to be preprocessed. diff --git a/sbin/iscontrol/iscsi.conf.5 b/sbin/iscontrol/iscsi.conf.5 index 2edcd35..0de5122 100644 --- a/sbin/iscontrol/iscsi.conf.5 +++ b/sbin/iscontrol/iscsi.conf.5 @@ -25,8 +25,8 @@ .\" $FreeBSD$ .\" .Dd June 5, 2007 -.Os .Dt ISCSI.CONF 5 +.Os .Sh NAME .Nm iscsi.conf .Nd key options to be negotiated in an iSCSI session diff --git a/sbin/mca/mca.c b/sbin/mca/mca.c index 0c4e1a0..9934454 100644 --- a/sbin/mca/mca.c +++ b/sbin/mca/mca.c @@ -53,10 +53,12 @@ __FBSDID("$FreeBSD$"); #define BCD(x) ((x >> 4) * 10 + (x & 15)) +#define HW_MCA_MAX_CPUID 255 + static char hw_mca_count[] = "hw.mca.count"; static char hw_mca_first[] = "hw.mca.first"; static char hw_mca_last[] = "hw.mca.last"; -static char hw_mca_recid[] = "hw.mca.%d"; +static char hw_mca_recid[] = "hw.mca.%lu.%u"; static char default_dumpfile[] = "/var/log/mca.log"; @@ -372,10 +374,13 @@ show_section(struct mca_section_header *sh) } static void -show(char *data) +show(char *data, const char *mib) { size_t reclen, seclen; + if (mib != NULL) + printf("<!-- MIB: %s -->\n", mib); + printf("<record>\n"); reclen = show_header((void*)data) - sizeof(struct mca_record_header); data += sizeof(struct mca_record_header); @@ -402,7 +407,7 @@ showall(char *buf, size_t buflen) if (buflen < reclen) return; - show(buf); + show(buf, NULL); buf += reclen; buflen -= reclen; @@ -442,7 +447,7 @@ main(int argc, char **argv) char *buf; size_t len; int ch, error, fd; - int count, first, last; + int count, first, last, cpuid; while ((ch = getopt(argc, argv, "df:")) != -1) { switch(ch) { @@ -481,12 +486,19 @@ main(int argc, char **argv) if (error) err(1, hw_mca_last); + cpuid = 0; while (count && first <= last) { - sprintf(mib, hw_mca_recid, first); - len = 0; - error = sysctlbyname(mib, NULL, &len, NULL, 0); - if (error == ENOENT) { + do { + sprintf(mib, hw_mca_recid, first, cpuid); + len = 0; + error = sysctlbyname(mib, NULL, &len, NULL, 0); + if (error != ENOENT) + break; + cpuid++; + } while (cpuid <= HW_MCA_MAX_CPUID); + if (error == ENOENT && cpuid > HW_MCA_MAX_CPUID) { first++; + cpuid = 0; continue; } if (error) @@ -503,11 +515,15 @@ main(int argc, char **argv) if (fl_dump) dump(buf); else - show(buf); + show(buf, mib); free(buf); - first++; count--; + if (cpuid == HW_MCA_MAX_CPUID) { + first++; + cpuid = 0; + } else + cpuid++; } } else { fd = open(file, O_RDONLY); diff --git a/sbin/mount/mount.c b/sbin/mount/mount.c index 907f754..b39a7d1 100644 --- a/sbin/mount/mount.c +++ b/sbin/mount/mount.c @@ -91,7 +91,7 @@ char *flags2opts(int); /* Map from mount options to printable formats. */ static struct opt { - int o_opt; + uint64_t o_opt; const char *o_name; } optnames[] = { { MNT_ASYNC, "asynchronous" }, @@ -612,7 +612,7 @@ mountfs(const char *vfstype, const char *spec, const char *name, int flags, void prmount(struct statfs *sfp) { - int flags; + uint64_t flags; unsigned int i; struct opt *o; struct passwd *pw; @@ -621,7 +621,7 @@ prmount(struct statfs *sfp) sfp->f_fstypename); flags = sfp->f_flags & MNT_VISFLAGMASK; - for (o = optnames; flags && o->o_opt; o++) + for (o = optnames; flags != 0 && o->o_opt != 0; o++) if (flags & o->o_opt) { (void)printf(", %s", o->o_name); flags &= ~o->o_opt; diff --git a/sbin/newfs/Makefile b/sbin/newfs/Makefile index f89499e..d45143b 100644 --- a/sbin/newfs/Makefile +++ b/sbin/newfs/Makefile @@ -4,8 +4,8 @@ .PATH: ${.CURDIR}/../../sys/geom PROG= newfs -DPADD= ${LIBUFS} -LDADD= -lufs +DPADD= ${LIBUFS} ${LIBUTIL} +LDADD= -lufs -lutil SRCS= newfs.c mkfs.c geom_bsd_enc.c WARNS?= 3 diff --git a/sbin/newfs/newfs.8 b/sbin/newfs/newfs.8 index 9aea040..ec9bcd8 100644 --- a/sbin/newfs/newfs.8 +++ b/sbin/newfs/newfs.8 @@ -78,10 +78,10 @@ The following options define the general layout policies: .It Fl E Erase the content of the disk before making the filesystem. The reserved area in front of the superblock (for bootcode) will not be erased. - +.Pp This is a relevant option for flash based storage devices that use wear levelling algorithms. - +.Pp NB: Erasing may take as long time as writing every sector on the disk. .It Fl J Enable journaling on the new file system via gjournal. diff --git a/sbin/newfs/newfs.c b/sbin/newfs/newfs.c index 8867306..e5a42c0 100644 --- a/sbin/newfs/newfs.c +++ b/sbin/newfs/newfs.c @@ -77,6 +77,8 @@ __FBSDID("$FreeBSD$"); #include <syslog.h> #include <unistd.h> +#include <libutil.h> + #include "newfs.h" int Eflag; /* Erase previous disk contents */ @@ -117,6 +119,7 @@ static void getfssize(intmax_t *, const char *p, intmax_t, intmax_t); static struct disklabel *getdisklabel(char *s); static void rewritelabel(char *s, struct disklabel *lp); static void usage(void); +static int expand_number_int(const char *buf, int *num); ufs2_daddr_t part_ofs; /* partition offset in blocks, used with files */ @@ -129,7 +132,7 @@ main(int argc, char *argv[]) struct stat st; char *cp, *special; intmax_t reserved; - int ch, i; + int ch, i, rval; off_t mediasize; char part_name; /* partition name, default to full disk */ @@ -169,7 +172,8 @@ main(int argc, char *argv[]) Rflag = 1; break; case 'S': - if ((sectorsize = atoi(optarg)) <= 0) + rval = expand_number_int(optarg, §orsize); + if (rval < 0 || sectorsize <= 0) errx(1, "%s: bad sector size", optarg); break; case 'T': @@ -182,12 +186,17 @@ main(int argc, char *argv[]) Xflag++; break; case 'a': - if ((maxcontig = atoi(optarg)) <= 0) + rval = expand_number_int(optarg, &maxcontig); + if (rval < 0 || maxcontig <= 0) errx(1, "%s: bad maximum contiguous blocks", optarg); break; case 'b': - if ((bsize = atoi(optarg)) < MINBSIZE) + rval = expand_number_int(optarg, &bsize); + if (rval < 0) + errx(1, "%s: bad block size", + optarg); + if (bsize < MINBSIZE) errx(1, "%s: block size too small, min is %d", optarg, MINBSIZE); if (bsize > MAXBSIZE) @@ -195,33 +204,40 @@ main(int argc, char *argv[]) optarg, MAXBSIZE); break; case 'c': - if ((maxblkspercg = atoi(optarg)) <= 0) + rval = expand_number_int(optarg, &maxblkspercg); + if (rval < 0 || maxblkspercg <= 0) errx(1, "%s: bad blocks per cylinder group", optarg); break; case 'd': - if ((maxbsize = atoi(optarg)) < MINBSIZE) + rval = expand_number_int(optarg, &maxbsize); + if (rval < 0 || maxbsize < MINBSIZE) errx(1, "%s: bad extent block size", optarg); break; case 'e': - if ((maxbpg = atoi(optarg)) <= 0) + rval = expand_number_int(optarg, &maxbpg); + if (rval < 0 || maxbpg <= 0) errx(1, "%s: bad blocks per file in a cylinder group", optarg); break; case 'f': - if ((fsize = atoi(optarg)) <= 0) + rval = expand_number_int(optarg, &fsize); + if (rval < 0 || fsize <= 0) errx(1, "%s: bad fragment size", optarg); break; case 'g': - if ((avgfilesize = atoi(optarg)) <= 0) + rval = expand_number_int(optarg, &avgfilesize); + if (rval < 0 || avgfilesize <= 0) errx(1, "%s: bad average file size", optarg); break; case 'h': - if ((avgfilesperdir = atoi(optarg)) <= 0) + rval = expand_number_int(optarg, &avgfilesperdir); + if (rval < 0 || avgfilesperdir <= 0) errx(1, "%s: bad average files per dir", optarg); break; case 'i': - if ((density = atoi(optarg)) <= 0) + rval = expand_number_int(optarg, &density); + if (rval < 0 || density <= 0) errx(1, "%s: bad bytes per inode", optarg); break; case 'l': @@ -481,3 +497,20 @@ usage() fprintf(stderr, "\t-s file system size (sectors)\n"); exit(1); } + +static int +expand_number_int(const char *buf, int *num) +{ + int64_t num64; + int rval; + + rval = expand_number(buf, &num64); + if (rval < 0) + return (rval); + if (num64 > INT_MAX || num64 < INT_MIN) { + errno = ERANGE; + return (-1); + } + *num = (int)num64; + return (0); +} diff --git a/sbin/nos-tun/Makefile b/sbin/nos-tun/Makefile index e128b62..9f1024f 100644 --- a/sbin/nos-tun/Makefile +++ b/sbin/nos-tun/Makefile @@ -1,8 +1,8 @@ # $FreeBSD$ PROG= nos-tun -WARNS?= 0 MAN= nos-tun.8 +WARNS?= 3 .include <bsd.prog.mk> diff --git a/sbin/nos-tun/nos-tun.c b/sbin/nos-tun/nos-tun.c index 9966840..83e7144 100644 --- a/sbin/nos-tun/nos-tun.c +++ b/sbin/nos-tun/nos-tun.c @@ -89,7 +89,8 @@ int tun; /* tunnel descriptor */ static void usage(void); -int Set_address(char *addr, struct sockaddr_in *sin) +static int +Set_address(char *addr, struct sockaddr_in *sin) { struct hostent *hp; @@ -107,15 +108,16 @@ int Set_address(char *addr, struct sockaddr_in *sin) return 0; } -int tun_open(char *devname, struct sockaddr *ouraddr, char *theiraddr) +static int +tun_open(char *dev_name, struct sockaddr *ouraddr, char *theiraddr) { int s; struct sockaddr_in *sin; /* Open tun device */ - tun = open (devname, O_RDWR); + tun = open(dev_name, O_RDWR); if (tun < 0) { - syslog(LOG_ERR,"can't open %s - %m",devname); + syslog(LOG_ERR,"can't open %s - %m", dev_name); return(1); } @@ -125,8 +127,8 @@ int tun_open(char *devname, struct sockaddr *ouraddr, char *theiraddr) bzero((char *)&ifra, sizeof(ifra)); bzero((char *)&ifrq, sizeof(ifrq)); - strncpy(ifrq.ifr_name, devname+5, IFNAMSIZ); - strncpy(ifra.ifra_name, devname+5, IFNAMSIZ); + strncpy(ifrq.ifr_name, dev_name+5, IFNAMSIZ); + strncpy(ifra.ifra_name, dev_name+5, IFNAMSIZ); s = socket(AF_INET, SOCK_DGRAM, 0); if (s < 0) { @@ -189,7 +191,8 @@ tunc_return: return(1); } -void Finish(int signum) +static void +Finish(int signum) { int s; @@ -238,7 +241,7 @@ int main (int argc, char **argv) { int c, len, ipoff; - char *devname = NULL; + char *dev_name = NULL; char *point_to = NULL; char *to_point = NULL; char *target; @@ -268,7 +271,7 @@ int main (int argc, char **argv) point_to = optarg; break; case 't': - devname = optarg; + dev_name = optarg; break; case 'p': protocol = optarg; @@ -278,7 +281,7 @@ int main (int argc, char **argv) argc -= optind; argv += optind; - if ((argc != 1 && argc != 2) || (devname == NULL) || + if ((argc != 1 && argc != 2) || (dev_name == NULL) || (point_to == NULL) || (to_point == NULL)) { usage(); } @@ -302,7 +305,7 @@ int main (int argc, char **argv) exit(2); } - if(tun_open(devname, &t_laddr, to_point)) { + if(tun_open(dev_name, &t_laddr, to_point)) { closelog(); exit(3); } @@ -386,7 +389,7 @@ int main (int argc, char **argv) } static void -usage() +usage(void) { fprintf(stderr, "usage: nos-tun -t tunnel -s source -d destination -p protocol_number [source] target\n"); diff --git a/sbin/ping6/ping6.8 b/sbin/ping6/ping6.8 index 3897a90..3298dea 100644 --- a/sbin/ping6/ping6.8 +++ b/sbin/ping6/ping6.8 @@ -29,7 +29,7 @@ .\" .\" $FreeBSD$ .\" -.Dd August 27, 2008 +.Dd April 20, 2010 .Dt PING6 8 .Os .Sh NAME @@ -40,9 +40,9 @@ packets to network hosts .Sh SYNOPSIS .Nm .\" without ipsec, or new ipsec -.Op Fl dfHmnNoqrRtvwW +.Op Fl DdfHmnNoqrRtvwW .\" old ipsec -.\" .Op Fl AdEfmnNqRtvwW +.\" .Op Fl ADdEfmnNqRtvwW .Bk -words .Op Fl a Ar addrtype .Ek @@ -141,6 +141,8 @@ Stop after sending .Ar count .Tn ECHO_RESPONSE packets. +.It Fl D +Disable IPv6 fragmentation. .It Fl d Set the .Dv SO_DEBUG diff --git a/sbin/ping6/ping6.c b/sbin/ping6/ping6.c index f7dba27..69a98b2 100644 --- a/sbin/ping6/ping6.c +++ b/sbin/ping6/ping6.c @@ -191,6 +191,7 @@ struct tv32 { #define F_ONCE 0x200000 #define F_AUDIBLE 0x400000 #define F_MISSED 0x800000 +#define F_DONTFRAG 0x1000000 #define F_NOUSERDATA (F_NODEADDR | F_FQDN | F_FQDNOLD | F_SUPTYPES) u_int options; @@ -349,7 +350,7 @@ main(argc, argv) #endif /*IPSEC_POLICY_IPSEC*/ #endif while ((ch = getopt(argc, argv, - "a:b:c:dfHg:h:I:i:l:mnNop:qrRS:s:tvwW" ADDOPTS)) != -1) { + "a:b:c:DdfHg:h:I:i:l:mnNop:qrRS:s:tvwW" ADDOPTS)) != -1) { #undef ADDOPTS switch (ch) { case 'a': @@ -415,6 +416,9 @@ main(argc, argv) errx(1, "illegal number of packets -- %s", optarg); break; + case 'D': + options |= F_DONTFRAG; + break; case 'd': options |= F_SO_DEBUG; break; @@ -742,7 +746,11 @@ main(argc, argv) for (i = 0; i < sizeof(nonce); i += sizeof(u_int32_t)) *((u_int32_t *)&nonce[i]) = arc4random(); #endif - + optval = 1; + if (options & F_DONTFRAG) + if (setsockopt(s, IPPROTO_IPV6, IPV6_DONTFRAG, + &optval, sizeof(optval)) == -1) + err(1, "IPV6_DONTFRAG"); hold = 1; if (options & F_SO_DEBUG) @@ -2780,7 +2788,7 @@ usage() "A" #endif "usage: ping6 [-" - "d" + "Dd" #if defined(IPSEC) && !defined(IPSEC_POLICY_IPSEC) "E" #endif diff --git a/sbin/quotacheck/quotacheck.8 b/sbin/quotacheck/quotacheck.8 index e8f4d9c..1c34cd3 100644 --- a/sbin/quotacheck/quotacheck.8 +++ b/sbin/quotacheck/quotacheck.8 @@ -100,7 +100,7 @@ is zero, parallel passes are run as per .Xr fsck 8 . This option is deprecated and parallel passes are always run as per -.Xf fsck 8. +.Xr fsck 8 . .It Fl u Only user quotas listed in .Pa /etc/fstab diff --git a/sbin/setkey/setkey.8 b/sbin/setkey/setkey.8 index ffb9115..c66860f 100644 --- a/sbin/setkey/setkey.8 +++ b/sbin/setkey/setkey.8 @@ -674,7 +674,7 @@ add 10.0.11.41 10.0.11.33 esp 0x10001 -A hmac-md5 "authentication!!" ; .Ed -Get the SA information assocaited with first example above: +Get the SA information associated with first example above: .Bd -literal -offset get 3ffe:501:4819::1 3ffe:501:481d::1 ah 123456 ; diff --git a/sbin/spppcontrol/spppcontrol.8 b/sbin/spppcontrol/spppcontrol.8 index 4389c8f..4d948a6 100644 --- a/sbin/spppcontrol/spppcontrol.8 +++ b/sbin/spppcontrol/spppcontrol.8 @@ -25,8 +25,8 @@ .\" $FreeBSD$ .\" .Dd December 30, 2001 -.Os .Dt SPPPCONTROL 8 +.Os .Sh NAME .Nm spppcontrol .Nd display or set parameters for an sppp interface diff --git a/sbin/sysctl/sysctl.c b/sbin/sysctl/sysctl.c index 5d0025b..d96450b 100644 --- a/sbin/sysctl/sysctl.c +++ b/sbin/sysctl/sysctl.c @@ -382,6 +382,7 @@ S_timeval(int l2, void *p) if (*p2 == '\n') *p2 = '\0'; fputs(p1, stdout); + free(p1); return (0); } diff --git a/sbin/tunefs/Makefile b/sbin/tunefs/Makefile index d501d10..d5313c4 100644 --- a/sbin/tunefs/Makefile +++ b/sbin/tunefs/Makefile @@ -6,4 +6,6 @@ DPADD= ${LIBUFS} LDADD= -lufs MAN= tunefs.8 +WARNS= 3 + .include <bsd.prog.mk> diff --git a/sbin/tunefs/tunefs.8 b/sbin/tunefs/tunefs.8 index 53e463c..a883cd4 100644 --- a/sbin/tunefs/tunefs.8 +++ b/sbin/tunefs/tunefs.8 @@ -28,7 +28,7 @@ .\" @(#)tunefs.8 8.2 (Berkeley) 12/11/93 .\" $FreeBSD$ .\" -.Dd October 21, 2009 +.Dd March 6, 2010 .Dt TUNEFS 8 .Os .Sh NAME @@ -40,6 +40,7 @@ .Op Fl a Cm enable | disable .Op Fl e Ar maxbpg .Op Fl f Ar avgfilesize +.Op Fl j Cm enable | disable .Op Fl J Cm enable | disable .Op Fl L Ar volname .Op Fl l Cm enable | disable @@ -49,6 +50,7 @@ .Op Fl o Cm space | time .Op Fl p .Op Fl s Ar avgfpdir +.Op Fl S Ar size .Ar special | filesystem .Sh DESCRIPTION The @@ -89,6 +91,8 @@ For file systems with exclusively large files, this parameter should be set higher. .It Fl f Ar avgfilesize Specify the expected average file size. +.It Fl j Cm enable | disable +Turn on/off soft updates journaling. .It Fl J Cm enable | disable Turn on/off gjournal flag. .It Fl L Ar volname @@ -136,6 +140,9 @@ obtained from the utility. .It Fl s Ar avgfpdir Specify the expected number of files per directory. +.It Fl S Ar size +Specify the softdep journal size in bytes. +The minimum is 4M. .El .Pp At least one of the above flags is required. diff --git a/sbin/tunefs/tunefs.c b/sbin/tunefs/tunefs.c index e4adb52..a10b35d 100644 --- a/sbin/tunefs/tunefs.c +++ b/sbin/tunefs/tunefs.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/dinode.h> #include <ufs/ffs/fs.h> +#include <ufs/ufs/dir.h> #include <ctype.h> #include <err.h> @@ -61,6 +62,7 @@ __FBSDID("$FreeBSD$"); #include <paths.h> #include <stdio.h> #include <stdlib.h> +#include <stdint.h> #include <string.h> #include <unistd.h> @@ -72,16 +74,20 @@ struct uufsd disk; void usage(void); void printfs(void); +int journal_alloc(int64_t size); +void journal_clear(void); +void sbdirty(void); int main(int argc, char *argv[]) { - char *avalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue; + char *avalue, *jvalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue; const char *special, *on; const char *name; int active; - int Aflag, aflag, eflag, evalue, fflag, fvalue, Jflag, Lflag, lflag; - int mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag, svalue; + int Aflag, aflag, eflag, evalue, fflag, fvalue, jflag, Jflag, Lflag; + int lflag, mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag; + int svalue, Sflag, Svalue; int ch, found_arg, i; const char *chg[2]; struct ufs_args args; @@ -89,13 +95,13 @@ main(int argc, char *argv[]) if (argc < 3) usage(); - Aflag = aflag = eflag = fflag = Jflag = Lflag = lflag = mflag = 0; - Nflag = nflag = oflag = pflag = sflag = 0; - avalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL; - evalue = fvalue = mvalue = ovalue = svalue = 0; + Aflag = aflag = eflag = fflag = jflag = Jflag = Lflag = lflag = 0; + mflag = Nflag = nflag = oflag = pflag = sflag = 0; + avalue = jvalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL; + evalue = fvalue = mvalue = ovalue = svalue = Svalue = 0; active = 0; found_arg = 0; /* At least one arg is required. */ - while ((ch = getopt(argc, argv, "Aa:e:f:J:L:l:m:N:n:o:ps:")) != -1) + while ((ch = getopt(argc, argv, "Aa:e:f:j:J:L:l:m:N:n:o:ps:S:")) != -1) switch (ch) { case 'A': @@ -135,6 +141,18 @@ main(int argc, char *argv[]) fflag = 1; break; + case 'j': + found_arg = 1; + name = "softdep journaled file system"; + jvalue = optarg; + if (strcmp(jvalue, "enable") && + strcmp(jvalue, "disable")) { + errx(10, "bad %s (options are %s)", + name, "`enable' or `disable'"); + } + jflag = 1; + break; + case 'J': found_arg = 1; name = "gjournaled file system"; @@ -240,6 +258,16 @@ main(int argc, char *argv[]) sflag = 1; break; + case 'S': + found_arg = 1; + name = "Softdep Journal Size"; + Svalue = atoi(optarg); + if (Svalue < SUJ_MIN) + errx(10, "%s must be >= %d (was %s)", + name, SUJ_MIN, optarg); + Sflag = 1; + break; + default: usage(); } @@ -310,6 +338,33 @@ main(int argc, char *argv[]) sblock.fs_avgfilesize = fvalue; } } + if (jflag) { + name = "soft updates journaling"; + if (strcmp(jvalue, "enable") == 0) { + if ((sblock.fs_flags & (FS_DOSOFTDEP | FS_SUJ)) == + (FS_DOSOFTDEP | FS_SUJ)) { + warnx("%s remains unchanged as enabled", name); + } else if (sblock.fs_clean == 0) { + warnx("%s cannot be enabled until fsck is run", + name); + } else if (journal_alloc(Svalue) != 0) { + warnx("%s can not be enabled", name); + } else { + sblock.fs_flags |= FS_DOSOFTDEP | FS_SUJ; + warnx("%s set", name); + } + } else if (strcmp(jvalue, "disable") == 0) { + if ((~sblock.fs_flags & FS_SUJ) == FS_SUJ) { + warnx("%s remains unchanged as disabled", name); + } else { + journal_clear(); + sblock.fs_flags &= ~(FS_DOSOFTDEP | FS_SUJ); + sblock.fs_sujfree = 0; + warnx("%s cleared, " + "remove .sujournal to reclaim space", name); + } + } + } if (Jflag) { name = "gjournal"; if (strcmp(Jvalue, "enable") == 0) { @@ -456,6 +511,500 @@ err: } void +sbdirty(void) +{ + disk.d_fs.fs_flags |= FS_UNCLEAN | FS_NEEDSFSCK; + disk.d_fs.fs_clean = 0; +} + +int blocks; +static char clrbuf[MAXBSIZE]; + +static ufs2_daddr_t +journal_balloc(void) +{ + ufs2_daddr_t blk; + struct cg *cgp; + int valid; + static int contig = 1; + + cgp = &disk.d_cg; + for (;;) { + blk = cgballoc(&disk); + if (blk > 0) + break; + /* + * If we failed to allocate a block from this cg, move to + * the next. + */ + if (cgwrite(&disk) < 0) { + warn("Failed to write updated cg"); + return (-1); + } + while ((valid = cgread(&disk)) == 1) { + /* + * Try to minimize fragmentation by requiring a minimum + * number of blocks present. + */ + if (cgp->cg_cs.cs_nbfree > blocks / 8) + break; + if (contig == 0 && cgp->cg_cs.cs_nbfree) + break; + } + if (valid) + continue; + /* + * Try once through looking only for large contiguous regions + * and again taking any space we can find. + */ + if (contig) { + contig = 0; + disk.d_ccg = 0; + warnx("Journal file fragmented."); + continue; + } + warnx("Failed to find sufficient free blocks for the journal"); + return -1; + } + if (bwrite(&disk, fsbtodb(&sblock, blk), clrbuf, + sblock.fs_bsize) <= 0) { + warn("Failed to initialize new block"); + return -1; + } + return (blk); +} + +/* + * Search a directory block for the SUJ_FILE. + */ +static ino_t +dir_search(ufs2_daddr_t blk, int bytes) +{ + char block[MAXBSIZE]; + struct direct *dp; + int off; + + if (bread(&disk, fsbtodb(&sblock, blk), block, bytes) <= 0) { + warn("Failed to read dir block"); + return (-1); + } + for (off = 0; off < bytes; off += dp->d_reclen) { + dp = (struct direct *)&block[off]; + if (dp->d_reclen == 0) + break; + if (dp->d_ino == 0) + continue; + if (dp->d_namlen != strlen(SUJ_FILE)) + continue; + if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0) + continue; + return (dp->d_ino); + } + + return (0); +} + +/* + * Search in the ROOTINO for the SUJ_FILE. If it exists we can not enable + * journaling. + */ +static ino_t +journal_findfile(void) +{ + struct ufs1_dinode *dp1; + struct ufs2_dinode *dp2; + ino_t ino; + int mode; + void *ip; + int i; + + if (getino(&disk, &ip, ROOTINO, &mode) != 0) { + warn("Failed to get root inode"); + return (-1); + } + dp2 = ip; + dp1 = ip; + if (sblock.fs_magic == FS_UFS1_MAGIC) { + if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) { + warnx("ROOTINO extends beyond direct blocks."); + return (-1); + } + for (i = 0; i < NDADDR; i++) { + if (dp1->di_db[i] == 0) + break; + if ((ino = dir_search(dp1->di_db[i], + sblksize(&sblock, (off_t)dp1->di_size, i))) != 0) + return (ino); + } + } else { + if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) { + warnx("ROOTINO extends beyond direct blocks."); + return (-1); + } + for (i = 0; i < NDADDR; i++) { + if (dp2->di_db[i] == 0) + break; + if ((ino = dir_search(dp2->di_db[i], + sblksize(&sblock, (off_t)dp2->di_size, i))) != 0) + return (ino); + } + } + + return (0); +} + +/* + * Insert the journal at inode 'ino' into directory blk 'blk' at the first + * free offset of 'off'. DIRBLKSIZ blocks after off are initialized as + * empty. + */ +static int +dir_insert(ufs2_daddr_t blk, off_t off, ino_t ino) +{ + struct direct *dp; + char block[MAXBSIZE]; + + if (bread(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) { + warn("Failed to read dir block"); + return (-1); + } + bzero(&block[off], sblock.fs_bsize - off); + dp = (struct direct *)&block[off]; + dp->d_ino = ino; + dp->d_reclen = DIRBLKSIZ; + dp->d_type = DT_REG; + dp->d_namlen = strlen(SUJ_FILE); + bcopy(SUJ_FILE, &dp->d_name, strlen(SUJ_FILE)); + off += DIRBLKSIZ; + for (; off < sblock.fs_bsize; off += DIRBLKSIZ) { + dp = (struct direct *)&block[off]; + dp->d_ino = 0; + dp->d_reclen = DIRBLKSIZ; + dp->d_type = DT_UNKNOWN; + } + if (bwrite(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) { + warn("Failed to write dir block"); + return (-1); + } + return (0); +} + +/* + * Extend a directory block in 'blk' by copying it to a full size block + * and inserting the new journal inode into .sujournal. + */ +static int +dir_extend(ufs2_daddr_t blk, ufs2_daddr_t nblk, off_t size, ino_t ino) +{ + char block[MAXBSIZE]; + + if (bread(&disk, fsbtodb(&sblock, blk), block, size) <= 0) { + warn("Failed to read dir block"); + return (-1); + } + if (bwrite(&disk, fsbtodb(&sblock, nblk), block, size) <= 0) { + warn("Failed to write dir block"); + return (-1); + } + + return dir_insert(nblk, size, ino); +} + +/* + * Insert the journal file into the ROOTINO directory. We always extend the + * last frag + */ +static int +journal_insertfile(ino_t ino) +{ + struct ufs1_dinode *dp1; + struct ufs2_dinode *dp2; + void *ip; + ufs2_daddr_t nblk; + ufs2_daddr_t blk; + ufs_lbn_t lbn; + int size; + int mode; + int off; + + if (getino(&disk, &ip, ROOTINO, &mode) != 0) { + warn("Failed to get root inode"); + sbdirty(); + return (-1); + } + dp2 = ip; + dp1 = ip; + blk = 0; + size = 0; + nblk = journal_balloc(); + if (nblk <= 0) + return (-1); + /* + * For simplicity sake we aways extend the ROOTINO into a new + * directory block rather than searching for space and inserting + * into an existing block. However, if the rootino has frags + * have to free them and extend the block. + */ + if (sblock.fs_magic == FS_UFS1_MAGIC) { + lbn = lblkno(&sblock, dp1->di_size); + off = blkoff(&sblock, dp1->di_size); + blk = dp1->di_db[lbn]; + size = sblksize(&sblock, (off_t)dp1->di_size, lbn); + } else { + lbn = lblkno(&sblock, dp2->di_size); + off = blkoff(&sblock, dp2->di_size); + blk = dp2->di_db[lbn]; + size = sblksize(&sblock, (off_t)dp2->di_size, lbn); + } + if (off != 0) { + if (dir_extend(blk, nblk, off, ino) == -1) + return (-1); + } else { + blk = 0; + if (dir_insert(nblk, 0, ino) == -1) + return (-1); + } + if (sblock.fs_magic == FS_UFS1_MAGIC) { + dp1->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE; + dp1->di_db[lbn] = nblk; + dp1->di_size = lblktosize(&sblock, lbn+1); + } else { + dp2->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE; + dp2->di_db[lbn] = nblk; + dp2->di_size = lblktosize(&sblock, lbn+1); + } + if (putino(&disk) < 0) { + warn("Failed to write root inode"); + return (-1); + } + if (cgwrite(&disk) < 0) { + warn("Failed to write updated cg"); + sbdirty(); + return (-1); + } + if (blk) { + if (cgbfree(&disk, blk, size) < 0) { + warn("Failed to write cg"); + return (-1); + } + } + + return (0); +} + +static int +indir_fill(ufs2_daddr_t blk, int level, int *resid) +{ + char indirbuf[MAXBSIZE]; + ufs1_daddr_t *bap1; + ufs2_daddr_t *bap2; + ufs2_daddr_t nblk; + int ncnt; + int cnt; + int i; + + bzero(indirbuf, sizeof(indirbuf)); + bap1 = (ufs1_daddr_t *)indirbuf; + bap2 = (void *)bap1; + cnt = 0; + for (i = 0; i < NINDIR(&sblock) && *resid != 0; i++) { + nblk = journal_balloc(); + if (nblk <= 0) + return (-1); + cnt++; + if (sblock.fs_magic == FS_UFS1_MAGIC) + *bap1++ = nblk; + else + *bap2++ = nblk; + if (level != 0) { + ncnt = indir_fill(nblk, level - 1, resid); + if (ncnt <= 0) + return (-1); + cnt += ncnt; + } else + (*resid)--; + } + if (bwrite(&disk, fsbtodb(&sblock, blk), indirbuf, + sblock.fs_bsize) <= 0) { + warn("Failed to write indirect"); + return (-1); + } + return (cnt); +} + +/* + * Clear the flag bits so the journal can be removed. + */ +void +journal_clear(void) +{ + struct ufs1_dinode *dp1; + struct ufs2_dinode *dp2; + ino_t ino; + int mode; + void *ip; + + ino = journal_findfile(); + if (ino == (ino_t)-1 || ino == 0) { + warnx("Journal file does not exist"); + return; + } + printf("Clearing journal flags from inode %d\n", ino); + if (getino(&disk, &ip, ino, &mode) != 0) { + warn("Failed to get journal inode"); + return; + } + dp2 = ip; + dp1 = ip; + if (sblock.fs_magic == FS_UFS1_MAGIC) + dp1->di_flags = 0; + else + dp2->di_flags = 0; + if (putino(&disk) < 0) { + warn("Failed to write journal inode"); + return; + } +} + +int +journal_alloc(int64_t size) +{ + struct ufs1_dinode *dp1; + struct ufs2_dinode *dp2; + ufs2_daddr_t blk; + void *ip; + struct cg *cgp; + int resid; + ino_t ino; + int blks; + int mode; + int i; + + cgp = &disk.d_cg; + ino = 0; + + /* + * If the journal file exists we can't allocate it. + */ + ino = journal_findfile(); + if (ino == (ino_t)-1) + return (-1); + if (ino > 0) { + warnx("Journal file %s already exists, please remove.", + SUJ_FILE); + return (-1); + } + /* + * If the user didn't supply a size pick one based on the filesystem + * size constrained with hardcoded MIN and MAX values. We opt for + * 1/1024th of the filesystem up to MAX but not exceeding one CG and + * not less than the MIN. + */ + if (size == 0) { + size = (sblock.fs_size * sblock.fs_bsize) / 1024; + size = MIN(SUJ_MAX, size); + if (size / sblock.fs_fsize > sblock.fs_fpg) + size = sblock.fs_fpg * sblock.fs_fsize; + size = MAX(SUJ_MIN, size); + } + resid = blocks = size / sblock.fs_bsize; + if (sblock.fs_cstotal.cs_nbfree < blocks) { + warn("Insufficient free space for %jd byte journal", size); + return (-1); + } + /* + * Find a cg with enough blocks to satisfy the journal + * size. Presently the journal does not span cgs. + */ + while (cgread(&disk) == 1) { + if (cgp->cg_cs.cs_nifree == 0) + continue; + ino = cgialloc(&disk); + if (ino <= 0) + break; + printf("Using inode %d in cg %d for %jd byte journal\n", + ino, cgp->cg_cgx, size); + if (getino(&disk, &ip, ino, &mode) != 0) { + warn("Failed to get allocated inode"); + sbdirty(); + goto out; + } + /* + * We leave fields unrelated to the number of allocated + * blocks and size uninitialized. This causes legacy + * fsck implementations to clear the inode. + */ + dp2 = ip; + dp1 = ip; + if (sblock.fs_magic == FS_UFS1_MAGIC) { + bzero(dp1, sizeof(*dp1)); + dp1->di_size = size; + dp1->di_mode = IFREG | IREAD; + dp1->di_nlink = 1; + dp1->di_flags = SF_IMMUTABLE | SF_NOUNLINK | UF_NODUMP; + } else { + bzero(dp2, sizeof(*dp2)); + dp2->di_size = size; + dp2->di_mode = IFREG | IREAD; + dp2->di_nlink = 1; + dp2->di_flags = SF_IMMUTABLE | SF_NOUNLINK | UF_NODUMP; + } + for (i = 0; i < NDADDR && resid; i++, resid--) { + blk = journal_balloc(); + if (blk <= 0) + goto out; + if (sblock.fs_magic == FS_UFS1_MAGIC) { + dp1->di_db[i] = blk; + dp1->di_blocks++; + } else { + dp2->di_db[i] = blk; + dp2->di_blocks++; + } + } + for (i = 0; i < NIADDR && resid; i++) { + blk = journal_balloc(); + if (blk <= 0) + goto out; + blks = indir_fill(blk, i, &resid) + 1; + if (blks <= 0) { + sbdirty(); + goto out; + } + if (sblock.fs_magic == FS_UFS1_MAGIC) { + dp1->di_ib[i] = blk; + dp1->di_blocks += blks; + } else { + dp2->di_ib[i] = blk; + dp2->di_blocks += blks; + } + } + if (sblock.fs_magic == FS_UFS1_MAGIC) + dp1->di_blocks *= sblock.fs_bsize / disk.d_bsize; + else + dp2->di_blocks *= sblock.fs_bsize / disk.d_bsize; + if (putino(&disk) < 0) { + warn("Failed to write inode"); + sbdirty(); + return (-1); + } + if (cgwrite(&disk) < 0) { + warn("Failed to write updated cg"); + sbdirty(); + return (-1); + } + if (journal_insertfile(ino) < 0) { + sbdirty(); + return (-1); + } + sblock.fs_sujfree = 0; + return (0); + } + warnx("Insufficient free space for the journal."); +out: + return (-1); +} + +void usage(void) { fprintf(stderr, "%s\n%s\n%s\n%s\n", @@ -477,6 +1026,8 @@ printfs(void) (sblock.fs_flags & FS_MULTILABEL)? "enabled" : "disabled"); warnx("soft updates: (-n) %s", (sblock.fs_flags & FS_DOSOFTDEP)? "enabled" : "disabled"); + warnx("soft update journaling: (-j) %s", + (sblock.fs_flags & FS_SUJ)? "enabled" : "disabled"); warnx("gjournal: (-J) %s", (sblock.fs_flags & FS_GJOURNAL)? "enabled" : "disabled"); warnx("maximum blocks per file in a cylinder group: (-e) %d", |