diff options
author | rgrimes <rgrimes@FreeBSD.org> | 1994-05-24 10:09:53 +0000 |
---|---|---|
committer | rgrimes <rgrimes@FreeBSD.org> | 1994-05-24 10:09:53 +0000 |
commit | 8fb65ce818b3e3c6f165b583b910af24000768a5 (patch) | |
tree | ba751e4f2166aefec707c9d7401c7ff432506642 /sys/ufs | |
parent | a6ce65d368e623088a4c1a29865889f431b15420 (diff) | |
download | FreeBSD-src-8fb65ce818b3e3c6f165b583b910af24000768a5.zip FreeBSD-src-8fb65ce818b3e3c6f165b583b910af24000768a5.tar.gz |
BSD 4.4 Lite Kernel Sources
Diffstat (limited to 'sys/ufs')
46 files changed, 17206 insertions, 0 deletions
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c new file mode 100644 index 0000000..cdd2e4b --- /dev/null +++ b/sys/ufs/ffs/ffs_alloc.c @@ -0,0 +1,1474 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_alloc.c 8.8 (Berkeley) 2/21/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/kernel.h> +#include <sys/syslog.h> + +#include <vm/vm.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> + +#include <ufs/ffs/fs.h> +#include <ufs/ffs/ffs_extern.h> + +extern u_long nextgennumber; + +static daddr_t ffs_alloccg __P((struct inode *, int, daddr_t, int)); +static daddr_t ffs_alloccgblk __P((struct fs *, struct cg *, daddr_t)); +static daddr_t ffs_clusteralloc __P((struct inode *, int, daddr_t, int)); +static ino_t ffs_dirpref __P((struct fs *)); +static daddr_t ffs_fragextend __P((struct inode *, int, long, int, int)); +static void ffs_fserr __P((struct fs *, u_int, char *)); +static u_long ffs_hashalloc + __P((struct inode *, int, long, int, u_long (*)())); +static ino_t ffs_nodealloccg __P((struct inode *, int, daddr_t, int)); +static daddr_t ffs_mapsearch __P((struct fs *, struct cg *, daddr_t, int)); + +/* + * Allocate a block in the file system. + * + * The size of the requested block is given, which must be some + * multiple of fs_fsize and <= fs_bsize. + * A preference may be optionally specified. If a preference is given + * the following hierarchy is used to allocate a block: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate a block in the same cylinder group. + * 4) quadradically rehash into other cylinder groups, until an + * available block is located. + * If no block preference is given the following heirarchy is used + * to allocate a block: + * 1) allocate a block in the cylinder group that contains the + * inode for the file. + * 2) quadradically rehash into other cylinder groups, until an + * available block is located. + */ +ffs_alloc(ip, lbn, bpref, size, cred, bnp) + register struct inode *ip; + daddr_t lbn, bpref; + int size; + struct ucred *cred; + daddr_t *bnp; +{ + register struct fs *fs; + daddr_t bno; + int cg, error; + + *bnp = 0; + fs = ip->i_fs; +#ifdef DIAGNOSTIC + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { + printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); + panic("ffs_alloc: bad size"); + } + if (cred == NOCRED) + panic("ffs_alloc: missing credential\n"); +#endif /* DIAGNOSTIC */ + if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) + goto nospace; + if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0) + goto nospace; +#ifdef QUOTA + if (error = chkdq(ip, (long)btodb(size), cred, 0)) + return (error); +#endif + if (bpref >= fs->fs_size) + bpref = 0; + if (bpref == 0) + cg = ino_to_cg(fs, ip->i_number); + else + cg = dtog(fs, bpref); + bno = (daddr_t)ffs_hashalloc(ip, cg, (long)bpref, size, + (u_long (*)())ffs_alloccg); + if (bno > 0) { + ip->i_blocks += btodb(size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bnp = bno; + return (0); + } +#ifdef QUOTA + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, (long)-btodb(size), cred, FORCE); +#endif +nospace: + ffs_fserr(fs, cred->cr_uid, "file system full"); + uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Reallocate a fragment to a bigger size + * + * The number and size of the old block is given, and a preference + * and new size is also specified. The allocator attempts to extend + * the original block. Failing that, the regular block allocator is + * invoked to get an appropriate block. + */ +ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) + register struct inode *ip; + daddr_t lbprev; + daddr_t bpref; + int osize, nsize; + struct ucred *cred; + struct buf **bpp; +{ + register struct fs *fs; + struct buf *bp; + int cg, request, error; + daddr_t bprev, bno; + + *bpp = 0; + fs = ip->i_fs; +#ifdef DIAGNOSTIC + if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || + (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { + printf( + "dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); + panic("ffs_realloccg: bad size"); + } + if (cred == NOCRED) + panic("ffs_realloccg: missing credential\n"); +#endif /* DIAGNOSTIC */ + if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0) + goto nospace; + if ((bprev = ip->i_db[lbprev]) == 0) { + printf("dev = 0x%x, bsize = %d, bprev = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt); + panic("ffs_realloccg: bad bprev"); + } + /* + * Allocate the extra space in the buffer. + */ + if (error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) { + brelse(bp); + return (error); + } +#ifdef QUOTA + if (error = chkdq(ip, (long)btodb(nsize - osize), cred, 0)) { + brelse(bp); + return (error); + } +#endif + /* + * Check for extension in the existing location. + */ + cg = dtog(fs, bprev); + if (bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) { + if (bp->b_blkno != fsbtodb(fs, bno)) + panic("bad blockno"); + ip->i_blocks += btodb(nsize - osize); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + bzero((char *)bp->b_data + osize, (u_int)nsize - osize); + *bpp = bp; + return (0); + } + /* + * Allocate a new disk location. + */ + if (bpref >= fs->fs_size) + bpref = 0; + switch ((int)fs->fs_optim) { + case FS_OPTSPACE: + /* + * Allocate an exact sized fragment. Although this makes + * best use of space, we will waste time relocating it if + * the file continues to grow. If the fragmentation is + * less than half of the minimum free reserve, we choose + * to begin optimizing for time. + */ + request = nsize; + if (fs->fs_minfree < 5 || + fs->fs_cstotal.cs_nffree > + fs->fs_dsize * fs->fs_minfree / (2 * 100)) + break; + log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", + fs->fs_fsmnt); + fs->fs_optim = FS_OPTTIME; + break; + case FS_OPTTIME: + /* + * At this point we have discovered a file that is trying to + * grow a small fragment to a larger fragment. To save time, + * we allocate a full sized block, then free the unused portion. + * If the file continues to grow, the `ffs_fragextend' call + * above will be able to grow it in place without further + * copying. If aberrant programs cause disk fragmentation to + * grow within 2% of the free reserve, we choose to begin + * optimizing for space. + */ + request = fs->fs_bsize; + if (fs->fs_cstotal.cs_nffree < + fs->fs_dsize * (fs->fs_minfree - 2) / 100) + break; + log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", + fs->fs_fsmnt); + fs->fs_optim = FS_OPTSPACE; + break; + default: + printf("dev = 0x%x, optim = %d, fs = %s\n", + ip->i_dev, fs->fs_optim, fs->fs_fsmnt); + panic("ffs_realloccg: bad optim"); + /* NOTREACHED */ + } + bno = (daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request, + (u_long (*)())ffs_alloccg); + if (bno > 0) { + bp->b_blkno = fsbtodb(fs, bno); + (void) vnode_pager_uncache(ITOV(ip)); + ffs_blkfree(ip, bprev, (long)osize); + if (nsize < request) + ffs_blkfree(ip, bno + numfrags(fs, nsize), + (long)(request - nsize)); + ip->i_blocks += btodb(nsize - osize); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + bzero((char *)bp->b_data + osize, (u_int)nsize - osize); + *bpp = bp; + return (0); + } +#ifdef QUOTA + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE); +#endif + brelse(bp); +nospace: + /* + * no space available + */ + ffs_fserr(fs, cred->cr_uid, "file system full"); + uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Reallocate a sequence of blocks into a contiguous sequence of blocks. + * + * The vnode and an array of buffer pointers for a range of sequential + * logical blocks to be made contiguous is given. The allocator attempts + * to find a range of sequential blocks starting as close as possible to + * an fs_rotdelay offset from the end of the allocation for the logical + * block immediately preceeding the current range. If successful, the + * physical block numbers in the buffer pointers and in the inode are + * changed to reflect the new allocation. If unsuccessful, the allocation + * is left unchanged. The success in doing the reallocation is returned. + * Note that the error return is not reflected back to the user. Rather + * the previous block allocation will be used. + */ +#include <sys/sysctl.h> +int doasyncfree = 1; +struct ctldebug debug14 = { "doasyncfree", &doasyncfree }; +int +ffs_reallocblks(ap) + struct vop_reallocblks_args /* { + struct vnode *a_vp; + struct cluster_save *a_buflist; + } */ *ap; +{ + struct fs *fs; + struct inode *ip; + struct vnode *vp; + struct buf *sbp, *ebp; + daddr_t *bap, *sbap, *ebap; + struct cluster_save *buflist; + daddr_t start_lbn, end_lbn, soff, eoff, newblk, blkno; + struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; + int i, len, start_lvl, end_lvl, pref, ssize; + + vp = ap->a_vp; + ip = VTOI(vp); + fs = ip->i_fs; + if (fs->fs_contigsumsize <= 0) + return (ENOSPC); + buflist = ap->a_buflist; + len = buflist->bs_nchildren; + start_lbn = buflist->bs_children[0]->b_lblkno; + end_lbn = start_lbn + len - 1; +#ifdef DIAGNOSTIC + for (i = 1; i < len; i++) + if (buflist->bs_children[i]->b_lblkno != start_lbn + i) + panic("ffs_reallocblks: non-cluster"); +#endif + /* + * If the latest allocation is in a new cylinder group, assume that + * the filesystem has decided to move and do not force it back to + * the previous cylinder group. + */ + if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != + dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) + return (ENOSPC); + if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || + ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) + return (ENOSPC); + /* + * Get the starting offset and block map for the first block. + */ + if (start_lvl == 0) { + sbap = &ip->i_db[0]; + soff = start_lbn; + } else { + idp = &start_ap[start_lvl - 1]; + if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { + brelse(sbp); + return (ENOSPC); + } + sbap = (daddr_t *)sbp->b_data; + soff = idp->in_off; + } + /* + * Find the preferred location for the cluster. + */ + pref = ffs_blkpref(ip, start_lbn, soff, sbap); + /* + * If the block range spans two block maps, get the second map. + */ + if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { + ssize = len; + } else { +#ifdef DIAGNOSTIC + if (start_ap[start_lvl-1].in_lbn == idp->in_lbn) + panic("ffs_reallocblk: start == end"); +#endif + ssize = len - (idp->in_off + 1); + if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) + goto fail; + ebap = (daddr_t *)ebp->b_data; + } + /* + * Search the block map looking for an allocation of the desired size. + */ + if ((newblk = (daddr_t)ffs_hashalloc(ip, dtog(fs, pref), (long)pref, + len, (u_long (*)())ffs_clusteralloc)) == 0) + goto fail; + /* + * We have found a new contiguous block. + * + * First we have to replace the old block pointers with the new + * block pointers in the inode and indirect blocks associated + * with the file. + */ + blkno = newblk; + for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { + if (i == ssize) + bap = ebap; +#ifdef DIAGNOSTIC + if (buflist->bs_children[i]->b_blkno != fsbtodb(fs, *bap)) + panic("ffs_reallocblks: alloc mismatch"); +#endif + *bap++ = blkno; + } + /* + * Next we must write out the modified inode and indirect blocks. + * For strict correctness, the writes should be synchronous since + * the old block values may have been written to disk. In practise + * they are almost never written, but if we are concerned about + * strict correctness, the `doasyncfree' flag should be set to zero. + * + * The test on `doasyncfree' should be changed to test a flag + * that shows whether the associated buffers and inodes have + * been written. The flag should be set when the cluster is + * started and cleared whenever the buffer or inode is flushed. + * We can then check below to see if it is set, and do the + * synchronous write only when it has been cleared. + */ + if (sbap != &ip->i_db[0]) { + if (doasyncfree) + bdwrite(sbp); + else + bwrite(sbp); + } else { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (!doasyncfree) + VOP_UPDATE(vp, &time, &time, MNT_WAIT); + } + if (ssize < len) + if (doasyncfree) + bdwrite(ebp); + else + bwrite(ebp); + /* + * Last, free the old blocks and assign the new blocks to the buffers. + */ + for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { + ffs_blkfree(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), + fs->fs_bsize); + buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); + } + return (0); + +fail: + if (ssize < len) + brelse(ebp); + if (sbap != &ip->i_db[0]) + brelse(sbp); + return (ENOSPC); +} + +/* + * Allocate an inode in the file system. + * + * If allocating a directory, use ffs_dirpref to select the inode. + * If allocating in a directory, the following hierarchy is followed: + * 1) allocate the preferred inode. + * 2) allocate an inode in the same cylinder group. + * 3) quadradically rehash into other cylinder groups, until an + * available inode is located. + * If no inode preference is given the following heirarchy is used + * to allocate an inode: + * 1) allocate an inode in cylinder group 0. + * 2) quadradically rehash into other cylinder groups, until an + * available inode is located. + */ +ffs_valloc(ap) + struct vop_valloc_args /* { + struct vnode *a_pvp; + int a_mode; + struct ucred *a_cred; + struct vnode **a_vpp; + } */ *ap; +{ + register struct vnode *pvp = ap->a_pvp; + register struct inode *pip; + register struct fs *fs; + register struct inode *ip; + mode_t mode = ap->a_mode; + ino_t ino, ipref; + int cg, error; + + *ap->a_vpp = NULL; + pip = VTOI(pvp); + fs = pip->i_fs; + if (fs->fs_cstotal.cs_nifree == 0) + goto noinodes; + + if ((mode & IFMT) == IFDIR) + ipref = ffs_dirpref(fs); + else + ipref = pip->i_number; + if (ipref >= fs->fs_ncg * fs->fs_ipg) + ipref = 0; + cg = ino_to_cg(fs, ipref); + ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, ffs_nodealloccg); + if (ino == 0) + goto noinodes; + error = VFS_VGET(pvp->v_mount, ino, ap->a_vpp); + if (error) { + VOP_VFREE(pvp, ino, mode); + return (error); + } + ip = VTOI(*ap->a_vpp); + if (ip->i_mode) { + printf("mode = 0%o, inum = %d, fs = %s\n", + ip->i_mode, ip->i_number, fs->fs_fsmnt); + panic("ffs_valloc: dup alloc"); + } + if (ip->i_blocks) { /* XXX */ + printf("free inode %s/%d had %d blocks\n", + fs->fs_fsmnt, ino, ip->i_blocks); + ip->i_blocks = 0; + } + ip->i_flags = 0; + /* + * Set up a new generation number for this inode. + */ + if (++nextgennumber < (u_long)time.tv_sec) + nextgennumber = time.tv_sec; + ip->i_gen = nextgennumber; + return (0); +noinodes: + ffs_fserr(fs, ap->a_cred->cr_uid, "out of inodes"); + uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Find a cylinder to place a directory. + * + * The policy implemented by this algorithm is to select from + * among those cylinder groups with above the average number of + * free inodes, the one with the smallest number of directories. + */ +static ino_t +ffs_dirpref(fs) + register struct fs *fs; +{ + int cg, minndir, mincg, avgifree; + + avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; + minndir = fs->fs_ipg; + mincg = 0; + for (cg = 0; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + return ((ino_t)(fs->fs_ipg * mincg)); +} + +/* + * Select the desired position for the next block in a file. The file is + * logically divided into sections. The first section is composed of the + * direct blocks. Each additional section contains fs_maxbpg blocks. + * + * If no blocks have been allocated in the first section, the policy is to + * request a block in the same cylinder group as the inode that describes + * the file. If no blocks have been allocated in any other section, the + * policy is to place the section in a cylinder group with a greater than + * average number of free blocks. An appropriate cylinder group is found + * by using a rotor that sweeps the cylinder groups. When a new group of + * blocks is needed, the sweep begins in the cylinder group following the + * cylinder group from which the previous allocation was made. The sweep + * continues until a cylinder group with greater than the average number + * of free blocks is found. If the allocation is for the first block in an + * indirect block, the information on the previous allocation is unavailable; + * here a best guess is made based upon the logical block number being + * allocated. + * + * If a section is already partially allocated, the policy is to + * contiguously allocate fs_maxcontig blocks. The end of one of these + * contiguous blocks and the beginning of the next is physically separated + * so that the disk head will be in transit between them for at least + * fs_rotdelay milliseconds. This is to allow time for the processor to + * schedule another I/O transfer. + */ +daddr_t +ffs_blkpref(ip, lbn, indx, bap) + struct inode *ip; + daddr_t lbn; + int indx; + daddr_t *bap; +{ + register struct fs *fs; + register int cg; + int avgbfree, startcg; + daddr_t nextblk; + + fs = ip->i_fs; + if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { + if (lbn < NDADDR) { + cg = ino_to_cg(fs, ip->i_number); + return (fs->fs_fpg * cg + fs->fs_frag); + } + /* + * Find a cylinder with greater than average number of + * unused data blocks. + */ + if (indx == 0 || bap[indx - 1] == 0) + startcg = + ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; + else + startcg = dtog(fs, bap[indx - 1]) + 1; + startcg %= fs->fs_ncg; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + for (cg = startcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (fs->fs_fpg * cg + fs->fs_frag); + } + for (cg = 0; cg <= startcg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (fs->fs_fpg * cg + fs->fs_frag); + } + return (NULL); + } + /* + * One or more previous blocks have been laid out. If less + * than fs_maxcontig previous blocks are contiguous, the + * next block is requested contiguously, otherwise it is + * requested rotationally delayed by fs_rotdelay milliseconds. + */ + nextblk = bap[indx - 1] + fs->fs_frag; + if (indx < fs->fs_maxcontig || bap[indx - fs->fs_maxcontig] + + blkstofrags(fs, fs->fs_maxcontig) != nextblk) + return (nextblk); + if (fs->fs_rotdelay != 0) + /* + * Here we convert ms of delay to frags as: + * (frags) = (ms) * (rev/sec) * (sect/rev) / + * ((sect/frag) * (ms/sec)) + * then round up to the next block. + */ + nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / + (NSPF(fs) * 1000), fs->fs_frag); + return (nextblk); +} + +/* + * Implement the cylinder overflow algorithm. + * + * The policy implemented by this algorithm is: + * 1) allocate the block in its requested cylinder group. + * 2) quadradically rehash on the cylinder group number. + * 3) brute force search for a free block. + */ +/*VARARGS5*/ +static u_long +ffs_hashalloc(ip, cg, pref, size, allocator) + struct inode *ip; + int cg; + long pref; + int size; /* size for data blocks, mode for inodes */ + u_long (*allocator)(); +{ + register struct fs *fs; + long result; + int i, icg = cg; + + fs = ip->i_fs; + /* + * 1: preferred cylinder group + */ + result = (*allocator)(ip, cg, pref, size); + if (result) + return (result); + /* + * 2: quadratic rehash + */ + for (i = 1; i < fs->fs_ncg; i *= 2) { + cg += i; + if (cg >= fs->fs_ncg) + cg -= fs->fs_ncg; + result = (*allocator)(ip, cg, 0, size); + if (result) + return (result); + } + /* + * 3: brute force search + * Note that we start at i == 2, since 0 was checked initially, + * and 1 is always checked in the quadratic rehash. + */ + cg = (icg + 2) % fs->fs_ncg; + for (i = 2; i < fs->fs_ncg; i++) { + result = (*allocator)(ip, cg, 0, size); + if (result) + return (result); + cg++; + if (cg == fs->fs_ncg) + cg = 0; + } + return (NULL); +} + +/* + * Determine whether a fragment can be extended. + * + * Check to see if the necessary fragments are available, and + * if they are, allocate them. + */ +static daddr_t +ffs_fragextend(ip, cg, bprev, osize, nsize) + struct inode *ip; + int cg; + long bprev; + int osize, nsize; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + long bno; + int frags, bbase; + int i, error; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) + return (NULL); + frags = numfrags(fs, nsize); + bbase = fragnum(fs, bprev); + if (bbase > fragnum(fs, (bprev + frags - 1))) { + /* cannot extend across a block boundary */ + return (NULL); + } + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (NULL); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return (NULL); + } + cgp->cg_time = time.tv_sec; + bno = dtogd(fs, bprev); + for (i = numfrags(fs, osize); i < frags; i++) + if (isclr(cg_blksfree(cgp), bno + i)) { + brelse(bp); + return (NULL); + } + /* + * the current fragment can be extended + * deduct the count on fragment being extended into + * increase the count on the remaining fragment (if any) + * allocate the extended piece + */ + for (i = frags; i < fs->fs_frag - bbase; i++) + if (isclr(cg_blksfree(cgp), bno + i)) + break; + cgp->cg_frsum[i - numfrags(fs, osize)]--; + if (i != frags) + cgp->cg_frsum[i - frags]++; + for (i = numfrags(fs, osize); i < frags; i++) { + clrbit(cg_blksfree(cgp), bno + i); + cgp->cg_cs.cs_nffree--; + fs->fs_cstotal.cs_nffree--; + fs->fs_cs(fs, cg).cs_nffree--; + } + fs->fs_fmod = 1; + bdwrite(bp); + return (bprev); +} + +/* + * Determine whether a block can be allocated. + * + * Check to see if a block of the appropriate size is available, + * and if it is, allocate it. + */ +static daddr_t +ffs_alloccg(ip, cg, bpref, size) + struct inode *ip; + int cg; + daddr_t bpref; + int size; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + register int i; + int error, bno, frags, allocsiz; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) + return (NULL); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (NULL); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp) || + (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { + brelse(bp); + return (NULL); + } + cgp->cg_time = time.tv_sec; + if (size == fs->fs_bsize) { + bno = ffs_alloccgblk(fs, cgp, bpref); + bdwrite(bp); + return (bno); + } + /* + * check to see if any fragments are already available + * allocsiz is the size which will be allocated, hacking + * it down to a smaller size if necessary + */ + frags = numfrags(fs, size); + for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) + if (cgp->cg_frsum[allocsiz] != 0) + break; + if (allocsiz == fs->fs_frag) { + /* + * no fragments were available, so a block will be + * allocated, and hacked up + */ + if (cgp->cg_cs.cs_nbfree == 0) { + brelse(bp); + return (NULL); + } + bno = ffs_alloccgblk(fs, cgp, bpref); + bpref = dtogd(fs, bno); + for (i = frags; i < fs->fs_frag; i++) + setbit(cg_blksfree(cgp), bpref + i); + i = fs->fs_frag - frags; + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cg).cs_nffree += i; + fs->fs_fmod = 1; + cgp->cg_frsum[i]++; + bdwrite(bp); + return (bno); + } + bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); + if (bno < 0) { + brelse(bp); + return (NULL); + } + for (i = 0; i < frags; i++) + clrbit(cg_blksfree(cgp), bno + i); + cgp->cg_cs.cs_nffree -= frags; + fs->fs_cstotal.cs_nffree -= frags; + fs->fs_cs(fs, cg).cs_nffree -= frags; + fs->fs_fmod = 1; + cgp->cg_frsum[allocsiz]--; + if (frags != allocsiz) + cgp->cg_frsum[allocsiz - frags]++; + bdwrite(bp); + return (cg * fs->fs_fpg + bno); +} + +/* + * Allocate a block in a cylinder group. + * + * This algorithm implements the following policy: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate the next available block on the block rotor for the + * specified cylinder group. + * Note that this routine only allocates fs_bsize blocks; these + * blocks may be fragmented by the routine that allocates them. + */ +static daddr_t +ffs_alloccgblk(fs, cgp, bpref) + register struct fs *fs; + register struct cg *cgp; + daddr_t bpref; +{ + daddr_t bno, blkno; + int cylno, pos, delta; + short *cylbp; + register int i; + + if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) { + bpref = cgp->cg_rotor; + goto norot; + } + bpref = blknum(fs, bpref); + bpref = dtogd(fs, bpref); + /* + * if the requested block is available, use it + */ + if (ffs_isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bpref))) { + bno = bpref; + goto gotit; + } + /* + * check for a block available on the same cylinder + */ + cylno = cbtocylno(fs, bpref); + if (cg_blktot(cgp)[cylno] == 0) + goto norot; + if (fs->fs_cpc == 0) { + /* + * Block layout information is not available. + * Leaving bpref unchanged means we take the + * next available free block following the one + * we just allocated. Hopefully this will at + * least hit a track cache on drives of unknown + * geometry (e.g. SCSI). + */ + goto norot; + } + /* + * check the summary information to see if a block is + * available in the requested cylinder starting at the + * requested rotational position and proceeding around. + */ + cylbp = cg_blks(fs, cgp, cylno); + pos = cbtorpos(fs, bpref); + for (i = pos; i < fs->fs_nrpos; i++) + if (cylbp[i] > 0) + break; + if (i == fs->fs_nrpos) + for (i = 0; i < pos; i++) + if (cylbp[i] > 0) + break; + if (cylbp[i] > 0) { + /* + * found a rotational position, now find the actual + * block. A panic if none is actually there. + */ + pos = cylno % fs->fs_cpc; + bno = (cylno - pos) * fs->fs_spc / NSPB(fs); + if (fs_postbl(fs, pos)[i] == -1) { + printf("pos = %d, i = %d, fs = %s\n", + pos, i, fs->fs_fsmnt); + panic("ffs_alloccgblk: cyl groups corrupted"); + } + for (i = fs_postbl(fs, pos)[i];; ) { + if (ffs_isblock(fs, cg_blksfree(cgp), bno + i)) { + bno = blkstofrags(fs, (bno + i)); + goto gotit; + } + delta = fs_rotbl(fs)[i]; + if (delta <= 0 || + delta + i > fragstoblks(fs, fs->fs_fpg)) + break; + i += delta; + } + printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); + panic("ffs_alloccgblk: can't find blk in cyl"); + } +norot: + /* + * no blocks in the requested cylinder, so take next + * available one in this cylinder group. + */ + bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); + if (bno < 0) + return (NULL); + cgp->cg_rotor = bno; +gotit: + blkno = fragstoblks(fs, bno); + ffs_clrblock(fs, cg_blksfree(cgp), (long)blkno); + ffs_clusteracct(fs, cgp, blkno, -1); + cgp->cg_cs.cs_nbfree--; + fs->fs_cstotal.cs_nbfree--; + fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; + cylno = cbtocylno(fs, bno); + cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--; + cg_blktot(cgp)[cylno]--; + fs->fs_fmod = 1; + return (cgp->cg_cgx * fs->fs_fpg + bno); +} + +/* + * Determine whether a cluster can be allocated. + * + * We do not currently check for optimal rotational layout if there + * are multiple choices in the same cylinder group. Instead we just + * take the first one that we find following bpref. + */ +static daddr_t +ffs_clusteralloc(ip, cg, bpref, len) + struct inode *ip; + int cg; + daddr_t bpref; + int len; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + int i, run, bno, bit, map; + u_char *mapp; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nbfree < len) + return (NULL); + if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, + NOCRED, &bp)) + goto fail; + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) + goto fail; + /* + * Check to see if a cluster of the needed size (or bigger) is + * available in this cylinder group. + */ + for (i = len; i <= fs->fs_contigsumsize; i++) + if (cg_clustersum(cgp)[i] > 0) + break; + if (i > fs->fs_contigsumsize) + goto fail; + /* + * Search the cluster map to find a big enough cluster. + * We take the first one that we find, even if it is larger + * than we need as we prefer to get one close to the previous + * block allocation. We do not search before the current + * preference point as we do not want to allocate a block + * that is allocated before the previous one (as we will + * then have to wait for another pass of the elevator + * algorithm before it will be read). We prefer to fail and + * be recalled to try an allocation in the next cylinder group. + */ + if (dtog(fs, bpref) != cg) + bpref = 0; + else + bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref))); + mapp = &cg_clustersfree(cgp)[bpref / NBBY]; + map = *mapp++; + bit = 1 << (bpref % NBBY); + for (run = 0, i = bpref; i < cgp->cg_nclusterblks; i++) { + if ((map & bit) == 0) { + run = 0; + } else { + run++; + if (run == len) + break; + } + if ((i & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + if (i == cgp->cg_nclusterblks) + goto fail; + /* + * Allocate the cluster that we have found. + */ + bno = cg * fs->fs_fpg + blkstofrags(fs, i - run + 1); + len = blkstofrags(fs, len); + for (i = 0; i < len; i += fs->fs_frag) + if (ffs_alloccgblk(fs, cgp, bno + i) != bno + i) + panic("ffs_clusteralloc: lost block"); + brelse(bp); + return (bno); + +fail: + brelse(bp); + return (0); +} + +/* + * Determine whether an inode can be allocated. + * + * Check to see if an inode is available, and if it is, + * allocate it using the following policy: + * 1) allocate the requested inode. + * 2) allocate the next available inode after the requested + * inode in the specified cylinder group. + */ +static ino_t +ffs_nodealloccg(ip, cg, ipref, mode) + struct inode *ip; + int cg; + daddr_t ipref; + int mode; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + int error, start, len, loc, map, i; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nifree == 0) + return (NULL); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (NULL); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { + brelse(bp); + return (NULL); + } + cgp->cg_time = time.tv_sec; + if (ipref) { + ipref %= fs->fs_ipg; + if (isclr(cg_inosused(cgp), ipref)) + goto gotit; + } + start = cgp->cg_irotor / NBBY; + len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); + loc = skpc(0xff, len, &cg_inosused(cgp)[start]); + if (loc == 0) { + len = start + 1; + start = 0; + loc = skpc(0xff, len, &cg_inosused(cgp)[0]); + if (loc == 0) { + printf("cg = %d, irotor = %d, fs = %s\n", + cg, cgp->cg_irotor, fs->fs_fsmnt); + panic("ffs_nodealloccg: map corrupted"); + /* NOTREACHED */ + } + } + i = start + len - loc; + map = cg_inosused(cgp)[i]; + ipref = i * NBBY; + for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { + if ((map & i) == 0) { + cgp->cg_irotor = ipref; + goto gotit; + } + } + printf("fs = %s\n", fs->fs_fsmnt); + panic("ffs_nodealloccg: block not in map"); + /* NOTREACHED */ +gotit: + setbit(cg_inosused(cgp), ipref); + cgp->cg_cs.cs_nifree--; + fs->fs_cstotal.cs_nifree--; + fs->fs_cs(fs, cg).cs_nifree--; + fs->fs_fmod = 1; + if ((mode & IFMT) == IFDIR) { + cgp->cg_cs.cs_ndir++; + fs->fs_cstotal.cs_ndir++; + fs->fs_cs(fs, cg).cs_ndir++; + } + bdwrite(bp); + return (cg * fs->fs_ipg + ipref); +} + +/* + * Free a block or fragment. + * + * The specified block or fragment is placed back in the + * free map. If a fragment is deallocated, a possible + * block reassembly is checked. + */ +ffs_blkfree(ip, bno, size) + register struct inode *ip; + daddr_t bno; + long size; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + daddr_t blkno; + int i, error, cg, blk, frags, bbase; + + fs = ip->i_fs; + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { + printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); + panic("blkfree: bad size"); + } + cg = dtog(fs, bno); + if ((u_int)bno >= fs->fs_size) { + printf("bad block %d, ino %d\n", bno, ip->i_number); + ffs_fserr(fs, ip->i_uid, "bad block"); + return; + } + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return; + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return; + } + cgp->cg_time = time.tv_sec; + bno = dtogd(fs, bno); + if (size == fs->fs_bsize) { + blkno = fragstoblks(fs, bno); + if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) { + printf("dev = 0x%x, block = %d, fs = %s\n", + ip->i_dev, bno, fs->fs_fsmnt); + panic("blkfree: freeing free block"); + } + ffs_setblock(fs, cg_blksfree(cgp), blkno); + ffs_clusteracct(fs, cgp, blkno, 1); + cgp->cg_cs.cs_nbfree++; + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + i = cbtocylno(fs, bno); + cg_blks(fs, cgp, i)[cbtorpos(fs, bno)]++; + cg_blktot(cgp)[i]++; + } else { + bbase = bno - fragnum(fs, bno); + /* + * decrement the counts associated with the old frags + */ + blk = blkmap(fs, cg_blksfree(cgp), bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1); + /* + * deallocate the fragment + */ + frags = numfrags(fs, size); + for (i = 0; i < frags; i++) { + if (isset(cg_blksfree(cgp), bno + i)) { + printf("dev = 0x%x, block = %d, fs = %s\n", + ip->i_dev, bno + i, fs->fs_fsmnt); + panic("blkfree: freeing free frag"); + } + setbit(cg_blksfree(cgp), bno + i); + } + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cg).cs_nffree += i; + /* + * add back in counts associated with the new frags + */ + blk = blkmap(fs, cg_blksfree(cgp), bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1); + /* + * if a complete block has been reassembled, account for it + */ + blkno = fragstoblks(fs, bbase); + if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) { + cgp->cg_cs.cs_nffree -= fs->fs_frag; + fs->fs_cstotal.cs_nffree -= fs->fs_frag; + fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; + ffs_clusteracct(fs, cgp, blkno, 1); + cgp->cg_cs.cs_nbfree++; + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + i = cbtocylno(fs, bbase); + cg_blks(fs, cgp, i)[cbtorpos(fs, bbase)]++; + cg_blktot(cgp)[i]++; + } + } + fs->fs_fmod = 1; + bdwrite(bp); +} + +/* + * Free an inode. + * + * The specified inode is placed back in the free map. + */ +int +ffs_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + register struct fs *fs; + register struct cg *cgp; + register struct inode *pip; + ino_t ino = ap->a_ino; + struct buf *bp; + int error, cg; + + pip = VTOI(ap->a_pvp); + fs = pip->i_fs; + if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) + panic("ifree: range: dev = 0x%x, ino = %d, fs = %s\n", + pip->i_dev, ino, fs->fs_fsmnt); + cg = ino_to_cg(fs, ino); + error = bread(pip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (0); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return (0); + } + cgp->cg_time = time.tv_sec; + ino %= fs->fs_ipg; + if (isclr(cg_inosused(cgp), ino)) { + printf("dev = 0x%x, ino = %d, fs = %s\n", + pip->i_dev, ino, fs->fs_fsmnt); + if (fs->fs_ronly == 0) + panic("ifree: freeing free inode"); + } + clrbit(cg_inosused(cgp), ino); + if (ino < cgp->cg_irotor) + cgp->cg_irotor = ino; + cgp->cg_cs.cs_nifree++; + fs->fs_cstotal.cs_nifree++; + fs->fs_cs(fs, cg).cs_nifree++; + if ((ap->a_mode & IFMT) == IFDIR) { + cgp->cg_cs.cs_ndir--; + fs->fs_cstotal.cs_ndir--; + fs->fs_cs(fs, cg).cs_ndir--; + } + fs->fs_fmod = 1; + bdwrite(bp); + return (0); +} + +/* + * Find a block of the specified size in the specified cylinder group. + * + * It is a panic if a request is made to find a block if none are + * available. + */ +static daddr_t +ffs_mapsearch(fs, cgp, bpref, allocsiz) + register struct fs *fs; + register struct cg *cgp; + daddr_t bpref; + int allocsiz; +{ + daddr_t bno; + int start, len, loc, i; + int blk, field, subfield, pos; + + /* + * find the fragment by searching through the free block + * map for an appropriate bit pattern + */ + if (bpref) + start = dtogd(fs, bpref) / NBBY; + else + start = cgp->cg_frotor / NBBY; + len = howmany(fs->fs_fpg, NBBY) - start; + loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[start], + (u_char *)fragtbl[fs->fs_frag], + (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); + if (loc == 0) { + len = start + 1; + start = 0; + loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[0], + (u_char *)fragtbl[fs->fs_frag], + (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); + if (loc == 0) { + printf("start = %d, len = %d, fs = %s\n", + start, len, fs->fs_fsmnt); + panic("ffs_alloccg: map corrupted"); + /* NOTREACHED */ + } + } + bno = (start + len - loc) * NBBY; + cgp->cg_frotor = bno; + /* + * found the byte in the map + * sift through the bits to find the selected frag + */ + for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { + blk = blkmap(fs, cg_blksfree(cgp), bno); + blk <<= 1; + field = around[allocsiz]; + subfield = inside[allocsiz]; + for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { + if ((blk & field) == subfield) + return (bno + pos); + field <<= 1; + subfield <<= 1; + } + } + printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt); + panic("ffs_alloccg: block not in map"); + return (-1); +} + +/* + * Update the cluster map because of an allocation or free. + * + * Cnt == 1 means free; cnt == -1 means allocating. + */ +ffs_clusteracct(fs, cgp, blkno, cnt) + struct fs *fs; + struct cg *cgp; + daddr_t blkno; + int cnt; +{ + long *sump; + u_char *freemapp, *mapp; + int i, start, end, forw, back, map, bit; + + if (fs->fs_contigsumsize <= 0) + return; + freemapp = cg_clustersfree(cgp); + sump = cg_clustersum(cgp); + /* + * Allocate or clear the actual block. + */ + if (cnt > 0) + setbit(freemapp, blkno); + else + clrbit(freemapp, blkno); + /* + * Find the size of the cluster going forward. + */ + start = blkno + 1; + end = start + fs->fs_contigsumsize; + if (end >= cgp->cg_nclusterblks) + end = cgp->cg_nclusterblks; + mapp = &freemapp[start / NBBY]; + map = *mapp++; + bit = 1 << (start % NBBY); + for (i = start; i < end; i++) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + forw = i - start; + /* + * Find the size of the cluster going backward. + */ + start = blkno - 1; + end = start - fs->fs_contigsumsize; + if (end < 0) + end = -1; + mapp = &freemapp[start / NBBY]; + map = *mapp--; + bit = 1 << (start % NBBY); + for (i = start; i > end; i--) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != 0) { + bit >>= 1; + } else { + map = *mapp--; + bit = 1 << (NBBY - 1); + } + } + back = start - i; + /* + * Account for old cluster and the possibly new forward and + * back clusters. + */ + i = back + forw + 1; + if (i > fs->fs_contigsumsize) + i = fs->fs_contigsumsize; + sump[i] += cnt; + if (back > 0) + sump[back] -= cnt; + if (forw > 0) + sump[forw] -= cnt; +} + +/* + * Fserr prints the name of a file system with an error diagnostic. + * + * The form of the error message is: + * fs: error message + */ +static void +ffs_fserr(fs, uid, cp) + struct fs *fs; + u_int uid; + char *cp; +{ + + log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->fs_fsmnt, cp); +} diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c new file mode 100644 index 0000000..752feec --- /dev/null +++ b/sys/ufs/ffs/ffs_balloc.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_balloc.c 8.4 (Berkeley) 9/23/93 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/vnode.h> + +#include <vm/vm.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/ffs/fs.h> +#include <ufs/ffs/ffs_extern.h> + +/* + * Balloc defines the structure of file system storage + * by allocating the physical blocks on a device given + * the inode and the logical block number in a file. + */ +ffs_balloc(ip, bn, size, cred, bpp, flags) + register struct inode *ip; + register daddr_t bn; + int size; + struct ucred *cred; + struct buf **bpp; + int flags; +{ + register struct fs *fs; + register daddr_t nb; + struct buf *bp, *nbp; + struct vnode *vp = ITOV(ip); + struct indir indirs[NIADDR + 2]; + daddr_t newb, lbn, *bap, pref; + int osize, nsize, num, i, error; + + *bpp = NULL; + if (bn < 0) + return (EFBIG); + fs = ip->i_fs; + lbn = bn; + + /* + * If the next write will extend the file into a new block, + * and the file is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + nb = lblkno(fs, ip->i_size); + if (nb < NDADDR && nb < bn) { + osize = blksize(fs, ip, nb); + if (osize < fs->fs_bsize && osize > 0) { + error = ffs_realloccg(ip, nb, + ffs_blkpref(ip, nb, (int)nb, &ip->i_db[0]), + osize, (int)fs->fs_bsize, cred, &bp); + if (error) + return (error); + ip->i_size = (nb + 1) * fs->fs_bsize; + vnode_pager_setsize(vp, (u_long)ip->i_size); + ip->i_db[nb] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (flags & B_SYNC) + bwrite(bp); + else + bawrite(bp); + } + } + /* + * The first NDADDR blocks are direct blocks + */ + if (bn < NDADDR) { + nb = ip->i_db[bn]; + if (nb != 0 && ip->i_size >= (bn + 1) * fs->fs_bsize) { + error = bread(vp, bn, fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + *bpp = bp; + return (0); + } + if (nb != 0) { + /* + * Consider need to reallocate a fragment. + */ + osize = fragroundup(fs, blkoff(fs, ip->i_size)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + error = bread(vp, bn, osize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + } else { + error = ffs_realloccg(ip, bn, + ffs_blkpref(ip, bn, (int)bn, &ip->i_db[0]), + osize, nsize, cred, &bp); + if (error) + return (error); + } + } else { + if (ip->i_size < (bn + 1) * fs->fs_bsize) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + error = ffs_alloc(ip, bn, + ffs_blkpref(ip, bn, (int)bn, &ip->i_db[0]), + nsize, cred, &newb); + if (error) + return (error); + bp = getblk(vp, bn, nsize, 0, 0); + bp->b_blkno = fsbtodb(fs, newb); + if (flags & B_CLRBUF) + clrbuf(bp); + } + ip->i_db[bn] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bpp = bp; + return (0); + } + /* + * Determine the number of levels of indirection. + */ + pref = 0; + if (error = ufs_getlbns(vp, bn, indirs, &num)) + return(error); +#ifdef DIAGNOSTIC + if (num < 1) + panic ("ffs_balloc: ufs_bmaparray returned indirect block\n"); +#endif + /* + * Fetch the first indirect block allocating if necessary. + */ + --num; + nb = ip->i_ib[indirs[0].in_off]; + if (nb == 0) { + pref = ffs_blkpref(ip, lbn, 0, (daddr_t *)0); + if (error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + cred, &newb)) + return (error); + nb = newb; + bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0); + bp->b_blkno = fsbtodb(fs, newb); + clrbuf(bp); + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if (error = bwrite(bp)) { + ffs_blkfree(ip, nb, fs->fs_bsize); + return (error); + } + ip->i_ib[indirs[0].in_off] = newb; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * Fetch through the indirect blocks, allocating as necessary. + */ + for (i = 1;;) { + error = bread(vp, + indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + bap = (daddr_t *)bp->b_data; + nb = bap[indirs[i].in_off]; + if (i == num) + break; + i += 1; + if (nb != 0) { + brelse(bp); + continue; + } + if (pref == 0) + pref = ffs_blkpref(ip, lbn, 0, (daddr_t *)0); + if (error = + ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) { + brelse(bp); + return (error); + } + nb = newb; + nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + clrbuf(nbp); + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if (error = bwrite(nbp)) { + ffs_blkfree(ip, nb, fs->fs_bsize); + brelse(bp); + return (error); + } + bap[indirs[i - 1].in_off] = nb; + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + } + /* + * Get the data block, allocating if necessary. + */ + if (nb == 0) { + pref = ffs_blkpref(ip, lbn, indirs[i].in_off, &bap[0]); + if (error = ffs_alloc(ip, + lbn, pref, (int)fs->fs_bsize, cred, &newb)) { + brelse(bp); + return (error); + } + nb = newb; + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + if (flags & B_CLRBUF) + clrbuf(nbp); + bap[indirs[i].in_off] = nb; + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + *bpp = nbp; + return (0); + } + brelse(bp); + if (flags & B_CLRBUF) { + error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + if (error) { + brelse(nbp); + return (error); + } + } else { + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + } + *bpp = nbp; + return (0); +} diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h new file mode 100644 index 0000000..ab467a2 --- /dev/null +++ b/sys/ufs/ffs/ffs_extern.h @@ -0,0 +1,101 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_extern.h 8.3 (Berkeley) 4/16/94 + */ + +struct buf; +struct fid; +struct fs; +struct inode; +struct mount; +struct nameidata; +struct proc; +struct statfs; +struct timeval; +struct ucred; +struct uio; +struct vnode; +struct mbuf; + +__BEGIN_DECLS +int ffs_alloc __P((struct inode *, + daddr_t, daddr_t, int, struct ucred *, daddr_t *)); +int ffs_balloc __P((struct inode *, + daddr_t, int, struct ucred *, struct buf **, int)); +int ffs_blkatoff __P((struct vop_blkatoff_args *)); +int ffs_blkfree __P((struct inode *, daddr_t, long)); +daddr_t ffs_blkpref __P((struct inode *, daddr_t, int, daddr_t *)); +int ffs_bmap __P((struct vop_bmap_args *)); +void ffs_clrblock __P((struct fs *, u_char *, daddr_t)); +int ffs_fhtovp __P((struct mount *, struct fid *, struct mbuf *, + struct vnode **, int *, struct ucred **)); +void ffs_fragacct __P((struct fs *, int, long [], int)); +int ffs_fsync __P((struct vop_fsync_args *)); +int ffs_init __P((void)); +int ffs_isblock __P((struct fs *, u_char *, daddr_t)); +int ffs_mount __P((struct mount *, + char *, caddr_t, struct nameidata *, struct proc *)); +int ffs_mountfs __P((struct vnode *, struct mount *, struct proc *)); +int ffs_mountroot __P((void)); +int ffs_read __P((struct vop_read_args *)); +int ffs_reallocblks __P((struct vop_reallocblks_args *)); +int ffs_realloccg __P((struct inode *, + daddr_t, daddr_t, int, int, struct ucred *, struct buf **)); +int ffs_reclaim __P((struct vop_reclaim_args *)); +void ffs_setblock __P((struct fs *, u_char *, daddr_t)); +int ffs_statfs __P((struct mount *, struct statfs *, struct proc *)); +int ffs_sync __P((struct mount *, int, struct ucred *, struct proc *)); +int ffs_truncate __P((struct vop_truncate_args *)); +int ffs_unmount __P((struct mount *, int, struct proc *)); +int ffs_update __P((struct vop_update_args *)); +int ffs_valloc __P((struct vop_valloc_args *)); +int ffs_vfree __P((struct vop_vfree_args *)); +int ffs_vget __P((struct mount *, ino_t, struct vnode **)); +int ffs_vptofh __P((struct vnode *, struct fid *)); +int ffs_write __P((struct vop_write_args *)); + +int bwrite(); /* FFS needs a bwrite routine. XXX */ + +#ifdef DIAGNOSTIC +void ffs_checkoverlap __P((struct buf *, struct inode *)); +#endif +__END_DECLS + +extern int (**ffs_vnodeop_p)(); +extern int (**ffs_specop_p)(); +#ifdef FIFO +extern int (**ffs_fifoop_p)(); +#define FFS_FIFOOPS ffs_fifoop_p +#else +#define FFS_FIFOOPS NULL +#endif diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c new file mode 100644 index 0000000..b45aee5 --- /dev/null +++ b/sys/ufs/ffs/ffs_inode.c @@ -0,0 +1,488 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_inode.c 8.5 (Berkeley) 12/30/93 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/trace.h> +#include <sys/resourcevar.h> + +#include <vm/vm.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/ffs/fs.h> +#include <ufs/ffs/ffs_extern.h> + +static int ffs_indirtrunc __P((struct inode *, daddr_t, daddr_t, daddr_t, int, + long *)); + +int +ffs_init() +{ + return (ufs_init()); +} + +/* + * Update the access, modified, and inode change times as specified by the + * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is + * used to specify that the inode needs to be updated but that the times have + * already been set. The access and modified times are taken from the second + * and third parameters; the inode change time is always taken from the current + * time. If waitfor is set, then wait for the disk write of the inode to + * complete. + */ +int +ffs_update(ap) + struct vop_update_args /* { + struct vnode *a_vp; + struct timeval *a_access; + struct timeval *a_modify; + int a_waitfor; + } */ *ap; +{ + register struct fs *fs; + struct buf *bp; + struct inode *ip; + int error; + + ip = VTOI(ap->a_vp); + if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) { + ip->i_flag &= + ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); + return (0); + } + if ((ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) + return (0); + if (ip->i_flag & IN_ACCESS) + ip->i_atime.ts_sec = ap->a_access->tv_sec; + if (ip->i_flag & IN_UPDATE) { + ip->i_mtime.ts_sec = ap->a_modify->tv_sec; + ip->i_modrev++; + } + if (ip->i_flag & IN_CHANGE) + ip->i_ctime.ts_sec = time.tv_sec; + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); + fs = ip->i_fs; + /* + * Ensure that uid and gid are correct. This is a temporary + * fix until fsck has been changed to do the update. + */ + if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ + ip->i_din.di_ouid = ip->i_uid; /* XXX */ + ip->i_din.di_ogid = ip->i_gid; /* XXX */ + } /* XXX */ + if (error = bread(ip->i_devvp, + fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, &bp)) { + brelse(bp); + return (error); + } + *((struct dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)) = ip->i_din; + if (ap->a_waitfor) + return (bwrite(bp)); + else { + bdwrite(bp); + return (0); + } +} + +#define SINGLE 0 /* index of single indirect block */ +#define DOUBLE 1 /* index of double indirect block */ +#define TRIPLE 2 /* index of triple indirect block */ +/* + * Truncate the inode oip to at most length size, freeing the + * disk blocks. + */ +ffs_truncate(ap) + struct vop_truncate_args /* { + struct vnode *a_vp; + off_t a_length; + int a_flags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *ovp = ap->a_vp; + register daddr_t lastblock; + register struct inode *oip; + daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; + daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; + off_t length = ap->a_length; + register struct fs *fs; + struct buf *bp; + int offset, size, level; + long count, nblocks, vflags, blocksreleased = 0; + struct timeval tv; + register int i; + int aflags, error, allerror; + off_t osize; + + oip = VTOI(ovp); + tv = time; + if (ovp->v_type == VLNK && + oip->i_size < ovp->v_mount->mnt_maxsymlinklen) { +#ifdef DIAGNOSTIC + if (length != 0) + panic("ffs_truncate: partial truncate of symlink"); +#endif + bzero((char *)&oip->i_shortlink, (u_int)oip->i_size); + oip->i_size = 0; + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(ovp, &tv, &tv, 1)); + } + if (oip->i_size == length) { + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(ovp, &tv, &tv, 0)); + } +#ifdef QUOTA + if (error = getinoquota(oip)) + return (error); +#endif + vnode_pager_setsize(ovp, (u_long)length); + fs = oip->i_fs; + osize = oip->i_size; + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of oszie is 0, length will be at least 1. + */ + if (osize < length) { + offset = blkoff(fs, length - 1); + lbn = lblkno(fs, length - 1); + aflags = B_CLRBUF; + if (ap->a_flags & IO_SYNC) + aflags |= B_SYNC; + if (error = ffs_balloc(oip, lbn, offset + 1, ap->a_cred, &bp, + aflags)) + return (error); + oip->i_size = length; + (void) vnode_pager_uncache(ovp); + if (aflags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(ovp, &tv, &tv, 1)); + } + /* + * Shorten the size of the file. If the file is not being + * truncated to a block boundry, the contents of the + * partial block following the end of the file must be + * zero'ed in case it ever become accessable again because + * of subsequent file growth. + */ + offset = blkoff(fs, length); + if (offset == 0) { + oip->i_size = length; + } else { + lbn = lblkno(fs, length); + aflags = B_CLRBUF; + if (ap->a_flags & IO_SYNC) + aflags |= B_SYNC; + if (error = ffs_balloc(oip, lbn, offset, ap->a_cred, &bp, + aflags)) + return (error); + oip->i_size = length; + size = blksize(fs, oip, lbn); + (void) vnode_pager_uncache(ovp); + bzero((char *)bp->b_data + offset, (u_int)(size - offset)); + allocbuf(bp, size); + if (aflags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + } + /* + * Calculate index into inode's block list of + * last direct and indirect blocks (if any) + * which we want to keep. Lastblock is -1 when + * the file is truncated to 0. + */ + lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; + lastiblock[SINGLE] = lastblock - NDADDR; + lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); + lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); + nblocks = btodb(fs->fs_bsize); + /* + * Update file and block pointers on disk before we start freeing + * blocks. If we crash before free'ing blocks below, the blocks + * will be returned to the free list. lastiblock values are also + * normalized to -1 for calls to ffs_indirtrunc below. + */ + bcopy((caddr_t)&oip->i_db[0], (caddr_t)oldblks, sizeof oldblks); + for (level = TRIPLE; level >= SINGLE; level--) + if (lastiblock[level] < 0) { + oip->i_ib[level] = 0; + lastiblock[level] = -1; + } + for (i = NDADDR - 1; i > lastblock; i--) + oip->i_db[i] = 0; + oip->i_flag |= IN_CHANGE | IN_UPDATE; + if (error = VOP_UPDATE(ovp, &tv, &tv, MNT_WAIT)) + allerror = error; + /* + * Having written the new inode to disk, save its new configuration + * and put back the old block pointers long enough to process them. + * Note that we save the new block configuration so we can check it + * when we are done. + */ + bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks); + bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks); + oip->i_size = osize; + vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; + allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0); + + /* + * Indirect blocks first. + */ + indir_lbn[SINGLE] = -NDADDR; + indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; + indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; + for (level = TRIPLE; level >= SINGLE; level--) { + bn = oip->i_ib[level]; + if (bn != 0) { + error = ffs_indirtrunc(oip, indir_lbn[level], + fsbtodb(fs, bn), lastiblock[level], level, &count); + if (error) + allerror = error; + blocksreleased += count; + if (lastiblock[level] < 0) { + oip->i_ib[level] = 0; + ffs_blkfree(oip, bn, fs->fs_bsize); + blocksreleased += nblocks; + } + } + if (lastiblock[level] >= 0) + goto done; + } + + /* + * All whole direct blocks or frags. + */ + for (i = NDADDR - 1; i > lastblock; i--) { + register long bsize; + + bn = oip->i_db[i]; + if (bn == 0) + continue; + oip->i_db[i] = 0; + bsize = blksize(fs, oip, i); + ffs_blkfree(oip, bn, bsize); + blocksreleased += btodb(bsize); + } + if (lastblock < 0) + goto done; + + /* + * Finally, look for a change in size of the + * last direct block; release any frags. + */ + bn = oip->i_db[lastblock]; + if (bn != 0) { + long oldspace, newspace; + + /* + * Calculate amount of space we're giving + * back as old block size minus new block size. + */ + oldspace = blksize(fs, oip, lastblock); + oip->i_size = length; + newspace = blksize(fs, oip, lastblock); + if (newspace == 0) + panic("itrunc: newspace"); + if (oldspace - newspace > 0) { + /* + * Block number of space to be free'd is + * the old block # plus the number of frags + * required for the storage we're keeping. + */ + bn += numfrags(fs, newspace); + ffs_blkfree(oip, bn, oldspace - newspace); + blocksreleased += btodb(oldspace - newspace); + } + } +done: +#ifdef DIAGNOSTIC + for (level = SINGLE; level <= TRIPLE; level++) + if (newblks[NDADDR + level] != oip->i_ib[level]) + panic("itrunc1"); + for (i = 0; i < NDADDR; i++) + if (newblks[i] != oip->i_db[i]) + panic("itrunc2"); + if (length == 0 && + (ovp->v_dirtyblkhd.lh_first || ovp->v_cleanblkhd.lh_first)) + panic("itrunc3"); +#endif /* DIAGNOSTIC */ + /* + * Put back the real size. + */ + oip->i_size = length; + oip->i_blocks -= blocksreleased; + if (oip->i_blocks < 0) /* sanity */ + oip->i_blocks = 0; + oip->i_flag |= IN_CHANGE; +#ifdef QUOTA + (void) chkdq(oip, -blocksreleased, NOCRED, 0); +#endif + return (allerror); +} + +/* + * Release blocks associated with the inode ip and stored in the indirect + * block bn. Blocks are free'd in LIFO order up to (but not including) + * lastbn. If level is greater than SINGLE, the block is an indirect block + * and recursive calls to indirtrunc must be used to cleanse other indirect + * blocks. + * + * NB: triple indirect blocks are untested. + */ +static int +ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) + register struct inode *ip; + daddr_t lbn, lastbn; + daddr_t dbn; + int level; + long *countp; +{ + register int i; + struct buf *bp; + register struct fs *fs = ip->i_fs; + register daddr_t *bap; + struct vnode *vp; + daddr_t *copy, nb, nlbn, last; + long blkcount, factor; + int nblocks, blocksreleased = 0; + int error = 0, allerror = 0; + + /* + * Calculate index in current block of last + * block to be kept. -1 indicates the entire + * block so we need not calculate the index. + */ + factor = 1; + for (i = SINGLE; i < level; i++) + factor *= NINDIR(fs); + last = lastbn; + if (lastbn > 0) + last /= factor; + nblocks = btodb(fs->fs_bsize); + /* + * Get buffer of block pointers, zero those entries corresponding + * to blocks to be free'd, and update on disk copy first. Since + * double(triple) indirect before single(double) indirect, calls + * to bmap on these blocks will fail. However, we already have + * the on disk address, so we have to set the b_blkno field + * explicitly instead of letting bread do everything for us. + */ + vp = ITOV(ip); + bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0); + if (bp->b_flags & (B_DONE | B_DELWRI)) { + /* Braces must be here in case trace evaluates to nothing. */ + trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn); + } else { + trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn); + curproc->p_stats->p_ru.ru_inblock++; /* pay for read */ + bp->b_flags |= B_READ; + if (bp->b_bcount > bp->b_bufsize) + panic("ffs_indirtrunc: bad buffer size"); + bp->b_blkno = dbn; + VOP_STRATEGY(bp); + error = biowait(bp); + } + if (error) { + brelse(bp); + *countp = 0; + return (error); + } + + bap = (daddr_t *)bp->b_data; + MALLOC(copy, daddr_t *, fs->fs_bsize, M_TEMP, M_WAITOK); + bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->fs_bsize); + bzero((caddr_t)&bap[last + 1], + (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t)); + if (last == -1) + bp->b_flags |= B_INVAL; + error = bwrite(bp); + if (error) + allerror = error; + bap = copy; + + /* + * Recursively free totally unused blocks. + */ + for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; + i--, nlbn += factor) { + nb = bap[i]; + if (nb == 0) + continue; + if (level > SINGLE) { + if (error = ffs_indirtrunc(ip, nlbn, + fsbtodb(fs, nb), (daddr_t)-1, level - 1, &blkcount)) + allerror = error; + blocksreleased += blkcount; + } + ffs_blkfree(ip, nb, fs->fs_bsize); + blocksreleased += nblocks; + } + + /* + * Recursively free last partial block. + */ + if (level > SINGLE && lastbn >= 0) { + last = lastbn % factor; + nb = bap[i]; + if (nb != 0) { + if (error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), + last, level - 1, &blkcount)) + allerror = error; + blocksreleased += blkcount; + } + } + FREE(copy, M_TEMP); + *countp = blocksreleased; + return (allerror); +} diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c new file mode 100644 index 0000000..c251b16 --- /dev/null +++ b/sys/ufs/ffs/ffs_subr.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_subr.c 8.2 (Berkeley) 9/21/93 + */ + +#include <sys/param.h> +#include <ufs/ffs/fs.h> + +#ifdef KERNEL +#include <sys/systm.h> +#include <sys/vnode.h> +#include <ufs/ffs/ffs_extern.h> +#include <sys/buf.h> +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> + +/* + * Return buffer with the contents of block "offset" from the beginning of + * directory "ip". If "res" is non-zero, fill it in with a pointer to the + * remaining space in the directory. + */ +int +ffs_blkatoff(ap) + struct vop_blkatoff_args /* { + struct vnode *a_vp; + off_t a_offset; + char **a_res; + struct buf **a_bpp; + } */ *ap; +{ + struct inode *ip; + register struct fs *fs; + struct buf *bp; + daddr_t lbn; + int bsize, error; + + ip = VTOI(ap->a_vp); + fs = ip->i_fs; + lbn = lblkno(fs, ap->a_offset); + bsize = blksize(fs, ip, lbn); + + *ap->a_bpp = NULL; + if (error = bread(ap->a_vp, lbn, bsize, NOCRED, &bp)) { + brelse(bp); + return (error); + } + if (ap->a_res) + *ap->a_res = (char *)bp->b_data + blkoff(fs, ap->a_offset); + *ap->a_bpp = bp; + return (0); +} +#endif + +/* + * Update the frsum fields to reflect addition or deletion + * of some frags. + */ +void +ffs_fragacct(fs, fragmap, fraglist, cnt) + struct fs *fs; + int fragmap; + long fraglist[]; + int cnt; +{ + int inblk; + register int field, subfield; + register int siz, pos; + + inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; + fragmap <<= 1; + for (siz = 1; siz < fs->fs_frag; siz++) { + if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) + continue; + field = around[siz]; + subfield = inside[siz]; + for (pos = siz; pos <= fs->fs_frag; pos++) { + if ((fragmap & field) == subfield) { + fraglist[siz] += cnt; + pos += siz; + field <<= siz; + subfield <<= siz; + } + field <<= 1; + subfield <<= 1; + } + } +} + +#if defined(KERNEL) && defined(DIAGNOSTIC) +void +ffs_checkoverlap(bp, ip) + struct buf *bp; + struct inode *ip; +{ + register struct buf *ebp, *ep; + register daddr_t start, last; + struct vnode *vp; + + ebp = &buf[nbuf]; + start = bp->b_blkno; + last = start + btodb(bp->b_bcount) - 1; + for (ep = buf; ep < ebp; ep++) { + if (ep == bp || (ep->b_flags & B_INVAL) || + ep->b_vp == NULLVP) + continue; + if (VOP_BMAP(ep->b_vp, (daddr_t)0, &vp, (daddr_t)0, NULL)) + continue; + if (vp != ip->i_devvp) + continue; + /* look for overlap */ + if (ep->b_bcount == 0 || ep->b_blkno > last || + ep->b_blkno + btodb(ep->b_bcount) <= start) + continue; + vprint("Disk overlap", vp); + (void)printf("\tstart %d, end %d overlap start %d, end %d\n", + start, last, ep->b_blkno, + ep->b_blkno + btodb(ep->b_bcount) - 1); + panic("Disk buffer overlap"); + } +} +#endif /* DIAGNOSTIC */ + +/* + * block operations + * + * check if a block is available + */ +int +ffs_isblock(fs, cp, h) + struct fs *fs; + unsigned char *cp; + daddr_t h; +{ + unsigned char mask; + + switch ((int)fs->fs_frag) { + case 8: + return (cp[h] == 0xff); + case 4: + mask = 0x0f << ((h & 0x1) << 2); + return ((cp[h >> 1] & mask) == mask); + case 2: + mask = 0x03 << ((h & 0x3) << 1); + return ((cp[h >> 2] & mask) == mask); + case 1: + mask = 0x01 << (h & 0x7); + return ((cp[h >> 3] & mask) == mask); + default: + panic("ffs_isblock"); + } +} + +/* + * take a block out of the map + */ +void +ffs_clrblock(fs, cp, h) + struct fs *fs; + u_char *cp; + daddr_t h; +{ + + switch ((int)fs->fs_frag) { + case 8: + cp[h] = 0; + return; + case 4: + cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); + return; + case 2: + cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); + return; + case 1: + cp[h >> 3] &= ~(0x01 << (h & 0x7)); + return; + default: + panic("ffs_clrblock"); + } +} + +/* + * put a block into the map + */ +void +ffs_setblock(fs, cp, h) + struct fs *fs; + unsigned char *cp; + daddr_t h; +{ + + switch ((int)fs->fs_frag) { + + case 8: + cp[h] = 0xff; + return; + case 4: + cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); + return; + case 2: + cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); + return; + case 1: + cp[h >> 3] |= (0x01 << (h & 0x7)); + return; + default: + panic("ffs_setblock"); + } +} diff --git a/sys/ufs/ffs/ffs_tables.c b/sys/ufs/ffs/ffs_tables.c new file mode 100644 index 0000000..8cf46b0 --- /dev/null +++ b/sys/ufs/ffs/ffs_tables.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_tables.c 8.1 (Berkeley) 6/11/93 + */ + +#include <sys/param.h> + +/* + * Bit patterns for identifying fragments in the block map + * used as ((map & around) == inside) + */ +int around[9] = { + 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff +}; +int inside[9] = { + 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe +}; + +/* + * Given a block map bit pattern, the frag tables tell whether a + * particular size fragment is available. + * + * used as: + * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] { + * at least one fragment of the indicated size is available + * } + * + * These tables are used by the scanc instruction on the VAX to + * quickly find an appropriate fragment. + */ +u_char fragtbl124[256] = { + 0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e, + 0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e, + 0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce, + 0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a, +}; + +u_char fragtbl8[256] = { + 0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, + 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c, + 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80, +}; + +/* + * The actual fragtbl array. + */ +u_char *fragtbl[MAXFRAG + 1] = { + 0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8, +}; diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c new file mode 100644 index 0000000..505dd5d --- /dev/null +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -0,0 +1,843 @@ +/* + * Copyright (c) 1989, 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/vnode.h> +#include <sys/socket.h> +#include <sys/mount.h> +#include <sys/buf.h> +#include <sys/mbuf.h> +#include <sys/file.h> +#include <sys/disklabel.h> +#include <sys/ioctl.h> +#include <sys/errno.h> +#include <sys/malloc.h> + +#include <miscfs/specfs/specdev.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/ffs/fs.h> +#include <ufs/ffs/ffs_extern.h> + +int ffs_sbupdate __P((struct ufsmount *, int)); + +struct vfsops ufs_vfsops = { + ffs_mount, + ufs_start, + ffs_unmount, + ufs_root, + ufs_quotactl, + ffs_statfs, + ffs_sync, + ffs_vget, + ffs_fhtovp, + ffs_vptofh, + ffs_init, +}; + +extern u_long nextgennumber; + +/* + * Called by main() when ufs is going to be mounted as root. + * + * Name is updated by mount(8) after booting. + */ +#define ROOTNAME "root_device" + +ffs_mountroot() +{ + extern struct vnode *rootvp; + register struct fs *fs; + register struct mount *mp; + struct proc *p = curproc; /* XXX */ + struct ufsmount *ump; + u_int size; + int error; + + /* + * Get vnodes for swapdev and rootdev. + */ + if (bdevvp(swapdev, &swapdev_vp) || bdevvp(rootdev, &rootvp)) + panic("ffs_mountroot: can't setup bdevvp's"); + + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = &ufs_vfsops; + mp->mnt_flag = MNT_RDONLY; + if (error = ffs_mountfs(rootvp, mp, p)) { + free(mp, M_MOUNT); + return (error); + } + if (error = vfs_lock(mp)) { + (void)ffs_unmount(mp, 0, p); + free(mp, M_MOUNT); + return (error); + } + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mp->mnt_flag |= MNT_ROOTFS; + mp->mnt_vnodecovered = NULLVP; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + bzero(fs->fs_fsmnt, sizeof(fs->fs_fsmnt)); + fs->fs_fsmnt[0] = '/'; + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void)ffs_statfs(mp, &mp->mnt_stat, p); + vfs_unlock(mp); + inittodr(fs->fs_time); + return (0); +} + +/* + * VFS Operations. + * + * mount system call + */ +int +ffs_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct ufs_args args; + struct ufsmount *ump; + register struct fs *fs; + u_int size; + int error, flags; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct ufs_args))) + return (error); + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); + fs = ump->um_fs; + error = 0; + if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + flags = WRITECLOSE; + if (mp->mnt_flag & MNT_FORCE) + flags |= FORCECLOSE; + if (vfs_busy(mp)) + return (EBUSY); + error = ffs_flushfiles(mp, flags, p); + vfs_unbusy(mp); + } + if (!error && (mp->mnt_flag & MNT_RELOAD)) + error = ffs_reload(mp, ndp->ni_cnd.cn_cred, p); + if (error) + return (error); + if (fs->fs_ronly && (mp->mnt_flag & MNT_WANTRDWR)) + fs->fs_ronly = 0; + if (args.fspec == 0) { + /* + * Process export requests. + */ + return (vfs_export(mp, &ump->um_export, &args.export)); + } + } + /* + * Not an update, or updating the name: look up the name + * and verify that it refers to a sensible block device. + */ + NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); + if (error = namei(ndp)) + return (error); + devvp = ndp->ni_vp; + + if (devvp->v_type != VBLK) { + vrele(devvp); + return (ENOTBLK); + } + if (major(devvp->v_rdev) >= nblkdev) { + vrele(devvp); + return (ENXIO); + } + if ((mp->mnt_flag & MNT_UPDATE) == 0) + error = ffs_mountfs(devvp, mp, p); + else { + if (devvp != ump->um_devvp) + error = EINVAL; /* needs translation */ + else + vrele(devvp); + } + if (error) { + vrele(devvp); + return (error); + } + ump = VFSTOUFS(mp); + fs = ump->um_fs; + (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); + bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void)ffs_statfs(mp, &mp->mnt_stat, p); + return (0); +} + +/* + * Reload all incore data for a filesystem (used after running fsck on + * the root filesystem and finding things to fix). The filesystem must + * be mounted read-only. + * + * Things to do to update the mount: + * 1) invalidate all cached meta-data. + * 2) re-read superblock from disk. + * 3) re-read summary information from disk. + * 4) invalidate all inactive vnodes. + * 5) invalidate all cached file data. + * 6) re-read inode data for all active vnodes. + */ +ffs_reload(mountp, cred, p) + register struct mount *mountp; + struct ucred *cred; + struct proc *p; +{ + register struct vnode *vp, *nvp, *devvp; + struct inode *ip; + struct csum *space; + struct buf *bp; + struct fs *fs; + int i, blks, size, error; + + if ((mountp->mnt_flag & MNT_RDONLY) == 0) + return (EINVAL); + /* + * Step 1: invalidate all cached meta-data. + */ + devvp = VFSTOUFS(mountp)->um_devvp; + if (vinvalbuf(devvp, 0, cred, p, 0, 0)) + panic("ffs_reload: dirty1"); + /* + * Step 2: re-read superblock from disk. + */ + if (error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) + return (error); + fs = (struct fs *)bp->b_data; + if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || + fs->fs_bsize < sizeof(struct fs)) { + brelse(bp); + return (EIO); /* XXX needs translation */ + } + fs = VFSTOUFS(mountp)->um_fs; + bcopy(&fs->fs_csp[0], &((struct fs *)bp->b_data)->fs_csp[0], + sizeof(fs->fs_csp)); + bcopy(bp->b_data, fs, (u_int)fs->fs_sbsize); + if (fs->fs_sbsize < SBSIZE) + bp->b_flags |= B_INVAL; + brelse(bp); + ffs_oldfscompat(fs); + /* + * Step 3: re-read summary information from disk. + */ + blks = howmany(fs->fs_cssize, fs->fs_fsize); + space = fs->fs_csp[0]; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + if (error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, + NOCRED, &bp)) + return (error); + bcopy(bp->b_data, fs->fs_csp[fragstoblks(fs, i)], (u_int)size); + brelse(bp); + } +loop: + for (vp = mountp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { + nvp = vp->v_mntvnodes.le_next; + /* + * Step 4: invalidate all inactive vnodes. + */ + if (vp->v_usecount == 0) { + vgone(vp); + continue; + } + /* + * Step 5: invalidate all cached file data. + */ + if (vget(vp, 1)) + goto loop; + if (vinvalbuf(vp, 0, cred, p, 0, 0)) + panic("ffs_reload: dirty2"); + /* + * Step 6: re-read inode data for all active vnodes. + */ + ip = VTOI(vp); + if (error = + bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, &bp)) { + vput(vp); + return (error); + } + ip->i_din = *((struct dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)); + brelse(bp); + vput(vp); + if (vp->v_mount != mountp) + goto loop; + } + return (0); +} + +/* + * Common code for mount and mountroot + */ +int +ffs_mountfs(devvp, mp, p) + register struct vnode *devvp; + struct mount *mp; + struct proc *p; +{ + register struct ufsmount *ump; + struct buf *bp; + register struct fs *fs; + dev_t dev = devvp->v_rdev; + struct partinfo dpart; + caddr_t base, space; + int havepart = 0, blks; + int error, i, size; + int ronly; + extern struct vnode *rootvp; + + /* + * Disallow multiple mounts of the same device. + * Disallow mounting of a device that is currently in use + * (except for root, which might share swap device for miniroot). + * Flush out any old buffers remaining from a previous use. + */ + if (error = vfs_mountedon(devvp)) + return (error); + if (vcount(devvp) > 1 && devvp != rootvp) + return (EBUSY); + if (error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + if (error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) + return (error); + if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) + size = DEV_BSIZE; + else { + havepart = 1; + size = dpart.disklab->d_secsize; + } + + bp = NULL; + ump = NULL; + if (error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) + goto out; + fs = (struct fs *)bp->b_data; + if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || + fs->fs_bsize < sizeof(struct fs)) { + error = EINVAL; /* XXX needs translation */ + goto out; + } + ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK); + bzero((caddr_t)ump, sizeof *ump); + ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, + M_WAITOK); + bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); + if (fs->fs_sbsize < SBSIZE) + bp->b_flags |= B_INVAL; + brelse(bp); + bp = NULL; + fs = ump->um_fs; + fs->fs_ronly = ronly; + if (ronly == 0) + fs->fs_fmod = 1; + blks = howmany(fs->fs_cssize, fs->fs_fsize); + base = space = malloc((u_long)fs->fs_cssize, M_UFSMNT, + M_WAITOK); + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, + NOCRED, &bp); + if (error) { + free(base, M_UFSMNT); + goto out; + } + bcopy(bp->b_data, space, (u_int)size); + fs->fs_csp[fragstoblks(fs, i)] = (struct csum *)space; + space += size; + brelse(bp); + bp = NULL; + } + mp->mnt_data = (qaddr_t)ump; + mp->mnt_stat.f_fsid.val[0] = (long)dev; + mp->mnt_stat.f_fsid.val[1] = MOUNT_UFS; + mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; + mp->mnt_flag |= MNT_LOCAL; + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_nindir = fs->fs_nindir; + ump->um_bptrtodb = fs->fs_fsbtodb; + ump->um_seqinc = fs->fs_frag; + for (i = 0; i < MAXQUOTAS; i++) + ump->um_quotas[i] = NULLVP; + devvp->v_specflags |= SI_MOUNTEDON; + ffs_oldfscompat(fs); + return (0); +out: + if (bp) + brelse(bp); + (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); + if (ump) { + free(ump->um_fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + } + return (error); +} + +/* + * Sanity checks for old file systems. + * + * XXX - goes away some day. + */ +ffs_oldfscompat(fs) + struct fs *fs; +{ + int i; + + fs->fs_npsect = max(fs->fs_npsect, fs->fs_nsect); /* XXX */ + fs->fs_interleave = max(fs->fs_interleave, 1); /* XXX */ + if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ + fs->fs_nrpos = 8; /* XXX */ + if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ + quad_t sizepb = fs->fs_bsize; /* XXX */ + /* XXX */ + fs->fs_maxfilesize = fs->fs_bsize * NDADDR - 1; /* XXX */ + for (i = 0; i < NIADDR; i++) { /* XXX */ + sizepb *= NINDIR(fs); /* XXX */ + fs->fs_maxfilesize += sizepb; /* XXX */ + } /* XXX */ + fs->fs_qbmask = ~fs->fs_bmask; /* XXX */ + fs->fs_qfmask = ~fs->fs_fmask; /* XXX */ + } /* XXX */ + return (0); +} + +/* + * unmount system call + */ +int +ffs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + register struct ufsmount *ump; + register struct fs *fs; + int error, flags, ronly; + + flags = 0; + if (mntflags & MNT_FORCE) { + if (mp->mnt_flag & MNT_ROOTFS) + return (EINVAL); + flags |= FORCECLOSE; + } + if (error = ffs_flushfiles(mp, flags, p)) + return (error); + ump = VFSTOUFS(mp); + fs = ump->um_fs; + ronly = !fs->fs_ronly; + ump->um_devvp->v_specflags &= ~SI_MOUNTEDON; + error = VOP_CLOSE(ump->um_devvp, ronly ? FREAD : FREAD|FWRITE, + NOCRED, p); + vrele(ump->um_devvp); + free(fs->fs_csp[0], M_UFSMNT); + free(fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +/* + * Flush out all the files in a filesystem. + */ +ffs_flushfiles(mp, flags, p) + register struct mount *mp; + int flags; + struct proc *p; +{ + extern int doforce; + register struct ufsmount *ump; + int i, error; + + if (!doforce) + flags &= ~FORCECLOSE; + ump = VFSTOUFS(mp); +#ifdef QUOTA + if (mp->mnt_flag & MNT_QUOTA) { + if (error = vflush(mp, NULLVP, SKIPSYSTEM|flags)) + return (error); + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP) + continue; + quotaoff(p, mp, i); + } + /* + * Here we fall through to vflush again to ensure + * that we have gotten rid of all the system vnodes. + */ + } +#endif + error = vflush(mp, NULLVP, flags); + return (error); +} + +/* + * Get file system statistics. + */ +int +ffs_statfs(mp, sbp, p) + struct mount *mp; + register struct statfs *sbp; + struct proc *p; +{ + register struct ufsmount *ump; + register struct fs *fs; + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_magic != FS_MAGIC) + panic("ffs_statfs"); + sbp->f_type = MOUNT_UFS; + sbp->f_bsize = fs->fs_fsize; + sbp->f_iosize = fs->fs_bsize; + sbp->f_blocks = fs->fs_dsize; + sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + + fs->fs_cstotal.cs_nffree; + sbp->f_bavail = (fs->fs_dsize * (100 - fs->fs_minfree) / 100) - + (fs->fs_dsize - sbp->f_bfree); + sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; + sbp->f_ffree = fs->fs_cstotal.cs_nifree; + if (sbp != &mp->mnt_stat) { + bcopy((caddr_t)mp->mnt_stat.f_mntonname, + (caddr_t)&sbp->f_mntonname[0], MNAMELEN); + bcopy((caddr_t)mp->mnt_stat.f_mntfromname, + (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + } + return (0); +} + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked `MPBUSY'. + */ +int +ffs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + register struct vnode *vp; + register struct inode *ip; + register struct ufsmount *ump = VFSTOUFS(mp); + register struct fs *fs; + int error, allerror = 0; + + fs = ump->um_fs; + /* + * Write back modified superblock. + * Consistency check that the superblock + * is still in the buffer cache. + */ + if (fs->fs_fmod != 0) { + if (fs->fs_ronly != 0) { /* XXX */ + printf("fs = %s\n", fs->fs_fsmnt); + panic("update: rofs mod"); + } + fs->fs_fmod = 0; + fs->fs_time = time.tv_sec; + allerror = ffs_sbupdate(ump, waitfor); + } + /* + * Write back each (modified) inode. + */ +loop: + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + if (VOP_ISLOCKED(vp)) + continue; + ip = VTOI(vp); + if ((ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && + vp->v_dirtyblkhd.lh_first == NULL) + continue; + if (vget(vp, 1)) + goto loop; + if (error = VOP_FSYNC(vp, cred, waitfor, p)) + allerror = error; + vput(vp); + } + /* + * Force stale file system control information to be flushed. + */ + if (error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) + allerror = error; +#ifdef QUOTA + qsync(mp); +#endif + return (allerror); +} + +/* + * Look up a FFS dinode number to find its incore vnode, otherwise read it + * in from disk. If it is in core, wait for the lock bit to clear, then + * return the inode locked. Detection and handling of mount points must be + * done by the calling routine. + */ +int +ffs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + register struct fs *fs; + register struct inode *ip; + struct ufsmount *ump; + struct buf *bp; + struct vnode *vp; + dev_t dev; + int i, type, error; + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + if ((*vpp = ufs_ihashget(dev, ino)) != NULL) + return (0); + + /* Allocate a new vnode/inode. */ + if (error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, &vp)) { + *vpp = NULL; + return (error); + } + type = ump->um_devvp->v_tag == VT_MFS ? M_MFSNODE : M_FFSNODE; /* XXX */ + MALLOC(ip, struct inode *, sizeof(struct inode), type, M_WAITOK); + bzero((caddr_t)ip, sizeof(struct inode)); + vp->v_data = ip; + ip->i_vnode = vp; + ip->i_fs = fs = ump->um_fs; + ip->i_dev = dev; + ip->i_number = ino; +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) + ip->i_dquot[i] = NODQUOT; +#endif + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ufs_ihashins(ip); + + /* Read in the disk contents for the inode, copy into the inode. */ + if (error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), + (int)fs->fs_bsize, NOCRED, &bp)) { + /* + * The inode does not contain anything useful, so it would + * be misleading to leave it on its hash chain. With mode + * still zero, it will be unlinked and returned to the free + * list by vput(). + */ + vput(vp); + brelse(bp); + *vpp = NULL; + return (error); + } + ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ino)); + brelse(bp); + + /* + * Initialize the vnode from the inode, check for aliases. + * Note that the underlying vnode may have changed. + */ + if (error = ufs_vinit(mp, ffs_specop_p, FFS_FIFOOPS, &vp)) { + vput(vp); + *vpp = NULL; + return (error); + } + /* + * Finish inode initialization now that aliasing has been resolved. + */ + ip->i_devvp = ump->um_devvp; + VREF(ip->i_devvp); + /* + * Set up a generation number for this inode if it does not + * already have one. This should only happen on old filesystems. + */ + if (ip->i_gen == 0) { + if (++nextgennumber < (u_long)time.tv_sec) + nextgennumber = time.tv_sec; + ip->i_gen = nextgennumber; + if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) + ip->i_flag |= IN_MODIFIED; + } + /* + * Ensure that uid and gid are correct. This is a temporary + * fix until fsck has been changed to do the update. + */ + if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ + ip->i_uid = ip->i_din.di_ouid; /* XXX */ + ip->i_gid = ip->i_din.di_ogid; /* XXX */ + } /* XXX */ + + *vpp = vp; + return (0); +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is valid + * - call ffs_vget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + * - check that the given client host has export rights and return + * those rights via. exflagsp and credanonp + */ +int +ffs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct fid *fhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + register struct ufid *ufhp; + struct fs *fs; + + ufhp = (struct ufid *)fhp; + fs = VFSTOUFS(mp)->um_fs; + if (ufhp->ufid_ino < ROOTINO || + ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) + return (ESTALE); + return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp)); +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +ffs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + register struct inode *ip; + register struct ufid *ufhp; + + ip = VTOI(vp); + ufhp = (struct ufid *)fhp; + ufhp->ufid_len = sizeof(struct ufid); + ufhp->ufid_ino = ip->i_number; + ufhp->ufid_gen = ip->i_gen; + return (0); +} + +/* + * Write a superblock and associated information back to disk. + */ +int +ffs_sbupdate(mp, waitfor) + struct ufsmount *mp; + int waitfor; +{ + register struct fs *fs = mp->um_fs; + register struct buf *bp; + int blks; + caddr_t space; + int i, size, error = 0; + + bp = getblk(mp->um_devvp, SBLOCK, (int)fs->fs_sbsize, 0, 0); + bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + /* Restore compatibility to old file systems. XXX */ + if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ + ((struct fs *)bp->b_data)->fs_nrpos = -1; /* XXX */ + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + blks = howmany(fs->fs_cssize, fs->fs_fsize); + space = (caddr_t)fs->fs_csp[0]; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), + size, 0, 0); + bcopy(space, bp->b_data, (u_int)size); + space += size; + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + } + return (error); +} diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c new file mode 100644 index 0000000..59814f2 --- /dev/null +++ b/sys/ufs/ffs/ffs_vnops.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_vnops.c 8.7 (Berkeley) 2/3/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/resourcevar.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/conf.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/malloc.h> + +#include <vm/vm.h> + +#include <miscfs/specfs/specdev.h> +#include <miscfs/fifofs/fifo.h> + +#include <ufs/ufs/lockf.h> +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/dir.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/ffs/fs.h> +#include <ufs/ffs/ffs_extern.h> + +/* Global vfs data structures for ufs. */ +int (**ffs_vnodeop_p)(); +struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, ufs_lookup }, /* lookup */ + { &vop_create_desc, ufs_create }, /* create */ + { &vop_mknod_desc, ufs_mknod }, /* mknod */ + { &vop_open_desc, ufs_open }, /* open */ + { &vop_close_desc, ufs_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, ufs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ffs_read }, /* read */ + { &vop_write_desc, ffs_write }, /* write */ + { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */ + { &vop_select_desc, ufs_select }, /* select */ + { &vop_mmap_desc, ufs_mmap }, /* mmap */ + { &vop_fsync_desc, ffs_fsync }, /* fsync */ + { &vop_seek_desc, ufs_seek }, /* seek */ + { &vop_remove_desc, ufs_remove }, /* remove */ + { &vop_link_desc, ufs_link }, /* link */ + { &vop_rename_desc, ufs_rename }, /* rename */ + { &vop_mkdir_desc, ufs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, ufs_rmdir }, /* rmdir */ + { &vop_symlink_desc, ufs_symlink }, /* symlink */ + { &vop_readdir_desc, ufs_readdir }, /* readdir */ + { &vop_readlink_desc, ufs_readlink }, /* readlink */ + { &vop_abortop_desc, ufs_abortop }, /* abortop */ + { &vop_inactive_desc, ufs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, ufs_bmap }, /* bmap */ + { &vop_strategy_desc, ufs_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ + { &vop_advlock_desc, ufs_advlock }, /* advlock */ + { &vop_blkatoff_desc, ffs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, ffs_valloc }, /* valloc */ + { &vop_reallocblks_desc, ffs_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, ffs_vfree }, /* vfree */ + { &vop_truncate_desc, ffs_truncate }, /* truncate */ + { &vop_update_desc, ffs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc ffs_vnodeop_opv_desc = + { &ffs_vnodeop_p, ffs_vnodeop_entries }; + +int (**ffs_specop_p)(); +struct vnodeopv_entry_desc ffs_specop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, ufsspec_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, ufs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsspec_read }, /* read */ + { &vop_write_desc, ufsspec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, ffs_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, ufs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_reallocblks_desc, spec_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, ffs_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, ffs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc ffs_specop_opv_desc = + { &ffs_specop_p, ffs_specop_entries }; + +#ifdef FIFO +int (**ffs_fifoop_p)(); +struct vnodeopv_entry_desc ffs_fifoop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, fifo_create }, /* create */ + { &vop_mknod_desc, fifo_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, ufsfifo_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, ufs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsfifo_read }, /* read */ + { &vop_write_desc, ufsfifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, ffs_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, fifo_remove }, /* remove */ + { &vop_link_desc, fifo_link }, /* link */ + { &vop_rename_desc, fifo_rename }, /* rename */ + { &vop_mkdir_desc, fifo_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fifo_rmdir }, /* rmdir */ + { &vop_symlink_desc, fifo_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, ufs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_reallocblks_desc, fifo_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, ffs_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, ffs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc ffs_fifoop_opv_desc = + { &ffs_fifoop_p, ffs_fifoop_entries }; +#endif /* FIFO */ + +#ifdef DEBUG +/* + * Enabling cluster read/write operations. + */ +#include <sys/sysctl.h> +int doclusterread = 1; +struct ctldebug debug11 = { "doclusterread", &doclusterread }; +int doclusterwrite = 1; +struct ctldebug debug12 = { "doclusterwrite", &doclusterwrite }; +#else +/* XXX for ufs_readwrite */ +#define doclusterread 1 +#define doclusterwrite 1 +#endif + +#include <ufs/ufs/ufs_readwrite.c> + +/* + * Synch an open file. + */ +/* ARGSUSED */ +int +ffs_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct buf *bp; + struct timeval tv; + struct buf *nbp; + int s; + + /* + * Flush all dirty buffers associated with a vnode. + */ +loop: + s = splbio(); + for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if ((bp->b_flags & B_BUSY)) + continue; + if ((bp->b_flags & B_DELWRI) == 0) + panic("ffs_fsync: not dirty"); + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); + /* + * Wait for I/O associated with indirect blocks to complete, + * since there is no way to quickly wait for them below. + */ + if (bp->b_vp == vp || ap->a_waitfor == MNT_NOWAIT) + (void) bawrite(bp); + else + (void) bwrite(bp); + goto loop; + } + if (ap->a_waitfor == MNT_WAIT) { + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); + } +#ifdef DIAGNOSTIC + if (vp->v_dirtyblkhd.lh_first) { + vprint("ffs_fsync: dirty", vp); + goto loop; + } +#endif + } + splx(s); + tv = time; + return (VOP_UPDATE(ap->a_vp, &tv, &tv, ap->a_waitfor == MNT_WAIT)); +} diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h new file mode 100644 index 0000000..bef052f --- /dev/null +++ b/sys/ufs/ffs/fs.h @@ -0,0 +1,489 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fs.h 8.7 (Berkeley) 4/19/94 + */ + +/* + * Each disk drive contains some number of file systems. + * A file system consists of a number of cylinder groups. + * Each cylinder group has inodes and data. + * + * A file system is described by its super-block, which in turn + * describes the cylinder groups. The super-block is critical + * data and is replicated in each cylinder group to protect against + * catastrophic loss. This is done at `newfs' time and the critical + * super-block data does not change, so the copies need not be + * referenced further unless disaster strikes. + * + * For file system fs, the offsets of the various blocks of interest + * are given in the super block as: + * [fs->fs_sblkno] Super-block + * [fs->fs_cblkno] Cylinder group block + * [fs->fs_iblkno] Inode blocks + * [fs->fs_dblkno] Data blocks + * The beginning of cylinder group cg in fs, is given by + * the ``cgbase(fs, cg)'' macro. + * + * The first boot and super blocks are given in absolute disk addresses. + * The byte-offset forms are preferred, as they don't imply a sector size. + */ +#define BBSIZE 8192 +#define SBSIZE 8192 +#define BBOFF ((off_t)(0)) +#define SBOFF ((off_t)(BBOFF + BBSIZE)) +#define BBLOCK ((daddr_t)(0)) +#define SBLOCK ((daddr_t)(BBLOCK + BBSIZE / DEV_BSIZE)) + +/* + * Addresses stored in inodes are capable of addressing fragments + * of `blocks'. File system blocks of at most size MAXBSIZE can + * be optionally broken into 2, 4, or 8 pieces, each of which is + * addressible; these pieces may be DEV_BSIZE, or some multiple of + * a DEV_BSIZE unit. + * + * Large files consist of exclusively large data blocks. To avoid + * undue wasted disk space, the last data block of a small file may be + * allocated as only as many fragments of a large block as are + * necessary. The file system format retains only a single pointer + * to such a fragment, which is a piece of a single large block that + * has been divided. The size of such a fragment is determinable from + * information in the inode, using the ``blksize(fs, ip, lbn)'' macro. + * + * The file system records space availability at the fragment level; + * to determine block availability, aligned fragments are examined. + */ + +/* + * MINBSIZE is the smallest allowable block size. + * In order to insure that it is possible to create files of size + * 2^32 with only two levels of indirection, MINBSIZE is set to 4096. + * MINBSIZE must be big enough to hold a cylinder group block, + * thus changes to (struct cg) must keep its size within MINBSIZE. + * Note that super blocks are always of size SBSIZE, + * and that both SBSIZE and MAXBSIZE must be >= MINBSIZE. + */ +#define MINBSIZE 4096 + +/* + * The path name on which the file system is mounted is maintained + * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in + * the super block for this name. + * The limit on the amount of summary information per file system + * is defined by MAXCSBUFS. It is currently parameterized for a + * maximum of two million cylinders. + */ +#define MAXMNTLEN 512 +#define MAXCSBUFS 32 + +/* + * A summary of contiguous blocks of various sizes is maintained + * in each cylinder group. Normally this is set by the initial + * value of fs_maxcontig. To conserve space, a maximum summary size + * is set by FS_MAXCONTIG. + */ +#define FS_MAXCONTIG 16 + +/* + * MINFREE gives the minimum acceptable percentage of file system + * blocks which may be free. If the freelist drops below this level + * only the superuser may continue to allocate blocks. This may + * be set to 0 if no reserve of free blocks is deemed necessary, + * however throughput drops by fifty percent if the file system + * is run at between 95% and 100% full; thus the minimum default + * value of fs_minfree is 5%. However, to get good clustering + * performance, 10% is a better choice. hence we use 10% as our + * default value. With 10% free space, fragmentation is not a + * problem, so we choose to optimize for time. + */ +#define MINFREE 5 +#define DEFAULTOPT FS_OPTTIME + +/* + * Per cylinder group information; summarized in blocks allocated + * from first cylinder group data blocks. These blocks have to be + * read in from fs_csaddr (size fs_cssize) in addition to the + * super block. + * + * N.B. sizeof(struct csum) must be a power of two in order for + * the ``fs_cs'' macro to work (see below). + */ +struct csum { + long cs_ndir; /* number of directories */ + long cs_nbfree; /* number of free blocks */ + long cs_nifree; /* number of free inodes */ + long cs_nffree; /* number of free frags */ +}; + +/* + * Super block for a file system. + */ +struct fs { + struct fs *fs_link; /* linked list of file systems */ + struct fs *fs_rlink; /* used for incore super blocks */ + daddr_t fs_sblkno; /* addr of super-block in filesys */ + daddr_t fs_cblkno; /* offset of cyl-block in filesys */ + daddr_t fs_iblkno; /* offset of inode-blocks in filesys */ + daddr_t fs_dblkno; /* offset of first data after cg */ + long fs_cgoffset; /* cylinder group offset in cylinder */ + long fs_cgmask; /* used to calc mod fs_ntrak */ + time_t fs_time; /* last time written */ + long fs_size; /* number of blocks in fs */ + long fs_dsize; /* number of data blocks in fs */ + long fs_ncg; /* number of cylinder groups */ + long fs_bsize; /* size of basic blocks in fs */ + long fs_fsize; /* size of frag blocks in fs */ + long fs_frag; /* number of frags in a block in fs */ +/* these are configuration parameters */ + long fs_minfree; /* minimum percentage of free blocks */ + long fs_rotdelay; /* num of ms for optimal next block */ + long fs_rps; /* disk revolutions per second */ +/* these fields can be computed from the others */ + long fs_bmask; /* ``blkoff'' calc of blk offsets */ + long fs_fmask; /* ``fragoff'' calc of frag offsets */ + long fs_bshift; /* ``lblkno'' calc of logical blkno */ + long fs_fshift; /* ``numfrags'' calc number of frags */ +/* these are configuration parameters */ + long fs_maxcontig; /* max number of contiguous blks */ + long fs_maxbpg; /* max number of blks per cyl group */ +/* these fields can be computed from the others */ + long fs_fragshift; /* block to frag shift */ + long fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + long fs_sbsize; /* actual size of super block */ + long fs_csmask; /* csum block offset */ + long fs_csshift; /* csum block number */ + long fs_nindir; /* value of NINDIR */ + long fs_inopb; /* value of INOPB */ + long fs_nspf; /* value of NSPF */ +/* yet another configuration parameter */ + long fs_optim; /* optimization preference, see below */ +/* these fields are derived from the hardware */ + long fs_npsect; /* # sectors/track including spares */ + long fs_interleave; /* hardware sector interleave */ + long fs_trackskew; /* sector 0 skew, per track */ + long fs_headswitch; /* head switch time, usec */ + long fs_trkseek; /* track-to-track seek, usec */ +/* sizes determined by number of cylinder groups and their sizes */ + daddr_t fs_csaddr; /* blk addr of cyl grp summary area */ + long fs_cssize; /* size of cyl grp summary area */ + long fs_cgsize; /* cylinder group size */ +/* these fields are derived from the hardware */ + long fs_ntrak; /* tracks per cylinder */ + long fs_nsect; /* sectors per track */ + long fs_spc; /* sectors per cylinder */ +/* this comes from the disk driver partitioning */ + long fs_ncyl; /* cylinders in file system */ +/* these fields can be computed from the others */ + long fs_cpg; /* cylinders per group */ + long fs_ipg; /* inodes per group */ + long fs_fpg; /* blocks per group * fs_frag */ +/* this data must be re-computed after crashes */ + struct csum fs_cstotal; /* cylinder summary information */ +/* these fields are cleared at mount time */ + char fs_fmod; /* super block modified flag */ + char fs_clean; /* file system is clean flag */ + char fs_ronly; /* mounted read-only flag */ + char fs_flags; /* currently unused flag */ + char fs_fsmnt[MAXMNTLEN]; /* name mounted on */ +/* these fields retain the current block allocation info */ + long fs_cgrotor; /* last cg searched */ + struct csum *fs_csp[MAXCSBUFS];/* list of fs_cs info buffers */ + long fs_cpc; /* cyl per cycle in postbl */ + short fs_opostbl[16][8]; /* old rotation block list head */ + long fs_sparecon[50]; /* reserved for future constants */ + long fs_contigsumsize; /* size of cluster summary array */ + long fs_maxsymlinklen; /* max length of an internal symlink */ + long fs_inodefmt; /* format of on-disk inodes */ + u_quad_t fs_maxfilesize; /* maximum representable file size */ + quad_t fs_qbmask; /* ~fs_bmask - for use with quad size */ + quad_t fs_qfmask; /* ~fs_fmask - for use with quad size */ + long fs_state; /* validate fs_clean field */ + long fs_postblformat; /* format of positional layout tables */ + long fs_nrpos; /* number of rotational positions */ + long fs_postbloff; /* (short) rotation block list head */ + long fs_rotbloff; /* (u_char) blocks for each rotation */ + long fs_magic; /* magic number */ + u_char fs_space[1]; /* list of blocks for each rotation */ +/* actually longer */ +}; +/* + * Filesystem idetification + */ +#define FS_MAGIC 0x011954 /* the fast filesystem magic number */ +#define FS_OKAY 0x7c269d38 /* superblock checksum */ +#define FS_42INODEFMT -1 /* 4.2BSD inode format */ +#define FS_44INODEFMT 2 /* 4.4BSD inode format */ +/* + * Preference for optimization. + */ +#define FS_OPTTIME 0 /* minimize allocation time */ +#define FS_OPTSPACE 1 /* minimize disk fragmentation */ + +/* + * Rotational layout table format types + */ +#define FS_42POSTBLFMT -1 /* 4.2BSD rotational table format */ +#define FS_DYNAMICPOSTBLFMT 1 /* dynamic rotational table format */ +/* + * Macros for access to superblock array structures + */ +#define fs_postbl(fs, cylno) \ + (((fs)->fs_postblformat == FS_42POSTBLFMT) \ + ? ((fs)->fs_opostbl[cylno]) \ + : ((short *)((char *)(fs) + (fs)->fs_postbloff) + (cylno) * (fs)->fs_nrpos)) +#define fs_rotbl(fs) \ + (((fs)->fs_postblformat == FS_42POSTBLFMT) \ + ? ((fs)->fs_space) \ + : ((u_char *)((char *)(fs) + (fs)->fs_rotbloff))) + +/* + * The size of a cylinder group is calculated by CGSIZE. The maximum size + * is limited by the fact that cylinder groups are at most one block. + * Its size is derived from the size of the maps maintained in the + * cylinder group and the (struct cg) size. + */ +#define CGSIZE(fs) \ + /* base cg */ (sizeof(struct cg) + sizeof(long) + \ + /* blktot size */ (fs)->fs_cpg * sizeof(long) + \ + /* blks size */ (fs)->fs_cpg * (fs)->fs_nrpos * sizeof(short) + \ + /* inode map */ howmany((fs)->fs_ipg, NBBY) + \ + /* block map */ howmany((fs)->fs_cpg * (fs)->fs_spc / NSPF(fs), NBBY) +\ + /* if present */ ((fs)->fs_contigsumsize <= 0 ? 0 : \ + /* cluster sum */ (fs)->fs_contigsumsize * sizeof(long) + \ + /* cluster map */ howmany((fs)->fs_cpg * (fs)->fs_spc / NSPB(fs), NBBY))) + +/* + * Convert cylinder group to base address of its global summary info. + * + * N.B. This macro assumes that sizeof(struct csum) is a power of two. + */ +#define fs_cs(fs, indx) \ + fs_csp[(indx) >> (fs)->fs_csshift][(indx) & ~(fs)->fs_csmask] + +/* + * Cylinder group block for a file system. + */ +#define CG_MAGIC 0x090255 +struct cg { + struct cg *cg_link; /* linked list of cyl groups */ + long cg_magic; /* magic number */ + time_t cg_time; /* time last written */ + long cg_cgx; /* we are the cgx'th cylinder group */ + short cg_ncyl; /* number of cyl's this cg */ + short cg_niblk; /* number of inode blocks this cg */ + long cg_ndblk; /* number of data blocks this cg */ + struct csum cg_cs; /* cylinder summary information */ + long cg_rotor; /* position of last used block */ + long cg_frotor; /* position of last used frag */ + long cg_irotor; /* position of last used inode */ + long cg_frsum[MAXFRAG]; /* counts of available frags */ + long cg_btotoff; /* (long) block totals per cylinder */ + long cg_boff; /* (short) free block positions */ + long cg_iusedoff; /* (char) used inode map */ + long cg_freeoff; /* (u_char) free block map */ + long cg_nextfreeoff; /* (u_char) next available space */ + long cg_clustersumoff; /* (long) counts of avail clusters */ + long cg_clusteroff; /* (char) free cluster map */ + long cg_nclusterblks; /* number of clusters this cg */ + long cg_sparecon[13]; /* reserved for future use */ + u_char cg_space[1]; /* space for cylinder group maps */ +/* actually longer */ +}; +/* + * Macros for access to cylinder group array structures + */ +#define cg_blktot(cgp) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_btot) \ + : ((long *)((char *)(cgp) + (cgp)->cg_btotoff))) +#define cg_blks(fs, cgp, cylno) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_b[cylno]) \ + : ((short *)((char *)(cgp) + (cgp)->cg_boff) + (cylno) * (fs)->fs_nrpos)) +#define cg_inosused(cgp) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_iused) \ + : ((char *)((char *)(cgp) + (cgp)->cg_iusedoff))) +#define cg_blksfree(cgp) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_free) \ + : ((u_char *)((char *)(cgp) + (cgp)->cg_freeoff))) +#define cg_chkmagic(cgp) \ + ((cgp)->cg_magic == CG_MAGIC || ((struct ocg *)(cgp))->cg_magic == CG_MAGIC) +#define cg_clustersfree(cgp) \ + ((u_char *)((char *)(cgp) + (cgp)->cg_clusteroff)) +#define cg_clustersum(cgp) \ + ((long *)((char *)(cgp) + (cgp)->cg_clustersumoff)) + +/* + * The following structure is defined + * for compatibility with old file systems. + */ +struct ocg { + struct ocg *cg_link; /* linked list of cyl groups */ + struct ocg *cg_rlink; /* used for incore cyl groups */ + time_t cg_time; /* time last written */ + long cg_cgx; /* we are the cgx'th cylinder group */ + short cg_ncyl; /* number of cyl's this cg */ + short cg_niblk; /* number of inode blocks this cg */ + long cg_ndblk; /* number of data blocks this cg */ + struct csum cg_cs; /* cylinder summary information */ + long cg_rotor; /* position of last used block */ + long cg_frotor; /* position of last used frag */ + long cg_irotor; /* position of last used inode */ + long cg_frsum[8]; /* counts of available frags */ + long cg_btot[32]; /* block totals per cylinder */ + short cg_b[32][8]; /* positions of free blocks */ + char cg_iused[256]; /* used inode map */ + long cg_magic; /* magic number */ + u_char cg_free[1]; /* free block map */ +/* actually longer */ +}; + +/* + * Turn file system block numbers into disk block addresses. + * This maps file system blocks to device size blocks. + */ +#define fsbtodb(fs, b) ((b) << (fs)->fs_fsbtodb) +#define dbtofsb(fs, b) ((b) >> (fs)->fs_fsbtodb) + +/* + * Cylinder group macros to locate things in cylinder groups. + * They calc file system addresses of cylinder group data structures. + */ +#define cgbase(fs, c) ((daddr_t)((fs)->fs_fpg * (c))) +#define cgdmin(fs, c) (cgstart(fs, c) + (fs)->fs_dblkno) /* 1st data */ +#define cgimin(fs, c) (cgstart(fs, c) + (fs)->fs_iblkno) /* inode blk */ +#define cgsblock(fs, c) (cgstart(fs, c) + (fs)->fs_sblkno) /* super blk */ +#define cgtod(fs, c) (cgstart(fs, c) + (fs)->fs_cblkno) /* cg block */ +#define cgstart(fs, c) \ + (cgbase(fs, c) + (fs)->fs_cgoffset * ((c) & ~((fs)->fs_cgmask))) + +/* + * Macros for handling inode numbers: + * inode number to file system block offset. + * inode number to cylinder group number. + * inode number to file system block address. + */ +#define ino_to_cg(fs, x) ((x) / (fs)->fs_ipg) +#define ino_to_fsba(fs, x) \ + ((daddr_t)(cgimin(fs, ino_to_cg(fs, x)) + \ + (blkstofrags((fs), (((x) % (fs)->fs_ipg) / INOPB(fs)))))) +#define ino_to_fsbo(fs, x) ((x) % INOPB(fs)) + +/* + * Give cylinder group number for a file system block. + * Give cylinder group block number for a file system block. + */ +#define dtog(fs, d) ((d) / (fs)->fs_fpg) +#define dtogd(fs, d) ((d) % (fs)->fs_fpg) + +/* + * Extract the bits for a block from a map. + * Compute the cylinder and rotational position of a cyl block addr. + */ +#define blkmap(fs, map, loc) \ + (((map)[(loc) / NBBY] >> ((loc) % NBBY)) & (0xff >> (NBBY - (fs)->fs_frag))) +#define cbtocylno(fs, bno) \ + ((bno) * NSPF(fs) / (fs)->fs_spc) +#define cbtorpos(fs, bno) \ + (((bno) * NSPF(fs) % (fs)->fs_spc / (fs)->fs_nsect * (fs)->fs_trackskew + \ + (bno) * NSPF(fs) % (fs)->fs_spc % (fs)->fs_nsect * (fs)->fs_interleave) % \ + (fs)->fs_nsect * (fs)->fs_nrpos / (fs)->fs_npsect) + +/* + * The following macros optimize certain frequently calculated + * quantities by using shifts and masks in place of divisions + * modulos and multiplications. + */ +#define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \ + ((loc) & (fs)->fs_qbmask) +#define fragoff(fs, loc) /* calculates (loc % fs->fs_fsize) */ \ + ((loc) & (fs)->fs_qfmask) +#define lblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \ + ((blk) << (fs)->fs_bshift) +#define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \ + ((loc) >> (fs)->fs_bshift) +#define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ + ((loc) >> (fs)->fs_fshift) +#define blkroundup(fs, size) /* calculates roundup(size, fs->fs_bsize) */ \ + (((size) + (fs)->fs_qbmask) & (fs)->fs_bmask) +#define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \ + (((size) + (fs)->fs_qfmask) & (fs)->fs_fmask) +#define fragstoblks(fs, frags) /* calculates (frags / fs->fs_frag) */ \ + ((frags) >> (fs)->fs_fragshift) +#define blkstofrags(fs, blks) /* calculates (blks * fs->fs_frag) */ \ + ((blks) << (fs)->fs_fragshift) +#define fragnum(fs, fsb) /* calculates (fsb % fs->fs_frag) */ \ + ((fsb) & ((fs)->fs_frag - 1)) +#define blknum(fs, fsb) /* calculates rounddown(fsb, fs->fs_frag) */ \ + ((fsb) &~ ((fs)->fs_frag - 1)) + +/* + * Determine the number of available frags given a + * percentage to hold in reserve + */ +#define freespace(fs, percentreserved) \ + (blkstofrags((fs), (fs)->fs_cstotal.cs_nbfree) + \ + (fs)->fs_cstotal.cs_nffree - ((fs)->fs_dsize * (percentreserved) / 100)) + +/* + * Determining the size of a file block in the file system. + */ +#define blksize(fs, ip, lbn) \ + (((lbn) >= NDADDR || (ip)->i_size >= ((lbn) + 1) << (fs)->fs_bshift) \ + ? (fs)->fs_bsize \ + : (fragroundup(fs, blkoff(fs, (ip)->i_size)))) +#define dblksize(fs, dip, lbn) \ + (((lbn) >= NDADDR || (dip)->di_size >= ((lbn) + 1) << (fs)->fs_bshift) \ + ? (fs)->fs_bsize \ + : (fragroundup(fs, blkoff(fs, (dip)->di_size)))) + +/* + * Number of disk sectors per block; assumes DEV_BSIZE byte sector size. + */ +#define NSPB(fs) ((fs)->fs_nspf << (fs)->fs_fragshift) +#define NSPF(fs) ((fs)->fs_nspf) + +/* + * INOPB is the number of inodes in a secondary storage block. + */ +#define INOPB(fs) ((fs)->fs_inopb) +#define INOPF(fs) ((fs)->fs_inopb >> (fs)->fs_fragshift) + +/* + * NINDIR is the number of indirects in a file system block. + */ +#define NINDIR(fs) ((fs)->fs_nindir) + +extern int inside[], around[]; +extern u_char *fragtbl[]; diff --git a/sys/ufs/lfs/README b/sys/ufs/lfs/README new file mode 100644 index 0000000..724b18f --- /dev/null +++ b/sys/ufs/lfs/README @@ -0,0 +1,139 @@ +# @(#)README 8.1 (Berkeley) 6/11/93 + +The file system is reasonably stable, but incomplete. There are +places where cleaning performance can be improved dramatically (see +comments in lfs_syscalls.c). For details on the implementation, +performance and why garbage collection always wins, see Dr. Margo +Seltzer's thesis available for anonymous ftp from toe.cs.berkeley.edu, +in the directory pub/personal/margo/thesis.ps.Z, or the January 1993 +USENIX paper. + +Missing Functionality: + Multiple block sizes and/or fragments are not yet implemented. + +---------- +The disk is laid out in segments. The first segment starts 8K into the +disk (the first 8K is used for boot information). Each segment is composed +of the following: + + An optional super block + One or more groups of: + segment summary + 0 or more data blocks + 0 or more inode blocks + +The segment summary and inode/data blocks start after the super block (if +present), and grow toward the end of the segment. + + _______________________________________________ + | | | | | + | summary | data/inode | summary | data/inode | + | block | blocks | block | blocks | ... + |_________|____________|_________|____________| + +The data/inode blocks following a summary block are described by the +summary block. In order to permit the segment to be written in any order +and in a forward direction only, a checksum is calculated across the +blocks described by the summary. Additionally, the summary is checksummed +and timestamped. Both of these are intended for recovery; the former is +to make it easy to determine that it *is* a summary block and the latter +is to make it easy to determine when recovery is finished for partially +written segments. These checksums are also used by the cleaner. + + Summary block (detail) + ________________ + | sum cksum | + | data cksum | + | next segment | + | timestamp | + | FINFO count | + | inode count | + | flags | + |______________| + | FINFO-1 | 0 or more file info structures, identifying the + | . | blocks in the segment. + | . | + | . | + | FINFO-N | + | inode-N | + | . | + | . | + | . | 0 or more inode daddr_t's, identifying the inode + | inode-1 | blocks in the segment. + |______________| + +Inode blocks are blocks of on-disk inodes in the same format as those in +the FFS. However, spare[0] contains the inode number of the inode so we +can find a particular inode on a page. They are packed page_size / +sizeof(inode) to a block. Data blocks are exactly as in the FFS. Both +inodes and data blocks move around the file system at will. + +The file system is described by a super-block which is replicated and +occurs as the first block of the first and other segments. (The maximum +number of super-blocks is MAXNUMSB). Each super-block maintains a list +of the disk addresses of all the super-blocks. The super-block maintains +a small amount of checkpoint information, essentially just enough to find +the inode for the IFILE (fs->lfs_idaddr). + +The IFILE is visible in the file system, as inode number IFILE_INUM. It +contains information shared between the kernel and various user processes. + + Ifile (detail) + ________________ + | cleaner info | Cleaner information per file system. (Page + | | granularity.) + |______________| + | segment | Space available and last modified times per + | usage table | segment. (Page granularity.) + |______________| + | IFILE-1 | Per inode status information: current version #, + | . | if currently allocated, last access time and + | . | current disk address of containing inode block. + | . | If current disk address is LFS_UNUSED_DADDR, the + | IFILE-N | inode is not in use, and it's on the free list. + |______________| + + +First Segment at Creation Time: +_____________________________________________________________ +| | | | | | | | +| 8K pad | Super | summary | inode | ifile | root | l + f | +| | block | | block | | dir | dir | +|________|_______|_________|_______|_______|_______|_______| + ^ + Segment starts here. + +Some differences from the Sprite LFS implementation. + +1. The LFS implementation placed the ifile metadata and the super block + at fixed locations. This implementation replicates the super block + and puts each at a fixed location. The checkpoint data is divided into + two parts -- just enough information to find the IFILE is stored in + two of the super blocks, although it is not toggled between them as in + the Sprite implementation. (This was deliberate, to avoid a single + point of failure.) The remaining checkpoint information is treated as + a regular file, which means that the cleaner info, the segment usage + table and the ifile meta-data are stored in normal log segments. + (Tastes great, less filling...) + +2. The segment layout is radically different in Sprite; this implementation + uses something a lot like network framing, where data/inode blocks are + written asynchronously, and a checksum is used to validate any set of + summary and data/inode blocks. Sprite writes summary blocks synchronously + after the data/inode blocks have been written and the existence of the + summary block validates the data/inode blocks. This permits us to write + everything contiguously, even partial segments and their summaries, whereas + Sprite is forced to seek (from the end of the data inode to the summary + which lives at the end of the segment). Additionally, writing the summary + synchronously should cost about 1/2 a rotation per summary. + +3. Sprite LFS distinguishes between different types of blocks in the segment. + Other than inode blocks and data blocks, we don't. + +4. Sprite LFS traverses the IFILE looking for free blocks. We maintain a + free list threaded through the IFILE entries. + +5. The cleaner runs in user space, as opposed to kernel space. It shares + information with the kernel by reading/writing the IFILE and through + cleaner specific system calls. + diff --git a/sys/ufs/lfs/TODO b/sys/ufs/lfs/TODO new file mode 100644 index 0000000..ace8f5e --- /dev/null +++ b/sys/ufs/lfs/TODO @@ -0,0 +1,116 @@ +# @(#)TODO 8.1 (Berkeley) 6/11/93 + +NOTE: Changed the lookup on a page of inodes to search from the back +in case the same inode gets written twice on the same page. + +Make sure that if you are writing a file, but not all the blocks +make it into a single segment, that you do not write the inode in +that segment. + +Keith: + Why not delete the lfs_bmapv call, just mark everything dirty + that isn't deleted/truncated? Get some numbers about + what percentage of the stuff that the cleaner thinks + might be live is live. If it's high, get rid of lfs_bmapv. + + There is a nasty problem in that it may take *more* room to write + the data to clean a segment than is returned by the new segment + because of indirect blocks in segment 2 being dirtied by the data + being copied into the log from segment 1. The suggested solution + at this point is to detect it when we have no space left on the + filesystem, write the extra data into the last segment (leaving + no clean ones), make it a checkpoint and shut down the file system + for fixing by a utility reading the raw partition. Argument is + that this should never happen and is practically impossible to fix + since the cleaner would have to theoretically build a model of the + entire filesystem in memory to detect the condition occurring. + A file coalescing cleaner will help avoid the problem, and one + that reads/writes from the raw disk could fix it. + +DONE Currently, inodes are being flushed to disk synchronously upon + creation -- see ufs_makeinode. However, only the inode + is flushed, the directory "name" is written using VOP_BWRITE, + so it's not synchronous. Possible solutions: 1: get some + ordering in the writes so that inode/directory entries get + stuffed into the same segment. 2: do both synchronously + 3: add Mendel's information into the stream so we log + creation/deletion of inodes. 4: do some form of partial + segment when changing the inode (creation/deletion/rename). +DONE Fix i_block increment for indirect blocks. + If the file system is tar'd, extracted on top of another LFS, the + IFILE ain't worth diddly. Is the cleaner writing the IFILE? + If not, let's make it read-only. +DONE Delete unnecessary source from utils in main-line source tree. +DONE Make sure that we're counting meta blocks in the inode i_block count. + Overlap the version and nextfree fields in the IFILE +DONE Vinvalbuf (Kirk): + Why writing blocks that are no longer useful? + Are the semantics of close such that blocks have to be flushed? + How specify in the buf chain the blocks that don't need + to be written? (Different numbering of indirect blocks.) + +Margo: + Change so that only search one sector of inode block file for the + inode by using sector addresses in the ifile instead of + logical disk addresses. + Fix the use of the ifile version field to use the generation + number instead. +DONE Unmount; not doing a bgetvp (VHOLD) in lfs_newbuf call. +DONE Document in the README file where the checkpoint information is + on disk. + Variable block sizes (Margo/Keith). + Switch the byte accounting to sector accounting. +DONE Check lfs.h and make sure that the #defines/structures are all + actually needed. +DONE Add a check in lfs_segment.c so that if the segment is empty, + we don't write it. + Need to keep vnode v_numoutput up to date for pending writes? +DONE USENIX paper (Carl/Margo). + + +Evelyn: + lfsck: If delete a file that's being executed, the version number + isn't updated, and lfsck has to figure this out; case is the same as if have an inode that no directory references, + so the file should be reattached into lost+found. + Recovery/fsck. + +Carl: + Investigate: clustering of reads (if blocks in the segment are ordered, + should read them all) and writes (McVoy paper). + Investigate: should the access time be part of the IFILE: + pro: theoretically, saves disk writes + con: cacheing inodes should obviate this advantage + the IFILE is already humongous + Cleaner. + Port to OSF/1 (Carl/Keith). + Currently there's no notion of write error checking. + + Failed data/inode writes should be rescheduled (kernel level + bad blocking). + + Failed superblock writes should cause selection of new + superblock for checkpointing. + +FUTURE FANTASIES: ============ + ++ unrm, versioning ++ transactions ++ extended cleaner policies (hot/cold data, data placement) + +============================== +Problem with the concept of multiple buffer headers referencing the segment: +Positives: + Don't lock down 1 segment per file system of physical memory. + Don't copy from buffers to segment memory. + Don't tie down the bus to transfer 1M. + Works on controllers supporting less than large transfers. + Disk can start writing immediately instead of waiting 1/2 rotation + and the full transfer. +Negatives: + Have to do segment write then segment summary write, since the latter + is what verifies that the segment is okay. (Is there another way + to do this?) +============================== + +The algorithm for selecting the disk addresses of the super-blocks +has to be available to the user program which checks the file system. + +(Currently in newfs, becomes a common subroutine.) diff --git a/sys/ufs/lfs/lfs.h b/sys/ufs/lfs/lfs.h new file mode 100644 index 0000000..87b8c22 --- /dev/null +++ b/sys/ufs/lfs/lfs.h @@ -0,0 +1,353 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs.h 8.3 (Berkeley) 9/23/93 + */ + +#define LFS_LABELPAD 8192 /* LFS label size */ +#define LFS_SBPAD 8192 /* LFS superblock size */ + +/* + * XXX + * This is a kluge and NEEDS to go away. + * + * Right now, ufs code handles most of the calls for directory operations + * such as create, mkdir, link, etc. As a result VOP_UPDATE is being + * called with waitfor set (since ffs does these things synchronously). + * Since LFS does not want to do these synchronously, we treat the last + * argument to lfs_update as a set of flags. If LFS_SYNC is set, then + * the update should be synchronous, if not, do it asynchronously. + * Unfortunately, this means that LFS won't work with NFS yet because + * NFS goes through paths that will make normal calls to ufs which will + * call lfs with a last argument of 1. + */ +#define LFS_SYNC 0x02 + +/* On-disk and in-memory checkpoint segment usage structure. */ +typedef struct segusage SEGUSE; +struct segusage { + u_long su_nbytes; /* number of live bytes */ + u_long su_lastmod; /* SEGUSE last modified timestamp */ + u_short su_nsums; /* number of summaries in segment */ + u_short su_ninos; /* number of inode blocks in seg */ +#define SEGUSE_ACTIVE 0x1 /* segment is currently being written */ +#define SEGUSE_DIRTY 0x2 /* segment has data in it */ +#define SEGUSE_SUPERBLOCK 0x4 /* segment contains a superblock */ + u_long su_flags; +}; + +#define SEGUPB(fs) (1 << (fs)->lfs_sushift) +#define SEGTABSIZE_SU(fs) \ + (((fs)->lfs_nseg + SEGUPB(fs) - 1) >> (fs)->lfs_sushift) + +/* On-disk file information. One per file with data blocks in the segment. */ +typedef struct finfo FINFO; +struct finfo { + u_long fi_nblocks; /* number of blocks */ + u_long fi_version; /* version number */ + u_long fi_ino; /* inode number */ + long fi_blocks[1]; /* array of logical block numbers */ +}; + +/* On-disk and in-memory super block. */ +struct lfs { +#define LFS_MAGIC 0x070162 + u_long lfs_magic; /* magic number */ +#define LFS_VERSION 1 + u_long lfs_version; /* version number */ + + u_long lfs_size; /* number of blocks in fs */ + u_long lfs_ssize; /* number of blocks per segment */ + u_long lfs_dsize; /* number of disk blocks in fs */ + u_long lfs_bsize; /* file system block size */ + u_long lfs_fsize; /* size of frag blocks in fs */ + u_long lfs_frag; /* number of frags in a block in fs */ + +/* Checkpoint region. */ + ino_t lfs_free; /* start of the free list */ + u_long lfs_bfree; /* number of free disk blocks */ + u_long lfs_nfiles; /* number of allocated inodes */ + long lfs_avail; /* blocks available for writing */ + u_long lfs_uinodes; /* inodes in cache not yet on disk */ + daddr_t lfs_idaddr; /* inode file disk address */ + ino_t lfs_ifile; /* inode file inode number */ + daddr_t lfs_lastseg; /* address of last segment written */ + daddr_t lfs_nextseg; /* address of next segment to write */ + daddr_t lfs_curseg; /* current segment being written */ + daddr_t lfs_offset; /* offset in curseg for next partial */ + daddr_t lfs_lastpseg; /* address of last partial written */ + u_long lfs_tstamp; /* time stamp */ + +/* These are configuration parameters. */ + u_long lfs_minfree; /* minimum percentage of free blocks */ + +/* These fields can be computed from the others. */ + u_quad_t lfs_maxfilesize; /* maximum representable file size */ + u_long lfs_dbpseg; /* disk blocks per segment */ + u_long lfs_inopb; /* inodes per block */ + u_long lfs_ifpb; /* IFILE entries per block */ + u_long lfs_sepb; /* SEGUSE entries per block */ + u_long lfs_nindir; /* indirect pointers per block */ + u_long lfs_nseg; /* number of segments */ + u_long lfs_nspf; /* number of sectors per fragment */ + u_long lfs_cleansz; /* cleaner info size in blocks */ + u_long lfs_segtabsz; /* segment table size in blocks */ + + u_long lfs_segmask; /* calculate offset within a segment */ + u_long lfs_segshift; /* fast mult/div for segments */ + u_long lfs_bmask; /* calc block offset from file offset */ + u_long lfs_bshift; /* calc block number from file offset */ + u_long lfs_ffmask; /* calc frag offset from file offset */ + u_long lfs_ffshift; /* fast mult/div for frag from file */ + u_long lfs_fbmask; /* calc frag offset from block offset */ + u_long lfs_fbshift; /* fast mult/div for frag from block */ + u_long lfs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + u_long lfs_sushift; /* fast mult/div for segusage table */ + +#define LFS_MIN_SBINTERVAL 5 /* minimum superblock segment spacing */ +#define LFS_MAXNUMSB 10 /* superblock disk offsets */ + daddr_t lfs_sboffs[LFS_MAXNUMSB]; + +/* These fields are set at mount time and are meaningless on disk. */ + struct segment *lfs_sp; /* current segment being written */ + struct vnode *lfs_ivnode; /* vnode for the ifile */ + u_long lfs_seglock; /* single-thread the segment writer */ + pid_t lfs_lockpid; /* pid of lock holder */ + u_long lfs_iocount; /* number of ios pending */ + u_long lfs_writer; /* don't allow any dirops to start */ + u_long lfs_dirops; /* count of active directory ops */ + u_long lfs_doifile; /* Write ifile blocks on next write */ + u_long lfs_nactive; /* Number of segments since last ckp */ + u_char lfs_fmod; /* super block modified flag */ + u_char lfs_clean; /* file system is clean flag */ + u_char lfs_ronly; /* mounted read-only flag */ + u_char lfs_flags; /* currently unused flag */ + u_char lfs_fsmnt[MNAMELEN]; /* name mounted on */ + u_char pad[3]; /* long-align */ + +/* Checksum; valid on disk. */ + u_long lfs_cksum; /* checksum for superblock checking */ +}; + +/* + * Inode 0 is the out-of-band inode number, inode 1 is the inode number for + * the IFILE, the root inode is 2 and the lost+found inode is 3. + */ + +/* Fixed inode numbers. */ +#define LFS_UNUSED_INUM 0 /* out of band inode number */ +#define LFS_IFILE_INUM 1 /* IFILE inode number */ +#define LOSTFOUNDINO 3 /* lost+found inode number */ +#define LFS_FIRST_INUM 4 /* first free inode number */ + +/* Address calculations for metadata located in the inode */ +#define S_INDIR(fs) -NDADDR +#define D_INDIR(fs) (S_INDIR(fs) - NINDIR(fs) - 1) +#define T_INDIR(fs) (D_INDIR(fs) - NINDIR(fs) * NINDIR(fs) - 1) + +/* Unassigned disk address. */ +#define UNASSIGNED -1 + +/* Unused logical block number */ +#define LFS_UNUSED_LBN -1 + +typedef struct ifile IFILE; +struct ifile { + u_long if_version; /* inode version number */ +#define LFS_UNUSED_DADDR 0 /* out-of-band daddr */ + daddr_t if_daddr; /* inode disk address */ + ino_t if_nextfree; /* next-unallocated inode */ +}; + +/* + * Cleaner information structure. This resides in the ifile and is used + * to pass information between the cleaner and the kernel. + */ +typedef struct _cleanerinfo { + u_long clean; /* K: number of clean segments */ + u_long dirty; /* K: number of dirty segments */ +} CLEANERINFO; + +#define CLEANSIZE_SU(fs) \ + ((sizeof(CLEANERINFO) + (fs)->lfs_bsize - 1) >> (fs)->lfs_bshift) + +/* + * All summary blocks are the same size, so we can always read a summary + * block easily from a segment. + */ +#define LFS_SUMMARY_SIZE 512 + +/* On-disk segment summary information */ +typedef struct segsum SEGSUM; +struct segsum { + u_long ss_sumsum; /* check sum of summary block */ + u_long ss_datasum; /* check sum of data */ + daddr_t ss_next; /* next segment */ + u_long ss_create; /* creation time stamp */ + u_short ss_nfinfo; /* number of file info structures */ + u_short ss_ninos; /* number of inodes in summary */ +#define SS_DIROP 0x01 /* segment begins a dirop */ +#define SS_CONT 0x02 /* more partials to finish this write*/ + u_short ss_flags; /* used for directory operations */ + u_short ss_pad; /* extra space */ + /* FINFO's and inode daddr's... */ +}; + +/* NINDIR is the number of indirects in a file system block. */ +#define NINDIR(fs) ((fs)->lfs_nindir) + +/* INOPB is the number of inodes in a secondary storage block. */ +#define INOPB(fs) ((fs)->lfs_inopb) + +#define blksize(fs) ((fs)->lfs_bsize) +#define blkoff(fs, loc) ((loc) & (fs)->lfs_bmask) +#define fsbtodb(fs, b) ((b) << (fs)->lfs_fsbtodb) +#define dbtofsb(fs, b) ((b) >> (fs)->lfs_fsbtodb) +#define lblkno(fs, loc) ((loc) >> (fs)->lfs_bshift) +#define lblktosize(fs, blk) ((blk) << (fs)->lfs_bshift) +#define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ + ((loc) >> (fs)->lfs_bshift) + +#define datosn(fs, daddr) /* disk address to segment number */ \ + (((daddr) - (fs)->lfs_sboffs[0]) / fsbtodb((fs), (fs)->lfs_ssize)) +#define sntoda(fs, sn) /* segment number to disk address */ \ + ((daddr_t)((sn) * ((fs)->lfs_ssize << (fs)->lfs_fsbtodb) + \ + (fs)->lfs_sboffs[0])) + +/* Read in the block with the cleaner info from the ifile. */ +#define LFS_CLEANERINFO(CP, F, BP) { \ + VTOI((F)->lfs_ivnode)->i_flag |= IN_ACCESS; \ + if (bread((F)->lfs_ivnode, \ + (daddr_t)0, (F)->lfs_bsize, NOCRED, &(BP))) \ + panic("lfs: ifile read"); \ + (CP) = (CLEANERINFO *)(BP)->b_data; \ +} + +/* Read in the block with a specific inode from the ifile. */ +#define LFS_IENTRY(IP, F, IN, BP) { \ + int _e; \ + VTOI((F)->lfs_ivnode)->i_flag |= IN_ACCESS; \ + if (_e = bread((F)->lfs_ivnode, \ + (IN) / (F)->lfs_ifpb + (F)->lfs_cleansz + (F)->lfs_segtabsz,\ + (F)->lfs_bsize, NOCRED, &(BP))) \ + panic("lfs: ifile read %d", _e); \ + (IP) = (IFILE *)(BP)->b_data + (IN) % (F)->lfs_ifpb; \ +} + +/* Read in the block with a specific segment usage entry from the ifile. */ +#define LFS_SEGENTRY(SP, F, IN, BP) { \ + int _e; \ + VTOI((F)->lfs_ivnode)->i_flag |= IN_ACCESS; \ + if (_e = bread((F)->lfs_ivnode, \ + ((IN) >> (F)->lfs_sushift) + (F)->lfs_cleansz, \ + (F)->lfs_bsize, NOCRED, &(BP))) \ + panic("lfs: ifile read: %d", _e); \ + (SP) = (SEGUSE *)(BP)->b_data + ((IN) & (F)->lfs_sepb - 1); \ +} + +/* + * Determine if there is enough room currently available to write db + * disk blocks. We need enough blocks for the new blocks, the current, + * inode blocks, a summary block, plus potentially the ifile inode and + * the segment usage table, plus an ifile page. + */ +#define LFS_FITS(fs, db) \ + ((long)((db + ((fs)->lfs_uinodes + INOPB((fs))) / INOPB((fs)) + \ + fsbtodb(fs, 1) + LFS_SUMMARY_SIZE / DEV_BSIZE + \ + (fs)->lfs_segtabsz)) < (fs)->lfs_avail) + +/* Determine if a buffer belongs to the ifile */ +#define IS_IFILE(bp) (VTOI(bp->b_vp)->i_number == LFS_IFILE_INUM) + +/* + * Structures used by lfs_bmapv and lfs_markv to communicate information + * about inodes and data blocks. + */ +typedef struct block_info { + ino_t bi_inode; /* inode # */ + daddr_t bi_lbn; /* logical block w/in file */ + daddr_t bi_daddr; /* disk address of block */ + time_t bi_segcreate; /* origin segment create time */ + int bi_version; /* file version number */ + void *bi_bp; /* data buffer */ +} BLOCK_INFO; + +/* In-memory description of a segment about to be written. */ +struct segment { + struct lfs *fs; /* file system pointer */ + struct buf **bpp; /* pointer to buffer array */ + struct buf **cbpp; /* pointer to next available bp */ + struct buf **start_bpp; /* pointer to first bp in this set */ + struct buf *ibp; /* buffer pointer to inode page */ + struct finfo *fip; /* current fileinfo pointer */ + struct vnode *vp; /* vnode being gathered */ + void *segsum; /* segment summary info */ + u_long ninodes; /* number of inodes in this segment */ + u_long seg_bytes_left; /* bytes left in segment */ + u_long sum_bytes_left; /* bytes left in summary block */ + u_long seg_number; /* number of this segment */ + daddr_t *start_lbp; /* beginning lbn for this set */ +#define SEGM_CKP 0x01 /* doing a checkpoint */ +#define SEGM_CLEAN 0x02 /* cleaner call; don't sort */ +#define SEGM_SYNC 0x04 /* wait for segment */ + u_long seg_flags; /* run-time flags for this segment */ +}; + +#define ISSPACE(F, BB, C) \ + (((C)->cr_uid == 0 && (F)->lfs_bfree >= (BB)) || \ + ((C)->cr_uid != 0 && IS_FREESPACE(F, BB))) + +#define IS_FREESPACE(F, BB) \ + ((F)->lfs_bfree > ((F)->lfs_dsize * (F)->lfs_minfree / 100 + (BB))) + +#define ISSPACE_XXX(F, BB) \ + ((F)->lfs_bfree >= (BB)) + +#define DOSTATS +#ifdef DOSTATS +/* Statistics Counters */ +struct lfs_stats { + int segsused; + int psegwrites; + int psyncwrites; + int pcleanwrites; + int blocktot; + int cleanblocks; + int ncheckpoints; + int nwrites; + int nsync_writes; + int wait_exceeded; + int write_exceeded; + int flush_invoked; +}; +extern struct lfs_stats lfs_stats; +#endif diff --git a/sys/ufs/lfs/lfs_alloc.c b/sys/ufs/lfs/lfs_alloc.c new file mode 100644 index 0000000..3f06c81 --- /dev/null +++ b/sys/ufs/lfs/lfs_alloc.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_alloc.c 8.4 (Berkeley) 1/4/94 + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/syslog.h> +#include <sys/mount.h> +#include <sys/malloc.h> + +#include <vm/vm.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> + +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> + +extern u_long nextgennumber; + +/* Allocate a new inode. */ +/* ARGSUSED */ +int +lfs_valloc(ap) + struct vop_valloc_args /* { + struct vnode *a_pvp; + int a_mode; + struct ucred *a_cred; + struct vnode **a_vpp; + } */ *ap; +{ + struct lfs *fs; + struct buf *bp; + struct ifile *ifp; + struct inode *ip; + struct vnode *vp; + daddr_t blkno; + ino_t new_ino; + u_long i, max; + int error; + + /* Get the head of the freelist. */ + fs = VTOI(ap->a_pvp)->i_lfs; + new_ino = fs->lfs_free; +#ifdef ALLOCPRINT + printf("lfs_ialloc: allocate inode %d\n", new_ino); +#endif + + /* + * Remove the inode from the free list and write the new start + * of the free list into the superblock. + */ + LFS_IENTRY(ifp, fs, new_ino, bp); + if (ifp->if_daddr != LFS_UNUSED_DADDR) + panic("lfs_ialloc: inuse inode on the free list"); + fs->lfs_free = ifp->if_nextfree; + brelse(bp); + + /* Extend IFILE so that the next lfs_valloc will succeed. */ + if (fs->lfs_free == LFS_UNUSED_INUM) { + vp = fs->lfs_ivnode; + ip = VTOI(vp); + blkno = lblkno(fs, ip->i_size); + lfs_balloc(vp, fs->lfs_bsize, blkno, &bp); + ip->i_size += fs->lfs_bsize; + vnode_pager_setsize(vp, (u_long)ip->i_size); + vnode_pager_uncache(vp); + + i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) * + fs->lfs_ifpb; + fs->lfs_free = i; + max = i + fs->lfs_ifpb; + for (ifp = (struct ifile *)bp->b_data; i < max; ++ifp) { + ifp->if_version = 1; + ifp->if_daddr = LFS_UNUSED_DADDR; + ifp->if_nextfree = ++i; + } + ifp--; + ifp->if_nextfree = LFS_UNUSED_INUM; + if (error = VOP_BWRITE(bp)) + return (error); + } + + /* Create a vnode to associate with the inode. */ + if (error = lfs_vcreate(ap->a_pvp->v_mount, new_ino, &vp)) + return (error); + + + ip = VTOI(vp); + /* Zero out the direct and indirect block addresses. */ + bzero(&ip->i_din, sizeof(struct dinode)); + ip->i_din.di_inumber = new_ino; + + /* Set a new generation number for this inode. */ + if (++nextgennumber < (u_long)time.tv_sec) + nextgennumber = time.tv_sec; + ip->i_gen = nextgennumber; + + /* Insert into the inode hash table. */ + ufs_ihashins(ip); + + if (error = ufs_vinit(vp->v_mount, lfs_specop_p, LFS_FIFOOPS, &vp)) { + vput(vp); + *ap->a_vpp = NULL; + return (error); + } + + *ap->a_vpp = vp; + vp->v_flag |= VDIROP; + VREF(ip->i_devvp); + + /* Set superblock modified bit and increment file count. */ + fs->lfs_fmod = 1; + ++fs->lfs_nfiles; + return (0); +} + +/* Create a new vnode/inode pair and initialize what fields we can. */ +int +lfs_vcreate(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + extern int (**lfs_vnodeop_p)(); + struct inode *ip; + struct ufsmount *ump; + int error, i; + + /* Create the vnode. */ + if (error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, vpp)) { + *vpp = NULL; + return (error); + } + + /* Get a pointer to the private mount structure. */ + ump = VFSTOUFS(mp); + + /* Initialize the inode. */ + MALLOC(ip, struct inode *, sizeof(struct inode), M_LFSNODE, M_WAITOK); + (*vpp)->v_data = ip; + ip->i_vnode = *vpp; + ip->i_devvp = ump->um_devvp; + ip->i_flag = IN_MODIFIED; + ip->i_dev = ump->um_dev; + ip->i_number = ip->i_din.di_inumber = ino; +ip->i_din.di_spare[0] = 0xdeadbeef; +ip->i_din.di_spare[1] = 0xdeadbeef; + ip->i_lfs = ump->um_lfs; +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) + ip->i_dquot[i] = NODQUOT; +#endif + ip->i_lockf = 0; + ip->i_diroff = 0; + ip->i_mode = 0; + ip->i_size = 0; + ip->i_blocks = 0; + ++ump->um_lfs->lfs_uinodes; + return (0); +} + +/* Free an inode. */ +/* ARGUSED */ +int +lfs_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + SEGUSE *sup; + struct buf *bp; + struct ifile *ifp; + struct inode *ip; + struct lfs *fs; + daddr_t old_iaddr; + ino_t ino; + + /* Get the inode number and file system. */ + ip = VTOI(ap->a_pvp); + fs = ip->i_lfs; + ino = ip->i_number; + if (ip->i_flag & IN_MODIFIED) { + --fs->lfs_uinodes; + ip->i_flag &= + ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); + } + /* + * Set the ifile's inode entry to unused, increment its version number + * and link it into the free chain. + */ + LFS_IENTRY(ifp, fs, ino, bp); + old_iaddr = ifp->if_daddr; + ifp->if_daddr = LFS_UNUSED_DADDR; + ++ifp->if_version; + ifp->if_nextfree = fs->lfs_free; + fs->lfs_free = ino; + (void) VOP_BWRITE(bp); + + if (old_iaddr != LFS_UNUSED_DADDR) { + LFS_SEGENTRY(sup, fs, datosn(fs, old_iaddr), bp); +#ifdef DIAGNOSTIC + if (sup->su_nbytes < sizeof(struct dinode)) + panic("lfs_vfree: negative byte count (segment %d)\n", + datosn(fs, old_iaddr)); +#endif + sup->su_nbytes -= sizeof(struct dinode); + (void) VOP_BWRITE(bp); + } + + /* Set superblock modified bit and decrement file count. */ + fs->lfs_fmod = 1; + --fs->lfs_nfiles; + return (0); +} diff --git a/sys/ufs/lfs/lfs_balloc.c b/sys/ufs/lfs/lfs_balloc.c new file mode 100644 index 0000000..b56bc9e --- /dev/null +++ b/sys/ufs/lfs/lfs_balloc.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_balloc.c 8.1 (Berkeley) 6/11/93 + */ +#include <sys/param.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/resourcevar.h> +#include <sys/trace.h> + +#include <miscfs/specfs/specdev.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> + +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> + +int +lfs_balloc(vp, iosize, lbn, bpp) + struct vnode *vp; + u_long iosize; + daddr_t lbn; + struct buf **bpp; +{ + struct buf *ibp, *bp; + struct inode *ip; + struct lfs *fs; + struct indir indirs[NIADDR+2]; + daddr_t daddr; + int bb, error, i, num; + + ip = VTOI(vp); + fs = ip->i_lfs; + + /* + * Three cases: it's a block beyond the end of file, it's a block in + * the file that may or may not have been assigned a disk address or + * we're writing an entire block. Note, if the daddr is unassigned, + * the block might still have existed in the cache (if it was read + * or written earlier). If it did, make sure we don't count it as a + * new block or zero out its contents. If it did not, make sure + * we allocate any necessary indirect blocks. + */ + + *bpp = NULL; + if (error = ufs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, NULL )) + return (error); + + *bpp = bp = getblk(vp, lbn, fs->lfs_bsize, 0, 0); + bb = VFSTOUFS(vp->v_mount)->um_seqinc; + if (daddr == UNASSIGNED) + /* May need to allocate indirect blocks */ + for (i = 1; i < num; ++i) + if (!indirs[i].in_exists) { + ibp = + getblk(vp, indirs[i].in_lbn, fs->lfs_bsize, + 0, 0); + if (!(ibp->b_flags & (B_DONE | B_DELWRI))) { + if (!ISSPACE(fs, bb, curproc->p_ucred)){ + ibp->b_flags |= B_INVAL; + brelse(ibp); + error = ENOSPC; + } else { + ip->i_blocks += bb; + ip->i_lfs->lfs_bfree -= bb; + clrbuf(ibp); + error = VOP_BWRITE(ibp); + } + } else + panic ("Indirect block should not exist"); + } + if (error) { + if (bp) + brelse(bp); + return(error); + } + + + /* Now, we may need to allocate the data block */ + if (!(bp->b_flags & (B_CACHE | B_DONE | B_DELWRI))) { + if (daddr == UNASSIGNED) + if (!ISSPACE(fs, bb, curproc->p_ucred)) { + bp->b_flags |= B_INVAL; + brelse(bp); + return(ENOSPC); + } else { + ip->i_blocks += bb; + ip->i_lfs->lfs_bfree -= bb; + if (iosize != fs->lfs_bsize) + clrbuf(bp); + } + else if (iosize == fs->lfs_bsize) + bp->b_blkno = daddr; /* Skip the I/O */ + else { + bp->b_blkno = daddr; + bp->b_flags |= B_READ; + VOP_STRATEGY(bp); + return(biowait(bp)); + } + } + return (error); +} diff --git a/sys/ufs/lfs/lfs_bio.c b/sys/ufs/lfs/lfs_bio.c new file mode 100644 index 0000000..0f021f1 --- /dev/null +++ b/sys/ufs/lfs/lfs_bio.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_bio.c 8.4 (Berkeley) 12/30/93 + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/resourcevar.h> +#include <sys/mount.h> +#include <sys/kernel.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> + +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> + +/* + * LFS block write function. + * + * XXX + * No write cost accounting is done. + * This is almost certainly wrong for synchronous operations and NFS. + */ +int lfs_allclean_wakeup; /* Cleaner wakeup address. */ +int locked_queue_count; /* XXX Count of locked-down buffers. */ +int lfs_writing; /* Set if already kicked off a writer + because of buffer space */ +/* +#define WRITE_THRESHHOLD ((nbuf >> 2) - 10) +#define WAIT_THRESHHOLD ((nbuf >> 1) - 10) +*/ +#define WAIT_THRESHHOLD (nbuf - (nbuf >> 2) - 10) +#define WRITE_THRESHHOLD ((nbuf >> 1) - 10) +#define LFS_BUFWAIT 2 + +int +lfs_bwrite(ap) + struct vop_bwrite_args /* { + struct buf *a_bp; + } */ *ap; +{ + register struct buf *bp = ap->a_bp; + struct lfs *fs; + struct inode *ip; + int error, s; + + /* + * Set the delayed write flag and use reassignbuf to move the buffer + * from the clean list to the dirty one. + * + * Set the B_LOCKED flag and unlock the buffer, causing brelse to move + * the buffer onto the LOCKED free list. This is necessary, otherwise + * getnewbuf() would try to reclaim the buffers using bawrite, which + * isn't going to work. + * + * XXX we don't let meta-data writes run out of space because they can + * come from the segment writer. We need to make sure that there is + * enough space reserved so that there's room to write meta-data + * blocks. + */ + if (!(bp->b_flags & B_LOCKED)) { + fs = VFSTOUFS(bp->b_vp->v_mount)->um_lfs; + while (!LFS_FITS(fs, fsbtodb(fs, 1)) && !IS_IFILE(bp) && + bp->b_lblkno > 0) { + /* Out of space, need cleaner to run */ + wakeup(&lfs_allclean_wakeup); + if (error = tsleep(&fs->lfs_avail, PCATCH | PUSER, + "cleaner", NULL)) { + brelse(bp); + return (error); + } + } + ip = VTOI((bp)->b_vp); + if (!(ip->i_flag & IN_MODIFIED)) + ++fs->lfs_uinodes; + ip->i_flag |= IN_CHANGE | IN_MODIFIED | IN_UPDATE; + fs->lfs_avail -= fsbtodb(fs, 1); + ++locked_queue_count; + bp->b_flags |= B_DELWRI | B_LOCKED; + bp->b_flags &= ~(B_READ | B_ERROR); + s = splbio(); + reassignbuf(bp, bp->b_vp); + splx(s); + } + brelse(bp); + return (0); +} + +/* + * XXX + * This routine flushes buffers out of the B_LOCKED queue when LFS has too + * many locked down. Eventually the pageout daemon will simply call LFS + * when pages need to be reclaimed. Note, we have one static count of locked + * buffers, so we can't have more than a single file system. To make this + * work for multiple file systems, put the count into the mount structure. + */ +void +lfs_flush() +{ + register struct mount *mp; + +#ifdef DOSTATS + ++lfs_stats.write_exceeded; +#endif + if (lfs_writing) + return; + lfs_writing = 1; + for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) { + /* The lock check below is to avoid races with unmount. */ + if (mp->mnt_stat.f_type == MOUNT_LFS && + (mp->mnt_flag & (MNT_MLOCK|MNT_RDONLY|MNT_UNMOUNT)) == 0 && + !((((struct ufsmount *)mp->mnt_data))->ufsmount_u.lfs)->lfs_dirops ) { + /* + * We set the queue to 0 here because we are about to + * write all the dirty buffers we have. If more come + * in while we're writing the segment, they may not + * get written, so we want the count to reflect these + * new writes after the segwrite completes. + */ +#ifdef DOSTATS + ++lfs_stats.flush_invoked; +#endif + lfs_segwrite(mp, 0); + } + } + lfs_writing = 0; +} + +int +lfs_check(vp, blkno) + struct vnode *vp; + daddr_t blkno; +{ + extern int lfs_allclean_wakeup; + int error; + + error = 0; + if (incore(vp, blkno)) + return (0); + if (locked_queue_count > WRITE_THRESHHOLD) + lfs_flush(); + + /* If out of buffers, wait on writer */ + while (locked_queue_count > WAIT_THRESHHOLD) { +#ifdef DOSTATS + ++lfs_stats.wait_exceeded; +#endif + error = tsleep(&locked_queue_count, PCATCH | PUSER, "buffers", + hz * LFS_BUFWAIT); + } + + return (error); +} diff --git a/sys/ufs/lfs/lfs_cksum.c b/sys/ufs/lfs/lfs_cksum.c new file mode 100644 index 0000000..77b011a --- /dev/null +++ b/sys/ufs/lfs/lfs_cksum.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_cksum.c 8.1 (Berkeley) 6/11/93 + */ + +#include <sys/types.h> + +/* + * Simple, general purpose, fast checksum. Data must be short-aligned. + * Returns a u_long in case we ever want to do something more rigorous. + * + * XXX + * Use the TCP/IP checksum instead. + */ +u_long +cksum(str, len) + register void *str; + register size_t len; +{ + register u_long sum; + + len &= ~(sizeof(u_short) - 1); + for (sum = 0; len; len -= sizeof(u_short)) { + sum ^= *(u_short *)str; + ++(u_short *)str; + } + return (sum); +} diff --git a/sys/ufs/lfs/lfs_debug.c b/sys/ufs/lfs/lfs_debug.c new file mode 100644 index 0000000..cc28d60 --- /dev/null +++ b/sys/ufs/lfs/lfs_debug.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_debug.c 8.1 (Berkeley) 6/11/93 + */ + +#ifdef DEBUG +#include <sys/param.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/mount.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> + +void +lfs_dump_super(lfsp) + struct lfs *lfsp; +{ + int i; + + (void)printf("%s%lx\t%s%lx\t%s%d\t%s%d\n", + "magic ", lfsp->lfs_magic, + "version ", lfsp->lfs_version, + "size ", lfsp->lfs_size, + "ssize ", lfsp->lfs_ssize); + (void)printf("%s%d\t%s%d\t%s%d\t%s%d\n", + "dsize ", lfsp->lfs_dsize, + "bsize ", lfsp->lfs_bsize, + "fsize ", lfsp->lfs_fsize, + "frag ", lfsp->lfs_frag); + + (void)printf("%s%d\t%s%d\t%s%d\t%s%d\n", + "minfree ", lfsp->lfs_minfree, + "inopb ", lfsp->lfs_inopb, + "ifpb ", lfsp->lfs_ifpb, + "nindir ", lfsp->lfs_nindir); + + (void)printf("%s%d\t%s%d\t%s%d\t%s%d\n", + "nseg ", lfsp->lfs_nseg, + "nspf ", lfsp->lfs_nspf, + "cleansz ", lfsp->lfs_cleansz, + "segtabsz ", lfsp->lfs_segtabsz); + + (void)printf("%s%lx\t%s%d\t%s%lx\t%s%d\n", + "segmask ", lfsp->lfs_segmask, + "segshift ", lfsp->lfs_segshift, + "bmask ", lfsp->lfs_bmask, + "bshift ", lfsp->lfs_bshift); + + (void)printf("%s%lx\t%s%d\t%s%lx\t%s%d\n", + "ffmask ", lfsp->lfs_ffmask, + "ffshift ", lfsp->lfs_ffshift, + "fbmask ", lfsp->lfs_fbmask, + "fbshift ", lfsp->lfs_fbshift); + + (void)printf("%s%d\t%s%d\t%s%lx\t%s%qx\n", + "sushift ", lfsp->lfs_sushift, + "fsbtodb ", lfsp->lfs_fsbtodb, + "cksum ", lfsp->lfs_cksum, + "maxfilesize ", lfsp->lfs_maxfilesize); + + (void)printf("Superblock disk addresses:"); + for (i = 0; i < LFS_MAXNUMSB; i++) + (void)printf(" %lx", lfsp->lfs_sboffs[i]); + (void)printf("\n"); + + (void)printf("Checkpoint Info\n"); + (void)printf("%s%d\t%s%lx\t%s%d\n", + "free ", lfsp->lfs_free, + "idaddr ", lfsp->lfs_idaddr, + "ifile ", lfsp->lfs_ifile); + (void)printf("%s%lx\t%s%d\t%s%lx\t%s%lx\t%s%lx\t%s%lx\n", + "bfree ", lfsp->lfs_bfree, + "nfiles ", lfsp->lfs_nfiles, + "lastseg ", lfsp->lfs_lastseg, + "nextseg ", lfsp->lfs_nextseg, + "curseg ", lfsp->lfs_curseg, + "offset ", lfsp->lfs_offset); + (void)printf("tstamp %lx\n", lfsp->lfs_tstamp); +} + +void +lfs_dump_dinode(dip) + struct dinode *dip; +{ + int i; + + (void)printf("%s%u\t%s%d\t%s%u\t%s%u\t%s%lu\n", + "mode ", dip->di_mode, + "nlink ", dip->di_nlink, + "uid ", dip->di_uid, + "gid ", dip->di_gid, + "size ", dip->di_size); + (void)printf("inum %ld\n", dip->di_inumber); + (void)printf("Direct Addresses\n"); + for (i = 0; i < NDADDR; i++) { + (void)printf("\t%lx", dip->di_db[i]); + if ((i % 6) == 5) + (void)printf("\n"); + } + for (i = 0; i < NIADDR; i++) + (void)printf("\t%lx", dip->di_ib[i]); + (void)printf("\n"); +} +#endif /* DEBUG */ diff --git a/sys/ufs/lfs/lfs_extern.h b/sys/ufs/lfs/lfs_extern.h new file mode 100644 index 0000000..c1157ad --- /dev/null +++ b/sys/ufs/lfs/lfs_extern.h @@ -0,0 +1,106 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_extern.h 8.2 (Berkeley) 4/16/94 + */ + +struct fid; +struct mount; +struct nameidata; +struct proc; +struct statfs; +struct timeval; +struct inode; +struct uio; +struct mbuf; + +__BEGIN_DECLS +u_long cksum __P((void *, size_t)); /* XXX */ +int lfs_balloc __P((struct vnode *, u_long, daddr_t, struct buf **)); +int lfs_blkatoff __P((struct vop_blkatoff_args *)); +int lfs_bwrite __P((struct vop_bwrite_args *)); +int lfs_check __P((struct vnode *, daddr_t)); +int lfs_close __P((struct vop_close_args *)); +int lfs_create __P((struct vop_create_args *)); +int lfs_fhtovp __P((struct mount *, struct fid *, struct mbuf *, + struct vnode **, int *, struct ucred **)); +int lfs_fsync __P((struct vop_fsync_args *)); +int lfs_getattr __P((struct vop_getattr_args *)); +struct dinode * + lfs_ifind __P((struct lfs *, ino_t, struct dinode *)); +int lfs_inactive __P((struct vop_inactive_args *)); +int lfs_init __P((void)); +int lfs_initseg __P((struct lfs *)); +int lfs_link __P((struct vop_link_args *)); +int lfs_makeinode __P((int, struct nameidata *, struct inode **)); +int lfs_mkdir __P((struct vop_mkdir_args *)); +int lfs_mknod __P((struct vop_mknod_args *)); +int lfs_mount __P((struct mount *, + char *, caddr_t, struct nameidata *, struct proc *)); +int lfs_mountroot __P((void)); +struct buf * + lfs_newbuf __P((struct vnode *, daddr_t, size_t)); +int lfs_read __P((struct vop_read_args *)); +int lfs_remove __P((struct vop_remove_args *)); +int lfs_rmdir __P((struct vop_rmdir_args *)); +int lfs_rename __P((struct vop_rename_args *)); +void lfs_seglock __P((struct lfs *, unsigned long flags)); +void lfs_segunlock __P((struct lfs *)); +int lfs_segwrite __P((struct mount *, int)); +int lfs_statfs __P((struct mount *, struct statfs *, struct proc *)); +int lfs_symlink __P((struct vop_symlink_args *)); +int lfs_sync __P((struct mount *, int, struct ucred *, struct proc *)); +int lfs_truncate __P((struct vop_truncate_args *)); +int lfs_unmount __P((struct mount *, int, struct proc *)); +int lfs_update __P((struct vop_update_args *)); +int lfs_valloc __P((struct vop_valloc_args *)); +int lfs_vcreate __P((struct mount *, ino_t, struct vnode **)); +int lfs_vfree __P((struct vop_vfree_args *)); +int lfs_vflush __P((struct vnode *)); +int lfs_vget __P((struct mount *, ino_t, struct vnode **)); +int lfs_vptofh __P((struct vnode *, struct fid *)); +int lfs_vref __P((struct vnode *)); +void lfs_vunref __P((struct vnode *)); +int lfs_write __P((struct vop_write_args *)); +#ifdef DEBUG +void lfs_dump_dinode __P((struct dinode *)); +void lfs_dump_super __P((struct lfs *)); +#endif +__END_DECLS +extern int (**lfs_vnodeop_p)(); +extern int (**lfs_specop_p)(); +#ifdef FIFO +extern int (**lfs_fifoop_p)(); +#define LFS_FIFOOPS lfs_fifoop_p +#else +#define LFS_FIFOOPS NULL +#endif diff --git a/sys/ufs/lfs/lfs_inode.c b/sys/ufs/lfs/lfs_inode.c new file mode 100644 index 0000000..1a06aa2 --- /dev/null +++ b/sys/ufs/lfs/lfs_inode.c @@ -0,0 +1,359 @@ +/* + * Copyright (c) 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_inode.c 8.5 (Berkeley) 12/30/93 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/kernel.h> +#include <sys/malloc.h> + +#include <vm/vm.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> + +int +lfs_init() +{ + return (ufs_init()); +} + +/* Search a block for a specific dinode. */ +struct dinode * +lfs_ifind(fs, ino, dip) + struct lfs *fs; + ino_t ino; + register struct dinode *dip; +{ + register int cnt; + register struct dinode *ldip; + + for (cnt = INOPB(fs), ldip = dip + (cnt - 1); cnt--; --ldip) + if (ldip->di_inumber == ino) + return (ldip); + + panic("lfs_ifind: dinode %u not found", ino); + /* NOTREACHED */ +} + +int +lfs_update(ap) + struct vop_update_args /* { + struct vnode *a_vp; + struct timeval *a_access; + struct timeval *a_modify; + int a_waitfor; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (0); + ip = VTOI(vp); + if ((ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) + return (0); + if (ip->i_flag & IN_ACCESS) + ip->i_atime.ts_sec = ap->a_access->tv_sec; + if (ip->i_flag & IN_UPDATE) { + ip->i_mtime.ts_sec = ap->a_modify->tv_sec; + (ip)->i_modrev++; + } + if (ip->i_flag & IN_CHANGE) + ip->i_ctime.ts_sec = time.tv_sec; + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); + + if (!(ip->i_flag & IN_MODIFIED)) + ++(VFSTOUFS(vp->v_mount)->um_lfs->lfs_uinodes); + ip->i_flag |= IN_MODIFIED; + + /* If sync, push back the vnode and any dirty blocks it may have. */ + return (ap->a_waitfor & LFS_SYNC ? lfs_vflush(vp) : 0); +} + +/* Update segment usage information when removing a block. */ +#define UPDATE_SEGUSE \ + if (lastseg != -1) { \ + LFS_SEGENTRY(sup, fs, lastseg, sup_bp); \ + if ((num << fs->lfs_bshift) > sup->su_nbytes) \ + panic("lfs_truncate: negative bytes in segment %d\n", \ + lastseg); \ + sup->su_nbytes -= num << fs->lfs_bshift; \ + e1 = VOP_BWRITE(sup_bp); \ + blocksreleased += num; \ + } + +#define SEGDEC { \ + if (daddr != 0) { \ + if (lastseg != (seg = datosn(fs, daddr))) { \ + UPDATE_SEGUSE; \ + num = 1; \ + lastseg = seg; \ + } else \ + ++num; \ + } \ +} + +/* + * Truncate the inode ip to at most length size. Update segment usage + * table information. + */ +/* ARGSUSED */ +int +lfs_truncate(ap) + struct vop_truncate_args /* { + struct vnode *a_vp; + off_t a_length; + int a_flags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct indir *inp; + register int i; + register daddr_t *daddrp; + register struct vnode *vp = ap->a_vp; + off_t length = ap->a_length; + struct buf *bp, *sup_bp; + struct timeval tv; + struct ifile *ifp; + struct inode *ip; + struct lfs *fs; + struct indir a[NIADDR + 2], a_end[NIADDR + 2]; + SEGUSE *sup; + daddr_t daddr, lastblock, lbn, olastblock; + long off, a_released, blocksreleased, i_released; + int e1, e2, depth, lastseg, num, offset, seg, size; + + ip = VTOI(vp); + tv = time; + if (vp->v_type == VLNK && vp->v_mount->mnt_maxsymlinklen > 0) { +#ifdef DIAGNOSTIC + if (length != 0) + panic("lfs_truncate: partial truncate of symlink"); +#endif + bzero((char *)&ip->i_shortlink, (u_int)ip->i_size); + ip->i_size = 0; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(vp, &tv, &tv, 0)); + } + vnode_pager_setsize(vp, (u_long)length); + + fs = ip->i_lfs; + + /* If length is larger than the file, just update the times. */ + if (ip->i_size <= length) { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(vp, &tv, &tv, 0)); + } + + /* + * Calculate index into inode's block list of last direct and indirect + * blocks (if any) which we want to keep. Lastblock is 0 when the + * file is truncated to 0. + */ + lastblock = lblkno(fs, length + fs->lfs_bsize - 1); + olastblock = lblkno(fs, ip->i_size + fs->lfs_bsize - 1) - 1; + + /* + * Update the size of the file. If the file is not being truncated to + * a block boundry, the contents of the partial block following the end + * of the file must be zero'ed in case it ever become accessable again + * because of subsequent file growth. + */ + offset = blkoff(fs, length); + if (offset == 0) + ip->i_size = length; + else { + lbn = lblkno(fs, length); +#ifdef QUOTA + if (e1 = getinoquota(ip)) + return (e1); +#endif + if (e1 = bread(vp, lbn, fs->lfs_bsize, NOCRED, &bp)) + return (e1); + ip->i_size = length; + size = blksize(fs); + (void)vnode_pager_uncache(vp); + bzero((char *)bp->b_data + offset, (u_int)(size - offset)); + allocbuf(bp, size); + if (e1 = VOP_BWRITE(bp)) + return (e1); + } + /* + * Modify sup->su_nbyte counters for each deleted block; keep track + * of number of blocks removed for ip->i_blocks. + */ + blocksreleased = 0; + num = 0; + lastseg = -1; + + for (lbn = olastblock; lbn >= lastblock;) { + /* XXX use run length from bmap array to make this faster */ + ufs_bmaparray(vp, lbn, &daddr, a, &depth, NULL); + if (lbn == olastblock) + for (i = NIADDR + 2; i--;) + a_end[i] = a[i]; + switch (depth) { + case 0: /* Direct block. */ + daddr = ip->i_db[lbn]; + SEGDEC; + ip->i_db[lbn] = 0; + --lbn; + break; +#ifdef DIAGNOSTIC + case 1: /* An indirect block. */ + panic("lfs_truncate: ufs_bmaparray returned depth 1"); + /* NOTREACHED */ +#endif + default: /* Chain of indirect blocks. */ + inp = a + --depth; + if (inp->in_off > 0 && lbn != lastblock) { + lbn -= inp->in_off < lbn - lastblock ? + inp->in_off : lbn - lastblock; + break; + } + for (; depth && (inp->in_off == 0 || lbn == lastblock); + --inp, --depth) { + if (bread(vp, + inp->in_lbn, fs->lfs_bsize, NOCRED, &bp)) + panic("lfs_truncate: bread bno %d", + inp->in_lbn); + daddrp = (daddr_t *)bp->b_data + inp->in_off; + for (i = inp->in_off; + i++ <= a_end[depth].in_off;) { + daddr = *daddrp++; + SEGDEC; + } + a_end[depth].in_off = NINDIR(fs) - 1; + if (inp->in_off == 0) + brelse (bp); + else { + bzero((daddr_t *)bp->b_data + + inp->in_off, fs->lfs_bsize - + inp->in_off * sizeof(daddr_t)); + if (e1 = VOP_BWRITE(bp)) + return (e1); + } + } + if (depth == 0 && a[1].in_off == 0) { + off = a[0].in_off; + daddr = ip->i_ib[off]; + SEGDEC; + ip->i_ib[off] = 0; + } + if (lbn == lastblock || lbn <= NDADDR) + --lbn; + else { + lbn -= NINDIR(fs); + if (lbn < lastblock) + lbn = lastblock; + } + } + } + UPDATE_SEGUSE; + + /* If truncating the file to 0, update the version number. */ + if (length == 0) { + LFS_IENTRY(ifp, fs, ip->i_number, bp); + ++ifp->if_version; + (void) VOP_BWRITE(bp); + } + +#ifdef DIAGNOSTIC + if (ip->i_blocks < fsbtodb(fs, blocksreleased)) { + printf("lfs_truncate: block count < 0\n"); + blocksreleased = ip->i_blocks; + } +#endif + ip->i_blocks -= fsbtodb(fs, blocksreleased); + fs->lfs_bfree += fsbtodb(fs, blocksreleased); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * Traverse dirty block list counting number of dirty buffers + * that are being deleted out of the cache, so that the lfs_avail + * field can be updated. + */ + a_released = 0; + i_released = 0; + for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) + if (bp->b_flags & B_LOCKED) { + ++a_released; + /* + * XXX + * When buffers are created in the cache, their block + * number is set equal to their logical block number. + * If that is still true, we are assuming that the + * blocks are new (not yet on disk) and weren't + * counted above. However, there is a slight chance + * that a block's disk address is equal to its logical + * block number in which case, we'll get an overcounting + * here. + */ + if (bp->b_blkno == bp->b_lblkno) + ++i_released; + } + blocksreleased = fsbtodb(fs, i_released); +#ifdef DIAGNOSTIC + if (blocksreleased > ip->i_blocks) { + printf("lfs_inode: Warning! %s\n", + "more blocks released from inode than are in inode"); + blocksreleased = ip->i_blocks; + } +#endif + fs->lfs_bfree += blocksreleased; + ip->i_blocks -= blocksreleased; +#ifdef DIAGNOSTIC + if (length == 0 && ip->i_blocks != 0) + printf("lfs_inode: Warning! %s%d%s\n", + "Truncation to zero, but ", ip->i_blocks, + " blocks left on inode"); +#endif + fs->lfs_avail += fsbtodb(fs, a_released); + e1 = vinvalbuf(vp, (length > 0) ? V_SAVE : 0, ap->a_cred, ap->a_p, + 0, 0); + e2 = VOP_UPDATE(vp, &tv, &tv, 0); + return (e1 ? e1 : e2 ? e2 : 0); +} diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c new file mode 100644 index 0000000..249d59d --- /dev/null +++ b/sys/ufs/lfs/lfs_segment.c @@ -0,0 +1,1111 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_segment.c 8.5 (Berkeley) 1/4/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/kernel.h> +#include <sys/resourcevar.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/conf.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/mount.h> + +#include <miscfs/specfs/specdev.h> +#include <miscfs/fifofs/fifo.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/dir.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> + +extern int count_lock_queue __P((void)); + +#define MAX_ACTIVE 10 +/* + * Determine if it's OK to start a partial in this segment, or if we need + * to go on to a new segment. + */ +#define LFS_PARTIAL_FITS(fs) \ + ((fs)->lfs_dbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \ + 1 << (fs)->lfs_fsbtodb) + +void lfs_callback __P((struct buf *)); +void lfs_gather __P((struct lfs *, struct segment *, + struct vnode *, int (*) __P((struct lfs *, struct buf *)))); +int lfs_gatherblock __P((struct segment *, struct buf *, int *)); +void lfs_iset __P((struct inode *, daddr_t, time_t)); +int lfs_match_data __P((struct lfs *, struct buf *)); +int lfs_match_dindir __P((struct lfs *, struct buf *)); +int lfs_match_indir __P((struct lfs *, struct buf *)); +int lfs_match_tindir __P((struct lfs *, struct buf *)); +void lfs_newseg __P((struct lfs *)); +void lfs_shellsort __P((struct buf **, daddr_t *, register int)); +void lfs_supercallback __P((struct buf *)); +void lfs_updatemeta __P((struct segment *)); +int lfs_vref __P((struct vnode *)); +void lfs_vunref __P((struct vnode *)); +void lfs_writefile __P((struct lfs *, struct segment *, struct vnode *)); +int lfs_writeinode __P((struct lfs *, struct segment *, struct inode *)); +int lfs_writeseg __P((struct lfs *, struct segment *)); +void lfs_writesuper __P((struct lfs *)); +void lfs_writevnodes __P((struct lfs *fs, struct mount *mp, + struct segment *sp, int dirops)); + +int lfs_allclean_wakeup; /* Cleaner wakeup address. */ + +/* Statistics Counters */ +#define DOSTATS +struct lfs_stats lfs_stats; + +/* op values to lfs_writevnodes */ +#define VN_REG 0 +#define VN_DIROP 1 +#define VN_EMPTY 2 + +/* + * Ifile and meta data blocks are not marked busy, so segment writes MUST be + * single threaded. Currently, there are two paths into lfs_segwrite, sync() + * and getnewbuf(). They both mark the file system busy. Lfs_vflush() + * explicitly marks the file system busy. So lfs_segwrite is safe. I think. + */ + +int +lfs_vflush(vp) + struct vnode *vp; +{ + struct inode *ip; + struct lfs *fs; + struct segment *sp; + + fs = VFSTOUFS(vp->v_mount)->um_lfs; + if (fs->lfs_nactive > MAX_ACTIVE) + return(lfs_segwrite(vp->v_mount, SEGM_SYNC|SEGM_CKP)); + lfs_seglock(fs, SEGM_SYNC); + sp = fs->lfs_sp; + + + ip = VTOI(vp); + if (vp->v_dirtyblkhd.lh_first == NULL) + lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); + + do { + do { + if (vp->v_dirtyblkhd.lh_first != NULL) + lfs_writefile(fs, sp, vp); + } while (lfs_writeinode(fs, sp, ip)); + + } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); + +#ifdef DOSTATS + ++lfs_stats.nwrites; + if (sp->seg_flags & SEGM_SYNC) + ++lfs_stats.nsync_writes; + if (sp->seg_flags & SEGM_CKP) + ++lfs_stats.ncheckpoints; +#endif + lfs_segunlock(fs); + return (0); +} + +void +lfs_writevnodes(fs, mp, sp, op) + struct lfs *fs; + struct mount *mp; + struct segment *sp; + int op; +{ + struct inode *ip; + struct vnode *vp; + +loop: + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + + /* XXX ignore dirops for now + if (op == VN_DIROP && !(vp->v_flag & VDIROP) || + op != VN_DIROP && (vp->v_flag & VDIROP)) + continue; + */ + + if (op == VN_EMPTY && vp->v_dirtyblkhd.lh_first) + continue; + + if (vp->v_type == VNON) + continue; + + if (lfs_vref(vp)) + continue; + + /* + * Write the inode/file if dirty and it's not the + * the IFILE. + */ + ip = VTOI(vp); + if ((ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE) || + vp->v_dirtyblkhd.lh_first != NULL) && + ip->i_number != LFS_IFILE_INUM) { + if (vp->v_dirtyblkhd.lh_first != NULL) + lfs_writefile(fs, sp, vp); + (void) lfs_writeinode(fs, sp, ip); + } + vp->v_flag &= ~VDIROP; + lfs_vunref(vp); + } +} + +int +lfs_segwrite(mp, flags) + struct mount *mp; + int flags; /* Do a checkpoint. */ +{ + struct buf *bp; + struct inode *ip; + struct lfs *fs; + struct segment *sp; + struct vnode *vp; + SEGUSE *segusep; + daddr_t ibno; + CLEANERINFO *cip; + int clean, do_ckp, error, i; + + fs = VFSTOUFS(mp)->um_lfs; + + /* + * If we have fewer than 2 clean segments, wait until cleaner + * writes. + */ + do { + LFS_CLEANERINFO(cip, fs, bp); + clean = cip->clean; + brelse(bp); + if (clean <= 2) { + printf ("segs clean: %d\n", clean); + wakeup(&lfs_allclean_wakeup); + if (error = tsleep(&fs->lfs_avail, PRIBIO + 1, + "lfs writer", 0)) + return (error); + } + } while (clean <= 2 ); + + /* + * Allocate a segment structure and enough space to hold pointers to + * the maximum possible number of buffers which can be described in a + * single summary block. + */ + do_ckp = flags & SEGM_CKP || fs->lfs_nactive > MAX_ACTIVE; + lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); + sp = fs->lfs_sp; + + lfs_writevnodes(fs, mp, sp, VN_REG); + + /* XXX ignore ordering of dirops for now */ + /* XXX + fs->lfs_writer = 1; + if (fs->lfs_dirops && (error = + tsleep(&fs->lfs_writer, PRIBIO + 1, "lfs writer", 0))) { + free(sp->bpp, M_SEGMENT); + free(sp, M_SEGMENT); + fs->lfs_writer = 0; + return (error); + } + + lfs_writevnodes(fs, mp, sp, VN_DIROP); + */ + + /* + * If we are doing a checkpoint, mark everything since the + * last checkpoint as no longer ACTIVE. + */ + if (do_ckp) + for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz; + --ibno >= fs->lfs_cleansz; ) { + if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize, + NOCRED, &bp)) + + panic("lfs: ifile read"); + segusep = (SEGUSE *)bp->b_data; + for (i = fs->lfs_sepb; i--; segusep++) + segusep->su_flags &= ~SEGUSE_ACTIVE; + + error = VOP_BWRITE(bp); + } + + if (do_ckp || fs->lfs_doifile) { +redo: + vp = fs->lfs_ivnode; + while (vget(vp, 1)); + ip = VTOI(vp); + if (vp->v_dirtyblkhd.lh_first != NULL) + lfs_writefile(fs, sp, vp); + (void)lfs_writeinode(fs, sp, ip); + vput(vp); + if (lfs_writeseg(fs, sp) && do_ckp) + goto redo; + } else + (void) lfs_writeseg(fs, sp); + + /* + * If the I/O count is non-zero, sleep until it reaches zero. At the + * moment, the user's process hangs around so we can sleep. + */ + /* XXX ignore dirops for now + fs->lfs_writer = 0; + fs->lfs_doifile = 0; + wakeup(&fs->lfs_dirops); + */ + +#ifdef DOSTATS + ++lfs_stats.nwrites; + if (sp->seg_flags & SEGM_SYNC) + ++lfs_stats.nsync_writes; + if (sp->seg_flags & SEGM_CKP) + ++lfs_stats.ncheckpoints; +#endif + lfs_segunlock(fs); + return (0); +} + +/* + * Write the dirty blocks associated with a vnode. + */ +void +lfs_writefile(fs, sp, vp) + struct lfs *fs; + struct segment *sp; + struct vnode *vp; +{ + struct buf *bp; + struct finfo *fip; + IFILE *ifp; + + if (sp->seg_bytes_left < fs->lfs_bsize || + sp->sum_bytes_left < sizeof(struct finfo)) + (void) lfs_writeseg(fs, sp); + + sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(daddr_t); + ++((SEGSUM *)(sp->segsum))->ss_nfinfo; + + fip = sp->fip; + fip->fi_nblocks = 0; + fip->fi_ino = VTOI(vp)->i_number; + LFS_IENTRY(ifp, fs, fip->fi_ino, bp); + fip->fi_version = ifp->if_version; + brelse(bp); + + /* + * It may not be necessary to write the meta-data blocks at this point, + * as the roll-forward recovery code should be able to reconstruct the + * list. + */ + lfs_gather(fs, sp, vp, lfs_match_data); + lfs_gather(fs, sp, vp, lfs_match_indir); + lfs_gather(fs, sp, vp, lfs_match_dindir); +#ifdef TRIPLE + lfs_gather(fs, sp, vp, lfs_match_tindir); +#endif + + fip = sp->fip; + if (fip->fi_nblocks != 0) { + sp->fip = + (struct finfo *)((caddr_t)fip + sizeof(struct finfo) + + sizeof(daddr_t) * (fip->fi_nblocks - 1)); + sp->start_lbp = &sp->fip->fi_blocks[0]; + } else { + sp->sum_bytes_left += sizeof(struct finfo) - sizeof(daddr_t); + --((SEGSUM *)(sp->segsum))->ss_nfinfo; + } +} + +int +lfs_writeinode(fs, sp, ip) + struct lfs *fs; + struct segment *sp; + struct inode *ip; +{ + struct buf *bp, *ibp; + IFILE *ifp; + SEGUSE *sup; + daddr_t daddr; + ino_t ino; + int error, i, ndx; + int redo_ifile = 0; + + if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE))) + return(0); + + /* Allocate a new inode block if necessary. */ + if (sp->ibp == NULL) { + /* Allocate a new segment if necessary. */ + if (sp->seg_bytes_left < fs->lfs_bsize || + sp->sum_bytes_left < sizeof(daddr_t)) + (void) lfs_writeseg(fs, sp); + + /* Get next inode block. */ + daddr = fs->lfs_offset; + fs->lfs_offset += fsbtodb(fs, 1); + sp->ibp = *sp->cbpp++ = + lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, daddr, + fs->lfs_bsize); + /* Zero out inode numbers */ + for (i = 0; i < INOPB(fs); ++i) + ((struct dinode *)sp->ibp->b_data)[i].di_inumber = 0; + ++sp->start_bpp; + fs->lfs_avail -= fsbtodb(fs, 1); + /* Set remaining space counters. */ + sp->seg_bytes_left -= fs->lfs_bsize; + sp->sum_bytes_left -= sizeof(daddr_t); + ndx = LFS_SUMMARY_SIZE / sizeof(daddr_t) - + sp->ninodes / INOPB(fs) - 1; + ((daddr_t *)(sp->segsum))[ndx] = daddr; + } + + /* Update the inode times and copy the inode onto the inode page. */ + if (ip->i_flag & IN_MODIFIED) + --fs->lfs_uinodes; + ITIMES(ip, &time, &time); + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); + bp = sp->ibp; + ((struct dinode *)bp->b_data)[sp->ninodes % INOPB(fs)] = ip->i_din; + /* Increment inode count in segment summary block. */ + ++((SEGSUM *)(sp->segsum))->ss_ninos; + + /* If this page is full, set flag to allocate a new page. */ + if (++sp->ninodes % INOPB(fs) == 0) + sp->ibp = NULL; + + /* + * If updating the ifile, update the super-block. Update the disk + * address and access times for this inode in the ifile. + */ + ino = ip->i_number; + if (ino == LFS_IFILE_INUM) { + daddr = fs->lfs_idaddr; + fs->lfs_idaddr = bp->b_blkno; + } else { + LFS_IENTRY(ifp, fs, ino, ibp); + daddr = ifp->if_daddr; + ifp->if_daddr = bp->b_blkno; + error = VOP_BWRITE(ibp); + } + + /* + * No need to update segment usage if there was no former inode address + * or if the last inode address is in the current partial segment. + */ + if (daddr != LFS_UNUSED_DADDR && + !(daddr >= fs->lfs_lastpseg && daddr <= bp->b_blkno)) { + LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); +#ifdef DIAGNOSTIC + if (sup->su_nbytes < sizeof(struct dinode)) { + /* XXX -- Change to a panic. */ + printf("lfs: negative bytes (segment %d)\n", + datosn(fs, daddr)); + panic("negative bytes"); + } +#endif + sup->su_nbytes -= sizeof(struct dinode); + redo_ifile = + (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED)); + error = VOP_BWRITE(bp); + } + return (redo_ifile); +} + +int +lfs_gatherblock(sp, bp, sptr) + struct segment *sp; + struct buf *bp; + int *sptr; +{ + struct lfs *fs; + int version; + + /* + * If full, finish this segment. We may be doing I/O, so + * release and reacquire the splbio(). + */ +#ifdef DIAGNOSTIC + if (sp->vp == NULL) + panic ("lfs_gatherblock: Null vp in segment"); +#endif + fs = sp->fs; + if (sp->sum_bytes_left < sizeof(daddr_t) || + sp->seg_bytes_left < fs->lfs_bsize) { + if (sptr) + splx(*sptr); + lfs_updatemeta(sp); + + version = sp->fip->fi_version; + (void) lfs_writeseg(fs, sp); + + sp->fip->fi_version = version; + sp->fip->fi_ino = VTOI(sp->vp)->i_number; + /* Add the current file to the segment summary. */ + ++((SEGSUM *)(sp->segsum))->ss_nfinfo; + sp->sum_bytes_left -= + sizeof(struct finfo) - sizeof(daddr_t); + + if (sptr) + *sptr = splbio(); + return(1); + } + + /* Insert into the buffer list, update the FINFO block. */ + bp->b_flags |= B_GATHERED; + *sp->cbpp++ = bp; + sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno; + + sp->sum_bytes_left -= sizeof(daddr_t); + sp->seg_bytes_left -= fs->lfs_bsize; + return(0); +} + +void +lfs_gather(fs, sp, vp, match) + struct lfs *fs; + struct segment *sp; + struct vnode *vp; + int (*match) __P((struct lfs *, struct buf *)); +{ + struct buf *bp; + int s; + + sp->vp = vp; + s = splbio(); +loop: for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) { + if (bp->b_flags & B_BUSY || !match(fs, bp) || + bp->b_flags & B_GATHERED) + continue; +#ifdef DIAGNOSTIC + if (!(bp->b_flags & B_DELWRI)) + panic("lfs_gather: bp not B_DELWRI"); + if (!(bp->b_flags & B_LOCKED)) + panic("lfs_gather: bp not B_LOCKED"); +#endif + if (lfs_gatherblock(sp, bp, &s)) + goto loop; + } + splx(s); + lfs_updatemeta(sp); + sp->vp = NULL; +} + + +/* + * Update the metadata that points to the blocks listed in the FINFO + * array. + */ +void +lfs_updatemeta(sp) + struct segment *sp; +{ + SEGUSE *sup; + struct buf *bp; + struct lfs *fs; + struct vnode *vp; + struct indir a[NIADDR + 2], *ap; + struct inode *ip; + daddr_t daddr, lbn, off; + int db_per_fsb, error, i, nblocks, num; + + vp = sp->vp; + nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; + if (vp == NULL || nblocks == 0) + return; + + /* Sort the blocks. */ + if (!(sp->seg_flags & SEGM_CLEAN)) + lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks); + + /* + * Assign disk addresses, and update references to the logical + * block and the segment usage information. + */ + fs = sp->fs; + db_per_fsb = fsbtodb(fs, 1); + for (i = nblocks; i--; ++sp->start_bpp) { + lbn = *sp->start_lbp++; + (*sp->start_bpp)->b_blkno = off = fs->lfs_offset; + fs->lfs_offset += db_per_fsb; + + if (error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL)) + panic("lfs_updatemeta: ufs_bmaparray %d", error); + ip = VTOI(vp); + switch (num) { + case 0: + ip->i_db[lbn] = off; + break; + case 1: + ip->i_ib[a[0].in_off] = off; + break; + default: + ap = &a[num - 1]; + if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp)) + panic("lfs_updatemeta: bread bno %d", + ap->in_lbn); + /* + * Bread may create a new indirect block which needs + * to get counted for the inode. + */ + if (bp->b_blkno == -1 && !(bp->b_flags & B_CACHE)) { +printf ("Updatemeta allocating indirect block: shouldn't happen\n"); + ip->i_blocks += btodb(fs->lfs_bsize); + fs->lfs_bfree -= btodb(fs->lfs_bsize); + } + ((daddr_t *)bp->b_data)[ap->in_off] = off; + VOP_BWRITE(bp); + } + + /* Update segment usage information. */ + if (daddr != UNASSIGNED && + !(daddr >= fs->lfs_lastpseg && daddr <= off)) { + LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); +#ifdef DIAGNOSTIC + if (sup->su_nbytes < fs->lfs_bsize) { + /* XXX -- Change to a panic. */ + printf("lfs: negative bytes (segment %d)\n", + datosn(fs, daddr)); + panic ("Negative Bytes"); + } +#endif + sup->su_nbytes -= fs->lfs_bsize; + error = VOP_BWRITE(bp); + } + } +} + +/* + * Start a new segment. + */ +int +lfs_initseg(fs) + struct lfs *fs; +{ + struct segment *sp; + SEGUSE *sup; + SEGSUM *ssp; + struct buf *bp; + int repeat; + + sp = fs->lfs_sp; + + repeat = 0; + /* Advance to the next segment. */ + if (!LFS_PARTIAL_FITS(fs)) { + /* Wake up any cleaning procs waiting on this file system. */ + wakeup(&lfs_allclean_wakeup); + + lfs_newseg(fs); + repeat = 1; + fs->lfs_offset = fs->lfs_curseg; + sp->seg_number = datosn(fs, fs->lfs_curseg); + sp->seg_bytes_left = fs->lfs_dbpseg * DEV_BSIZE; + + /* + * If the segment contains a superblock, update the offset + * and summary address to skip over it. + */ + LFS_SEGENTRY(sup, fs, sp->seg_number, bp); + if (sup->su_flags & SEGUSE_SUPERBLOCK) { + fs->lfs_offset += LFS_SBPAD / DEV_BSIZE; + sp->seg_bytes_left -= LFS_SBPAD; + } + brelse(bp); + } else { + sp->seg_number = datosn(fs, fs->lfs_curseg); + sp->seg_bytes_left = (fs->lfs_dbpseg - + (fs->lfs_offset - fs->lfs_curseg)) * DEV_BSIZE; + } + fs->lfs_lastpseg = fs->lfs_offset; + + sp->fs = fs; + sp->ibp = NULL; + sp->ninodes = 0; + + /* Get a new buffer for SEGSUM and enter it into the buffer list. */ + sp->cbpp = sp->bpp; + *sp->cbpp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_offset, + LFS_SUMMARY_SIZE); + sp->segsum = (*sp->cbpp)->b_data; + bzero(sp->segsum, LFS_SUMMARY_SIZE); + sp->start_bpp = ++sp->cbpp; + fs->lfs_offset += LFS_SUMMARY_SIZE / DEV_BSIZE; + + /* Set point to SEGSUM, initialize it. */ + ssp = sp->segsum; + ssp->ss_next = fs->lfs_nextseg; + ssp->ss_nfinfo = ssp->ss_ninos = 0; + + /* Set pointer to first FINFO, initialize it. */ + sp->fip = (struct finfo *)(sp->segsum + sizeof(SEGSUM)); + sp->fip->fi_nblocks = 0; + sp->start_lbp = &sp->fip->fi_blocks[0]; + + sp->seg_bytes_left -= LFS_SUMMARY_SIZE; + sp->sum_bytes_left = LFS_SUMMARY_SIZE - sizeof(SEGSUM); + + return(repeat); +} + +/* + * Return the next segment to write. + */ +void +lfs_newseg(fs) + struct lfs *fs; +{ + CLEANERINFO *cip; + SEGUSE *sup; + struct buf *bp; + int curseg, isdirty, sn; + + LFS_SEGENTRY(sup, fs, datosn(fs, fs->lfs_nextseg), bp); + sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; + sup->su_nbytes = 0; + sup->su_nsums = 0; + sup->su_ninos = 0; + (void) VOP_BWRITE(bp); + + LFS_CLEANERINFO(cip, fs, bp); + --cip->clean; + ++cip->dirty; + (void) VOP_BWRITE(bp); + + fs->lfs_lastseg = fs->lfs_curseg; + fs->lfs_curseg = fs->lfs_nextseg; + for (sn = curseg = datosn(fs, fs->lfs_curseg);;) { + sn = (sn + 1) % fs->lfs_nseg; + if (sn == curseg) + panic("lfs_nextseg: no clean segments"); + LFS_SEGENTRY(sup, fs, sn, bp); + isdirty = sup->su_flags & SEGUSE_DIRTY; + brelse(bp); + if (!isdirty) + break; + } + + ++fs->lfs_nactive; + fs->lfs_nextseg = sntoda(fs, sn); +#ifdef DOSTATS + ++lfs_stats.segsused; +#endif +} + +int +lfs_writeseg(fs, sp) + struct lfs *fs; + struct segment *sp; +{ + extern int locked_queue_count; + struct buf **bpp, *bp, *cbp; + SEGUSE *sup; + SEGSUM *ssp; + dev_t i_dev; + size_t size; + u_long *datap, *dp; + int ch_per_blk, do_again, i, nblocks, num, s; + int (*strategy)__P((struct vop_strategy_args *)); + struct vop_strategy_args vop_strategy_a; + u_short ninos; + char *p; + + /* + * If there are no buffers other than the segment summary to write + * and it is not a checkpoint, don't do anything. On a checkpoint, + * even if there aren't any buffers, you need to write the superblock. + */ + if ((nblocks = sp->cbpp - sp->bpp) == 1) + return (0); + + ssp = (SEGSUM *)sp->segsum; + + /* Update the segment usage information. */ + LFS_SEGENTRY(sup, fs, sp->seg_number, bp); + ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs); + sup->su_nbytes += nblocks - 1 - ninos << fs->lfs_bshift; + sup->su_nbytes += ssp->ss_ninos * sizeof(struct dinode); + sup->su_nbytes += LFS_SUMMARY_SIZE; + sup->su_lastmod = time.tv_sec; + sup->su_ninos += ninos; + ++sup->su_nsums; + do_again = !(bp->b_flags & B_GATHERED); + (void)VOP_BWRITE(bp); + /* + * Compute checksum across data and then across summary; the first + * block (the summary block) is skipped. Set the create time here + * so that it's guaranteed to be later than the inode mod times. + * + * XXX + * Fix this to do it inline, instead of malloc/copy. + */ + datap = dp = malloc(nblocks * sizeof(u_long), M_SEGMENT, M_WAITOK); + for (bpp = sp->bpp, i = nblocks - 1; i--;) { + if ((*++bpp)->b_flags & B_INVAL) { + if (copyin((*bpp)->b_saveaddr, dp++, sizeof(u_long))) + panic("lfs_writeseg: copyin failed"); + } else + *dp++ = ((u_long *)(*bpp)->b_data)[0]; + } + ssp->ss_create = time.tv_sec; + ssp->ss_datasum = cksum(datap, (nblocks - 1) * sizeof(u_long)); + ssp->ss_sumsum = + cksum(&ssp->ss_datasum, LFS_SUMMARY_SIZE - sizeof(ssp->ss_sumsum)); + free(datap, M_SEGMENT); +#ifdef DIAGNOSTIC + if (fs->lfs_bfree < fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE) + panic("lfs_writeseg: No diskspace for summary"); +#endif + fs->lfs_bfree -= (fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE); + + i_dev = VTOI(fs->lfs_ivnode)->i_dev; + strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; + + /* + * When we simply write the blocks we lose a rotation for every block + * written. To avoid this problem, we allocate memory in chunks, copy + * the buffers into the chunk and write the chunk. MAXPHYS is the + * largest size I/O devices can handle. + * When the data is copied to the chunk, turn off the the B_LOCKED bit + * and brelse the buffer (which will move them to the LRU list). Add + * the B_CALL flag to the buffer header so we can count I/O's for the + * checkpoints and so we can release the allocated memory. + * + * XXX + * This should be removed if the new virtual memory system allows us to + * easily make the buffers contiguous in kernel memory and if that's + * fast enough. + */ + ch_per_blk = MAXPHYS / fs->lfs_bsize; + for (bpp = sp->bpp, i = nblocks; i;) { + num = ch_per_blk; + if (num > i) + num = i; + i -= num; + size = num * fs->lfs_bsize; + + cbp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, + (*bpp)->b_blkno, size); + cbp->b_dev = i_dev; + cbp->b_flags |= B_ASYNC | B_BUSY; + + s = splbio(); + ++fs->lfs_iocount; + for (p = cbp->b_data; num--;) { + bp = *bpp++; + /* + * Fake buffers from the cleaner are marked as B_INVAL. + * We need to copy the data from user space rather than + * from the buffer indicated. + * XXX == what do I do on an error? + */ + if (bp->b_flags & B_INVAL) { + if (copyin(bp->b_saveaddr, p, bp->b_bcount)) + panic("lfs_writeseg: copyin failed"); + } else + bcopy(bp->b_data, p, bp->b_bcount); + p += bp->b_bcount; + if (bp->b_flags & B_LOCKED) + --locked_queue_count; + bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI | + B_LOCKED | B_GATHERED); + if (bp->b_flags & B_CALL) { + /* if B_CALL, it was created with newbuf */ + brelvp(bp); + if (!(bp->b_flags & B_INVAL)) + free(bp->b_data, M_SEGMENT); + free(bp, M_SEGMENT); + } else { + bremfree(bp); + bp->b_flags |= B_DONE; + reassignbuf(bp, bp->b_vp); + brelse(bp); + } + } + ++cbp->b_vp->v_numoutput; + splx(s); + cbp->b_bcount = p - (char *)cbp->b_data; + /* + * XXXX This is a gross and disgusting hack. Since these + * buffers are physically addressed, they hang off the + * device vnode (devvp). As a result, they have no way + * of getting to the LFS superblock or lfs structure to + * keep track of the number of I/O's pending. So, I am + * going to stuff the fs into the saveaddr field of + * the buffer (yuk). + */ + cbp->b_saveaddr = (caddr_t)fs; + vop_strategy_a.a_desc = VDESC(vop_strategy); + vop_strategy_a.a_bp = cbp; + (strategy)(&vop_strategy_a); + } + /* + * XXX + * Vinvalbuf can move locked buffers off the locked queue + * and we have no way of knowing about this. So, after + * doing a big write, we recalculate how many bufers are + * really still left on the locked queue. + */ + locked_queue_count = count_lock_queue(); + wakeup(&locked_queue_count); +#ifdef DOSTATS + ++lfs_stats.psegwrites; + lfs_stats.blocktot += nblocks - 1; + if (fs->lfs_sp->seg_flags & SEGM_SYNC) + ++lfs_stats.psyncwrites; + if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { + ++lfs_stats.pcleanwrites; + lfs_stats.cleanblocks += nblocks - 1; + } +#endif + return (lfs_initseg(fs) || do_again); +} + +void +lfs_writesuper(fs) + struct lfs *fs; +{ + struct buf *bp; + dev_t i_dev; + int (*strategy) __P((struct vop_strategy_args *)); + int s; + struct vop_strategy_args vop_strategy_a; + + i_dev = VTOI(fs->lfs_ivnode)->i_dev; + strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; + + /* Checksum the superblock and copy it into a buffer. */ + fs->lfs_cksum = cksum(fs, sizeof(struct lfs) - sizeof(fs->lfs_cksum)); + bp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_sboffs[0], + LFS_SBPAD); + *(struct lfs *)bp->b_data = *fs; + + /* XXX Toggle between first two superblocks; for now just write first */ + bp->b_dev = i_dev; + bp->b_flags |= B_BUSY | B_CALL | B_ASYNC; + bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI); + bp->b_iodone = lfs_supercallback; + vop_strategy_a.a_desc = VDESC(vop_strategy); + vop_strategy_a.a_bp = bp; + s = splbio(); + ++bp->b_vp->v_numoutput; + splx(s); + (strategy)(&vop_strategy_a); +} + +/* + * Logical block number match routines used when traversing the dirty block + * chain. + */ +int +lfs_match_data(fs, bp) + struct lfs *fs; + struct buf *bp; +{ + return (bp->b_lblkno >= 0); +} + +int +lfs_match_indir(fs, bp) + struct lfs *fs; + struct buf *bp; +{ + int lbn; + + lbn = bp->b_lblkno; + return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0); +} + +int +lfs_match_dindir(fs, bp) + struct lfs *fs; + struct buf *bp; +{ + int lbn; + + lbn = bp->b_lblkno; + return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1); +} + +int +lfs_match_tindir(fs, bp) + struct lfs *fs; + struct buf *bp; +{ + int lbn; + + lbn = bp->b_lblkno; + return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2); +} + +/* + * Allocate a new buffer header. + */ +struct buf * +lfs_newbuf(vp, daddr, size) + struct vnode *vp; + daddr_t daddr; + size_t size; +{ + struct buf *bp; + size_t nbytes; + + nbytes = roundup(size, DEV_BSIZE); + bp = malloc(sizeof(struct buf), M_SEGMENT, M_WAITOK); + bzero(bp, sizeof(struct buf)); + if (nbytes) + bp->b_data = malloc(nbytes, M_SEGMENT, M_WAITOK); + bgetvp(vp, bp); + bp->b_bufsize = size; + bp->b_bcount = size; + bp->b_lblkno = daddr; + bp->b_blkno = daddr; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_iodone = lfs_callback; + bp->b_flags |= B_BUSY | B_CALL | B_NOCACHE; + return (bp); +} + +void +lfs_callback(bp) + struct buf *bp; +{ + struct lfs *fs; + + fs = (struct lfs *)bp->b_saveaddr; +#ifdef DIAGNOSTIC + if (fs->lfs_iocount == 0) + panic("lfs_callback: zero iocount\n"); +#endif + if (--fs->lfs_iocount == 0) + wakeup(&fs->lfs_iocount); + + brelvp(bp); + free(bp->b_data, M_SEGMENT); + free(bp, M_SEGMENT); +} + +void +lfs_supercallback(bp) + struct buf *bp; +{ + brelvp(bp); + free(bp->b_data, M_SEGMENT); + free(bp, M_SEGMENT); +} + +/* + * Shellsort (diminishing increment sort) from Data Structures and + * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; + * see also Knuth Vol. 3, page 84. The increments are selected from + * formula (8), page 95. Roughly O(N^3/2). + */ +/* + * This is our own private copy of shellsort because we want to sort + * two parallel arrays (the array of buffer pointers and the array of + * logical block numbers) simultaneously. Note that we cast the array + * of logical block numbers to a unsigned in this routine so that the + * negative block numbers (meta data blocks) sort AFTER the data blocks. + */ +void +lfs_shellsort(bp_array, lb_array, nmemb) + struct buf **bp_array; + daddr_t *lb_array; + register int nmemb; +{ + static int __rsshell_increments[] = { 4, 1, 0 }; + register int incr, *incrp, t1, t2; + struct buf *bp_temp; + u_long lb_temp; + + for (incrp = __rsshell_increments; incr = *incrp++;) + for (t1 = incr; t1 < nmemb; ++t1) + for (t2 = t1 - incr; t2 >= 0;) + if (lb_array[t2] > lb_array[t2 + incr]) { + lb_temp = lb_array[t2]; + lb_array[t2] = lb_array[t2 + incr]; + lb_array[t2 + incr] = lb_temp; + bp_temp = bp_array[t2]; + bp_array[t2] = bp_array[t2 + incr]; + bp_array[t2 + incr] = bp_temp; + t2 -= incr; + } else + break; +} + +/* + * Check VXLOCK. Return 1 if the vnode is locked. Otherwise, vget it. + */ +lfs_vref(vp) + register struct vnode *vp; +{ + + if (vp->v_flag & VXLOCK) + return(1); + return (vget(vp, 0)); +} + +void +lfs_vunref(vp) + register struct vnode *vp; +{ + extern int lfs_no_inactive; + + /* + * This is vrele except that we do not want to VOP_INACTIVE + * this vnode. Rather than inline vrele here, we use a global + * flag to tell lfs_inactive not to run. Yes, its gross. + */ + lfs_no_inactive = 1; + vrele(vp); + lfs_no_inactive = 0; +} diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c new file mode 100644 index 0000000..afcd8c2 --- /dev/null +++ b/sys/ufs/lfs/lfs_subr.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_subr.c 8.2 (Berkeley) 9/21/93 + */ + +#include <sys/param.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/buf.h> +#include <sys/mount.h> +#include <sys/malloc.h> +#include <sys/proc.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> + +/* + * Return buffer with the contents of block "offset" from the beginning of + * directory "ip". If "res" is non-zero, fill it in with a pointer to the + * remaining space in the directory. + */ +int +lfs_blkatoff(ap) + struct vop_blkatoff_args /* { + struct vnode *a_vp; + off_t a_offset; + char **a_res; + struct buf **a_bpp; + } */ *ap; +{ + register struct lfs *fs; + struct inode *ip; + struct buf *bp; + daddr_t lbn; + int bsize, error; + + ip = VTOI(ap->a_vp); + fs = ip->i_lfs; + lbn = lblkno(fs, ap->a_offset); + bsize = blksize(fs); + + *ap->a_bpp = NULL; + if (error = bread(ap->a_vp, lbn, bsize, NOCRED, &bp)) { + brelse(bp); + return (error); + } + if (ap->a_res) + *ap->a_res = (char *)bp->b_data + blkoff(fs, ap->a_offset); + *ap->a_bpp = bp; + return (0); +} + + +/* + * lfs_seglock -- + * Single thread the segment writer. + */ +void +lfs_seglock(fs, flags) + struct lfs *fs; + unsigned long flags; +{ + struct segment *sp; + int s; + + if (fs->lfs_seglock) + if (fs->lfs_lockpid == curproc->p_pid) { + ++fs->lfs_seglock; + fs->lfs_sp->seg_flags |= flags; + return; + } else while (fs->lfs_seglock) + (void)tsleep(&fs->lfs_seglock, PRIBIO + 1, + "lfs seglock", 0); + + fs->lfs_seglock = 1; + fs->lfs_lockpid = curproc->p_pid; + + sp = fs->lfs_sp = malloc(sizeof(struct segment), M_SEGMENT, M_WAITOK); + sp->bpp = malloc(((LFS_SUMMARY_SIZE - sizeof(SEGSUM)) / + sizeof(daddr_t) + 1) * sizeof(struct buf *), M_SEGMENT, M_WAITOK); + sp->seg_flags = flags; + sp->vp = NULL; + (void) lfs_initseg(fs); + + /* + * Keep a cumulative count of the outstanding I/O operations. If the + * disk drive catches up with us it could go to zero before we finish, + * so we artificially increment it by one until we've scheduled all of + * the writes we intend to do. + */ + s = splbio(); + ++fs->lfs_iocount; + splx(s); +} +/* + * lfs_segunlock -- + * Single thread the segment writer. + */ +void +lfs_segunlock(fs) + struct lfs *fs; +{ + struct segment *sp; + unsigned long sync, ckp; + int s; + + if (fs->lfs_seglock == 1) { + + sp = fs->lfs_sp; + sync = sp->seg_flags & SEGM_SYNC; + ckp = sp->seg_flags & SEGM_CKP; + if (sp->bpp != sp->cbpp) { + /* Free allocated segment summary */ + fs->lfs_offset -= LFS_SUMMARY_SIZE / DEV_BSIZE; + brelvp(*sp->bpp); + free((*sp->bpp)->b_data, M_SEGMENT); + free(*sp->bpp, M_SEGMENT); + } else + printf ("unlock to 0 with no summary"); + free(sp->bpp, M_SEGMENT); + free(sp, M_SEGMENT); + + /* + * If the I/O count is non-zero, sleep until it reaches zero. + * At the moment, the user's process hangs around so we can + * sleep. + */ + s = splbio(); + --fs->lfs_iocount; + /* + * We let checkpoints happen asynchronously. That means + * that during recovery, we have to roll forward between + * the two segments described by the first and second + * superblocks to make sure that the checkpoint described + * by a superblock completed. + */ + if (sync && fs->lfs_iocount) + (void)tsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs vflush", 0); + splx(s); + if (ckp) { + fs->lfs_nactive = 0; + lfs_writesuper(fs); + } + --fs->lfs_seglock; + fs->lfs_lockpid = 0; + wakeup(&fs->lfs_seglock); + } else if (fs->lfs_seglock == 0) { + panic ("Seglock not held"); + } else { + --fs->lfs_seglock; + } +} diff --git a/sys/ufs/lfs/lfs_syscalls.c b/sys/ufs/lfs/lfs_syscalls.c new file mode 100644 index 0000000..666595e --- /dev/null +++ b/sys/ufs/lfs/lfs_syscalls.c @@ -0,0 +1,562 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_syscalls.c 8.5 (Berkeley) 4/20/94 + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/buf.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/kernel.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> +#define BUMP_FIP(SP) \ + (SP)->fip = (FINFO *) (&(SP)->fip->fi_blocks[(SP)->fip->fi_nblocks]) + +#define INC_FINFO(SP) ++((SEGSUM *)((SP)->segsum))->ss_nfinfo +#define DEC_FINFO(SP) --((SEGSUM *)((SP)->segsum))->ss_nfinfo + +/* + * Before committing to add something to a segment summary, make sure there + * is enough room. S is the bytes added to the summary. + */ +#define CHECK_SEG(s) \ +if (sp->sum_bytes_left < (s)) { \ + (void) lfs_writeseg(fs, sp); \ +} +struct buf *lfs_fakebuf __P((struct vnode *, int, size_t, caddr_t)); + +/* + * lfs_markv: + * + * This will mark inodes and blocks dirty, so they are written into the log. + * It will block until all the blocks have been written. The segment create + * time passed in the block_info and inode_info structures is used to decide + * if the data is valid for each block (in case some process dirtied a block + * or inode that is being cleaned between the determination that a block is + * live and the lfs_markv call). + * + * 0 on success + * -1/errno is return on error. + */ +struct lfs_markv_args { + fsid_t *fsidp; /* file system */ + BLOCK_INFO *blkiov; /* block array */ + int blkcnt; /* count of block array entries */ +}; +int +lfs_markv(p, uap, retval) + struct proc *p; + struct lfs_markv_args *uap; + int *retval; +{ + struct segment *sp; + BLOCK_INFO *blkp; + IFILE *ifp; + struct buf *bp, **bpp; + struct inode *ip; + struct lfs *fs; + struct mount *mntp; + struct vnode *vp; + fsid_t fsid; + void *start; + ino_t lastino; + daddr_t b_daddr, v_daddr; + u_long bsize; + int cnt, error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if ((mntp = getvfs(&fsid)) == NULL) + return (EINVAL); + + cnt = uap->blkcnt; + start = malloc(cnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK); + if (error = copyin(uap->blkiov, start, cnt * sizeof(BLOCK_INFO))) + goto err1; + + /* Mark blocks/inodes dirty. */ + fs = VFSTOUFS(mntp)->um_lfs; + bsize = fs->lfs_bsize; + error = 0; + + lfs_seglock(fs, SEGM_SYNC | SEGM_CLEAN); + sp = fs->lfs_sp; + for (v_daddr = LFS_UNUSED_DADDR, lastino = LFS_UNUSED_INUM, + blkp = start; cnt--; ++blkp) { + /* + * Get the IFILE entry (only once) and see if the file still + * exists. + */ + if (lastino != blkp->bi_inode) { + if (lastino != LFS_UNUSED_INUM) { + /* Finish up last file */ + if (sp->fip->fi_nblocks == 0) { + DEC_FINFO(sp); + sp->sum_bytes_left += + sizeof(FINFO) - sizeof(daddr_t); + } else { + lfs_updatemeta(sp); + BUMP_FIP(sp); + } + + lfs_writeinode(fs, sp, ip); + lfs_vunref(vp); + } + + /* Start a new file */ + CHECK_SEG(sizeof(FINFO)); + sp->sum_bytes_left -= sizeof(FINFO) - sizeof(daddr_t); + INC_FINFO(sp); + sp->start_lbp = &sp->fip->fi_blocks[0]; + sp->vp = NULL; + sp->fip->fi_version = blkp->bi_version; + sp->fip->fi_nblocks = 0; + sp->fip->fi_ino = blkp->bi_inode; + lastino = blkp->bi_inode; + if (blkp->bi_inode == LFS_IFILE_INUM) + v_daddr = fs->lfs_idaddr; + else { + LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); + v_daddr = ifp->if_daddr; + brelse(bp); + } + if (v_daddr == LFS_UNUSED_DADDR) + continue; + + /* Get the vnode/inode. */ + if (lfs_fastvget(mntp, blkp->bi_inode, v_daddr, &vp, + blkp->bi_lbn == LFS_UNUSED_LBN ? + blkp->bi_bp : NULL)) { +#ifdef DIAGNOSTIC + printf("lfs_markv: VFS_VGET failed (%d)\n", + blkp->bi_inode); +#endif + lastino = LFS_UNUSED_INUM; + v_daddr = LFS_UNUSED_DADDR; + continue; + } + sp->vp = vp; + ip = VTOI(vp); + } else if (v_daddr == LFS_UNUSED_DADDR) + continue; + + /* If this BLOCK_INFO didn't contain a block, keep going. */ + if (blkp->bi_lbn == LFS_UNUSED_LBN) + continue; + if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || + b_daddr != blkp->bi_daddr) + continue; + /* + * If we got to here, then we are keeping the block. If it + * is an indirect block, we want to actually put it in the + * buffer cache so that it can be updated in the finish_meta + * section. If it's not, we need to allocate a fake buffer + * so that writeseg can perform the copyin and write the buffer. + */ + if (blkp->bi_lbn >= 0) /* Data Block */ + bp = lfs_fakebuf(vp, blkp->bi_lbn, bsize, + blkp->bi_bp); + else { + bp = getblk(vp, blkp->bi_lbn, bsize, 0, 0); + if (!(bp->b_flags & (B_DELWRI | B_DONE | B_CACHE)) && + (error = copyin(blkp->bi_bp, bp->b_data, + bsize))) + goto err2; + if (error = VOP_BWRITE(bp)) + goto err2; + } + while (lfs_gatherblock(sp, bp, NULL)); + } + if (sp->vp) { + if (sp->fip->fi_nblocks == 0) { + DEC_FINFO(sp); + sp->sum_bytes_left += + sizeof(FINFO) - sizeof(daddr_t); + } else + lfs_updatemeta(sp); + + lfs_writeinode(fs, sp, ip); + lfs_vunref(vp); + } + (void) lfs_writeseg(fs, sp); + lfs_segunlock(fs); + free(start, M_SEGMENT); + return (error); + +/* + * XXX + * If we come in to error 2, we might have indirect blocks that were + * updated and now have bad block pointers. I don't know what to do + * about this. + */ + +err2: lfs_vunref(vp); + /* Free up fakebuffers */ + for (bpp = --sp->cbpp; bpp >= sp->bpp; --bpp) + if ((*bpp)->b_flags & B_CALL) { + brelvp(*bpp); + free(*bpp, M_SEGMENT); + } else + brelse(*bpp); + lfs_segunlock(fs); +err1: + free(start, M_SEGMENT); + return (error); +} + +/* + * lfs_bmapv: + * + * This will fill in the current disk address for arrays of blocks. + * + * 0 on success + * -1/errno is return on error. + */ +struct lfs_bmapv_args { + fsid_t *fsidp; /* file system */ + BLOCK_INFO *blkiov; /* block array */ + int blkcnt; /* count of block array entries */ +}; +int +lfs_bmapv(p, uap, retval) + struct proc *p; + struct lfs_bmapv_args *uap; + int *retval; +{ + BLOCK_INFO *blkp; + struct mount *mntp; + struct vnode *vp; + fsid_t fsid; + void *start; + daddr_t daddr; + int cnt, error, step; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if ((mntp = getvfs(&fsid)) == NULL) + return (EINVAL); + + cnt = uap->blkcnt; + start = blkp = malloc(cnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK); + if (error = copyin(uap->blkiov, blkp, cnt * sizeof(BLOCK_INFO))) { + free(blkp, M_SEGMENT); + return (error); + } + + for (step = cnt; step--; ++blkp) { + if (blkp->bi_lbn == LFS_UNUSED_LBN) + continue; + /* Could be a deadlock ? */ + if (VFS_VGET(mntp, blkp->bi_inode, &vp)) + daddr = LFS_UNUSED_DADDR; + else { + if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &daddr, NULL)) + daddr = LFS_UNUSED_DADDR; + vput(vp); + } + blkp->bi_daddr = daddr; + } + copyout(start, uap->blkiov, cnt * sizeof(BLOCK_INFO)); + free(start, M_SEGMENT); + return (0); +} + +/* + * lfs_segclean: + * + * Mark the segment clean. + * + * 0 on success + * -1/errno is return on error. + */ +struct lfs_segclean_args { + fsid_t *fsidp; /* file system */ + u_long segment; /* segment number */ +}; +int +lfs_segclean(p, uap, retval) + struct proc *p; + struct lfs_segclean_args *uap; + int *retval; +{ + CLEANERINFO *cip; + SEGUSE *sup; + struct buf *bp; + struct mount *mntp; + struct lfs *fs; + fsid_t fsid; + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if ((mntp = getvfs(&fsid)) == NULL) + return (EINVAL); + + fs = VFSTOUFS(mntp)->um_lfs; + + if (datosn(fs, fs->lfs_curseg) == uap->segment) + return (EBUSY); + + LFS_SEGENTRY(sup, fs, uap->segment, bp); + if (sup->su_flags & SEGUSE_ACTIVE) { + brelse(bp); + return (EBUSY); + } + fs->lfs_avail += fsbtodb(fs, fs->lfs_ssize) - 1; + fs->lfs_bfree += (sup->su_nsums * LFS_SUMMARY_SIZE / DEV_BSIZE) + + sup->su_ninos * btodb(fs->lfs_bsize); + sup->su_flags &= ~SEGUSE_DIRTY; + (void) VOP_BWRITE(bp); + + LFS_CLEANERINFO(cip, fs, bp); + ++cip->clean; + --cip->dirty; + (void) VOP_BWRITE(bp); + wakeup(&fs->lfs_avail); + return (0); +} + +/* + * lfs_segwait: + * + * This will block until a segment in file system fsid is written. A timeout + * in milliseconds may be specified which will awake the cleaner automatically. + * An fsid of -1 means any file system, and a timeout of 0 means forever. + * + * 0 on success + * 1 on timeout + * -1/errno is return on error. + */ +struct lfs_segwait_args { + fsid_t *fsidp; /* file system */ + struct timeval *tv; /* timeout */ +}; +int +lfs_segwait(p, uap, retval) + struct proc *p; + struct lfs_segwait_args *uap; + int *retval; +{ + extern int lfs_allclean_wakeup; + struct mount *mntp; + struct timeval atv; + fsid_t fsid; + void *addr; + u_long timeout; + int error, s; + + if (error = suser(p->p_ucred, &p->p_acflag)) { + return (error); +} +#ifdef WHEN_QUADS_WORK + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if (fsid == (fsid_t)-1) + addr = &lfs_allclean_wakeup; + else { + if ((mntp = getvfs(&fsid)) == NULL) + return (EINVAL); + addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg; + } +#else + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if ((mntp = getvfs(&fsid)) == NULL) + addr = &lfs_allclean_wakeup; + else + addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg; +#endif + + if (uap->tv) { + if (error = copyin(uap->tv, &atv, sizeof(struct timeval))) + return (error); + if (itimerfix(&atv)) + return (EINVAL); + s = splclock(); + timevaladd(&atv, (struct timeval *)&time); + timeout = hzto(&atv); + splx(s); + } else + timeout = 0; + + error = tsleep(addr, PCATCH | PUSER, "segment", timeout); + return (error == ERESTART ? EINTR : 0); +} + +/* + * VFS_VGET call specialized for the cleaner. The cleaner already knows the + * daddr from the ifile, so don't look it up again. If the cleaner is + * processing IINFO structures, it may have the ondisk inode already, so + * don't go retrieving it again. + */ +int +lfs_fastvget(mp, ino, daddr, vpp, dinp) + struct mount *mp; + ino_t ino; + daddr_t daddr; + struct vnode **vpp; + struct dinode *dinp; +{ + register struct inode *ip; + struct vnode *vp; + struct ufsmount *ump; + struct buf *bp; + dev_t dev; + int error; + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + /* + * This is playing fast and loose. Someone may have the inode + * locked, in which case they are going to be distinctly unhappy + * if we trash something. + */ + if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) { + lfs_vref(*vpp); + if ((*vpp)->v_flag & VXLOCK) + printf ("Cleaned vnode VXLOCKED\n"); + ip = VTOI(*vpp); + if (ip->i_flags & IN_LOCKED) + printf("cleaned vnode locked\n"); + if (!(ip->i_flag & IN_MODIFIED)) { + ++ump->um_lfs->lfs_uinodes; + ip->i_flag |= IN_MODIFIED; + } + ip->i_flag |= IN_MODIFIED; + return (0); + } + + /* Allocate new vnode/inode. */ + if (error = lfs_vcreate(mp, ino, &vp)) { + *vpp = NULL; + return (error); + } + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ip = VTOI(vp); + ufs_ihashins(ip); + + /* + * XXX + * This may not need to be here, logically it should go down with + * the i_devvp initialization. + * Ask Kirk. + */ + ip->i_lfs = ump->um_lfs; + + /* Read in the disk contents for the inode, copy into the inode. */ + if (dinp) + if (error = copyin(dinp, &ip->i_din, sizeof(struct dinode))) + return (error); + else { + if (error = bread(ump->um_devvp, daddr, + (int)ump->um_lfs->lfs_bsize, NOCRED, &bp)) { + /* + * The inode does not contain anything useful, so it + * would be misleading to leave it on its hash chain. + * Iput() will return it to the free list. + */ + ufs_ihashrem(ip); + + /* Unlock and discard unneeded inode. */ + lfs_vunref(vp); + brelse(bp); + *vpp = NULL; + return (error); + } + ip->i_din = + *lfs_ifind(ump->um_lfs, ino, (struct dinode *)bp->b_data); + brelse(bp); + } + + /* Inode was just read from user space or disk, make sure it's locked */ + ip->i_flag |= IN_LOCKED; + + /* + * Initialize the vnode from the inode, check for aliases. In all + * cases re-init ip, the underlying vnode/inode may have changed. + */ + if (error = ufs_vinit(mp, lfs_specop_p, LFS_FIFOOPS, &vp)) { + lfs_vunref(vp); + *vpp = NULL; + return (error); + } + /* + * Finish inode initialization now that aliasing has been resolved. + */ + ip->i_devvp = ump->um_devvp; + ip->i_flag |= IN_MODIFIED; + ++ump->um_lfs->lfs_uinodes; + VREF(ip->i_devvp); + *vpp = vp; + return (0); +} +struct buf * +lfs_fakebuf(vp, lbn, size, uaddr) + struct vnode *vp; + int lbn; + size_t size; + caddr_t uaddr; +{ + struct buf *bp; + + bp = lfs_newbuf(vp, lbn, 0); + bp->b_saveaddr = uaddr; + bp->b_bufsize = size; + bp->b_bcount = size; + bp->b_flags |= B_INVAL; + return (bp); +} diff --git a/sys/ufs/lfs/lfs_vfsops.c b/sys/ufs/lfs/lfs_vfsops.c new file mode 100644 index 0000000..0c8186e --- /dev/null +++ b/sys/ufs/lfs/lfs_vfsops.c @@ -0,0 +1,573 @@ +/* + * Copyright (c) 1989, 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_vfsops.c 8.7 (Berkeley) 4/16/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/buf.h> +#include <sys/mbuf.h> +#include <sys/file.h> +#include <sys/disklabel.h> +#include <sys/ioctl.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/socket.h> + +#include <miscfs/specfs/specdev.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> + +int lfs_mountfs __P((struct vnode *, struct mount *, struct proc *)); + +struct vfsops lfs_vfsops = { + lfs_mount, + ufs_start, + lfs_unmount, + ufs_root, + ufs_quotactl, + lfs_statfs, + lfs_sync, + lfs_vget, + lfs_fhtovp, + lfs_vptofh, + lfs_init, +}; + +int +lfs_mountroot() +{ + panic("lfs_mountroot"); /* XXX -- implement */ +} + +/* + * VFS Operations. + * + * mount system call + */ +lfs_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct ufs_args args; + struct ufsmount *ump; + register struct lfs *fs; /* LFS */ + u_int size; + int error; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct ufs_args))) + return (error); + + /* Until LFS can do NFS right. XXX */ + if (args.export.ex_flags & MNT_EXPORTED) + return (EINVAL); + + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); +#ifdef NOTLFS /* LFS */ + fs = ump->um_fs; + if (fs->fs_ronly && (mp->mnt_flag & MNT_RDONLY) == 0) + fs->fs_ronly = 0; +#else + fs = ump->um_lfs; + if (fs->lfs_ronly && (mp->mnt_flag & MNT_RDONLY) == 0) + fs->lfs_ronly = 0; +#endif + if (args.fspec == 0) { + /* + * Process export requests. + */ + return (vfs_export(mp, &ump->um_export, &args.export)); + } + } + /* + * Not an update, or updating the name: look up the name + * and verify that it refers to a sensible block device. + */ + NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); + if (error = namei(ndp)) + return (error); + devvp = ndp->ni_vp; + if (devvp->v_type != VBLK) { + vrele(devvp); + return (ENOTBLK); + } + if (major(devvp->v_rdev) >= nblkdev) { + vrele(devvp); + return (ENXIO); + } + if ((mp->mnt_flag & MNT_UPDATE) == 0) + error = lfs_mountfs(devvp, mp, p); /* LFS */ + else { + if (devvp != ump->um_devvp) + error = EINVAL; /* needs translation */ + else + vrele(devvp); + } + if (error) { + vrele(devvp); + return (error); + } + ump = VFSTOUFS(mp); + fs = ump->um_lfs; /* LFS */ +#ifdef NOTLFS /* LFS */ + (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); + bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) ufs_statfs(mp, &mp->mnt_stat, p); +#else + (void)copyinstr(path, fs->lfs_fsmnt, sizeof(fs->lfs_fsmnt) - 1, &size); + bzero(fs->lfs_fsmnt + size, sizeof(fs->lfs_fsmnt) - size); + bcopy((caddr_t)fs->lfs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) lfs_statfs(mp, &mp->mnt_stat, p); +#endif + return (0); +} + +/* + * Common code for mount and mountroot + * LFS specific + */ +int +lfs_mountfs(devvp, mp, p) + register struct vnode *devvp; + struct mount *mp; + struct proc *p; +{ + extern struct vnode *rootvp; + register struct lfs *fs; + register struct ufsmount *ump; + struct vnode *vp; + struct buf *bp; + struct partinfo dpart; + dev_t dev; + int error, i, ronly, size; + + /* + * Disallow multiple mounts of the same device. + * Disallow mounting of a device that is currently in use + * (except for root, which might share swap device for miniroot). + * Flush out any old buffers remaining from a previous use. + */ + if (error = vfs_mountedon(devvp)) + return (error); + if (vcount(devvp) > 1 && devvp != rootvp) + return (EBUSY); + if (error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + if (error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) + return (error); + + if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) + size = DEV_BSIZE; + else { + size = dpart.disklab->d_secsize; +#ifdef NEVER_USED + dpart.part->p_fstype = FS_LFS; + dpart.part->p_fsize = fs->lfs_fsize; /* frag size */ + dpart.part->p_frag = fs->lfs_frag; /* frags per block */ + dpart.part->p_cpg = fs->lfs_segshift; /* segment shift */ +#endif + } + + /* Don't free random space on error. */ + bp = NULL; + ump = NULL; + + /* Read in the superblock. */ + if (error = bread(devvp, LFS_LABELPAD / size, LFS_SBPAD, NOCRED, &bp)) + goto out; + fs = (struct lfs *)bp->b_data; + + /* Check the basics. */ + if (fs->lfs_magic != LFS_MAGIC || fs->lfs_bsize > MAXBSIZE || + fs->lfs_bsize < sizeof(struct lfs)) { + error = EINVAL; /* XXX needs translation */ + goto out; + } + + /* Allocate the mount structure, copy the superblock into it. */ + ump = (struct ufsmount *)malloc(sizeof *ump, M_UFSMNT, M_WAITOK); + fs = ump->um_lfs = malloc(sizeof(struct lfs), M_UFSMNT, M_WAITOK); + bcopy(bp->b_data, fs, sizeof(struct lfs)); + if (sizeof(struct lfs) < LFS_SBPAD) /* XXX why? */ + bp->b_flags |= B_INVAL; + brelse(bp); + bp = NULL; + + /* Set up the I/O information */ + fs->lfs_iocount = 0; + + /* Set up the ifile and lock aflags */ + fs->lfs_doifile = 0; + fs->lfs_writer = 0; + fs->lfs_dirops = 0; + fs->lfs_seglock = 0; + + /* Set the file system readonly/modify bits. */ + fs->lfs_ronly = ronly; + if (ronly == 0) + fs->lfs_fmod = 1; + + /* Initialize the mount structure. */ + dev = devvp->v_rdev; + mp->mnt_data = (qaddr_t)ump; + mp->mnt_stat.f_fsid.val[0] = (long)dev; + mp->mnt_stat.f_fsid.val[1] = MOUNT_LFS; + mp->mnt_flag |= MNT_LOCAL; + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_bptrtodb = 0; + ump->um_seqinc = 1 << fs->lfs_fsbtodb; + ump->um_nindir = fs->lfs_nindir; + for (i = 0; i < MAXQUOTAS; i++) + ump->um_quotas[i] = NULLVP; + devvp->v_specflags |= SI_MOUNTEDON; + + /* + * We use the ifile vnode for almost every operation. Instead of + * retrieving it from the hash table each time we retrieve it here, + * artificially increment the reference count and keep a pointer + * to it in the incore copy of the superblock. + */ + if (error = VFS_VGET(mp, LFS_IFILE_INUM, &vp)) + goto out; + fs->lfs_ivnode = vp; + VREF(vp); + vput(vp); + + return (0); +out: + if (bp) + brelse(bp); + (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); + if (ump) { + free(ump->um_lfs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + } + return (error); +} + +/* + * unmount system call + */ +lfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + extern int doforce; + register struct ufsmount *ump; + register struct lfs *fs; + int i, error, flags, ronly; + + flags = 0; + if (mntflags & MNT_FORCE) { + if (!doforce || (mp->mnt_flag & MNT_ROOTFS)) + return (EINVAL); + flags |= FORCECLOSE; + } + + ump = VFSTOUFS(mp); + fs = ump->um_lfs; +#ifdef QUOTA + if (mp->mnt_flag & MNT_QUOTA) { + if (error = vflush(mp, fs->lfs_ivnode, SKIPSYSTEM|flags)) + return (error); + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP) + continue; + quotaoff(p, mp, i); + } + /* + * Here we fall through to vflush again to ensure + * that we have gotten rid of all the system vnodes. + */ + } +#endif + if (error = vflush(mp, fs->lfs_ivnode, flags)) + return (error); + fs->lfs_clean = 1; + if (error = VFS_SYNC(mp, 1, p->p_ucred, p)) + return (error); + if (fs->lfs_ivnode->v_dirtyblkhd.lh_first) + panic("lfs_unmount: still dirty blocks on ifile vnode\n"); + vrele(fs->lfs_ivnode); + vgone(fs->lfs_ivnode); + + ronly = !fs->lfs_ronly; + ump->um_devvp->v_specflags &= ~SI_MOUNTEDON; + error = VOP_CLOSE(ump->um_devvp, + ronly ? FREAD : FREAD|FWRITE, NOCRED, p); + vrele(ump->um_devvp); + free(fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +/* + * Get file system statistics. + */ +lfs_statfs(mp, sbp, p) + struct mount *mp; + register struct statfs *sbp; + struct proc *p; +{ + register struct lfs *fs; + register struct ufsmount *ump; + + ump = VFSTOUFS(mp); + fs = ump->um_lfs; + if (fs->lfs_magic != LFS_MAGIC) + panic("lfs_statfs: magic"); + sbp->f_type = MOUNT_LFS; + sbp->f_bsize = fs->lfs_bsize; + sbp->f_iosize = fs->lfs_bsize; + sbp->f_blocks = dbtofsb(fs,fs->lfs_dsize); + sbp->f_bfree = dbtofsb(fs, fs->lfs_bfree); + sbp->f_bavail = (fs->lfs_dsize * (100 - fs->lfs_minfree) / 100) - + (fs->lfs_dsize - fs->lfs_bfree); + sbp->f_bavail = dbtofsb(fs, sbp->f_bavail); + sbp->f_files = fs->lfs_nfiles; + sbp->f_ffree = sbp->f_bfree * INOPB(fs); + if (sbp != &mp->mnt_stat) { + bcopy((caddr_t)mp->mnt_stat.f_mntonname, + (caddr_t)&sbp->f_mntonname[0], MNAMELEN); + bcopy((caddr_t)mp->mnt_stat.f_mntfromname, + (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + } + return (0); +} + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked `MPBUSY'. + */ +lfs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + int error; + + /* All syncs must be checkpoints until roll-forward is implemented. */ + error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0)); +#ifdef QUOTA + qsync(mp); +#endif + return (error); +} + +/* + * Look up an LFS dinode number to find its incore vnode. If not already + * in core, read it in from the specified device. Return the inode locked. + * Detection and handling of mount points must be done by the calling routine. + */ +int +lfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + register struct lfs *fs; + register struct inode *ip; + struct buf *bp; + struct ifile *ifp; + struct vnode *vp; + struct ufsmount *ump; + daddr_t daddr; + dev_t dev; + int error; + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + if ((*vpp = ufs_ihashget(dev, ino)) != NULL) + return (0); + + /* Translate the inode number to a disk address. */ + fs = ump->um_lfs; + if (ino == LFS_IFILE_INUM) + daddr = fs->lfs_idaddr; + else { + LFS_IENTRY(ifp, fs, ino, bp); + daddr = ifp->if_daddr; + brelse(bp); + if (daddr == LFS_UNUSED_DADDR) + return (ENOENT); + } + + /* Allocate new vnode/inode. */ + if (error = lfs_vcreate(mp, ino, &vp)) { + *vpp = NULL; + return (error); + } + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ip = VTOI(vp); + ufs_ihashins(ip); + + /* + * XXX + * This may not need to be here, logically it should go down with + * the i_devvp initialization. + * Ask Kirk. + */ + ip->i_lfs = ump->um_lfs; + + /* Read in the disk contents for the inode, copy into the inode. */ + if (error = + bread(ump->um_devvp, daddr, (int)fs->lfs_bsize, NOCRED, &bp)) { + /* + * The inode does not contain anything useful, so it would + * be misleading to leave it on its hash chain. With mode + * still zero, it will be unlinked and returned to the free + * list by vput(). + */ + vput(vp); + brelse(bp); + *vpp = NULL; + return (error); + } + ip->i_din = *lfs_ifind(fs, ino, (struct dinode *)bp->b_data); + brelse(bp); + + /* + * Initialize the vnode from the inode, check for aliases. In all + * cases re-init ip, the underlying vnode/inode may have changed. + */ + if (error = ufs_vinit(mp, lfs_specop_p, LFS_FIFOOPS, &vp)) { + vput(vp); + *vpp = NULL; + return (error); + } + /* + * Finish inode initialization now that aliasing has been resolved. + */ + ip->i_devvp = ump->um_devvp; + VREF(ip->i_devvp); + *vpp = vp; + return (0); +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is valid + * - call lfs_vget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + * - check that the given client host has export rights and return + * those rights via. exflagsp and credanonp + * + * XXX + * use ifile to see if inode is allocated instead of reading off disk + * what is the relationship between my generational number and the NFS + * generational number. + */ +int +lfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct fid *fhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + register struct ufid *ufhp; + + ufhp = (struct ufid *)fhp; + if (ufhp->ufid_ino < ROOTINO) + return (ESTALE); + return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp)); +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +lfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + register struct inode *ip; + register struct ufid *ufhp; + + ip = VTOI(vp); + ufhp = (struct ufid *)fhp; + ufhp->ufid_len = sizeof(struct ufid); + ufhp->ufid_ino = ip->i_number; + ufhp->ufid_gen = ip->i_gen; + return (0); +} diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c new file mode 100644 index 0000000..fc6bd48 --- /dev/null +++ b/sys/ufs/lfs/lfs_vnops.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_vnops.c 8.5 (Berkeley) 12/30/93 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/resourcevar.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/conf.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/malloc.h> + +#include <vm/vm.h> + +#include <miscfs/specfs/specdev.h> +#include <miscfs/fifofs/fifo.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/dir.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/lfs/lfs.h> +#include <ufs/lfs/lfs_extern.h> + +/* Global vfs data structures for lfs. */ +int (**lfs_vnodeop_p)(); +struct vnodeopv_entry_desc lfs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, ufs_lookup }, /* lookup */ + { &vop_create_desc, ufs_create }, /* create */ + { &vop_mknod_desc, ufs_mknod }, /* mknod */ + { &vop_open_desc, ufs_open }, /* open */ + { &vop_close_desc, lfs_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, lfs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, lfs_read }, /* read */ + { &vop_write_desc, lfs_write }, /* write */ + { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */ + { &vop_select_desc, ufs_select }, /* select */ + { &vop_mmap_desc, ufs_mmap }, /* mmap */ + { &vop_fsync_desc, lfs_fsync }, /* fsync */ + { &vop_seek_desc, ufs_seek }, /* seek */ + { &vop_remove_desc, ufs_remove }, /* remove */ + { &vop_link_desc, ufs_link }, /* link */ + { &vop_rename_desc, ufs_rename }, /* rename */ + { &vop_mkdir_desc, ufs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, ufs_rmdir }, /* rmdir */ + { &vop_symlink_desc, ufs_symlink }, /* symlink */ + { &vop_readdir_desc, ufs_readdir }, /* readdir */ + { &vop_readlink_desc, ufs_readlink }, /* readlink */ + { &vop_abortop_desc, ufs_abortop }, /* abortop */ + { &vop_inactive_desc, lfs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, ufs_bmap }, /* bmap */ + { &vop_strategy_desc, ufs_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ + { &vop_advlock_desc, ufs_advlock }, /* advlock */ + { &vop_blkatoff_desc, lfs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, lfs_valloc }, /* valloc */ + { &vop_vfree_desc, lfs_vfree }, /* vfree */ + { &vop_truncate_desc, lfs_truncate }, /* truncate */ + { &vop_update_desc, lfs_update }, /* update */ + { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc lfs_vnodeop_opv_desc = + { &lfs_vnodeop_p, lfs_vnodeop_entries }; + +int (**lfs_specop_p)(); +struct vnodeopv_entry_desc lfs_specop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, ufsspec_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, lfs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsspec_read }, /* read */ + { &vop_write_desc, ufsspec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, lfs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_vfree_desc, lfs_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, lfs_update }, /* update */ + { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc lfs_specop_opv_desc = + { &lfs_specop_p, lfs_specop_entries }; + +#ifdef FIFO +int (**lfs_fifoop_p)(); +struct vnodeopv_entry_desc lfs_fifoop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, fifo_create }, /* create */ + { &vop_mknod_desc, fifo_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, ufsfifo_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, lfs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsfifo_read }, /* read */ + { &vop_write_desc, ufsfifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, fifo_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, fifo_remove }, /* remove */ + { &vop_link_desc, fifo_link }, /* link */ + { &vop_rename_desc, fifo_rename }, /* rename */ + { &vop_mkdir_desc, fifo_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fifo_rmdir }, /* rmdir */ + { &vop_symlink_desc, fifo_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, lfs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_vfree_desc, lfs_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, lfs_update }, /* update */ + { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc lfs_fifoop_opv_desc = + { &lfs_fifoop_p, lfs_fifoop_entries }; +#endif /* FIFO */ + +#define LFS_READWRITE +#include <ufs/ufs/ufs_readwrite.c> +#undef LFS_READWRITE + +/* + * Synch an open file. + */ +/* ARGSUSED */ +lfs_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + struct timeval tv; + + tv = time; + return (VOP_UPDATE(ap->a_vp, &tv, &tv, + ap->a_waitfor == MNT_WAIT ? LFS_SYNC : 0)); +} + +/* + * These macros are used to bracket UFS directory ops, so that we can + * identify all the pages touched during directory ops which need to + * be ordered and flushed atomically, so that they may be recovered. + */ +#define SET_DIROP(fs) { \ + if ((fs)->lfs_writer) \ + tsleep(&(fs)->lfs_dirops, PRIBIO + 1, "lfs_dirop", 0); \ + ++(fs)->lfs_dirops; \ + (fs)->lfs_doifile = 1; \ +} + +#define SET_ENDOP(fs) { \ + --(fs)->lfs_dirops; \ + if (!(fs)->lfs_dirops) \ + wakeup(&(fs)->lfs_writer); \ +} + +#define MARK_VNODE(dvp) (dvp)->v_flag |= VDIROP + +int +lfs_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + ret = ufs_symlink(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + ret = ufs_mknod(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + ret = ufs_create(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + ret = ufs_mkdir(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + MARK_VNODE(ap->a_vp); + ret = ufs_remove(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_rmdir(ap) + struct vop_rmdir_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + MARK_VNODE(ap->a_vp); + ret = ufs_rmdir(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_link(ap) + struct vop_link_args /* { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_vp)->i_lfs); + MARK_VNODE(ap->a_vp); + ret = ufs_link(ap); + SET_ENDOP(VTOI(ap->a_vp)->i_lfs); + return (ret); +} + +int +lfs_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_fdvp)->i_lfs); + MARK_VNODE(ap->a_fdvp); + MARK_VNODE(ap->a_tdvp); + ret = ufs_rename(ap); + SET_ENDOP(VTOI(ap->a_fdvp)->i_lfs); + return (ret); +} +/* XXX hack to avoid calling ITIMES in getattr */ +int +lfs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + register struct vattr *vap = ap->a_vap; + /* + * Copy from inode table + */ + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_mode & ~IFMT; + vap->va_nlink = ip->i_nlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + vap->va_rdev = (dev_t)ip->i_rdev; + vap->va_size = ip->i_din.di_size; + vap->va_atime = ip->i_atime; + vap->va_mtime = ip->i_mtime; + vap->va_ctime = ip->i_ctime; + vap->va_flags = ip->i_flags; + vap->va_gen = ip->i_gen; + /* this doesn't belong here */ + if (vp->v_type == VBLK) + vap->va_blocksize = BLKDEV_IOSIZE; + else if (vp->v_type == VCHR) + vap->va_blocksize = MAXBSIZE; + else + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_bytes = dbtob(ip->i_blocks); + vap->va_type = vp->v_type; + vap->va_filerev = ip->i_modrev; + return (0); +} +/* + * Close called + * + * XXX -- we were using ufs_close, but since it updates the + * times on the inode, we might need to bump the uinodes + * count. + */ +/* ARGSUSED */ +int +lfs_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + int mod; + + if (vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) { + mod = ip->i_flag & IN_MODIFIED; + ITIMES(ip, &time, &time); + if (!mod && ip->i_flag & IN_MODIFIED) + ip->i_lfs->lfs_uinodes++; + } + return (0); +} + +/* + * Stub inactive routine that avoid calling ufs_inactive in some cases. + */ +int lfs_no_inactive = 0; + +int +lfs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + if (lfs_no_inactive) + return (0); + return (ufs_inactive(ap)); +} diff --git a/sys/ufs/mfs/mfs_extern.h b/sys/ufs/mfs/mfs_extern.h new file mode 100644 index 0000000..e357faf6 --- /dev/null +++ b/sys/ufs/mfs/mfs_extern.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfs_extern.h 8.1 (Berkeley) 6/11/93 + */ + +struct buf; +struct mount; +struct nameidata; +struct proc; +struct statfs; +struct ucred; +struct vnode; + +__BEGIN_DECLS +int mfs_badop __P((void)); +int mfs_bmap __P((struct vop_bmap_args *)); +int mfs_close __P((struct vop_close_args *)); +void mfs_doio __P((struct buf *bp, caddr_t base)); +int mfs_inactive __P((struct vop_inactive_args *)); /* XXX */ +int mfs_reclaim __P((struct vop_reclaim_args *)); /* XXX */ +int mfs_init __P((void)); +int mfs_ioctl __P((struct vop_ioctl_args *)); +int mfs_mount __P((struct mount *mp, + char *path, caddr_t data, struct nameidata *ndp, struct proc *p)); +int mfs_open __P((struct vop_open_args *)); +int mfs_print __P((struct vop_print_args *)); /* XXX */ +int mfs_start __P((struct mount *mp, int flags, struct proc *p)); +int mfs_statfs __P((struct mount *mp, struct statfs *sbp, struct proc *p)); +int mfs_strategy __P((struct vop_strategy_args *)); /* XXX */ +__END_DECLS diff --git a/sys/ufs/mfs/mfs_vfsops.c b/sys/ufs/mfs/mfs_vfsops.c new file mode 100644 index 0000000..3fcbdf3 --- /dev/null +++ b/sys/ufs/mfs/mfs_vfsops.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 1989, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfs_vfsops.c 8.4 (Berkeley) 4/16/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/buf.h> +#include <sys/mount.h> +#include <sys/signalvar.h> +#include <sys/vnode.h> +#include <sys/malloc.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +#include <ufs/ffs/fs.h> +#include <ufs/ffs/ffs_extern.h> + +#include <ufs/mfs/mfsnode.h> +#include <ufs/mfs/mfs_extern.h> + +caddr_t mfs_rootbase; /* address of mini-root in kernel virtual memory */ +u_long mfs_rootsize; /* size of mini-root in bytes */ + +static int mfs_minor; /* used for building internal dev_t */ + +extern int (**mfs_vnodeop_p)(); + +/* + * mfs vfs operations. + */ +struct vfsops mfs_vfsops = { + mfs_mount, + mfs_start, + ffs_unmount, + ufs_root, + ufs_quotactl, + mfs_statfs, + ffs_sync, + ffs_vget, + ffs_fhtovp, + ffs_vptofh, + mfs_init, +}; + +/* + * Called by main() when mfs is going to be mounted as root. + * + * Name is updated by mount(8) after booting. + */ +#define ROOTNAME "mfs_root" + +mfs_mountroot() +{ + extern struct vnode *rootvp; + register struct fs *fs; + register struct mount *mp; + struct proc *p = curproc; /* XXX */ + struct ufsmount *ump; + struct mfsnode *mfsp; + u_int size; + int error; + + /* + * Get vnodes for swapdev and rootdev. + */ + if (bdevvp(swapdev, &swapdev_vp) || bdevvp(rootdev, &rootvp)) + panic("mfs_mountroot: can't setup bdevvp's"); + + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = &mfs_vfsops; + mp->mnt_flag = MNT_RDONLY; + mfsp = malloc(sizeof *mfsp, M_MFSNODE, M_WAITOK); + rootvp->v_data = mfsp; + rootvp->v_op = mfs_vnodeop_p; + rootvp->v_tag = VT_MFS; + mfsp->mfs_baseoff = mfs_rootbase; + mfsp->mfs_size = mfs_rootsize; + mfsp->mfs_vnode = rootvp; + mfsp->mfs_pid = p->p_pid; + mfsp->mfs_buflist = (struct buf *)0; + if (error = ffs_mountfs(rootvp, mp, p)) { + free(mp, M_MOUNT); + free(mfsp, M_MFSNODE); + return (error); + } + if (error = vfs_lock(mp)) { + (void)ffs_unmount(mp, 0, p); + free(mp, M_MOUNT); + free(mfsp, M_MFSNODE); + return (error); + } + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mp->mnt_flag |= MNT_ROOTFS; + mp->mnt_vnodecovered = NULLVP; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + bzero(fs->fs_fsmnt, sizeof(fs->fs_fsmnt)); + fs->fs_fsmnt[0] = '/'; + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void)ffs_statfs(mp, &mp->mnt_stat, p); + vfs_unlock(mp); + inittodr((time_t)0); + return (0); +} + +/* + * This is called early in boot to set the base address and size + * of the mini-root. + */ +mfs_initminiroot(base) + caddr_t base; +{ + struct fs *fs = (struct fs *)(base + SBOFF); + extern int (*mountroot)(); + + /* check for valid super block */ + if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || + fs->fs_bsize < sizeof(struct fs)) + return (0); + mountroot = mfs_mountroot; + mfs_rootbase = base; + mfs_rootsize = fs->fs_fsize * fs->fs_size; + rootdev = makedev(255, mfs_minor++); + return (mfs_rootsize); +} + +/* + * VFS Operations. + * + * mount system call + */ +/* ARGSUSED */ +int +mfs_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct mfs_args args; + struct ufsmount *ump; + register struct fs *fs; + register struct mfsnode *mfsp; + u_int size; + int flags, error; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct mfs_args))) + return (error); + + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + flags = WRITECLOSE; + if (mp->mnt_flag & MNT_FORCE) + flags |= FORCECLOSE; + if (vfs_busy(mp)) + return (EBUSY); + error = ffs_flushfiles(mp, flags, p); + vfs_unbusy(mp); + if (error) + return (error); + } + if (fs->fs_ronly && (mp->mnt_flag & MNT_WANTRDWR)) + fs->fs_ronly = 0; +#ifdef EXPORTMFS + if (args.fspec == 0) + return (vfs_export(mp, &ump->um_export, &args.export)); +#endif + return (0); + } + error = getnewvnode(VT_MFS, (struct mount *)0, mfs_vnodeop_p, &devvp); + if (error) + return (error); + devvp->v_type = VBLK; + if (checkalias(devvp, makedev(255, mfs_minor++), (struct mount *)0)) + panic("mfs_mount: dup dev"); + mfsp = (struct mfsnode *)malloc(sizeof *mfsp, M_MFSNODE, M_WAITOK); + devvp->v_data = mfsp; + mfsp->mfs_baseoff = args.base; + mfsp->mfs_size = args.size; + mfsp->mfs_vnode = devvp; + mfsp->mfs_pid = p->p_pid; + mfsp->mfs_buflist = (struct buf *)0; + if (error = ffs_mountfs(devvp, mp, p)) { + mfsp->mfs_buflist = (struct buf *)-1; + vrele(devvp); + return (error); + } + ump = VFSTOUFS(mp); + fs = ump->um_fs; + (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); + bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) mfs_statfs(mp, &mp->mnt_stat, p); + return (0); +} + +int mfs_pri = PWAIT | PCATCH; /* XXX prob. temp */ + +/* + * Used to grab the process and keep it in the kernel to service + * memory filesystem I/O requests. + * + * Loop servicing I/O requests. + * Copy the requested data into or out of the memory filesystem + * address space. + */ +/* ARGSUSED */ +int +mfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + register struct vnode *vp = VFSTOUFS(mp)->um_devvp; + register struct mfsnode *mfsp = VTOMFS(vp); + register struct buf *bp; + register caddr_t base; + int error = 0; + + base = mfsp->mfs_baseoff; + while (mfsp->mfs_buflist != (struct buf *)(-1)) { + while (bp = mfsp->mfs_buflist) { + mfsp->mfs_buflist = bp->b_actf; + mfs_doio(bp, base); + wakeup((caddr_t)bp); + } + /* + * If a non-ignored signal is received, try to unmount. + * If that fails, clear the signal (it has been "processed"), + * otherwise we will loop here, as tsleep will always return + * EINTR/ERESTART. + */ + if (error = tsleep((caddr_t)vp, mfs_pri, "mfsidl", 0)) + if (dounmount(mp, 0, p) != 0) + CLRSIG(p, CURSIG(p)); + } + return (error); +} + +/* + * Get file system statistics. + */ +mfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + int error; + + error = ffs_statfs(mp, sbp, p); + sbp->f_type = MOUNT_MFS; + return (error); +} diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c new file mode 100644 index 0000000..71adf06 --- /dev/null +++ b/sys/ufs/mfs/mfs_vnops.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfs_vnops.c 8.3 (Berkeley) 9/21/93 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/buf.h> +#include <sys/map.h> +#include <sys/vnode.h> +#include <sys/malloc.h> + +#include <miscfs/specfs/specdev.h> + +#include <machine/vmparam.h> + +#include <ufs/mfs/mfsnode.h> +#include <ufs/mfs/mfsiom.h> +#include <ufs/mfs/mfs_extern.h> + +#if !defined(hp300) && !defined(i386) && !defined(mips) && !defined(sparc) && !defined(luna68k) +static int mfsmap_want; /* 1 => need kernel I/O resources */ +struct map mfsmap[MFS_MAPSIZE]; +extern char mfsiobuf[]; +#endif + +/* + * mfs vnode operations. + */ +int (**mfs_vnodeop_p)(); +struct vnodeopv_entry_desc mfs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, mfs_lookup }, /* lookup */ + { &vop_create_desc, mfs_create }, /* create */ + { &vop_mknod_desc, mfs_mknod }, /* mknod */ + { &vop_open_desc, mfs_open }, /* open */ + { &vop_close_desc, mfs_close }, /* close */ + { &vop_access_desc, mfs_access }, /* access */ + { &vop_getattr_desc, mfs_getattr }, /* getattr */ + { &vop_setattr_desc, mfs_setattr }, /* setattr */ + { &vop_read_desc, mfs_read }, /* read */ + { &vop_write_desc, mfs_write }, /* write */ + { &vop_ioctl_desc, mfs_ioctl }, /* ioctl */ + { &vop_select_desc, mfs_select }, /* select */ + { &vop_mmap_desc, mfs_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, mfs_seek }, /* seek */ + { &vop_remove_desc, mfs_remove }, /* remove */ + { &vop_link_desc, mfs_link }, /* link */ + { &vop_rename_desc, mfs_rename }, /* rename */ + { &vop_mkdir_desc, mfs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, mfs_rmdir }, /* rmdir */ + { &vop_symlink_desc, mfs_symlink }, /* symlink */ + { &vop_readdir_desc, mfs_readdir }, /* readdir */ + { &vop_readlink_desc, mfs_readlink }, /* readlink */ + { &vop_abortop_desc, mfs_abortop }, /* abortop */ + { &vop_inactive_desc, mfs_inactive }, /* inactive */ + { &vop_reclaim_desc, mfs_reclaim }, /* reclaim */ + { &vop_lock_desc, mfs_lock }, /* lock */ + { &vop_unlock_desc, mfs_unlock }, /* unlock */ + { &vop_bmap_desc, mfs_bmap }, /* bmap */ + { &vop_strategy_desc, mfs_strategy }, /* strategy */ + { &vop_print_desc, mfs_print }, /* print */ + { &vop_islocked_desc, mfs_islocked }, /* islocked */ + { &vop_pathconf_desc, mfs_pathconf }, /* pathconf */ + { &vop_advlock_desc, mfs_advlock }, /* advlock */ + { &vop_blkatoff_desc, mfs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, mfs_valloc }, /* valloc */ + { &vop_vfree_desc, mfs_vfree }, /* vfree */ + { &vop_truncate_desc, mfs_truncate }, /* truncate */ + { &vop_update_desc, mfs_update }, /* update */ + { &vop_bwrite_desc, mfs_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc mfs_vnodeop_opv_desc = + { &mfs_vnodeop_p, mfs_vnodeop_entries }; + +/* + * Vnode Operations. + * + * Open called to allow memory filesystem to initialize and + * validate before actual IO. Record our process identifier + * so we can tell when we are doing I/O to ourself. + */ +/* ARGSUSED */ +int +mfs_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + if (ap->a_vp->v_type != VBLK) { + panic("mfs_ioctl not VBLK"); + /* NOTREACHED */ + } + return (0); +} + +/* + * Ioctl operation. + */ +/* ARGSUSED */ +int +mfs_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (ENOTTY); +} + +/* + * Pass I/O requests to the memory filesystem process. + */ +int +mfs_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + register struct buf *bp = ap->a_bp; + register struct mfsnode *mfsp; + struct vnode *vp; + struct proc *p = curproc; /* XXX */ + + if (!vfinddev(bp->b_dev, VBLK, &vp) || vp->v_usecount == 0) + panic("mfs_strategy: bad dev"); + mfsp = VTOMFS(vp); + /* check for mini-root access */ + if (mfsp->mfs_pid == 0) { + caddr_t base; + + base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_READ) + bcopy(base, bp->b_data, bp->b_bcount); + else + bcopy(bp->b_data, base, bp->b_bcount); + biodone(bp); + } else if (mfsp->mfs_pid == p->p_pid) { + mfs_doio(bp, mfsp->mfs_baseoff); + } else { + bp->b_actf = mfsp->mfs_buflist; + mfsp->mfs_buflist = bp; + wakeup((caddr_t)vp); + } + return (0); +} + +#if defined(vax) || defined(tahoe) +/* + * Memory file system I/O. + * + * Essentially play ubasetup() and disk interrupt service routine by + * doing the copies to or from the memfs process. If doing physio + * (i.e. pagein), we must map the I/O through the kernel virtual + * address space. + */ +void +mfs_doio(bp, base) + register struct buf *bp; + caddr_t base; +{ + register struct pte *pte, *ppte; + register caddr_t vaddr; + int off, npf, npf2, reg; + caddr_t kernaddr, offset; + + /* + * For phys I/O, map the b_data into kernel virtual space using + * the Mfsiomap pte's. + */ + if ((bp->b_flags & B_PHYS) == 0) { + kernaddr = bp->b_data; + } else { + if (bp->b_flags & (B_PAGET | B_UAREA | B_DIRTY)) + panic("swap on memfs?"); + off = (int)bp->b_data & PGOFSET; + npf = btoc(bp->b_bcount + off); + /* + * Get some mapping page table entries + */ + while ((reg = rmalloc(mfsmap, (long)npf)) == 0) { + mfsmap_want++; + sleep((caddr_t)&mfsmap_want, PZERO-1); + } + reg--; + pte = vtopte(bp->b_proc, btop(bp->b_data)); + /* + * Do vmaccess() but with the Mfsiomap page table. + */ + ppte = &Mfsiomap[reg]; + vaddr = &mfsiobuf[reg * NBPG]; + kernaddr = vaddr + off; + for (npf2 = npf; npf2; npf2--) { + mapin(ppte, (u_int)vaddr, pte->pg_pfnum, + (int)(PG_V|PG_KW)); +#if defined(tahoe) + if ((bp->b_flags & B_READ) == 0) + mtpr(P1DC, vaddr); +#endif + ppte++; + pte++; + vaddr += NBPG; + } + } + offset = base + (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_READ) + bp->b_error = copyin(offset, kernaddr, bp->b_bcount); + else + bp->b_error = copyout(kernaddr, offset, bp->b_bcount); + if (bp->b_error) + bp->b_flags |= B_ERROR; + /* + * Release pte's used by physical I/O. + */ + if (bp->b_flags & B_PHYS) { + rmfree(mfsmap, (long)npf, (long)++reg); + if (mfsmap_want) { + mfsmap_want = 0; + wakeup((caddr_t)&mfsmap_want); + } + } + biodone(bp); +} +#endif /* vax || tahoe */ + +#if defined(hp300) || defined(i386) || defined(mips) || defined(sparc) || defined(luna68k) +/* + * Memory file system I/O. + * + * Trivial on the HP since buffer has already been mapping into KVA space. + */ +void +mfs_doio(bp, base) + register struct buf *bp; + caddr_t base; +{ + + base += (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_READ) + bp->b_error = copyin(base, bp->b_data, bp->b_bcount); + else + bp->b_error = copyout(bp->b_data, base, bp->b_bcount); + if (bp->b_error) + bp->b_flags |= B_ERROR; + biodone(bp); +} +#endif + +/* + * This is a noop, simply returning what one has been given. + */ +int +mfs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + return (0); +} + +/* + * Memory filesystem close routine + */ +/* ARGSUSED */ +int +mfs_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct mfsnode *mfsp = VTOMFS(vp); + register struct buf *bp; + int error; + + /* + * Finish any pending I/O requests. + */ + while (bp = mfsp->mfs_buflist) { + mfsp->mfs_buflist = bp->b_actf; + mfs_doio(bp, mfsp->mfs_baseoff); + wakeup((caddr_t)bp); + } + /* + * On last close of a memory filesystem + * we must invalidate any in core blocks, so that + * we can, free up its vnode. + */ + if (error = vinvalbuf(vp, 1, ap->a_cred, ap->a_p, 0, 0)) + return (error); + /* + * There should be no way to have any more uses of this + * vnode, so if we find any other uses, it is a panic. + */ + if (vp->v_usecount > 1) + printf("mfs_close: ref count %d > 1\n", vp->v_usecount); + if (vp->v_usecount > 1 || mfsp->mfs_buflist) + panic("mfs_close"); + /* + * Send a request to the filesystem server to exit. + */ + mfsp->mfs_buflist = (struct buf *)(-1); + wakeup((caddr_t)vp); + return (0); +} + +/* + * Memory filesystem inactive routine + */ +/* ARGSUSED */ +int +mfs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct mfsnode *mfsp = VTOMFS(ap->a_vp); + + if (mfsp->mfs_buflist && mfsp->mfs_buflist != (struct buf *)(-1)) + panic("mfs_inactive: not inactive (mfs_buflist %x)", + mfsp->mfs_buflist); + return (0); +} + +/* + * Reclaim a memory filesystem devvp so that it can be reused. + */ +int +mfs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + FREE(ap->a_vp->v_data, M_MFSNODE); + ap->a_vp->v_data = NULL; + return (0); +} + +/* + * Print out the contents of an mfsnode. + */ +int +mfs_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct mfsnode *mfsp = VTOMFS(ap->a_vp); + + printf("tag VT_MFS, pid %d, base %d, size %d\n", mfsp->mfs_pid, + mfsp->mfs_baseoff, mfsp->mfs_size); + return (0); +} + +/* + * Block device bad operation + */ +int +mfs_badop() +{ + + panic("mfs_badop called\n"); + /* NOTREACHED */ +} + +/* + * Memory based filesystem initialization. + */ +mfs_init() +{ + +#if !defined(hp300) && !defined(i386) && !defined(mips) && !defined(sparc) && !defined(luna68k) + rminit(mfsmap, (long)MFS_MAPREG, (long)1, "mfs mapreg", MFS_MAPSIZE); +#endif +} diff --git a/sys/ufs/mfs/mfsiom.h b/sys/ufs/mfs/mfsiom.h new file mode 100644 index 0000000..98aca85 --- /dev/null +++ b/sys/ufs/mfs/mfsiom.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfsiom.h 8.1 (Berkeley) 6/11/93 + */ + +#define MFS_MAPREG (MAXPHYS/NBPG + 2) /* Kernel mapping pte's */ +#define MFS_MAPSIZE 10 /* Size of alloc map for pte's */ diff --git a/sys/ufs/mfs/mfsnode.h b/sys/ufs/mfs/mfsnode.h new file mode 100644 index 0000000..4480ab0 --- /dev/null +++ b/sys/ufs/mfs/mfsnode.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfsnode.h 8.2 (Berkeley) 8/11/93 + */ + +/* + * This structure defines the control data for the memory based file system. + */ + +struct mfsnode { + struct vnode *mfs_vnode; /* vnode associated with this mfsnode */ + caddr_t mfs_baseoff; /* base of file system in memory */ + long mfs_size; /* size of memory file system */ + pid_t mfs_pid; /* supporting process pid */ + struct buf *mfs_buflist; /* list of I/O requests */ + long mfs_spare[4]; +}; + +/* + * Convert between mfsnode pointers and vnode pointers + */ +#define VTOMFS(vp) ((struct mfsnode *)(vp)->v_data) +#define MFSTOV(mfsp) ((mfsp)->mfs_vnode) + +/* Prototypes for MFS operations on vnodes. */ +#define mfs_lookup ((int (*) __P((struct vop_lookup_args *)))mfs_badop) +#define mfs_create ((int (*) __P((struct vop_create_args *)))mfs_badop) +#define mfs_mknod ((int (*) __P((struct vop_mknod_args *)))mfs_badop) +#define mfs_access ((int (*) __P((struct vop_access_args *)))mfs_badop) +#define mfs_getattr ((int (*) __P((struct vop_getattr_args *)))mfs_badop) +#define mfs_setattr ((int (*) __P((struct vop_setattr_args *)))mfs_badop) +#define mfs_read ((int (*) __P((struct vop_read_args *)))mfs_badop) +#define mfs_write ((int (*) __P((struct vop_write_args *)))mfs_badop) +#define mfs_select ((int (*) __P((struct vop_select_args *)))mfs_badop) +#define mfs_mmap ((int (*) __P((struct vop_mmap_args *)))mfs_badop) +#define mfs_seek ((int (*) __P((struct vop_seek_args *)))mfs_badop) +#define mfs_remove ((int (*) __P((struct vop_remove_args *)))mfs_badop) +#define mfs_link ((int (*) __P((struct vop_link_args *)))mfs_badop) +#define mfs_rename ((int (*) __P((struct vop_rename_args *)))mfs_badop) +#define mfs_mkdir ((int (*) __P((struct vop_mkdir_args *)))mfs_badop) +#define mfs_rmdir ((int (*) __P((struct vop_rmdir_args *)))mfs_badop) +#define mfs_symlink ((int (*) __P((struct vop_symlink_args *)))mfs_badop) +#define mfs_readdir ((int (*) __P((struct vop_readdir_args *)))mfs_badop) +#define mfs_readlink ((int (*) __P((struct vop_readlink_args *)))mfs_badop) +#define mfs_abortop ((int (*) __P((struct vop_abortop_args *)))mfs_badop) +#define mfs_lock ((int (*) __P((struct vop_lock_args *)))nullop) +#define mfs_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) +#define mfs_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +#define mfs_pathconf ((int (*) __P((struct vop_pathconf_args *)))mfs_badop) +#define mfs_advlock ((int (*) __P((struct vop_advlock_args *)))mfs_badop) +#define mfs_blkatoff ((int (*) __P((struct vop_blkatoff_args *)))mfs_badop) +#define mfs_valloc ((int (*) __P((struct vop_valloc_args *)))mfs_badop) +#define mfs_vfree ((int (*) __P((struct vop_vfree_args *)))mfs_badop) +#define mfs_truncate ((int (*) __P((struct vop_truncate_args *)))mfs_badop) +#define mfs_update ((int (*) __P((struct vop_update_args *)))mfs_badop) +#define mfs_bwrite ((int (*) __P((struct vop_bwrite_args *)))vn_bwrite) diff --git a/sys/ufs/ufs/dinode.h b/sys/ufs/ufs/dinode.h new file mode 100644 index 0000000..5b9915d --- /dev/null +++ b/sys/ufs/ufs/dinode.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dinode.h 8.3 (Berkeley) 1/21/94 + */ + +/* + * The root inode is the root of the file system. Inode 0 can't be used for + * normal purposes and historically bad blocks were linked to inode 1, thus + * the root inode is 2. (Inode 1 is no longer used for this purpose, however + * numerous dump tapes make this assumption, so we are stuck with it). + */ +#define ROOTINO ((ino_t)2) + +/* + * A dinode contains all the meta-data associated with a UFS file. + * This structure defines the on-disk format of a dinode. + */ + +#define NDADDR 12 /* Direct addresses in inode. */ +#define NIADDR 3 /* Indirect addresses in inode. */ + +struct dinode { + u_short di_mode; /* 0: IFMT and permissions. */ + short di_nlink; /* 2: File link count. */ + union { + u_short oldids[2]; /* 4: Ffs: old user and group ids. */ + ino_t inumber; /* 4: Lfs: inode number. */ + } di_u; + u_quad_t di_size; /* 8: File byte count. */ + struct timespec di_atime; /* 16: Last access time. */ + struct timespec di_mtime; /* 24: Last modified time. */ + struct timespec di_ctime; /* 32: Last inode change time. */ + daddr_t di_db[NDADDR]; /* 40: Direct disk blocks. */ + daddr_t di_ib[NIADDR]; /* 88: Indirect disk blocks. */ + u_long di_flags; /* 100: Status flags (chflags). */ + long di_blocks; /* 104: Blocks actually held. */ + long di_gen; /* 108: Generation number. */ + u_long di_uid; /* 112: File owner. */ + u_long di_gid; /* 116: File group. */ + long di_spare[2]; /* 120: Reserved; currently unused */ +}; + +/* + * The di_db fields may be overlaid with other information for + * file types that do not have associated disk storage. Block + * and character devices overlay the first data block with their + * dev_t value. Short symbolic links place their path in the + * di_db area. + */ +#define di_inumber di_u.inumber +#define di_ogid di_u.oldids[1] +#define di_ouid di_u.oldids[0] +#define di_rdev di_db[0] +#define di_shortlink di_db +#define MAXSYMLINKLEN ((NDADDR + NIADDR) * sizeof(daddr_t)) + +/* File modes. */ +#define IEXEC 0000100 /* Executable. */ +#define IWRITE 0000200 /* Writeable. */ +#define IREAD 0000400 /* Readable. */ +#define ISVTX 0001000 /* Sticky bit. */ +#define ISGID 0002000 /* Set-gid. */ +#define ISUID 0004000 /* Set-uid. */ + +/* File types. */ +#define IFMT 0170000 /* Mask of file type. */ +#define IFIFO 0010000 /* Named pipe (fifo). */ +#define IFCHR 0020000 /* Character device. */ +#define IFDIR 0040000 /* Directory file. */ +#define IFBLK 0060000 /* Block device. */ +#define IFREG 0100000 /* Regular file. */ +#define IFLNK 0120000 /* Symbolic link. */ +#define IFSOCK 0140000 /* UNIX domain socket. */ diff --git a/sys/ufs/ufs/dir.h b/sys/ufs/ufs/dir.h new file mode 100644 index 0000000..c51bd1c --- /dev/null +++ b/sys/ufs/ufs/dir.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dir.h 8.2 (Berkeley) 1/21/94 + */ + +#ifndef _DIR_H_ +#define _DIR_H_ + +/* + * A directory consists of some number of blocks of DIRBLKSIZ + * bytes, where DIRBLKSIZ is chosen such that it can be transferred + * to disk in a single atomic operation (e.g. 512 bytes on most machines). + * + * Each DIRBLKSIZ byte block contains some number of directory entry + * structures, which are of variable length. Each directory entry has + * a struct direct at the front of it, containing its inode number, + * the length of the entry, and the length of the name contained in + * the entry. These are followed by the name padded to a 4 byte boundary + * with null bytes. All names are guaranteed null terminated. + * The maximum length of a name in a directory is MAXNAMLEN. + * + * The macro DIRSIZ(fmt, dp) gives the amount of space required to represent + * a directory entry. Free space in a directory is represented by + * entries which have dp->d_reclen > DIRSIZ(fmt, dp). All DIRBLKSIZ bytes + * in a directory block are claimed by the directory entries. This + * usually results in the last entry in a directory having a large + * dp->d_reclen. When entries are deleted from a directory, the + * space is returned to the previous entry in the same directory + * block by increasing its dp->d_reclen. If the first entry of + * a directory block is free, then its dp->d_ino is set to 0. + * Entries other than the first in a directory do not normally have + * dp->d_ino set to 0. + */ +#define DIRBLKSIZ DEV_BSIZE +#define MAXNAMLEN 255 + +struct direct { + u_long d_ino; /* inode number of entry */ + u_short d_reclen; /* length of this record */ + u_char d_type; /* file type, see below */ + u_char d_namlen; /* length of string in d_name */ + char d_name[MAXNAMLEN + 1]; /* name with length <= MAXNAMLEN */ +}; + +/* + * File types + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 + +/* + * Convert between stat structure types and directory types. + */ +#define IFTODT(mode) (((mode) & 0170000) >> 12) +#define DTTOIF(dirtype) ((dirtype) << 12) + +/* + * The DIRSIZ macro gives the minimum record length which will hold + * the directory entry. This requires the amount of space in struct direct + * without the d_name field, plus enough space for the name with a terminating + * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary. + */ +#if (BYTE_ORDER == LITTLE_ENDIAN) +#define DIRSIZ(oldfmt, dp) \ + ((oldfmt) ? \ + ((sizeof (struct direct) - (MAXNAMLEN+1)) + (((dp)->d_type+1 + 3) &~ 3)) : \ + ((sizeof (struct direct) - (MAXNAMLEN+1)) + (((dp)->d_namlen+1 + 3) &~ 3))) +#else +#define DIRSIZ(oldfmt, dp) \ + ((sizeof (struct direct) - (MAXNAMLEN+1)) + (((dp)->d_namlen+1 + 3) &~ 3)) +#endif +#define OLDDIRFMT 1 +#define NEWDIRFMT 0 + +/* + * Template for manipulating directories. + * Should use struct direct's, but the name field + * is MAXNAMLEN - 1, and this just won't do. + */ +struct dirtemplate { + u_long dot_ino; + short dot_reclen; + u_char dot_type; + u_char dot_namlen; + char dot_name[4]; /* must be multiple of 4 */ + u_long dotdot_ino; + short dotdot_reclen; + u_char dotdot_type; + u_char dotdot_namlen; + char dotdot_name[4]; /* ditto */ +}; + +/* + * This is the old format of directories, sanz type element. + */ +struct odirtemplate { + u_long dot_ino; + short dot_reclen; + u_short dot_namlen; + char dot_name[4]; /* must be multiple of 4 */ + u_long dotdot_ino; + short dotdot_reclen; + u_short dotdot_namlen; + char dotdot_name[4]; /* ditto */ +}; +#endif /* !_DIR_H_ */ diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h new file mode 100644 index 0000000..df15596 --- /dev/null +++ b/sys/ufs/ufs/inode.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)inode.h 8.4 (Berkeley) 1/21/94 + */ + +#include <ufs/ufs/dinode.h> + +/* + * Theoretically, directories can be more than 2Gb in length, however, in + * practice this seems unlikely. So, we define the type doff_t as a long + * to keep down the cost of doing lookup on a 32-bit machine. If you are + * porting to a 64-bit architecture, you should make doff_t the same as off_t. + */ +#define doff_t long + +/* + * The inode is used to describe each active (or recently active) + * file in the UFS filesystem. It is composed of two types of + * information. The first part is the information that is needed + * only while the file is active (such as the identity of the file + * and linkage to speed its lookup). The second part is the + * permannent meta-data associated with the file which is read + * in from the permanent dinode from long term storage when the + * file becomes active, and is put back when the file is no longer + * being used. + */ +struct inode { + struct inode *i_next; /* Hash chain forward. */ + struct inode **i_prev; /* Hash chain back. */ + struct vnode *i_vnode; /* Vnode associated with this inode. */ + struct vnode *i_devvp; /* Vnode for block I/O. */ + u_long i_flag; /* I* flags. */ + dev_t i_dev; /* Device associated with the inode. */ + ino_t i_number; /* The identity of the inode. */ + union { /* Associated filesystem. */ + struct fs *fs; /* FFS */ + struct lfs *lfs; /* LFS */ + } inode_u; +#define i_fs inode_u.fs +#define i_lfs inode_u.lfs + struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ + u_quad_t i_modrev; /* Revision level for lease. */ + struct lockf *i_lockf; /* Head of byte-level lock list. */ + pid_t i_lockholder; /* DEBUG: holder of inode lock. */ + pid_t i_lockwaiter; /* DEBUG: latest blocked for inode lock. */ + /* + * Side effects; used during directory lookup. + */ + long i_count; /* Size of free slot in directory. */ + doff_t i_endoff; /* End of useful stuff in directory. */ + doff_t i_diroff; /* Offset in dir, where we found last entry. */ + doff_t i_offset; /* Offset of free space in directory. */ + ino_t i_ino; /* Inode number of found directory. */ + u_long i_reclen; /* Size of found directory entry. */ + long i_spare[11]; /* Spares to round up to 128 bytes. */ + /* + * The on-disk dinode itself. + */ + struct dinode i_din; /* 128 bytes of the on-disk dinode. */ +}; + +#define i_atime i_din.di_atime +#define i_blocks i_din.di_blocks +#define i_ctime i_din.di_ctime +#define i_db i_din.di_db +#define i_flags i_din.di_flags +#define i_gen i_din.di_gen +#define i_gid i_din.di_gid +#define i_ib i_din.di_ib +#define i_mode i_din.di_mode +#define i_mtime i_din.di_mtime +#define i_nlink i_din.di_nlink +#define i_rdev i_din.di_rdev +#define i_shortlink i_din.di_shortlink +#define i_size i_din.di_size +#define i_uid i_din.di_uid + +/* These flags are kept in i_flag. */ +#define IN_ACCESS 0x0001 /* Access time update request. */ +#define IN_CHANGE 0x0002 /* Inode change time update request. */ +#define IN_EXLOCK 0x0004 /* File has exclusive lock. */ +#define IN_LOCKED 0x0008 /* Inode lock. */ +#define IN_LWAIT 0x0010 /* Process waiting on file lock. */ +#define IN_MODIFIED 0x0020 /* Inode has been modified. */ +#define IN_RENAME 0x0040 /* Inode is being renamed. */ +#define IN_SHLOCK 0x0080 /* File has shared lock. */ +#define IN_UPDATE 0x0100 /* Modification time update request. */ +#define IN_WANTED 0x0200 /* Inode is wanted by a process. */ + +#ifdef KERNEL +/* + * Structure used to pass around logical block paths generated by + * ufs_getlbns and used by truncate and bmap code. + */ +struct indir { + daddr_t in_lbn; /* Logical block number. */ + int in_off; /* Offset in buffer. */ + int in_exists; /* Flag if the block exists. */ +}; + +/* Convert between inode pointers and vnode pointers. */ +#define VTOI(vp) ((struct inode *)(vp)->v_data) +#define ITOV(ip) ((ip)->i_vnode) + +#define ITIMES(ip, t1, t2) { \ + if ((ip)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) { \ + (ip)->i_flag |= IN_MODIFIED; \ + if ((ip)->i_flag & IN_ACCESS) \ + (ip)->i_atime.ts_sec = (t1)->tv_sec; \ + if ((ip)->i_flag & IN_UPDATE) { \ + (ip)->i_mtime.ts_sec = (t2)->tv_sec; \ + (ip)->i_modrev++; \ + } \ + if ((ip)->i_flag & IN_CHANGE) \ + (ip)->i_ctime.ts_sec = time.tv_sec; \ + (ip)->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); \ + } \ +} + +/* This overlays the fid structure (see mount.h). */ +struct ufid { + u_short ufid_len; /* Length of structure. */ + u_short ufid_pad; /* Force long alignment. */ + ino_t ufid_ino; /* File number (ino). */ + long ufid_gen; /* Generation number. */ +}; +#endif /* KERNEL */ diff --git a/sys/ufs/ufs/lockf.h b/sys/ufs/ufs/lockf.h new file mode 100644 index 0000000..0ec61db --- /dev/null +++ b/sys/ufs/ufs/lockf.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Scooter Morris at Genentech Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lockf.h 8.1 (Berkeley) 6/11/93 + */ + +/* + * The lockf structure is a kernel structure which contains the information + * associated with a byte range lock. The lockf structures are linked into + * the inode structure. Locks are sorted by the starting byte of the lock for + * efficiency. + */ +struct lockf { + short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */ + short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ + off_t lf_start; /* The byte # of the start of the lock */ + off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/ + caddr_t lf_id; /* The id of the resource holding the lock */ + struct inode *lf_inode; /* Back pointer to the inode */ + struct lockf *lf_next; /* A pointer to the next lock on this inode */ + struct lockf *lf_block; /* The list of blocked locks */ +}; + +/* Maximum length of sleep chains to traverse to try and detect deadlock. */ +#define MAXDEPTH 50 + +__BEGIN_DECLS +void lf_addblock __P((struct lockf *, struct lockf *)); +int lf_clearlock __P((struct lockf *)); +int lf_findoverlap __P((struct lockf *, + struct lockf *, int, struct lockf ***, struct lockf **)); +struct lockf * + lf_getblock __P((struct lockf *)); +int lf_getlock __P((struct lockf *, struct flock *)); +int lf_setlock __P((struct lockf *)); +void lf_split __P((struct lockf *, struct lockf *)); +void lf_wakelock __P((struct lockf *)); +__END_DECLS + +#ifdef LOCKF_DEBUG +extern int lockf_debug; + +__BEGIN_DECLS +void lf_print __P((char *, struct lockf *)); +void lf_printlist __P((char *, struct lockf *)); +__END_DECLS +#endif diff --git a/sys/ufs/ufs/quota.h b/sys/ufs/ufs/quota.h new file mode 100644 index 0000000..11efb40 --- /dev/null +++ b/sys/ufs/ufs/quota.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)quota.h 8.1 (Berkeley) 6/11/93 + */ + +#ifndef _QUOTA_ +#define _QUOTA_ + +/* + * Definitions for disk quotas imposed on the average user + * (big brother finally hits UNIX). + * + * The following constants define the amount of time given a user before the + * soft limits are treated as hard limits (usually resulting in an allocation + * failure). The timer is started when the user crosses their soft limit, it + * is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME (7*24*60*60) /* 1 week */ +#define MAX_DQ_TIME (7*24*60*60) /* 1 week */ + +/* + * The following constants define the usage of the quota file array in the + * ufsmount structure and dquot array in the inode structure. The semantics + * of the elements of these arrays are defined in the routine getinoquota; + * the remainder of the quota code treats them generically and need not be + * inspected when changing the size of the array. + */ +#define MAXQUOTAS 2 +#define USRQUOTA 0 /* element used for user quotas */ +#define GRPQUOTA 1 /* element used for group quotas */ + +/* + * Definitions for the default names of the quotas files. + */ +#define INITQFNAMES { \ + "user", /* USRQUOTA */ \ + "group", /* GRPQUOTA */ \ + "undefined", \ +}; +#define QUOTAFILENAME "quota" +#define QUOTAGROUP "operator" + +/* + * Command definitions for the 'quotactl' system call. The commands are + * broken into a main command defined below and a subcommand that is used + * to convey the type of quota that is being manipulated (see above). + */ +#define SUBCMDMASK 0x00ff +#define SUBCMDSHIFT 8 +#define QCMD(cmd, type) (((cmd) << SUBCMDSHIFT) | ((type) & SUBCMDMASK)) + +#define Q_QUOTAON 0x0100 /* enable quotas */ +#define Q_QUOTAOFF 0x0200 /* disable quotas */ +#define Q_GETQUOTA 0x0300 /* get limits and usage */ +#define Q_SETQUOTA 0x0400 /* set limits and usage */ +#define Q_SETUSE 0x0500 /* set usage */ +#define Q_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. The setquota system call establishes + * the vnode for each quota file (a pointer is retained in the ufsmount + * structure). + */ +struct dqblk { + u_long dqb_bhardlimit; /* absolute limit on disk blks alloc */ + u_long dqb_bsoftlimit; /* preferred limit on disk blks */ + u_long dqb_curblocks; /* current block count */ + u_long dqb_ihardlimit; /* maximum # allocated inodes + 1 */ + u_long dqb_isoftlimit; /* preferred inode limit */ + u_long dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive files */ +}; + +/* + * The following structure records disk usage for a user or group on a + * filesystem. There is one allocated for each quota that exists on any + * filesystem for the current user or group. A cache is kept of recently + * used entries. + */ +struct dquot { + struct dquot *dq_forw, **dq_back; /* hash list */ + struct dquot *dq_freef, **dq_freeb; /* free list */ + short dq_flags; /* flags, see below */ + short dq_cnt; /* count of active references */ + short dq_spare; /* unused spare padding */ + short dq_type; /* quota type of this dquot */ + u_long dq_id; /* identifier this applies to */ + struct ufsmount *dq_ump; /* filesystem that this is taken from */ + struct dqblk dq_dqb; /* actual usage & quotas */ +}; +/* + * Flag values. + */ +#define DQ_LOCK 0x01 /* this quota locked (no MODS) */ +#define DQ_WANT 0x02 /* wakeup on unlock */ +#define DQ_MOD 0x04 /* this quota modified since read */ +#define DQ_FAKE 0x08 /* no limits here, just usage */ +#define DQ_BLKS 0x10 /* has been warned about blk limit */ +#define DQ_INODS 0x20 /* has been warned about inode limit */ +/* + * Shorthand notation. + */ +#define dq_bhardlimit dq_dqb.dqb_bhardlimit +#define dq_bsoftlimit dq_dqb.dqb_bsoftlimit +#define dq_curblocks dq_dqb.dqb_curblocks +#define dq_ihardlimit dq_dqb.dqb_ihardlimit +#define dq_isoftlimit dq_dqb.dqb_isoftlimit +#define dq_curinodes dq_dqb.dqb_curinodes +#define dq_btime dq_dqb.dqb_btime +#define dq_itime dq_dqb.dqb_itime + +/* + * If the system has never checked for a quota for this file, then it is set + * to NODQUOT. Once a write attempt is made the inode pointer is set to + * reference a dquot structure. + */ +#define NODQUOT ((struct dquot *) 0) + +/* + * Flags to chkdq() and chkiq() + */ +#define FORCE 0x01 /* force usage changes independent of limits */ +#define CHOWN 0x02 /* (advisory) change initiated by chown */ + +/* + * Macros to avoid subroutine calls to trivial functions. + */ +#ifdef DIAGNOSTIC +#define DQREF(dq) dqref(dq) +#else +#define DQREF(dq) (dq)->dq_cnt++ +#endif + +#include <sys/cdefs.h> + +struct dquot; +struct inode; +struct mount; +struct proc; +struct ucred; +struct ufsmount; +struct vnode; +__BEGIN_DECLS +int chkdq __P((struct inode *, long, struct ucred *, int)); +int chkdqchg __P((struct inode *, long, struct ucred *, int)); +int chkiq __P((struct inode *, long, struct ucred *, int)); +int chkiqchg __P((struct inode *, long, struct ucred *, int)); +void dqflush __P((struct vnode *)); +int dqget __P((struct vnode *, + u_long, struct ufsmount *, int, struct dquot **)); +void dqinit __P((void)); +void dqref __P((struct dquot *)); +void dqrele __P((struct vnode *, struct dquot *)); +int dqsync __P((struct vnode *, struct dquot *)); +int getinoquota __P((struct inode *)); +int getquota __P((struct mount *, u_long, int, caddr_t)); +int qsync __P((struct mount *mp)); +int quotaoff __P((struct proc *, struct mount *, int)); +int quotaon __P((struct proc *, struct mount *, int, caddr_t)); +int setquota __P((struct mount *, u_long, int, caddr_t)); +int setuse __P((struct mount *, u_long, int, caddr_t)); +int ufs_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); +__END_DECLS + +#ifdef DIAGNOSTIC +__BEGIN_DECLS +void chkdquot __P((struct inode *)); +__END_DECLS +#endif + +#endif /* _QUOTA_ */ diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c new file mode 100644 index 0000000..bcd838d --- /dev/null +++ b/sys/ufs/ufs/ufs_bmap.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 + */ + +#include <sys/param.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/resourcevar.h> +#include <sys/trace.h> + +#include <miscfs/specfs/specdev.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +/* + * Bmap converts a the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the array of block pointers described by the dinode. + */ +int +ufs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_vpp != NULL) + *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, + ap->a_runp)); +} + +/* + * Indirect blocks are now on the vnode for the file. They are given negative + * logical block numbers. Indirect blocks are addressed by the negative + * address of the first data block to which they point. Double indirect blocks + * are addressed by one less than the address of the first indirect block to + * which they point. Triple indirect blocks are addressed by one less than + * the address of the first double indirect block to which they point. + * + * ufs_bmaparray does the bmap conversion, and if requested returns the + * array of logical blocks which must be traversed to get to a block. + * Each entry contains the offset into that block that gets you to the + * next block and the disk address of the block (if it is assigned). + */ + +int +ufs_bmaparray(vp, bn, bnp, ap, nump, runp) + struct vnode *vp; + register daddr_t bn; + daddr_t *bnp; + struct indir *ap; + int *nump; + int *runp; +{ + register struct inode *ip; + struct buf *bp; + struct ufsmount *ump; + struct mount *mp; + struct vnode *devvp; + struct indir a[NIADDR], *xap; + daddr_t daddr; + long metalbn; + int error, maxrun, num; + + ip = VTOI(vp); + mp = vp->v_mount; + ump = VFSTOUFS(mp); +#ifdef DIAGNOSTIC + if (ap != NULL && nump == NULL || ap == NULL && nump != NULL) + panic("ufs_bmaparray: invalid arguments"); +#endif + + if (runp) { + /* + * XXX + * If MAXBSIZE is the largest transfer the disks can handle, + * we probably want maxrun to be 1 block less so that we + * don't create a block larger than the device can handle. + */ + *runp = 0; + maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1; + } + + xap = ap == NULL ? a : ap; + if (!nump) + nump = # + if (error = ufs_getlbns(vp, bn, xap, nump)) + return (error); + + num = *nump; + if (num == 0) { + *bnp = blkptrtodb(ump, ip->i_db[bn]); + if (*bnp == 0) + *bnp = -1; + else if (runp) + for (++bn; bn < NDADDR && *runp < maxrun && + is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); + ++bn, ++*runp); + return (0); + } + + + /* Get disk address out of indirect block array */ + daddr = ip->i_ib[xap->in_off]; + + devvp = VFSTOUFS(vp->v_mount)->um_devvp; + for (bp = NULL, ++xap; --num; ++xap) { + /* + * Exit the loop if there is no disk address assigned yet and + * the indirect block isn't in the cache, or if we were + * looking for an indirect block and we've found it. + */ + + metalbn = xap->in_lbn; + if (daddr == 0 && !incore(vp, metalbn) || metalbn == bn) + break; + /* + * If we get here, we've either got the block in the cache + * or we have a disk address for it, go fetch it. + */ + if (bp) + brelse(bp); + + xap->in_exists = 1; + bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); + if (bp->b_flags & (B_DONE | B_DELWRI)) { + trace(TR_BREADHIT, pack(vp, size), metalbn); + } +#ifdef DIAGNOSTIC + else if (!daddr) + panic("ufs_bmaparry: indirect block not in cache"); +#endif + else { + trace(TR_BREADMISS, pack(vp, size), metalbn); + bp->b_blkno = blkptrtodb(ump, daddr); + bp->b_flags |= B_READ; + VOP_STRATEGY(bp); + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + if (error = biowait(bp)) { + brelse(bp); + return (error); + } + } + + daddr = ((daddr_t *)bp->b_data)[xap->in_off]; + if (num == 1 && daddr && runp) + for (bn = xap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1], + ((daddr_t *)bp->b_data)[bn]); + ++bn, ++*runp); + } + if (bp) + brelse(bp); + + daddr = blkptrtodb(ump, daddr); + *bnp = daddr == 0 ? -1 : daddr; + return (0); +} + +/* + * Create an array of logical block number/offset pairs which represent the + * path of indirect blocks required to access a data block. The first "pair" + * contains the logical block number of the appropriate single, double or + * triple indirect block and the offset into the inode indirect block array. + * Note, the logical block number of the inode single/double/triple indirect + * block appears twice in the array, once with the offset into the i_ib and + * once with the offset into the page itself. + */ +int +ufs_getlbns(vp, bn, ap, nump) + struct vnode *vp; + register daddr_t bn; + struct indir *ap; + int *nump; +{ + long metalbn, realbn; + struct ufsmount *ump; + int blockcnt, i, numlevels, off; + + ump = VFSTOUFS(vp->v_mount); + if (nump) + *nump = 0; + numlevels = 0; + realbn = bn; + if ((long)bn < 0) + bn = -(long)bn; + + /* The first NDADDR blocks are direct blocks. */ + if (bn < NDADDR) + return (0); + + /* + * Determine the number of levels of indirection. After this loop + * is done, blockcnt indicates the number of data blocks possible + * at the given level of indirection, and NIADDR - i is the number + * of levels of indirection needed to locate the requested block. + */ + for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + if (i == 0) + return (EFBIG); + blockcnt *= MNINDIR(ump); + if (bn < blockcnt) + break; + } + + /* Calculate the address of the first meta-block. */ + if (realbn >= 0) + metalbn = -(realbn - bn + NIADDR - i); + else + metalbn = -(-realbn - bn + NIADDR - i); + + /* + * At each iteration, off is the offset into the bap array which is + * an array of disk addresses at the current level of indirection. + * The logical block number and the offset in that block are stored + * into the argument array. + */ + ap->in_lbn = metalbn; + ap->in_off = off = NIADDR - i; + ap->in_exists = 0; + ap++; + for (++numlevels; i <= NIADDR; i++) { + /* If searching for a meta-data block, quit when found. */ + if (metalbn == realbn) + break; + + blockcnt /= MNINDIR(ump); + off = (bn / blockcnt) % MNINDIR(ump); + + ++numlevels; + ap->in_lbn = metalbn; + ap->in_off = off; + ap->in_exists = 0; + ++ap; + + metalbn -= -1 + off * blockcnt; + } + if (nump) + *nump = numlevels; + return (0); +} diff --git a/sys/ufs/ufs/ufs_disksubr.c b/sys/ufs/ufs/ufs_disksubr.c new file mode 100644 index 0000000..78dede4 --- /dev/null +++ b/sys/ufs/ufs/ufs_disksubr.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/disklabel.h> +#include <sys/syslog.h> + +/* + * Seek sort for disks. We depend on the driver which calls us using b_resid + * as the current cylinder number. + * + * The argument ap structure holds a b_actf activity chain pointer on which we + * keep two queues, sorted in ascending cylinder order. The first queue holds + * those requests which are positioned after the current cylinder (in the first + * request); the second holds requests which came in after their cylinder number + * was passed. Thus we implement a one way scan, retracting after reaching the + * end of the drive to the first request on the second queue, at which time it + * becomes the first queue. + * + * A one-way scan is natural because of the way UNIX read-ahead blocks are + * allocated. + */ + +/* + * For portability with historic industry practice, the + * cylinder number has to be maintained in the `b_resid' + * field. + */ +#define b_cylinder b_resid + +void +disksort(ap, bp) + register struct buf *ap, *bp; +{ + register struct buf *bq; + + /* If the queue is empty, then it's easy. */ + if (ap->b_actf == NULL) { + bp->b_actf = NULL; + ap->b_actf = bp; + return; + } + + /* + * If we lie after the first (currently active) request, then we + * must locate the second request list and add ourselves to it. + */ + bq = ap->b_actf; + if (bp->b_cylinder < bq->b_cylinder) { + while (bq->b_actf) { + /* + * Check for an ``inversion'' in the normally ascending + * cylinder numbers, indicating the start of the second + * request list. + */ + if (bq->b_actf->b_cylinder < bq->b_cylinder) { + /* + * Search the second request list for the first + * request at a larger cylinder number. We go + * before that; if there is no such request, we + * go at end. + */ + do { + if (bp->b_cylinder < + bq->b_actf->b_cylinder) + goto insert; + if (bp->b_cylinder == + bq->b_actf->b_cylinder && + bp->b_blkno < bq->b_actf->b_blkno) + goto insert; + bq = bq->b_actf; + } while (bq->b_actf); + goto insert; /* after last */ + } + bq = bq->b_actf; + } + /* + * No inversions... we will go after the last, and + * be the first request in the second request list. + */ + goto insert; + } + /* + * Request is at/after the current request... + * sort in the first request list. + */ + while (bq->b_actf) { + /* + * We want to go after the current request if there is an + * inversion after it (i.e. it is the end of the first + * request list), or if the next request is a larger cylinder + * than our request. + */ + if (bq->b_actf->b_cylinder < bq->b_cylinder || + bp->b_cylinder < bq->b_actf->b_cylinder || + (bp->b_cylinder == bq->b_actf->b_cylinder && + bp->b_blkno < bq->b_actf->b_blkno)) + goto insert; + bq = bq->b_actf; + } + /* + * Neither a second list nor a larger request... we go at the end of + * the first list, which is the same as the end of the whole schebang. + */ +insert: bp->b_actf = bq->b_actf; + bq->b_actf = bp; +} + +/* + * Attempt to read a disk label from a device using the indicated stategy + * routine. The label must be partly set up before this: secpercyl and + * anything required in the strategy routine (e.g., sector size) must be + * filled in before calling us. Returns NULL on success and an error + * string on failure. + */ +char * +readdisklabel(dev, strat, lp) + dev_t dev; + int (*strat)(); + register struct disklabel *lp; +{ + register struct buf *bp; + struct disklabel *dlp; + char *msg = NULL; + + if (lp->d_secperunit == 0) + lp->d_secperunit = 0x1fffffff; + lp->d_npartitions = 1; + if (lp->d_partitions[0].p_size == 0) + lp->d_partitions[0].p_size = 0x1fffffff; + lp->d_partitions[0].p_offset = 0; + + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dev; + bp->b_blkno = LABELSECTOR; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_BUSY | B_READ; + bp->b_cylinder = LABELSECTOR / lp->d_secpercyl; + (*strat)(bp); + if (biowait(bp)) + msg = "I/O error"; + else for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *)((char *)bp->b_data + + DEV_BSIZE - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) { + if (msg == NULL) + msg = "no disk label"; + } else if (dlp->d_npartitions > MAXPARTITIONS || + dkcksum(dlp) != 0) + msg = "disk label corrupted"; + else { + *lp = *dlp; + msg = NULL; + break; + } + } + bp->b_flags = B_INVAL | B_AGE; + brelse(bp); + return (msg); +} + +/* + * Check new disk label for sensibility before setting it. + */ +int +setdisklabel(olp, nlp, openmask) + register struct disklabel *olp, *nlp; + u_long openmask; +{ + register i; + register struct partition *opp, *npp; + + if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC || + dkcksum(nlp) != 0) + return (EINVAL); + while ((i = ffs((long)openmask)) != 0) { + i--; + openmask &= ~(1 << i); + if (nlp->d_npartitions <= i) + return (EBUSY); + opp = &olp->d_partitions[i]; + npp = &nlp->d_partitions[i]; + if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size) + return (EBUSY); + /* + * Copy internally-set partition information + * if new label doesn't include it. XXX + */ + if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) { + npp->p_fstype = opp->p_fstype; + npp->p_fsize = opp->p_fsize; + npp->p_frag = opp->p_frag; + npp->p_cpg = opp->p_cpg; + } + } + nlp->d_checksum = 0; + nlp->d_checksum = dkcksum(nlp); + *olp = *nlp; + return (0); +} + +/* encoding of disk minor numbers, should be elsewhere... */ +#define dkunit(dev) (minor(dev) >> 3) +#define dkpart(dev) (minor(dev) & 07) +#define dkminor(unit, part) (((unit) << 3) | (part)) + +/* + * Write disk label back to device after modification. + */ +int +writedisklabel(dev, strat, lp) + dev_t dev; + int (*strat)(); + register struct disklabel *lp; +{ + struct buf *bp; + struct disklabel *dlp; + int labelpart; + int error = 0; + + labelpart = dkpart(dev); + if (lp->d_partitions[labelpart].p_offset != 0) { + if (lp->d_partitions[0].p_offset != 0) + return (EXDEV); /* not quite right */ + labelpart = 0; + } + bp = geteblk((int)lp->d_secsize); + bp->b_dev = makedev(major(dev), dkminor(dkunit(dev), labelpart)); + bp->b_blkno = LABELSECTOR; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_READ; + (*strat)(bp); + if (error = biowait(bp)) + goto done; + for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *) + ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC && + dkcksum(dlp) == 0) { + *dlp = *lp; + bp->b_flags = B_WRITE; + (*strat)(bp); + error = biowait(bp); + goto done; + } + } + error = ESRCH; +done: + brelse(bp); + return (error); +} + +/* + * Compute checksum for disk label. + */ +dkcksum(lp) + register struct disklabel *lp; +{ + register u_short *start, *end; + register u_short sum = 0; + + start = (u_short *)lp; + end = (u_short *)&lp->d_partitions[lp->d_npartitions]; + while (start < end) + sum ^= *start++; + return (sum); +} + +/* + * Disk error is the preface to plaintive error messages + * about failing disk transfers. It prints messages of the form + +hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) + + * if the offset of the error in the transfer and a disk label + * are both available. blkdone should be -1 if the position of the error + * is unknown; the disklabel pointer may be null from drivers that have not + * been converted to use them. The message is printed with printf + * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. + * The message should be completed (with at least a newline) with printf + * or addlog, respectively. There is no trailing space. + */ +void +diskerr(bp, dname, what, pri, blkdone, lp) + register struct buf *bp; + char *dname, *what; + int pri, blkdone; + register struct disklabel *lp; +{ + int unit = dkunit(bp->b_dev), part = dkpart(bp->b_dev); + register void (*pr) __P((const char *, ...)); + char partname = 'a' + part; + int sn; + + if (pri != LOG_PRINTF) { + log(pri, ""); + pr = addlog; + } else + pr = printf; + (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, + bp->b_flags & B_READ ? "read" : "writ"); + sn = bp->b_blkno; + if (bp->b_bcount <= DEV_BSIZE) + (*pr)("%d", sn); + else { + if (blkdone >= 0) { + sn += blkdone; + (*pr)("%d of ", sn); + } + (*pr)("%d-%d", bp->b_blkno, + bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); + } + if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { +#ifdef tahoe + sn *= DEV_BSIZE / lp->d_secsize; /* XXX */ +#endif + sn += lp->d_partitions[part].p_offset; + (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn, + sn / lp->d_secpercyl); + sn %= lp->d_secpercyl; + (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors); + } +} diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h new file mode 100644 index 0000000..e25923e --- /dev/null +++ b/sys/ufs/ufs/ufs_extern.h @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_extern.h 8.3 (Berkeley) 4/16/94 + */ + +struct buf; +struct direct; +struct disklabel; +struct fid; +struct flock; +struct inode; +struct mbuf; +struct mount; +struct nameidata; +struct proc; +struct ucred; +struct uio; +struct vattr; +struct vnode; +struct ufs_args; + +__BEGIN_DECLS +void diskerr + __P((struct buf *, char *, char *, int, int, struct disklabel *)); +void disksort __P((struct buf *, struct buf *)); +u_int dkcksum __P((struct disklabel *)); +char *readdisklabel __P((dev_t, int (*)(), struct disklabel *)); +int setdisklabel __P((struct disklabel *, struct disklabel *, u_long)); +int writedisklabel __P((dev_t, int (*)(), struct disklabel *)); + +int ufs_abortop __P((struct vop_abortop_args *)); +int ufs_access __P((struct vop_access_args *)); +int ufs_advlock __P((struct vop_advlock_args *)); +int ufs_bmap __P((struct vop_bmap_args *)); +int ufs_check_export __P((struct mount *, struct ufid *, struct mbuf *, + struct vnode **, int *exflagsp, struct ucred **)); +int ufs_checkpath __P((struct inode *, struct inode *, struct ucred *)); +int ufs_close __P((struct vop_close_args *)); +int ufs_create __P((struct vop_create_args *)); +void ufs_dirbad __P((struct inode *, doff_t, char *)); +int ufs_dirbadentry __P((struct vnode *, struct direct *, int)); +int ufs_dirempty __P((struct inode *, ino_t, struct ucred *)); +int ufs_direnter __P((struct inode *, struct vnode *,struct componentname *)); +int ufs_dirremove __P((struct vnode *, struct componentname*)); +int ufs_dirrewrite + __P((struct inode *, struct inode *, struct componentname *)); +int ufs_getattr __P((struct vop_getattr_args *)); +int ufs_getlbns __P((struct vnode *, daddr_t, struct indir *, int *)); +struct vnode * + ufs_ihashget __P((dev_t, ino_t)); +void ufs_ihashinit __P((void)); +void ufs_ihashins __P((struct inode *)); +struct vnode * + ufs_ihashlookup __P((dev_t, ino_t)); +void ufs_ihashrem __P((struct inode *)); +int ufs_inactive __P((struct vop_inactive_args *)); +int ufs_init __P((void)); +int ufs_ioctl __P((struct vop_ioctl_args *)); +int ufs_islocked __P((struct vop_islocked_args *)); +int ufs_link __P((struct vop_link_args *)); +int ufs_lock __P((struct vop_lock_args *)); +int ufs_lookup __P((struct vop_lookup_args *)); +int ufs_makeinode __P((int mode, struct vnode *, struct vnode **, struct componentname *)); +int ufs_mkdir __P((struct vop_mkdir_args *)); +int ufs_mknod __P((struct vop_mknod_args *)); +int ufs_mmap __P((struct vop_mmap_args *)); +int ufs_open __P((struct vop_open_args *)); +int ufs_pathconf __P((struct vop_pathconf_args *)); +int ufs_print __P((struct vop_print_args *)); +int ufs_readdir __P((struct vop_readdir_args *)); +int ufs_readlink __P((struct vop_readlink_args *)); +int ufs_reclaim __P((struct vop_reclaim_args *)); +int ufs_remove __P((struct vop_remove_args *)); +int ufs_rename __P((struct vop_rename_args *)); +int ufs_rmdir __P((struct vop_rmdir_args *)); +int ufs_root __P((struct mount *, struct vnode **)); +int ufs_seek __P((struct vop_seek_args *)); +int ufs_select __P((struct vop_select_args *)); +int ufs_setattr __P((struct vop_setattr_args *)); +int ufs_start __P((struct mount *, int, struct proc *)); +int ufs_strategy __P((struct vop_strategy_args *)); +int ufs_symlink __P((struct vop_symlink_args *)); +int ufs_unlock __P((struct vop_unlock_args *)); +int ufs_vinit __P((struct mount *, + int (**)(), int (**)(), struct vnode **)); +int ufsspec_close __P((struct vop_close_args *)); +int ufsspec_read __P((struct vop_read_args *)); +int ufsspec_write __P((struct vop_write_args *)); + +#ifdef FIFO +int ufsfifo_read __P((struct vop_read_args *)); +int ufsfifo_write __P((struct vop_write_args *)); +int ufsfifo_close __P((struct vop_close_args *)); +#endif +__END_DECLS diff --git a/sys/ufs/ufs/ufs_ihash.c b/sys/ufs/ufs/ufs_ihash.c new file mode 100644 index 0000000..4a37c90 --- /dev/null +++ b/sys/ufs/ufs/ufs_ihash.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_ihash.c 8.4 (Berkeley) 12/30/93 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/proc.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufs_extern.h> + +/* + * Structures associated with inode cacheing. + */ +struct inode **ihashtbl; +u_long ihash; /* size of hash table - 1 */ +#define INOHASH(device, inum) (((device) + (inum)) & ihash) + +/* + * Initialize inode hash table. + */ +void +ufs_ihashinit() +{ + + ihashtbl = hashinit(desiredvnodes, M_UFSMNT, &ihash); +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, return it, even if it is locked. + */ +struct vnode * +ufs_ihashlookup(device, inum) + dev_t device; + ino_t inum; +{ + register struct inode *ip; + + for (ip = ihashtbl[INOHASH(device, inum)];; ip = ip->i_next) { + if (ip == NULL) + return (NULL); + if (inum == ip->i_number && device == ip->i_dev) + return (ITOV(ip)); + } + /* NOTREACHED */ +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, but locked, wait for it. + */ +struct vnode * +ufs_ihashget(device, inum) + dev_t device; + ino_t inum; +{ + register struct inode *ip; + struct vnode *vp; + + for (;;) + for (ip = ihashtbl[INOHASH(device, inum)];; ip = ip->i_next) { + if (ip == NULL) + return (NULL); + if (inum == ip->i_number && device == ip->i_dev) { + if (ip->i_flag & IN_LOCKED) { + ip->i_flag |= IN_WANTED; + sleep(ip, PINOD); + break; + } + vp = ITOV(ip); + if (!vget(vp, 1)) + return (vp); + break; + } + } + /* NOTREACHED */ +} + +/* + * Insert the inode into the hash table, and return it locked. + */ +void +ufs_ihashins(ip) + struct inode *ip; +{ + struct inode **ipp, *iq; + + ipp = &ihashtbl[INOHASH(ip->i_dev, ip->i_number)]; + if (iq = *ipp) + iq->i_prev = &ip->i_next; + ip->i_next = iq; + ip->i_prev = ipp; + *ipp = ip; + if (ip->i_flag & IN_LOCKED) + panic("ufs_ihashins: already locked"); + if (curproc) + ip->i_lockholder = curproc->p_pid; + else + ip->i_lockholder = -1; + ip->i_flag |= IN_LOCKED; +} + +/* + * Remove the inode from the hash table. + */ +void +ufs_ihashrem(ip) + register struct inode *ip; +{ + register struct inode *iq; + + if (iq = ip->i_next) + iq->i_prev = ip->i_prev; + *ip->i_prev = iq; +#ifdef DIAGNOSTIC + ip->i_next = NULL; + ip->i_prev = NULL; +#endif +} diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c new file mode 100644 index 0000000..ac876f9 --- /dev/null +++ b/sys/ufs/ufs/ufs_inode.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_inode.c 8.4 (Berkeley) 1/21/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/kernel.h> +#include <sys/malloc.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +u_long nextgennumber; /* Next generation number to assign. */ +int prtactive = 0; /* 1 => print out reclaim of active vnodes */ + +int +ufs_init() +{ + static int first = 1; + + if (!first) + return (0); + first = 0; + +#ifdef DIAGNOSTIC + if ((sizeof(struct inode) - 1) & sizeof(struct inode)) + printf("ufs_init: bad size %d\n", sizeof(struct inode)); +#endif + ufs_ihashinit(); + dqinit(); + return (0); +} + +/* + * Last reference to an inode. If necessary, write or delete it. + */ +int +ufs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + struct timeval tv; + int mode, error; + extern int prtactive; + + if (prtactive && vp->v_usecount != 0) + vprint("ffs_inactive: pushing active", vp); + + /* Get rid of inodes related to stale file handles. */ + if (ip->i_mode == 0) { + if ((vp->v_flag & VXLOCK) == 0) + vgone(vp); + return (0); + } + + error = 0; +#ifdef DIAGNOSTIC + if (VOP_ISLOCKED(vp)) + panic("ffs_inactive: locked inode"); + if (curproc) + ip->i_lockholder = curproc->p_pid; + else + ip->i_lockholder = -1; +#endif + ip->i_flag |= IN_LOCKED; + if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { +#ifdef QUOTA + if (!getinoquota(ip)) + (void)chkiq(ip, -1, NOCRED, 0); +#endif + error = VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, NULL); + ip->i_rdev = 0; + mode = ip->i_mode; + ip->i_mode = 0; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + VOP_VFREE(vp, ip->i_number, mode); + } + if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { + tv = time; + VOP_UPDATE(vp, &tv, &tv, 0); + } + VOP_UNLOCK(vp); + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + */ + if (vp->v_usecount == 0 && ip->i_mode == 0) + vgone(vp); + return (error); +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +ufs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip; + int i, type; + + if (prtactive && vp->v_usecount != 0) + vprint("ufs_reclaim: pushing active", vp); + /* + * Remove the inode from its hash chain. + */ + ip = VTOI(vp); + ufs_ihashrem(ip); + /* + * Purge old data structures associated with the inode. + */ + cache_purge(vp); + if (ip->i_devvp) { + vrele(ip->i_devvp); + ip->i_devvp = 0; + } +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) { + if (ip->i_dquot[i] != NODQUOT) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } +#endif + switch (vp->v_mount->mnt_stat.f_type) { + case MOUNT_UFS: + type = M_FFSNODE; + break; + case MOUNT_MFS: + type = M_MFSNODE; + break; + case MOUNT_LFS: + type = M_LFSNODE; + break; + default: + panic("ufs_reclaim: not ufs file"); + } + FREE(vp->v_data, type); + vp->v_data = NULL; + return (0); +} diff --git a/sys/ufs/ufs/ufs_lockf.c b/sys/ufs/ufs/ufs_lockf.c new file mode 100644 index 0000000..cb9a737 --- /dev/null +++ b/sys/ufs/ufs/ufs_lockf.c @@ -0,0 +1,707 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Scooter Morris at Genentech Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/fcntl.h> + +#include <ufs/ufs/lockf.h> +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufs_extern.h> + +/* + * This variable controls the maximum number of processes that will + * be checked in doing deadlock detection. + */ +int maxlockdepth = MAXDEPTH; + +#ifdef LOCKF_DEBUG +int lockf_debug = 0; +#endif + +#define NOLOCKF (struct lockf *)0 +#define SELF 0x1 +#define OTHERS 0x2 + +/* + * Set a byte-range lock. + */ +int +lf_setlock(lock) + register struct lockf *lock; +{ + register struct lockf *block; + struct inode *ip = lock->lf_inode; + struct lockf **prev, *overlap, *ltmp; + static char lockstr[] = "lockf"; + int ovcase, priority, needtolink, error; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_setlock", lock); +#endif /* LOCKF_DEBUG */ + + /* + * Set the priority + */ + priority = PLOCK; + if (lock->lf_type == F_WRLCK) + priority += 4; + priority |= PCATCH; + /* + * Scan lock list for this file looking for locks that would block us. + */ + while (block = lf_getblock(lock)) { + /* + * Free the structure and return if nonblocking. + */ + if ((lock->lf_flags & F_WAIT) == 0) { + FREE(lock, M_LOCKF); + return (EAGAIN); + } + /* + * We are blocked. Since flock style locks cover + * the whole file, there is no chance for deadlock. + * For byte-range locks we must check for deadlock. + * + * Deadlock detection is done by looking through the + * wait channels to see if there are any cycles that + * involve us. MAXDEPTH is set just to make sure we + * do not go off into neverland. + */ + if ((lock->lf_flags & F_POSIX) && + (block->lf_flags & F_POSIX)) { + register struct proc *wproc; + register struct lockf *waitblock; + int i = 0; + + /* The block is waiting on something */ + wproc = (struct proc *)block->lf_id; + while (wproc->p_wchan && + (wproc->p_wmesg == lockstr) && + (i++ < maxlockdepth)) { + waitblock = (struct lockf *)wproc->p_wchan; + /* Get the owner of the blocking lock */ + waitblock = waitblock->lf_next; + if ((waitblock->lf_flags & F_POSIX) == 0) + break; + wproc = (struct proc *)waitblock->lf_id; + if (wproc == (struct proc *)lock->lf_id) { + free(lock, M_LOCKF); + return (EDEADLK); + } + } + } + /* + * For flock type locks, we must first remove + * any shared locks that we hold before we sleep + * waiting for an exclusive lock. + */ + if ((lock->lf_flags & F_FLOCK) && + lock->lf_type == F_WRLCK) { + lock->lf_type = F_UNLCK; + (void) lf_clearlock(lock); + lock->lf_type = F_WRLCK; + } + /* + * Add our lock to the blocked list and sleep until we're free. + * Remember who blocked us (for deadlock detection). + */ + lock->lf_next = block; + lf_addblock(block, lock); +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: blocking on", block); + lf_printlist("lf_setlock", block); + } +#endif /* LOCKF_DEBUG */ + if (error = tsleep((caddr_t)lock, priority, lockstr, 0)) { + /* + * Delete ourselves from the waiting to lock list. + */ + for (block = lock->lf_next; + block != NOLOCKF; + block = block->lf_block) { + if (block->lf_block != lock) + continue; + block->lf_block = block->lf_block->lf_block; + break; + } + /* + * If we did not find ourselves on the list, but + * are still linked onto a lock list, then something + * is very wrong. + */ + if (block == NOLOCKF && lock->lf_next != NOLOCKF) + panic("lf_setlock: lost lock"); + free(lock, M_LOCKF); + return (error); + } + } + /* + * No blocks!! Add the lock. Note that we will + * downgrade or upgrade any overlapping locks this + * process already owns. + * + * Skip over locks owned by other processes. + * Handle any locks that overlap and are owned by ourselves. + */ + prev = &ip->i_lockf; + block = ip->i_lockf; + needtolink = 1; + for (;;) { + if (ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap)) + block = overlap->lf_next; + /* + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + switch (ovcase) { + case 0: /* no overlap */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + break; + + case 1: /* overlap == lock */ + /* + * If downgrading lock, others may be + * able to acquire it. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) + lf_wakelock(overlap); + overlap->lf_type = lock->lf_type; + FREE(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + + case 2: /* overlap contains lock */ + /* + * Check for common starting point and different types. + */ + if (overlap->lf_type == lock->lf_type) { + free(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + } + if (overlap->lf_start == lock->lf_start) { + *prev = lock; + lock->lf_next = overlap; + overlap->lf_start = lock->lf_end + 1; + } else + lf_split(overlap, lock); + lf_wakelock(overlap); + break; + + case 3: /* lock contains overlap */ + /* + * If downgrading lock, others may be able to + * acquire it, otherwise take the list. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) { + lf_wakelock(overlap); + } else { + ltmp = lock->lf_block; + lock->lf_block = overlap->lf_block; + lf_addblock(lock, ltmp); + } + /* + * Add the new lock if necessary and delete the overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap->lf_next; + prev = &lock->lf_next; + needtolink = 0; + } else + *prev = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + /* + * Add lock after overlap on the list. + */ + lock->lf_next = overlap->lf_next; + overlap->lf_next = lock; + overlap->lf_end = lock->lf_start - 1; + prev = &lock->lf_next; + lf_wakelock(overlap); + needtolink = 0; + continue; + + case 5: /* overlap ends after lock */ + /* + * Add the new lock before overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + overlap->lf_start = lock->lf_end + 1; + lf_wakelock(overlap); + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: got the lock", lock); + lf_printlist("lf_setlock", lock); + } +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Remove a byte-range lock on an inode. + * + * Generally, find the lock (or an overlap to that lock) + * and remove it (or shrink it), then wakeup anyone we can. + */ +int +lf_clearlock(unlock) + register struct lockf *unlock; +{ + struct inode *ip = unlock->lf_inode; + register struct lockf *lf = ip->i_lockf; + struct lockf *overlap, **prev; + int ovcase; + + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (unlock->lf_type != F_UNLCK) + panic("lf_clearlock: bad type"); + if (lockf_debug & 1) + lf_print("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + prev = &ip->i_lockf; + while (ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) { + /* + * Wakeup the list of locks to be retried. + */ + lf_wakelock(overlap); + + switch (ovcase) { + + case 1: /* overlap == lock */ + *prev = overlap->lf_next; + FREE(overlap, M_LOCKF); + break; + + case 2: /* overlap contains lock: split it */ + if (overlap->lf_start == unlock->lf_start) { + overlap->lf_start = unlock->lf_end + 1; + break; + } + lf_split(overlap, unlock); + overlap->lf_next = unlock->lf_next; + break; + + case 3: /* lock contains overlap */ + *prev = overlap->lf_next; + lf = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + overlap->lf_end = unlock->lf_start - 1; + prev = &overlap->lf_next; + lf = overlap->lf_next; + continue; + + case 5: /* overlap ends after lock */ + overlap->lf_start = unlock->lf_end + 1; + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_printlist("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Check whether there is a blocking lock, + * and if so return its process identifier. + */ +int +lf_getlock(lock, fl) + register struct lockf *lock; + register struct flock *fl; +{ + register struct lockf *block; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_getlock", lock); +#endif /* LOCKF_DEBUG */ + + if (block = lf_getblock(lock)) { + fl->l_type = block->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = block->lf_start; + if (block->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = block->lf_end - block->lf_start + 1; + if (block->lf_flags & F_POSIX) + fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; + else + fl->l_pid = -1; + } else { + fl->l_type = F_UNLCK; + } + return (0); +} + +/* + * Walk the list of locks for an inode and + * return the first blocking lock. + */ +struct lockf * +lf_getblock(lock) + register struct lockf *lock; +{ + struct lockf **prev, *overlap, *lf = lock->lf_inode->i_lockf; + int ovcase; + + prev = &lock->lf_inode->i_lockf; + while (ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) { + /* + * We've found an overlap, see if it blocks us + */ + if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) + return (overlap); + /* + * Nope, point to the next one on the list and + * see if it blocks us + */ + lf = overlap->lf_next; + } + return (NOLOCKF); +} + +/* + * Walk the list of locks for an inode to + * find an overlapping lock (if any). + * + * NOTE: this returns only the FIRST overlapping lock. There + * may be more than one. + */ +int +lf_findoverlap(lf, lock, type, prev, overlap) + register struct lockf *lf; + struct lockf *lock; + int type; + struct lockf ***prev; + struct lockf **overlap; +{ + off_t start, end; + + *overlap = lf; + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_findoverlap: looking for overlap in", lock); +#endif /* LOCKF_DEBUG */ + start = lock->lf_start; + end = lock->lf_end; + while (lf != NOLOCKF) { + if (((type & SELF) && lf->lf_id != lock->lf_id) || + ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("\tchecking", lf); +#endif /* LOCKF_DEBUG */ + /* + * OK, check for overlap + * + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + if ((lf->lf_end != -1 && start > lf->lf_end) || + (end != -1 && lf->lf_start > end)) { + /* Case 0 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("no overlap\n"); +#endif /* LOCKF_DEBUG */ + if ((type & SELF) && end != -1 && lf->lf_start > end) + return (0); + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } + if ((lf->lf_start == start) && (lf->lf_end == end)) { + /* Case 1 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap == lock\n"); +#endif /* LOCKF_DEBUG */ + return (1); + } + if ((lf->lf_start <= start) && + (end != -1) && + ((lf->lf_end >= end) || (lf->lf_end == -1))) { + /* Case 2 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap contains lock\n"); +#endif /* LOCKF_DEBUG */ + return (2); + } + if (start <= lf->lf_start && + (end == -1 || + (lf->lf_end != -1 && end >= lf->lf_end))) { + /* Case 3 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("lock contains overlap\n"); +#endif /* LOCKF_DEBUG */ + return (3); + } + if ((lf->lf_start < start) && + ((lf->lf_end >= start) || (lf->lf_end == -1))) { + /* Case 4 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap starts before lock\n"); +#endif /* LOCKF_DEBUG */ + return (4); + } + if ((lf->lf_start > start) && + (end != -1) && + ((lf->lf_end > end) || (lf->lf_end == -1))) { + /* Case 5 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap ends after lock\n"); +#endif /* LOCKF_DEBUG */ + return (5); + } + panic("lf_findoverlap: default"); + } + return (0); +} + +/* + * Add a lock to the end of the blocked list. + */ +void +lf_addblock(lock, blocked) + struct lockf *lock; + struct lockf *blocked; +{ + register struct lockf *lf; + + if (blocked == NOLOCKF) + return; +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) { + lf_print("addblock: adding", blocked); + lf_print("to blocked list of", lock); + } +#endif /* LOCKF_DEBUG */ + if ((lf = lock->lf_block) == NOLOCKF) { + lock->lf_block = blocked; + return; + } + while (lf->lf_block != NOLOCKF) + lf = lf->lf_block; + lf->lf_block = blocked; + return; +} + +/* + * Split a lock and a contained region into + * two or three locks as necessary. + */ +void +lf_split(lock1, lock2) + register struct lockf *lock1; + register struct lockf *lock2; +{ + register struct lockf *splitlock; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) { + lf_print("lf_split", lock1); + lf_print("splitting from", lock2); + } +#endif /* LOCKF_DEBUG */ + /* + * Check to see if spliting into only two pieces. + */ + if (lock1->lf_start == lock2->lf_start) { + lock1->lf_start = lock2->lf_end + 1; + lock2->lf_next = lock1; + return; + } + if (lock1->lf_end == lock2->lf_end) { + lock1->lf_end = lock2->lf_start - 1; + lock2->lf_next = lock1->lf_next; + lock1->lf_next = lock2; + return; + } + /* + * Make a new lock consisting of the last part of + * the encompassing lock + */ + MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); + bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock); + splitlock->lf_start = lock2->lf_end + 1; + splitlock->lf_block = NOLOCKF; + lock1->lf_end = lock2->lf_start - 1; + /* + * OK, now link it in + */ + splitlock->lf_next = lock1->lf_next; + lock2->lf_next = splitlock; + lock1->lf_next = lock2; +} + +/* + * Wakeup a blocklist + */ +void +lf_wakelock(listhead) + struct lockf *listhead; +{ + register struct lockf *blocklist, *wakelock; + + blocklist = listhead->lf_block; + listhead->lf_block = NOLOCKF; + while (blocklist != NOLOCKF) { + wakelock = blocklist; + blocklist = blocklist->lf_block; + wakelock->lf_block = NOLOCKF; + wakelock->lf_next = NOLOCKF; +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_wakelock: awakening", wakelock); +#endif /* LOCKF_DEBUG */ + wakeup((caddr_t)wakelock); + } +} + +#ifdef LOCKF_DEBUG +/* + * Print out a lock. + */ +void +lf_print(tag, lock) + char *tag; + register struct lockf *lock; +{ + + printf("%s: lock 0x%lx for ", tag, lock); + if (lock->lf_flags & F_POSIX) + printf("proc %d", ((struct proc *)(lock->lf_id))->p_pid); + else + printf("id 0x%x", lock->lf_id); + printf(" in ino %d on dev <%d, %d>, %s, start %d, end %d", + lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev), + lock->lf_type == F_RDLCK ? "shared" : + lock->lf_type == F_WRLCK ? "exclusive" : + lock->lf_type == F_UNLCK ? "unlock" : + "unknown", lock->lf_start, lock->lf_end); + if (lock->lf_block) + printf(" block 0x%x\n", lock->lf_block); + else + printf("\n"); +} + +void +lf_printlist(tag, lock) + char *tag; + struct lockf *lock; +{ + register struct lockf *lf; + + printf("%s: Lock list for ino %d on dev <%d, %d>:\n", + tag, lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev)); + for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) { + printf("\tlock 0x%lx for ", lf); + if (lf->lf_flags & F_POSIX) + printf("proc %d", ((struct proc *)(lf->lf_id))->p_pid); + else + printf("id 0x%x", lf->lf_id); + printf(", %s, start %d, end %d", + lf->lf_type == F_RDLCK ? "shared" : + lf->lf_type == F_WRLCK ? "exclusive" : + lf->lf_type == F_UNLCK ? "unlock" : + "unknown", lf->lf_start, lf->lf_end); + if (lf->lf_block) + printf(" block 0x%x\n", lf->lf_block); + else + printf("\n"); + } +} +#endif /* LOCKF_DEBUG */ diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c new file mode 100644 index 0000000..87c6802 --- /dev/null +++ b/sys/ufs/ufs/ufs_lookup.c @@ -0,0 +1,970 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lookup.c 8.6 (Berkeley) 4/1/94 + */ + +#include <sys/param.h> +#include <sys/namei.h> +#include <sys/buf.h> +#include <sys/file.h> +#include <sys/mount.h> +#include <sys/vnode.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/dir.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +struct nchstats nchstats; +#ifdef DIAGNOSTIC +int dirchk = 1; +#else +int dirchk = 0; +#endif + +#define FSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) + +/* + * Convert a component of a pathname into a pointer to a locked inode. + * This is a very central and rather complicated routine. + * If the file system is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending + * on whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it and the target of the pathname + * exists, lookup returns both the target and its parent directory locked. + * When creating or renaming and LOCKPARENT is specified, the target may + * not be ".". When deleting and LOCKPARENT is specified, the target may + * be "."., but the caller must check to ensure it does an vrele and vput + * instead of two vputs. + * + * Overall outline of ufs_lookup: + * + * check accessibility of directory + * look for name in cache, if found, then if at end of path + * and deleting or creating, drop it, else return name + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * inode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + */ +int +ufs_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vdp; /* vnode for directory being searched */ + register struct inode *dp; /* inode for directory being searched */ + struct buf *bp; /* a buffer of directory entries */ + register struct direct *ep; /* the current directory entry */ + int entryoffsetinblock; /* offset of ep in bp's buffer */ + enum {NONE, COMPACT, FOUND} slotstatus; + doff_t slotoffset; /* offset of area with free space */ + int slotsize; /* size of area at slotoffset */ + int slotfreespace; /* amount of space free in slot */ + int slotneeded; /* size of the entry we're seeking */ + int numdirpasses; /* strategy for directory search */ + doff_t endsearch; /* offset to end directory search */ + doff_t prevoff; /* prev entry dp->i_offset */ + struct vnode *pdp; /* saved dp during symlink work */ + struct vnode *tdp; /* returned by VFS_VGET */ + doff_t enduseful; /* pointer past last used dir slot */ + u_long bmask; /* block offset mask */ + int lockparent; /* 1 => lockparent flag is set */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int namlen, error; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct ucred *cred = cnp->cn_cred; + int flags = cnp->cn_flags; + int nameiop = cnp->cn_nameiop; + + bp = NULL; + slotoffset = -1; + *vpp = NULL; + vdp = ap->a_dvp; + dp = VTOI(vdp); + lockparent = flags & LOCKPARENT; + wantparent = flags & (LOCKPARENT|WANTPARENT); + + /* + * Check accessiblity of directory. + */ + if ((dp->i_mode & IFMT) != IFDIR) + return (ENOTDIR); + if (error = VOP_ACCESS(vdp, VEXEC, cred, cnp->cn_proc)) + return (error); + + /* + * We now have a segment name to search for, and a directory to search. + * + * Before tediously performing a linear scan of the directory, + * check the name cache to see if the directory/name pair + * we are looking for is known already. + */ + if (error = cache_lookup(vdp, vpp, cnp)) { + int vpid; /* capability number of vnode */ + + if (error == ENOENT) + return (error); + /* + * Get the next vnode in the path. + * See comment below starting `Step through' for + * an explaination of the locking protocol. + */ + pdp = vdp; + dp = VTOI(*vpp); + vdp = *vpp; + vpid = vdp->v_id; + if (pdp == vdp) { /* lookup on "." */ + VREF(vdp); + error = 0; + } else if (flags & ISDOTDOT) { + VOP_UNLOCK(pdp); + error = vget(vdp, 1); + if (!error && lockparent && (flags & ISLASTCN)) + error = VOP_LOCK(pdp); + } else { + error = vget(vdp, 1); + if (!lockparent || error || !(flags & ISLASTCN)) + VOP_UNLOCK(pdp); + } + /* + * Check that the capability number did not change + * while we were waiting for the lock. + */ + if (!error) { + if (vpid == vdp->v_id) + return (0); + vput(vdp); + if (lockparent && pdp != vdp && (flags & ISLASTCN)) + VOP_UNLOCK(pdp); + } + if (error = VOP_LOCK(pdp)) + return (error); + vdp = pdp; + dp = VTOI(pdp); + *vpp = NULL; + } + + /* + * Suppress search for slots unless creating + * file and at end of pathname, in which case + * we watch for a place to put the new file in + * case it doesn't already exist. + */ + slotstatus = FOUND; + slotfreespace = slotsize = slotneeded = 0; + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN)) { + slotstatus = NONE; + slotneeded = (sizeof(struct direct) - MAXNAMLEN + + cnp->cn_namelen + 3) &~ 3; + } + + /* + * If there is cached information on a previous search of + * this directory, pick up where we last left off. + * We cache only lookups as these are the most common + * and have the greatest payoff. Caching CREATE has little + * benefit as it usually must search the entire directory + * to determine that the entry does not exist. Caching the + * location of the last DELETE or RENAME has not reduced + * profiling time and hence has been removed in the interest + * of simplicity. + */ + bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; + if (nameiop != LOOKUP || dp->i_diroff == 0 || + dp->i_diroff > dp->i_size) { + entryoffsetinblock = 0; + dp->i_offset = 0; + numdirpasses = 1; + } else { + dp->i_offset = dp->i_diroff; + if ((entryoffsetinblock = dp->i_offset & bmask) && + (error = VOP_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp))) + return (error); + numdirpasses = 2; + nchstats.ncs_2passes++; + } + prevoff = dp->i_offset; + endsearch = roundup(dp->i_size, DIRBLKSIZ); + enduseful = 0; + +searchloop: + while (dp->i_offset < endsearch) { + /* + * If necessary, get the next directory block. + */ + if ((dp->i_offset & bmask) == 0) { + if (bp != NULL) + brelse(bp); + if (error = + VOP_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp)) + return (error); + entryoffsetinblock = 0; + } + /* + * If still looking for a slot, and at a DIRBLKSIZE + * boundary, have to start looking for free space again. + */ + if (slotstatus == NONE && + (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { + slotoffset = -1; + slotfreespace = 0; + } + /* + * Get pointer to next entry. + * Full validation checks are slow, so we only check + * enough to insure forward progress through the + * directory. Complete checks can be run by patching + * "dirchk" to be true. + */ + ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); + if (ep->d_reclen == 0 || + dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock)) { + int i; + + ufs_dirbad(dp, dp->i_offset, "mangled entry"); + i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); + dp->i_offset += i; + entryoffsetinblock += i; + continue; + } + + /* + * If an appropriate sized slot has not yet been found, + * check to see if one is available. Also accumulate space + * in the current block so that we can determine if + * compaction is viable. + */ + if (slotstatus != FOUND) { + int size = ep->d_reclen; + + if (ep->d_ino != 0) + size -= DIRSIZ(FSFMT(vdp), ep); + if (size > 0) { + if (size >= slotneeded) { + slotstatus = FOUND; + slotoffset = dp->i_offset; + slotsize = ep->d_reclen; + } else if (slotstatus == NONE) { + slotfreespace += size; + if (slotoffset == -1) + slotoffset = dp->i_offset; + if (slotfreespace >= slotneeded) { + slotstatus = COMPACT; + slotsize = dp->i_offset + + ep->d_reclen - slotoffset; + } + } + } + } + + /* + * Check for a name match. + */ + if (ep->d_ino) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (vdp->v_mount->mnt_maxsymlinklen > 0) + namlen = ep->d_namlen; + else + namlen = ep->d_type; +# else + namlen = ep->d_namlen; +# endif + if (namlen == cnp->cn_namelen && + !bcmp(cnp->cn_nameptr, ep->d_name, + (unsigned)namlen)) { + /* + * Save directory entry's inode number and + * reclen in ndp->ni_ufs area, and release + * directory buffer. + */ + dp->i_ino = ep->d_ino; + dp->i_reclen = ep->d_reclen; + brelse(bp); + goto found; + } + } + prevoff = dp->i_offset; + dp->i_offset += ep->d_reclen; + entryoffsetinblock += ep->d_reclen; + if (ep->d_ino) + enduseful = dp->i_offset; + } +/* notfound: */ + /* + * If we started in the middle of the directory and failed + * to find our target, we must check the beginning as well. + */ + if (numdirpasses == 2) { + numdirpasses--; + dp->i_offset = 0; + endsearch = dp->i_diroff; + goto searchloop; + } + if (bp != NULL) + brelse(bp); + /* + * If creating, and at end of pathname and current + * directory has not been removed, then can consider + * allowing file to be created. + */ + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN) && dp->i_nlink != 0) { + /* + * Access for write is interpreted as allowing + * creation of files in the directory. + */ + if (error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc)) + return (error); + /* + * Return an indication of where the new directory + * entry should be put. If we didn't find a slot, + * then set dp->i_count to 0 indicating + * that the new slot belongs at the end of the + * directory. If we found a slot, then the new entry + * can be put in the range from dp->i_offset to + * dp->i_offset + dp->i_count. + */ + if (slotstatus == NONE) { + dp->i_offset = roundup(dp->i_size, DIRBLKSIZ); + dp->i_count = 0; + enduseful = dp->i_offset; + } else { + dp->i_offset = slotoffset; + dp->i_count = slotsize; + if (enduseful < slotoffset + slotsize) + enduseful = slotoffset + slotsize; + } + dp->i_endoff = roundup(enduseful, DIRBLKSIZ); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * We return with the directory locked, so that + * the parameters we set up above will still be + * valid if we actually decide to do a direnter(). + * We return ni_vp == NULL to indicate that the entry + * does not currently exist; we leave a pointer to + * the (locked) directory inode in ndp->ni_dvp. + * The pathname buffer is saved so that the name + * can be obtained later. + * + * NB - if the directory is unlocked, then this + * information cannot be used. + */ + cnp->cn_flags |= SAVENAME; + if (!lockparent) + VOP_UNLOCK(vdp); + return (EJUSTRETURN); + } + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) + cache_enter(vdp, *vpp, cnp); + return (ENOENT); + +found: + if (numdirpasses == 2) + nchstats.ncs_pass2++; + /* + * Check that directory length properly reflects presence + * of this entry. + */ + if (entryoffsetinblock + DIRSIZ(FSFMT(vdp), ep) > dp->i_size) { + ufs_dirbad(dp, dp->i_offset, "i_size too small"); + dp->i_size = entryoffsetinblock + DIRSIZ(FSFMT(vdp), ep); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + } + + /* + * Found component in pathname. + * If the final component of path name, save information + * in the cache as to where the entry was found. + */ + if ((flags & ISLASTCN) && nameiop == LOOKUP) + dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1); + + /* + * If deleting, and at end of pathname, return + * parameters which can be used to remove file. + * If the wantparent flag isn't set, we return only + * the directory (in ndp->ni_dvp), otherwise we go + * on and lock the inode, being careful with ".". + */ + if (nameiop == DELETE && (flags & ISLASTCN)) { + /* + * Write access to directory required to delete files. + */ + if (error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc)) + return (error); + /* + * Return pointer to current entry in dp->i_offset, + * and distance past previous entry (if there + * is a previous entry in this block) in dp->i_count. + * Save directory inode pointer in ndp->ni_dvp for dirremove(). + */ + if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) + dp->i_count = 0; + else + dp->i_count = dp->i_offset - prevoff; + if (dp->i_number == dp->i_ino) { + VREF(vdp); + *vpp = vdp; + return (0); + } + if (error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp)) + return (error); + /* + * If directory is "sticky", then user must own + * the directory, or the file in it, else she + * may not delete it (unless she's root). This + * implements append-only directories. + */ + if ((dp->i_mode & ISVTX) && + cred->cr_uid != 0 && + cred->cr_uid != dp->i_uid && + VTOI(tdp)->i_uid != cred->cr_uid) { + vput(tdp); + return (EPERM); + } + *vpp = tdp; + if (!lockparent) + VOP_UNLOCK(vdp); + return (0); + } + + /* + * If rewriting (RENAME), return the inode and the + * information required to rewrite the present directory + * Must get inode of directory entry to verify it's a + * regular file, or empty directory. + */ + if (nameiop == RENAME && wantparent && + (flags & ISLASTCN)) { + if (error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc)) + return (error); + /* + * Careful about locking second inode. + * This can only occur if the target is ".". + */ + if (dp->i_number == dp->i_ino) + return (EISDIR); + if (error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp)) + return (error); + *vpp = tdp; + cnp->cn_flags |= SAVENAME; + if (!lockparent) + VOP_UNLOCK(vdp); + return (0); + } + + /* + * Step through the translation in the name. We do not `vput' the + * directory because we may need it again if a symbolic link + * is relative to the current directory. Instead we save it + * unlocked as "pdp". We must get the target inode before unlocking + * the directory to insure that the inode will not be removed + * before we get it. We prevent deadlock by always fetching + * inodes from the root, moving down the directory tree. Thus + * when following backward pointers ".." we must unlock the + * parent directory before getting the requested directory. + * There is a potential race condition here if both the current + * and parent directories are removed before the VFS_VGET for the + * inode associated with ".." returns. We hope that this occurs + * infrequently since we cannot avoid this race condition without + * implementing a sophisticated deadlock detection algorithm. + * Note also that this simple deadlock detection scheme will not + * work if the file system has any hard links other than ".." + * that point backwards in the directory structure. + */ + pdp = vdp; + if (flags & ISDOTDOT) { + VOP_UNLOCK(pdp); /* race to get the inode */ + if (error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp)) { + VOP_LOCK(pdp); + return (error); + } + if (lockparent && (flags & ISLASTCN) && + (error = VOP_LOCK(pdp))) { + vput(tdp); + return (error); + } + *vpp = tdp; + } else if (dp->i_number == dp->i_ino) { + VREF(vdp); /* we want ourself, ie "." */ + *vpp = vdp; + } else { + if (error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp)) + return (error); + if (!lockparent || !(flags & ISLASTCN)) + VOP_UNLOCK(pdp); + *vpp = tdp; + } + + /* + * Insert name into cache if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + return (0); +} + +void +ufs_dirbad(ip, offset, how) + struct inode *ip; + doff_t offset; + char *how; +{ + struct mount *mp; + + mp = ITOV(ip)->v_mount; + (void)printf("%s: bad dir ino %d at offset %d: %s\n", + mp->mnt_stat.f_mntonname, ip->i_number, offset, how); + if ((mp->mnt_stat.f_flags & MNT_RDONLY) == 0) + panic("bad dir"); +} + +/* + * Do consistency checking on a directory entry: + * record length must be multiple of 4 + * entry must fit in rest of its DIRBLKSIZ block + * record must be large enough to contain entry + * name is not longer than MAXNAMLEN + * name must be as long as advertised, and null terminated + */ +int +ufs_dirbadentry(dp, ep, entryoffsetinblock) + struct vnode *dp; + register struct direct *ep; + int entryoffsetinblock; +{ + register int i; + int namlen; + +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (dp->v_mount->mnt_maxsymlinklen > 0) + namlen = ep->d_namlen; + else + namlen = ep->d_type; +# else + namlen = ep->d_namlen; +# endif + if ((ep->d_reclen & 0x3) != 0 || + ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || + ep->d_reclen < DIRSIZ(FSFMT(dp), ep) || namlen > MAXNAMLEN) { + /*return (1); */ + printf("First bad\n"); + goto bad; + } + for (i = 0; i < namlen; i++) + if (ep->d_name[i] == '\0') { + /*return (1); */ + printf("Second bad\n"); + goto bad; + } + if (ep->d_name[i]) + goto bad; + return (ep->d_name[i]); +bad: + return(1); +} + +/* + * Write a directory entry after a call to namei, using the parameters + * that it left in nameidata. The argument ip is the inode which the new + * directory entry will refer to. Dvp is a pointer to the directory to + * be written, which was left locked by namei. Remaining parameters + * (dp->i_offset, dp->i_count) indicate how the space for the new + * entry is to be obtained. + */ +int +ufs_direnter(ip, dvp, cnp) + struct inode *ip; + struct vnode *dvp; + register struct componentname *cnp; +{ + register struct direct *ep, *nep; + register struct inode *dp; + struct buf *bp; + struct direct newdir; + struct iovec aiov; + struct uio auio; + u_int dsize; + int error, loc, newentrysize, spacefree; + char *dirbuf; + +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & SAVENAME) == 0) + panic("direnter: missing name"); +#endif + dp = VTOI(dvp); + newdir.d_ino = ip->i_number; + newdir.d_namlen = cnp->cn_namelen; + bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); + if (dvp->v_mount->mnt_maxsymlinklen > 0) + newdir.d_type = IFTODT(ip->i_mode); + else { + newdir.d_type = 0; +# if (BYTE_ORDER == LITTLE_ENDIAN) + { u_char tmp = newdir.d_namlen; + newdir.d_namlen = newdir.d_type; + newdir.d_type = tmp; } +# endif + } + newentrysize = DIRSIZ(FSFMT(dvp), &newdir); + if (dp->i_count == 0) { + /* + * If dp->i_count is 0, then namei could find no + * space in the directory. Here, dp->i_offset will + * be on a directory block boundary and we will write the + * new entry into a fresh block. + */ + if (dp->i_offset & (DIRBLKSIZ - 1)) + panic("ufs_direnter: newblk"); + auio.uio_offset = dp->i_offset; + newdir.d_reclen = DIRBLKSIZ; + auio.uio_resid = newentrysize; + aiov.iov_len = newentrysize; + aiov.iov_base = (caddr_t)&newdir; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = (struct proc *)0; + error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred); + if (DIRBLKSIZ > + VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) + /* XXX should grow with balloc() */ + panic("ufs_direnter: frag size"); + else if (!error) { + dp->i_size = roundup(dp->i_size, DIRBLKSIZ); + dp->i_flag |= IN_CHANGE; + } + return (error); + } + + /* + * If dp->i_count is non-zero, then namei found space + * for the new entry in the range dp->i_offset to + * dp->i_offset + dp->i_count in the directory. + * To use this space, we may have to compact the entries located + * there, by copying them together towards the beginning of the + * block, leaving the free space in one usable chunk at the end. + */ + + /* + * Increase size of directory if entry eats into new space. + * This should never push the size past a new multiple of + * DIRBLKSIZE. + * + * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. + */ + if (dp->i_offset + dp->i_count > dp->i_size) + dp->i_size = dp->i_offset + dp->i_count; + /* + * Get the block containing the space for the new directory entry. + */ + if (error = VOP_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp)) + return (error); + /* + * Find space for the new entry. In the simple case, the entry at + * offset base will have the space. If it does not, then namei + * arranged that compacting the region dp->i_offset to + * dp->i_offset + dp->i_count would yield the + * space. + */ + ep = (struct direct *)dirbuf; + dsize = DIRSIZ(FSFMT(dvp), ep); + spacefree = ep->d_reclen - dsize; + for (loc = ep->d_reclen; loc < dp->i_count; ) { + nep = (struct direct *)(dirbuf + loc); + if (ep->d_ino) { + /* trim the existing slot */ + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + } else { + /* overwrite; nothing there; header is ours */ + spacefree += dsize; + } + dsize = DIRSIZ(FSFMT(dvp), nep); + spacefree += nep->d_reclen - dsize; + loc += nep->d_reclen; + bcopy((caddr_t)nep, (caddr_t)ep, dsize); + } + /* + * Update the pointer fields in the previous entry (if any), + * copy in the new entry, and write out the block. + */ + if (ep->d_ino == 0) { + if (spacefree + dsize < newentrysize) + panic("ufs_direnter: compact1"); + newdir.d_reclen = spacefree + dsize; + } else { + if (spacefree < newentrysize) + panic("ufs_direnter: compact2"); + newdir.d_reclen = spacefree; + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + } + bcopy((caddr_t)&newdir, (caddr_t)ep, (u_int)newentrysize); + error = VOP_BWRITE(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + if (!error && dp->i_endoff && dp->i_endoff < dp->i_size) + error = VOP_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_SYNC, + cnp->cn_cred, cnp->cn_proc); + return (error); +} + +/* + * Remove a directory entry after a call to namei, using + * the parameters which it left in nameidata. The entry + * dp->i_offset contains the offset into the directory of the + * entry to be eliminated. The dp->i_count field contains the + * size of the previous record in the directory. If this + * is 0, the first entry is being deleted, so we need only + * zero the inode number to mark the entry as free. If the + * entry is not the first in the directory, we must reclaim + * the space of the now empty record by adding the record size + * to the size of the previous entry. + */ +int +ufs_dirremove(dvp, cnp) + struct vnode *dvp; + struct componentname *cnp; +{ + register struct inode *dp; + struct direct *ep; + struct buf *bp; + int error; + + dp = VTOI(dvp); + if (dp->i_count == 0) { + /* + * First entry in block: set d_ino to zero. + */ + if (error = + VOP_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) + return (error); + ep->d_ino = 0; + error = VOP_BWRITE(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); + } + /* + * Collapse new free space into previous entry. + */ + if (error = VOP_BLKATOFF(dvp, (off_t)(dp->i_offset - dp->i_count), + (char **)&ep, &bp)) + return (error); + ep->d_reclen += dp->i_reclen; + error = VOP_BWRITE(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +/* + * Rewrite an existing directory entry to point at the inode + * supplied. The parameters describing the directory entry are + * set up by a call to namei. + */ +int +ufs_dirrewrite(dp, ip, cnp) + struct inode *dp, *ip; + struct componentname *cnp; +{ + struct buf *bp; + struct direct *ep; + struct vnode *vdp = ITOV(dp); + int error; + + if (error = VOP_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp)) + return (error); + ep->d_ino = ip->i_number; + if (vdp->v_mount->mnt_maxsymlinklen > 0) + ep->d_type = IFTODT(ip->i_mode); + error = VOP_BWRITE(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +/* + * Check if a directory is empty or not. + * Inode supplied must be locked. + * + * Using a struct dirtemplate here is not precisely + * what we want, but better than using a struct direct. + * + * NB: does not handle corrupted directories. + */ +int +ufs_dirempty(ip, parentino, cred) + register struct inode *ip; + ino_t parentino; + struct ucred *cred; +{ + register off_t off; + struct dirtemplate dbuf; + register struct direct *dp = (struct direct *)&dbuf; + int error, count, namlen; +#define MINDIRSIZ (sizeof (struct dirtemplate) / 2) + + for (off = 0; off < ip->i_size; off += dp->d_reclen) { + error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, + UIO_SYSSPACE, IO_NODELOCKED, cred, &count, (struct proc *)0); + /* + * Since we read MINDIRSIZ, residual must + * be 0 unless we're at end of file. + */ + if (error || count != 0) + return (0); + /* avoid infinite loops */ + if (dp->d_reclen == 0) + return (0); + /* skip empty entries */ + if (dp->d_ino == 0) + continue; + /* accept only "." and ".." */ +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0) + namlen = dp->d_namlen; + else + namlen = dp->d_type; +# else + namlen = dp->d_namlen; +# endif + if (namlen > 2) + return (0); + if (dp->d_name[0] != '.') + return (0); + /* + * At this point namlen must be 1 or 2. + * 1 implies ".", 2 implies ".." if second + * char is also "." + */ + if (namlen == 1) + continue; + if (dp->d_name[1] == '.' && dp->d_ino == parentino) + continue; + return (0); + } + return (1); +} + +/* + * Check if source directory is in the path of the target directory. + * Target is supplied locked, source is unlocked. + * The target is always vput before returning. + */ +int +ufs_checkpath(source, target, cred) + struct inode *source, *target; + struct ucred *cred; +{ + struct vnode *vp; + int error, rootino, namlen; + struct dirtemplate dirbuf; + + vp = ITOV(target); + if (target->i_number == source->i_number) { + error = EEXIST; + goto out; + } + rootino = ROOTINO; + error = 0; + if (target->i_number == rootino) + goto out; + + for (;;) { + if (vp->v_type != VDIR) { + error = ENOTDIR; + break; + } + error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, + sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED, cred, (int *)0, (struct proc *)0); + if (error != 0) + break; +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen > 0) + namlen = dirbuf.dotdot_namlen; + else + namlen = dirbuf.dotdot_type; +# else + namlen = dirbuf.dotdot_namlen; +# endif + if (namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + error = ENOTDIR; + break; + } + if (dirbuf.dotdot_ino == source->i_number) { + error = EINVAL; + break; + } + if (dirbuf.dotdot_ino == rootino) + break; + vput(vp); + if (error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino, &vp)) { + vp = NULL; + break; + } + } + +out: + if (error == ENOTDIR) + printf("checkpath: .. not a directory\n"); + if (vp != NULL) + vput(vp); + return (error); +} diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c new file mode 100644 index 0000000..15cb1cf --- /dev/null +++ b/sys/ufs/ufs/ufs_quota.c @@ -0,0 +1,938 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_quota.c 8.2 (Berkeley) 12/30/93 + */ +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/malloc.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/mount.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +/* + * Quota name to error message mapping. + */ +static char *quotatypes[] = INITQFNAMES; + +/* + * Set up the quotas for an inode. + * + * This routine completely defines the semantics of quotas. + * If other criterion want to be used to establish quotas, the + * MAXQUOTAS value in quotas.h should be increased, and the + * additional dquots set up here. + */ +int +getinoquota(ip) + register struct inode *ip; +{ + struct ufsmount *ump; + struct vnode *vp = ITOV(ip); + int error; + + ump = VFSTOUFS(vp->v_mount); + /* + * Set up the user quota based on file uid. + * EINVAL means that quotas are not enabled. + */ + if (ip->i_dquot[USRQUOTA] == NODQUOT && + (error = + dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) && + error != EINVAL) + return (error); + /* + * Set up the group quota based on file gid. + * EINVAL means that quotas are not enabled. + */ + if (ip->i_dquot[GRPQUOTA] == NODQUOT && + (error = + dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && + error != EINVAL) + return (error); + return (0); +} + +/* + * Update disk usage, and take corrective action. + */ +int +chkdq(ip, change, cred, flags) + register struct inode *ip; + long change; + struct ucred *cred; + int flags; +{ + register struct dquot *dq; + register int i; + int ncurblocks, error; + +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + ncurblocks = dq->dq_curblocks + change; + if (ncurblocks >= 0) + dq->dq_curblocks = ncurblocks; + else + dq->dq_curblocks = 0; + dq->dq_flags &= ~DQ_BLKS; + dq->dq_flags |= DQ_MOD; + } + return (0); + } + if ((flags & FORCE) == 0 && cred->cr_uid != 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + if (error = chkdqchg(ip, change, cred, i)) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + dq->dq_curblocks += change; + dq->dq_flags |= DQ_MOD; + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +int +chkdqchg(ip, change, cred, type) + struct inode *ip; + long change; + struct ucred *cred; + int type; +{ + register struct dquot *dq = ip->i_dquot[type]; + long ncurblocks = dq->dq_curblocks + change; + + /* + * If user would exceed their hard limit, disallow space allocation. + */ + if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s disk limit reached\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type]); + dq->dq_flags |= DQ_BLKS; + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow space + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { + if (dq->dq_curblocks < dq->dq_bsoftlimit) { + dq->dq_btime = time.tv_sec + + VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type]; + if (ip->i_uid == cred->cr_uid) + uprintf("\n%s: warning, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], "disk quota exceeded"); + return (0); + } + if (time.tv_sec > dq->dq_btime) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], + "disk quota exceeded for too long"); + dq->dq_flags |= DQ_BLKS; + } + return (EDQUOT); + } + } + return (0); +} + +/* + * Check the inode limit, applying corrective action. + */ +int +chkiq(ip, change, cred, flags) + register struct inode *ip; + long change; + struct ucred *cred; + int flags; +{ + register struct dquot *dq; + register int i; + int ncurinodes, error; + +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + ncurinodes = dq->dq_curinodes + change; + if (ncurinodes >= 0) + dq->dq_curinodes = ncurinodes; + else + dq->dq_curinodes = 0; + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + } + return (0); + } + if ((flags & FORCE) == 0 && cred->cr_uid != 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + if (error = chkiqchg(ip, change, cred, i)) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + dq->dq_curinodes += change; + dq->dq_flags |= DQ_MOD; + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +int +chkiqchg(ip, change, cred, type) + struct inode *ip; + long change; + struct ucred *cred; + int type; +{ + register struct dquot *dq = ip->i_dquot[type]; + long ncurinodes = dq->dq_curinodes + change; + + /* + * If user would exceed their hard limit, disallow inode allocation. + */ + if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s inode limit reached\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type]); + dq->dq_flags |= DQ_INODS; + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow inode + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + if (dq->dq_curinodes < dq->dq_isoftlimit) { + dq->dq_itime = time.tv_sec + + VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type]; + if (ip->i_uid == cred->cr_uid) + uprintf("\n%s: warning, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], "inode quota exceeded"); + return (0); + } + if (time.tv_sec > dq->dq_itime) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], + "inode quota exceeded for too long"); + dq->dq_flags |= DQ_INODS; + } + return (EDQUOT); + } + } + return (0); +} + +#ifdef DIAGNOSTIC +/* + * On filesystems with quotas enabled, it is an error for a file to change + * size and not to have a dquot structure associated with it. + */ +void +chkdquot(ip) + register struct inode *ip; +{ + struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount); + register int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP || + (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING))) + continue; + if (ip->i_dquot[i] == NODQUOT) { + vprint("chkdquot: missing dquot", ITOV(ip)); + panic("missing dquot"); + } + } +} +#endif + +/* + * Code to process quotactl commands. + */ + +/* + * Q_QUOTAON - set up a quota file for a particular file system. + */ +int +quotaon(p, mp, type, fname) + struct proc *p; + struct mount *mp; + register int type; + caddr_t fname; +{ + register struct ufsmount *ump = VFSTOUFS(mp); + register struct vnode *vp, **vpp; + struct vnode *nextvp; + struct dquot *dq; + int error; + struct nameidata nd; + + vpp = &ump->um_quotas[type]; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, p); + if (error = vn_open(&nd, FREAD|FWRITE, 0)) + return (error); + vp = nd.ni_vp; + VOP_UNLOCK(vp); + if (vp->v_type != VREG) { + (void) vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (EACCES); + } + if (vfs_busy(mp)) { + (void) vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (EBUSY); + } + if (*vpp != vp) + quotaoff(p, mp, type); + ump->um_qflags[type] |= QTF_OPENING; + mp->mnt_flag |= MNT_QUOTA; + vp->v_flag |= VSYSTEM; + *vpp = vp; + /* + * Save the credential of the process that turned on quotas. + * Set up the time limits for this quota. + */ + crhold(p->p_ucred); + ump->um_cred[type] = p->p_ucred; + ump->um_btime[type] = MAX_DQ_TIME; + ump->um_itime[type] = MAX_IQ_TIME; + if (dqget(NULLVP, 0, ump, type, &dq) == 0) { + if (dq->dq_btime > 0) + ump->um_btime[type] = dq->dq_btime; + if (dq->dq_itime > 0) + ump->um_itime[type] = dq->dq_itime; + dqrele(NULLVP, dq); + } + /* + * Search vnodes associated with this mount point, + * adding references to quota file being opened. + * NB: only need to add dquot's for inodes being modified. + */ +again: + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { + nextvp = vp->v_mntvnodes.le_next; + if (vp->v_writecount == 0) + continue; + if (vget(vp, 1)) + goto again; + if (error = getinoquota(VTOI(vp))) { + vput(vp); + break; + } + vput(vp); + if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) + goto again; + } + ump->um_qflags[type] &= ~QTF_OPENING; + if (error) + quotaoff(p, mp, type); + vfs_unbusy(mp); + return (error); +} + +/* + * Q_QUOTAOFF - turn off disk quotas for a filesystem. + */ +int +quotaoff(p, mp, type) + struct proc *p; + struct mount *mp; + register int type; +{ + register struct vnode *vp; + struct vnode *qvp, *nextvp; + struct ufsmount *ump = VFSTOUFS(mp); + register struct dquot *dq; + register struct inode *ip; + int error; + + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("quotaoff: not busy"); + if ((qvp = ump->um_quotas[type]) == NULLVP) + return (0); + ump->um_qflags[type] |= QTF_CLOSING; + /* + * Search vnodes associated with this mount point, + * deleting any references to quota file being closed. + */ +again: + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { + nextvp = vp->v_mntvnodes.le_next; + if (vget(vp, 1)) + goto again; + ip = VTOI(vp); + dq = ip->i_dquot[type]; + ip->i_dquot[type] = NODQUOT; + dqrele(vp, dq); + vput(vp); + if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) + goto again; + } + dqflush(qvp); + qvp->v_flag &= ~VSYSTEM; + error = vn_close(qvp, FREAD|FWRITE, p->p_ucred, p); + ump->um_quotas[type] = NULLVP; + crfree(ump->um_cred[type]); + ump->um_cred[type] = NOCRED; + ump->um_qflags[type] &= ~QTF_CLOSING; + for (type = 0; type < MAXQUOTAS; type++) + if (ump->um_quotas[type] != NULLVP) + break; + if (type == MAXQUOTAS) + mp->mnt_flag &= ~MNT_QUOTA; + return (error); +} + +/* + * Q_GETQUOTA - return current values in a dqblk structure. + */ +int +getquota(mp, id, type, addr) + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + struct dquot *dq; + int error; + + if (error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq)) + return (error); + error = copyout((caddr_t)&dq->dq_dqb, addr, sizeof (struct dqblk)); + dqrele(NULLVP, dq); + return (error); +} + +/* + * Q_SETQUOTA - assign an entire dqblk structure. + */ +int +setquota(mp, id, type, addr) + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + register struct dquot *dq; + struct dquot *ndq; + struct ufsmount *ump = VFSTOUFS(mp); + struct dqblk newlim; + int error; + + if (error = copyin(addr, (caddr_t)&newlim, sizeof (struct dqblk))) + return (error); + if (error = dqget(NULLVP, id, ump, type, &ndq)) + return (error); + dq = ndq; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + /* + * Copy all but the current values. + * Reset time limit if previously had no soft limit or were + * under it, but now have a soft limit and are over it. + */ + newlim.dqb_curblocks = dq->dq_curblocks; + newlim.dqb_curinodes = dq->dq_curinodes; + if (dq->dq_id != 0) { + newlim.dqb_btime = dq->dq_btime; + newlim.dqb_itime = dq->dq_itime; + } + if (newlim.dqb_bsoftlimit && + dq->dq_curblocks >= newlim.dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) + newlim.dqb_btime = time.tv_sec + ump->um_btime[type]; + if (newlim.dqb_isoftlimit && + dq->dq_curinodes >= newlim.dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) + newlim.dqb_itime = time.tv_sec + ump->um_itime[type]; + dq->dq_dqb = newlim; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + else + dq->dq_flags &= ~DQ_FAKE; + dq->dq_flags |= DQ_MOD; + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SETUSE - set current inode and block usage. + */ +int +setuse(mp, id, type, addr) + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + register struct dquot *dq; + struct ufsmount *ump = VFSTOUFS(mp); + struct dquot *ndq; + struct dqblk usage; + int error; + + if (error = copyin(addr, (caddr_t)&usage, sizeof (struct dqblk))) + return (error); + if (error = dqget(NULLVP, id, ump, type, &ndq)) + return (error); + dq = ndq; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + /* + * Reset time limit if have a soft limit and were + * previously under it, but are now over it. + */ + if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && + usage.dqb_curblocks >= dq->dq_bsoftlimit) + dq->dq_btime = time.tv_sec + ump->um_btime[type]; + if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && + usage.dqb_curinodes >= dq->dq_isoftlimit) + dq->dq_itime = time.tv_sec + ump->um_itime[type]; + dq->dq_curblocks = usage.dqb_curblocks; + dq->dq_curinodes = usage.dqb_curinodes; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SYNC - sync quota files to disk. + */ +int +qsync(mp) + struct mount *mp; +{ + struct ufsmount *ump = VFSTOUFS(mp); + register struct vnode *vp, *nextvp; + register struct dquot *dq; + register int i; + + /* + * Check if the mount point has any quotas. + * If not, simply return. + */ + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("qsync: not busy"); + for (i = 0; i < MAXQUOTAS; i++) + if (ump->um_quotas[i] != NULLVP) + break; + if (i == MAXQUOTAS) + return (0); + /* + * Search vnodes associated with this mount point, + * synchronizing any modified dquot structures. + */ +again: + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { + nextvp = vp->v_mntvnodes.le_next; + if (VOP_ISLOCKED(vp)) + continue; + if (vget(vp, 1)) + goto again; + for (i = 0; i < MAXQUOTAS; i++) { + dq = VTOI(vp)->i_dquot[i]; + if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) + dqsync(vp, dq); + } + vput(vp); + if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) + goto again; + } + return (0); +} + +/* + * Code pertaining to management of the in-core dquot data structures. + */ +struct dquot **dqhashtbl; +u_long dqhash; + +/* + * Dquot free list. + */ +#define DQUOTINC 5 /* minimum free dquots desired */ +struct dquot *dqfreel, **dqback = &dqfreel; +long numdquot, desireddquot = DQUOTINC; + +/* + * Initialize the quota system. + */ +void +dqinit() +{ + + dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash); +} + +/* + * Obtain a dquot structure for the specified identifier and quota file + * reading the information from the file if necessary. + */ +int +dqget(vp, id, ump, type, dqp) + struct vnode *vp; + u_long id; + register struct ufsmount *ump; + register int type; + struct dquot **dqp; +{ + register struct dquot *dq, *dp, **dpp; + register struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + + dqvp = ump->um_quotas[type]; + if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { + *dqp = NODQUOT; + return (EINVAL); + } + /* + * Check the cache first. + */ + dpp = &dqhashtbl[((((int)(dqvp)) >> 8) + id) & dqhash]; + for (dq = *dpp; dq; dq = dq->dq_forw) { + if (dq->dq_id != id || + dq->dq_ump->um_quotas[dq->dq_type] != dqvp) + continue; + /* + * Cache hit with no references. Take + * the structure off the free list. + */ + if (dq->dq_cnt == 0) { + if ((dp = dq->dq_freef) != NODQUOT) + dp->dq_freeb = dq->dq_freeb; + else + dqback = dq->dq_freeb; + *dq->dq_freeb = dp; + } + DQREF(dq); + *dqp = dq; + return (0); + } + /* + * Not in cache, allocate a new one. + */ + if (dqfreel == NODQUOT && numdquot < MAXQUOTAS * desiredvnodes) + desireddquot += DQUOTINC; + if (numdquot < desireddquot) { + dq = (struct dquot *)malloc(sizeof *dq, M_DQUOT, M_WAITOK); + bzero((char *)dq, sizeof *dq); + numdquot++; + } else { + if ((dq = dqfreel) == NULL) { + tablefull("dquot"); + *dqp = NODQUOT; + return (EUSERS); + } + if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) + panic("free dquot isn't"); + if ((dp = dq->dq_freef) != NODQUOT) + dp->dq_freeb = &dqfreel; + else + dqback = &dqfreel; + dqfreel = dp; + dq->dq_freef = NULL; + dq->dq_freeb = NULL; + if (dp = dq->dq_forw) + dp->dq_back = dq->dq_back; + *dq->dq_back = dp; + } + /* + * Initialize the contents of the dquot structure. + */ + if (vp != dqvp) + VOP_LOCK(dqvp); + if (dp = *dpp) + dp->dq_back = &dq->dq_forw; + dq->dq_forw = dp; + dq->dq_back = dpp; + *dpp = dq; + DQREF(dq); + dq->dq_flags = DQ_LOCK; + dq->dq_id = id; + dq->dq_ump = ump; + dq->dq_type = type; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t)&dq->dq_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(id * sizeof (struct dqblk)); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = (struct proc *)0; + error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); + if (auio.uio_resid == sizeof(struct dqblk) && error == 0) + bzero((caddr_t)&dq->dq_dqb, sizeof(struct dqblk)); + if (vp != dqvp) + VOP_UNLOCK(dqvp); + if (dq->dq_flags & DQ_WANT) + wakeup((caddr_t)dq); + dq->dq_flags = 0; + /* + * I/O error in reading quota file, release + * quota structure and reflect problem to caller. + */ + if (error) { + if (dp = dq->dq_forw) + dp->dq_back = dq->dq_back; + *dq->dq_back = dp; + dq->dq_forw = NULL; + dq->dq_back = NULL; + dqrele(vp, dq); + *dqp = NODQUOT; + return (error); + } + /* + * Check for no limit to enforce. + * Initialize time values if necessary. + */ + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + if (dq->dq_id != 0) { + if (dq->dq_btime == 0) + dq->dq_btime = time.tv_sec + ump->um_btime[type]; + if (dq->dq_itime == 0) + dq->dq_itime = time.tv_sec + ump->um_itime[type]; + } + *dqp = dq; + return (0); +} + +/* + * Obtain a reference to a dquot. + */ +void +dqref(dq) + struct dquot *dq; +{ + + dq->dq_cnt++; +} + +/* + * Release a reference to a dquot. + */ +void +dqrele(vp, dq) + struct vnode *vp; + register struct dquot *dq; +{ + + if (dq == NODQUOT) + return; + if (dq->dq_cnt > 1) { + dq->dq_cnt--; + return; + } + if (dq->dq_flags & DQ_MOD) + (void) dqsync(vp, dq); + if (--dq->dq_cnt > 0) + return; + if (dqfreel != NODQUOT) { + *dqback = dq; + dq->dq_freeb = dqback; + } else { + dqfreel = dq; + dq->dq_freeb = &dqfreel; + } + dq->dq_freef = NODQUOT; + dqback = &dq->dq_freef; +} + +/* + * Update the disk quota in the quota file. + */ +int +dqsync(vp, dq) + struct vnode *vp; + register struct dquot *dq; +{ + struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + + if (dq == NODQUOT) + panic("dqsync: dquot"); + if ((dq->dq_flags & DQ_MOD) == 0) + return (0); + if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP) + panic("dqsync: file"); + if (vp != dqvp) + VOP_LOCK(dqvp); + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+2); + if ((dq->dq_flags & DQ_MOD) == 0) { + if (vp != dqvp) + VOP_UNLOCK(dqvp); + return (0); + } + } + dq->dq_flags |= DQ_LOCK; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t)&dq->dq_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk)); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_procp = (struct proc *)0; + error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); + if (auio.uio_resid && error == 0) + error = EIO; + if (dq->dq_flags & DQ_WANT) + wakeup((caddr_t)dq); + dq->dq_flags &= ~(DQ_MOD|DQ_LOCK|DQ_WANT); + if (vp != dqvp) + VOP_UNLOCK(dqvp); + return (error); +} + +/* + * Flush all entries from the cache for a particular vnode. + */ +void +dqflush(vp) + register struct vnode *vp; +{ + register struct dquot *dq, *dp, **dpp, *nextdq; + + /* + * Move all dquot's that used to refer to this quota + * file off their hash chains (they will eventually + * fall off the head of the free list and be re-used). + */ + for (dpp = &dqhashtbl[dqhash]; dpp >= dqhashtbl; dpp--) { + for (dq = *dpp; dq; dq = nextdq) { + nextdq = dq->dq_forw; + if (dq->dq_ump->um_quotas[dq->dq_type] != vp) + continue; + if (dq->dq_cnt) + panic("dqflush: stray dquot"); + if (dp = dq->dq_forw) + dp->dq_back = dq->dq_back; + *dq->dq_back = dp; + dq->dq_forw = NULL; + dq->dq_back = NULL; + dq->dq_ump = (struct ufsmount *)0; + } + } +} diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c new file mode 100644 index 0000000..5ead2c1 --- /dev/null +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -0,0 +1,295 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_readwrite.c 8.7 (Berkeley) 1/21/94 + */ + +#ifdef LFS_READWRITE +#define BLKSIZE(a, b, c) blksize(a) +#define FS struct lfs +#define I_FS i_lfs +#define READ lfs_read +#define READ_S "lfs_read" +#define WRITE lfs_write +#define WRITE_S "lfs_write" +#define fs_bsize lfs_bsize +#define fs_maxfilesize lfs_maxfilesize +#else +#define BLKSIZE(a, b, c) blksize(a, b, c) +#define FS struct fs +#define I_FS i_fs +#define READ ffs_read +#define READ_S "ffs_read" +#define WRITE ffs_write +#define WRITE_S "ffs_write" +#endif + +/* + * Vnode op for reading. + */ +/* ARGSUSED */ +READ(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp; + register struct inode *ip; + register struct uio *uio; + register FS *fs; + struct buf *bp; + daddr_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + int error; + u_short mode; + + vp = ap->a_vp; + ip = VTOI(vp); + mode = ip->i_mode; + uio = ap->a_uio; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("%s: mode", READ_S); + + if (vp->v_type == VLNK) { + if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) + panic("%s: short symlink", READ_S); + } else if (vp->v_type != VREG && vp->v_type != VDIR) + panic("%s: type %d", READ_S, vp->v_type); +#endif + fs = ip->I_FS; + if ((u_quad_t)uio->uio_offset > fs->fs_maxfilesize) + return (EFBIG); + + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) + break; + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + size = BLKSIZE(fs, ip, lbn); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (bytesinfile < xfersize) + xfersize = bytesinfile; + +#ifdef LFS_READWRITE + (void)lfs_check(vp, lbn); + error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, &bp); +#else + if (lblktosize(fs, nextlbn) > ip->i_size) + error = bread(vp, lbn, size, NOCRED, &bp); + else if (doclusterread) + error = cluster_read(vp, + ip->i_size, lbn, size, NOCRED, &bp); + else if (lbn - 1 == vp->v_lastr) { + int nextsize = BLKSIZE(fs, ip, nextlbn); + error = breadn(vp, lbn, + size, &nextlbn, &nextsize, 1, NOCRED, &bp); + } else + error = bread(vp, lbn, size, NOCRED, &bp); +#endif + if (error) + break; + vp->v_lastr = lbn; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + if (error = + uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio)) + break; + + if (S_ISREG(mode) && (xfersize + blkoffset == fs->fs_bsize || + uio->uio_offset == ip->i_size)) + bp->b_flags |= B_AGE; + brelse(bp); + } + if (bp != NULL) + brelse(bp); + ip->i_flag |= IN_ACCESS; + return (error); +} + +/* + * Vnode op for writing. + */ +WRITE(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp; + register struct uio *uio; + register struct inode *ip; + register FS *fs; + struct buf *bp; + struct proc *p; + daddr_t lbn; + off_t osize; + int blkoffset, error, flags, ioflag, resid, size, xfersize; + + ioflag = ap->a_ioflag; + uio = ap->a_uio; + vp = ap->a_vp; + ip = VTOI(vp); + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("%s: mode", WRITE_S); +#endif + + switch (vp->v_type) { + case VREG: + if (ioflag & IO_APPEND) + uio->uio_offset = ip->i_size; + if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) + return (EPERM); + /* FALLTHROUGH */ + case VLNK: + break; + case VDIR: + if ((ioflag & IO_SYNC) == 0) + panic("%s: nonsync dir write", WRITE_S); + break; + default: + panic("%s: type", WRITE_S); + } + + fs = ip->I_FS; + if (uio->uio_offset < 0 || + (u_quad_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) + return (EFBIG); + /* + * Maybe this should be above the vnode op call, but so long as + * file servers have no limits, I don't think it matters. + */ + p = uio->uio_procp; + if (vp->v_type == VREG && p && + uio->uio_offset + uio->uio_resid > + p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { + psignal(p, SIGXFSZ); + return (EFBIG); + } + + resid = uio->uio_resid; + osize = ip->i_size; + flags = ioflag & IO_SYNC ? B_SYNC : 0; + + for (error = 0; uio->uio_resid > 0;) { + lbn = lblkno(fs, uio->uio_offset); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; +#ifdef LFS_READWRITE + (void)lfs_check(vp, lbn); + error = lfs_balloc(vp, xfersize, lbn, &bp); +#else + if (fs->fs_bsize > xfersize) + flags |= B_CLRBUF; + else + flags &= ~B_CLRBUF; + + error = ffs_balloc(ip, + lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); +#endif + if (error) + break; + if (uio->uio_offset + xfersize > ip->i_size) { + ip->i_size = uio->uio_offset + xfersize; + vnode_pager_setsize(vp, (u_long)ip->i_size); + } + (void)vnode_pager_uncache(vp); + + size = BLKSIZE(fs, ip, lbn) - bp->b_resid; + if (size < xfersize) + xfersize = size; + + error = + uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); +#ifdef LFS_READWRITE + (void)VOP_BWRITE(bp); +#else + if (ioflag & IO_SYNC) + (void)bwrite(bp); + else if (xfersize + blkoffset == fs->fs_bsize) + if (doclusterwrite) + cluster_write(bp, ip->i_size); + else { + bp->b_flags |= B_AGE; + bawrite(bp); + } + else + bdwrite(bp); +#endif + if (error || xfersize == 0) + break; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) + ip->i_mode &= ~(ISUID | ISGID); + if (error) { + if (ioflag & IO_UNIT) { + (void)VOP_TRUNCATE(vp, osize, + ioflag & IO_SYNC, ap->a_cred, uio->uio_procp); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) + error = VOP_UPDATE(vp, &time, &time, 1); + return (error); +} diff --git a/sys/ufs/ufs/ufs_vfsops.c b/sys/ufs/ufs/ufs_vfsops.c new file mode 100644 index 0000000..f806e0b --- /dev/null +++ b/sys/ufs/ufs/ufs_vfsops.c @@ -0,0 +1,206 @@ +/* + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vfsops.c 8.4 (Berkeley) 4/16/94 + */ + +#include <sys/param.h> +#include <sys/mbuf.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/malloc.h> + +#include <miscfs/specfs/specdev.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +/* + * Flag to permit forcible unmounting. + */ +int doforce = 1; + +/* + * Make a filesystem operational. + * Nothing to do at the moment. + */ +/* ARGSUSED */ +int +ufs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +/* + * Return the root of a filesystem. + */ +int +ufs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *nvp; + int error; + + if (error = VFS_VGET(mp, (ino_t)ROOTINO, &nvp)) + return (error); + *vpp = nvp; + return (0); +} + +/* + * Do operations associated with quotas + */ +int +ufs_quotactl(mp, cmds, uid, arg, p) + struct mount *mp; + int cmds; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + int cmd, type, error; + +#ifndef QUOTA + return (EOPNOTSUPP); +#else + if (uid == -1) + uid = p->p_cred->p_ruid; + cmd = cmds >> SUBCMDSHIFT; + + switch (cmd) { + case Q_GETQUOTA: + case Q_SYNC: + if (uid == p->p_cred->p_ruid) + break; + /* fall through */ + default: + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + } + + type = cmd & SUBCMDMASK; + if ((u_int)type >= MAXQUOTAS) + return (EINVAL); + + switch (cmd) { + + case Q_QUOTAON: + return (quotaon(p, mp, type, arg)); + + case Q_QUOTAOFF: + if (vfs_busy(mp)) + return (0); + error = quotaoff(p, mp, type); + vfs_unbusy(mp); + return (error); + + case Q_SETQUOTA: + return (setquota(mp, uid, type, arg)); + + case Q_SETUSE: + return (setuse(mp, uid, type, arg)); + + case Q_GETQUOTA: + return (getquota(mp, uid, type, arg)); + + case Q_SYNC: + if (vfs_busy(mp)) + return (0); + error = qsync(mp); + vfs_unbusy(mp); + return (error); + + default: + return (EINVAL); + } + /* NOTREACHED */ +#endif +} + +/* + * This is the generic part of fhtovp called after the underlying + * filesystem has validated the file handle. + * + * Verify that a host should have access to a filesystem, and if so + * return a vnode for the presented file handle. + */ +int +ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct ufid *ufhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + register struct inode *ip; + register struct netcred *np; + register struct ufsmount *ump = VFSTOUFS(mp); + struct vnode *nvp; + int error; + + /* + * Get the export permission structure for this <mp, client> tuple. + */ + np = vfs_export_lookup(mp, &ump->um_export, nam); + if (np == NULL) + return (EACCES); + + if (error = VFS_VGET(mp, ufhp->ufid_ino, &nvp)) { + *vpp = NULLVP; + return (error); + } + ip = VTOI(nvp); + if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen) { + vput(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + *exflagsp = np->netc_exflags; + *credanonp = &np->netc_anon; + return (0); +} diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c new file mode 100644 index 0000000..7b7c883 --- /dev/null +++ b/sys/ufs/ufs/ufs_vnops.c @@ -0,0 +1,2159 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vnops.c 8.10 (Berkeley) 4/1/94 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/resourcevar.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/conf.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/dirent.h> + +#include <vm/vm.h> + +#include <miscfs/specfs/specdev.h> + +#include <ufs/ufs/lockf.h> +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> +#include <ufs/ufs/dir.h> +#include <ufs/ufs/ufsmount.h> +#include <ufs/ufs/ufs_extern.h> + +static int ufs_chmod __P((struct vnode *, int, struct ucred *, struct proc *)); +static int ufs_chown + __P((struct vnode *, uid_t, gid_t, struct ucred *, struct proc *)); + +union _qcvt { + quad_t qcvt; + long val[2]; +}; +#define SETHIGH(q, h) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_HIGHWORD] = (h); \ + (q) = tmp.qcvt; \ +} +#define SETLOW(q, l) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_LOWWORD] = (l); \ + (q) = tmp.qcvt; \ +} + +/* + * Create a regular file + */ +int +ufs_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int error; + + if (error = + ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), + ap->a_dvp, ap->a_vpp, ap->a_cnp)) + return (error); + return (0); +} + +/* + * Mknod vnode call + */ +/* ARGSUSED */ +int +ufs_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vattr *vap = ap->a_vap; + register struct vnode **vpp = ap->a_vpp; + register struct inode *ip; + int error; + + if (error = + ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), + ap->a_dvp, vpp, ap->a_cnp)) + return (error); + ip = VTOI(*vpp); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + if (vap->va_rdev != VNOVAL) { + /* + * Want to be able to use this to make badblock + * inodes, so don't truncate the dev number. + */ + ip->i_rdev = vap->va_rdev; + } + /* + * Remove inode so that it will be reloaded by VFS_VGET and + * checked to see if it is an alias of an existing entry in + * the inode cache. + */ + vput(*vpp); + (*vpp)->v_type = VNON; + vgone(*vpp); + *vpp = 0; + return (0); +} + +/* + * Open called. + * + * Nothing to do. + */ +/* ARGSUSED */ +int +ufs_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * Files marked append-only must be opened for appending. + */ + if ((VTOI(ap->a_vp)->i_flags & APPEND) && + (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) + return (EPERM); + return (0); +} + +/* + * Close called. + * + * Update the times on the inode. + */ +/* ARGSUSED */ +int +ufs_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + + if (vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) + ITIMES(ip, &time, &time); + return (0); +} + +int +ufs_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + register struct ucred *cred = ap->a_cred; + mode_t mask, mode = ap->a_mode; + register gid_t *gp; + int i, error; + +#ifdef DIAGNOSTIC + if (!VOP_ISLOCKED(vp)) { + vprint("ufs_access: not locked", vp); + panic("ufs_access: not locked"); + } +#endif +#ifdef QUOTA + if (mode & VWRITE) + switch (vp->v_type) { + case VDIR: + case VLNK: + case VREG: + if (error = getinoquota(ip)) + return (error); + break; + } +#endif + + /* If immutable bit set, nobody gets to write it. */ + if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE)) + return (EPERM); + + /* Otherwise, user id 0 always gets access. */ + if (cred->cr_uid == 0) + return (0); + + mask = 0; + + /* Otherwise, check the owner. */ + if (cred->cr_uid == ip->i_uid) { + if (mode & VEXEC) + mask |= S_IXUSR; + if (mode & VREAD) + mask |= S_IRUSR; + if (mode & VWRITE) + mask |= S_IWUSR; + return ((ip->i_mode & mask) == mask ? 0 : EACCES); + } + + /* Otherwise, check the groups. */ + for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) + if (ip->i_gid == *gp) { + if (mode & VEXEC) + mask |= S_IXGRP; + if (mode & VREAD) + mask |= S_IRGRP; + if (mode & VWRITE) + mask |= S_IWGRP; + return ((ip->i_mode & mask) == mask ? 0 : EACCES); + } + + /* Otherwise, check everyone else. */ + if (mode & VEXEC) + mask |= S_IXOTH; + if (mode & VREAD) + mask |= S_IROTH; + if (mode & VWRITE) + mask |= S_IWOTH; + return ((ip->i_mode & mask) == mask ? 0 : EACCES); +} + +/* ARGSUSED */ +int +ufs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + register struct vattr *vap = ap->a_vap; + + ITIMES(ip, &time, &time); + /* + * Copy from inode table + */ + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_mode & ~IFMT; + vap->va_nlink = ip->i_nlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + vap->va_rdev = (dev_t)ip->i_rdev; + vap->va_size = ip->i_din.di_size; + vap->va_atime = ip->i_atime; + vap->va_mtime = ip->i_mtime; + vap->va_ctime = ip->i_ctime; + vap->va_flags = ip->i_flags; + vap->va_gen = ip->i_gen; + /* this doesn't belong here */ + if (vp->v_type == VBLK) + vap->va_blocksize = BLKDEV_IOSIZE; + else if (vp->v_type == VCHR) + vap->va_blocksize = MAXBSIZE; + else + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_bytes = dbtob(ip->i_blocks); + vap->va_type = vp->v_type; + vap->va_filerev = ip->i_modrev; + return (0); +} + +/* + * Set attribute vnode op. called from several syscalls + */ +int +ufs_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vattr *vap = ap->a_vap; + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + register struct ucred *cred = ap->a_cred; + register struct proc *p = ap->a_p; + struct timeval atimeval, mtimeval; + int error; + + /* + * Check for unsettable attributes. + */ + if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || + (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || + (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || + ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { + return (EINVAL); + } + if (vap->va_flags != VNOVAL) { + if (cred->cr_uid != ip->i_uid && + (error = suser(cred, &p->p_acflag))) + return (error); + if (cred->cr_uid == 0) { + if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) && + securelevel > 0) + return (EPERM); + ip->i_flags = vap->va_flags; + } else { + if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) + return (EPERM); + ip->i_flags &= SF_SETTABLE; + ip->i_flags |= (vap->va_flags & UF_SETTABLE); + } + ip->i_flag |= IN_CHANGE; + if (vap->va_flags & (IMMUTABLE | APPEND)) + return (0); + } + if (ip->i_flags & (IMMUTABLE | APPEND)) + return (EPERM); + /* + * Go through the fields and update iff not VNOVAL. + */ + if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) + if (error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, p)) + return (error); + if (vap->va_size != VNOVAL) { + if (vp->v_type == VDIR) + return (EISDIR); + if (error = VOP_TRUNCATE(vp, vap->va_size, 0, cred, p)) + return (error); + } + ip = VTOI(vp); + if (vap->va_atime.ts_sec != VNOVAL || vap->va_mtime.ts_sec != VNOVAL) { + if (cred->cr_uid != ip->i_uid && + (error = suser(cred, &p->p_acflag)) && + ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || + (error = VOP_ACCESS(vp, VWRITE, cred, p)))) + return (error); + if (vap->va_atime.ts_sec != VNOVAL) + ip->i_flag |= IN_ACCESS; + if (vap->va_mtime.ts_sec != VNOVAL) + ip->i_flag |= IN_CHANGE | IN_UPDATE; + atimeval.tv_sec = vap->va_atime.ts_sec; + atimeval.tv_usec = vap->va_atime.ts_nsec / 1000; + mtimeval.tv_sec = vap->va_mtime.ts_sec; + mtimeval.tv_usec = vap->va_mtime.ts_nsec / 1000; + if (error = VOP_UPDATE(vp, &atimeval, &mtimeval, 1)) + return (error); + } + error = 0; + if (vap->va_mode != (mode_t)VNOVAL) + error = ufs_chmod(vp, (int)vap->va_mode, cred, p); + return (error); +} + +/* + * Change the mode on a file. + * Inode must be locked before calling. + */ +static int +ufs_chmod(vp, mode, cred, p) + register struct vnode *vp; + register int mode; + register struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + int error; + + if (cred->cr_uid != ip->i_uid && + (error = suser(cred, &p->p_acflag))) + return (error); + if (cred->cr_uid) { + if (vp->v_type != VDIR && (mode & S_ISTXT)) + return (EFTYPE); + if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) + return (EPERM); + } + ip->i_mode &= ~ALLPERMS; + ip->i_mode |= (mode & ALLPERMS); + ip->i_flag |= IN_CHANGE; + if ((vp->v_flag & VTEXT) && (ip->i_mode & S_ISTXT) == 0) + (void) vnode_pager_uncache(vp); + return (0); +} + +/* + * Perform chown operation on inode ip; + * inode must be locked prior to call. + */ +static int +ufs_chown(vp, uid, gid, cred, p) + register struct vnode *vp; + uid_t uid; + gid_t gid; + struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + uid_t ouid; + gid_t ogid; + int error = 0; +#ifdef QUOTA + register int i; + long change; +#endif + + if (uid == (uid_t)VNOVAL) + uid = ip->i_uid; + if (gid == (gid_t)VNOVAL) + gid = ip->i_gid; + /* + * If we don't own the file, are trying to change the owner + * of the file, or are not a member of the target group, + * the caller must be superuser or the call fails. + */ + if ((cred->cr_uid != ip->i_uid || uid != ip->i_uid || + !groupmember((gid_t)gid, cred)) && + (error = suser(cred, &p->p_acflag))) + return (error); + ogid = ip->i_gid; + ouid = ip->i_uid; +#ifdef QUOTA + if (error = getinoquota(ip)) + return (error); + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + change = ip->i_blocks; + (void) chkdq(ip, -change, cred, CHOWN); + (void) chkiq(ip, -1, cred, CHOWN); + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } +#endif + ip->i_gid = gid; + ip->i_uid = uid; +#ifdef QUOTA + if ((error = getinoquota(ip)) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { + if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) + goto good; + else + (void) chkdq(ip, -change, cred, CHOWN|FORCE); + } + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } + ip->i_gid = ogid; + ip->i_uid = ouid; + if (getinoquota(ip) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + (void) chkdq(ip, change, cred, FORCE|CHOWN); + (void) chkiq(ip, 1, cred, FORCE|CHOWN); + (void) getinoquota(ip); + } + return (error); +good: + if (getinoquota(ip)) + panic("chown: lost quota"); +#endif /* QUOTA */ + if (ouid != uid || ogid != gid) + ip->i_flag |= IN_CHANGE; + if (ouid != uid && cred->cr_uid != 0) + ip->i_mode &= ~ISUID; + if (ogid != gid && cred->cr_uid != 0) + ip->i_mode &= ~ISGID; + return (0); +} + +/* ARGSUSED */ +int +ufs_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (ENOTTY); +} + +/* ARGSUSED */ +int +ufs_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * We should really check to see if I/O is possible. + */ + return (1); +} + +/* + * Mmap a file + * + * NB Currently unsupported. + */ +/* ARGSUSED */ +int +ufs_mmap(ap) + struct vop_mmap_args /* { + struct vnode *a_vp; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (EINVAL); +} + +/* + * Seek on a file + * + * Nothing to do, so just return. + */ +/* ARGSUSED */ +int +ufs_seek(ap) + struct vop_seek_args /* { + struct vnode *a_vp; + off_t a_oldoff; + off_t a_newoff; + struct ucred *a_cred; + } */ *ap; +{ + + return (0); +} + +int +ufs_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct inode *ip; + register struct vnode *vp = ap->a_vp; + register struct vnode *dvp = ap->a_dvp; + int error; + + ip = VTOI(vp); + if ((ip->i_flags & (IMMUTABLE | APPEND)) || + (VTOI(dvp)->i_flags & APPEND)) { + error = EPERM; + goto out; + } + if ((error = ufs_dirremove(dvp, ap->a_cnp)) == 0) { + ip->i_nlink--; + ip->i_flag |= IN_CHANGE; + } +out: + if (dvp == vp) + vrele(vp); + else + vput(vp); + vput(dvp); + return (error); +} + +/* + * link vnode call + */ +int +ufs_link(ap) + struct vop_link_args /* { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *tdvp = ap->a_tdvp; + register struct componentname *cnp = ap->a_cnp; + register struct inode *ip; + struct timeval tv; + int error; + +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_link: no name"); +#endif + if (vp->v_mount != tdvp->v_mount) { + VOP_ABORTOP(vp, cnp); + error = EXDEV; + goto out2; + } + if (vp != tdvp && (error = VOP_LOCK(tdvp))) { + VOP_ABORTOP(vp, cnp); + goto out2; + } + ip = VTOI(tdvp); + if ((nlink_t)ip->i_nlink >= LINK_MAX) { + VOP_ABORTOP(vp, cnp); + error = EMLINK; + goto out1; + } + if (ip->i_flags & (IMMUTABLE | APPEND)) { + VOP_ABORTOP(vp, cnp); + error = EPERM; + goto out1; + } + ip->i_nlink++; + ip->i_flag |= IN_CHANGE; + tv = time; + error = VOP_UPDATE(tdvp, &tv, &tv, 1); + if (!error) + error = ufs_direnter(ip, vp, cnp); + if (error) { + ip->i_nlink--; + ip->i_flag |= IN_CHANGE; + } + FREE(cnp->cn_pnbuf, M_NAMEI); +out1: + if (vp != tdvp) + VOP_UNLOCK(tdvp); +out2: + vput(vp); + return (error); +} + + + +/* + * relookup - lookup a path name component + * Used by lookup to re-aquire things. + */ +int +relookup(dvp, vpp, cnp) + struct vnode *dvp, **vpp; + struct componentname *cnp; +{ + register struct vnode *dp = 0; /* the directory we are searching */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int error = 0; +#ifdef NAMEI_DIAGNOSTIC + int newhash; /* DEBUG: check name hash */ + char *cp; /* DEBUG: check name ptr/len */ +#endif + + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + cnp->cn_flags &= ~ISSYMLINK; + dp = dvp; + VOP_LOCK(dp); + +/* dirloop: */ + /* + * Search a new directory. + * + * The cn_hash value is for use by vfs_cache. + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ +#ifdef NAMEI_DIAGNOSTIC + for (newhash = 0, cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + newhash += (unsigned char)*cp; + if (newhash != cnp->cn_hash) + panic("relookup: bad hash"); + if (cnp->cn_namelen != cp - cnp->cn_nameptr) + panic ("relookup: bad len"); + if (*cp != 0) + panic("relookup: not last component"); + printf("{%s}: ", cnp->cn_nameptr); +#endif + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (cnp->cn_nameiop != LOOKUP || wantparent) { + error = EISDIR; + goto bad; + } + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (!(cnp->cn_flags & LOCKLEAF)) + VOP_UNLOCK(dp); + *vpp = dp; + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + if (cnp->cn_flags & ISDOTDOT) + panic ("relookup: lookup on dot-dot"); + + /* + * We now have a segment name to search for, and a directory to search. + */ + if (error = VOP_LOOKUP(dp, vpp, cnp)) { +#ifdef DIAGNOSTIC + if (*vpp != NULL) + panic("leaf should be empty"); +#endif + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly || (dvp->v_mount->mnt_flag & MNT_RDONLY)) { + error = EROFS; + goto bad; + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory inode in ndp->ni_dvp. + */ + return (0); + } + dp = *vpp; + +#ifdef DIAGNOSTIC + /* + * Check for symbolic link + */ + if (dp->v_type == VLNK && (cnp->cn_flags & FOLLOW)) + panic ("relookup: symlink found.\n"); +#endif + + /* + * Check for read-only file systems. + */ + if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) { + /* + * Disallow directory write attempts on read-only + * file systems. + */ + if (rdonly || (dp->v_mount->mnt_flag & MNT_RDONLY) || + (wantparent && + (dvp->v_mount->mnt_flag & MNT_RDONLY))) { + error = EROFS; + goto bad2; + } + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + + if (!wantparent) + vrele(dvp); + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp); + return (0); + +bad2: + if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN)) + VOP_UNLOCK(dvp); + vrele(dvp); +bad: + vput(dp); + *vpp = NULL; + return (error); +} + + +/* + * Rename system call. + * rename("foo", "bar"); + * is essentially + * unlink("bar"); + * link("foo", "bar"); + * unlink("foo"); + * but ``atomically''. Can't do full commit without saving state in the + * inode on disk which isn't feasible at this time. Best we can do is + * always guarantee the target exists. + * + * Basic algorithm is: + * + * 1) Bump link count on source while we're linking it to the + * target. This also ensure the inode won't be deleted out + * from underneath us while we work (it may be truncated by + * a concurrent `trunc' or `open' for creation). + * 2) Link source to destination. If destination already exists, + * delete it first. + * 3) Unlink source reference to inode if still around. If a + * directory was moved and the parent of the destination + * is different from the source, patch the ".." entry in the + * directory. + */ +int +ufs_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + struct vnode *tvp = ap->a_tvp; + register struct vnode *tdvp = ap->a_tdvp; + struct vnode *fvp = ap->a_fvp; + register struct vnode *fdvp = ap->a_fdvp; + register struct componentname *tcnp = ap->a_tcnp; + register struct componentname *fcnp = ap->a_fcnp; + register struct inode *ip, *xp, *dp; + struct dirtemplate dirbuf; + struct timeval tv; + int doingdirectory = 0, oldparent = 0, newparent = 0; + int error = 0; + u_char namlen; + +#ifdef DIAGNOSTIC + if ((tcnp->cn_flags & HASBUF) == 0 || + (fcnp->cn_flags & HASBUF) == 0) + panic("ufs_rename: no name"); +#endif + /* + * Check for cross-device rename. + */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; +abortit: + VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */ + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */ + vrele(fdvp); + vrele(fvp); + return (error); + } + + /* + * Check if just deleting a link name. + */ + if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) || + (VTOI(tdvp)->i_flags & APPEND))) { + error = EPERM; + goto abortit; + } + if (fvp == tvp) { + if (fvp->v_type == VDIR) { + error = EINVAL; + goto abortit; + } + VOP_ABORTOP(fdvp, fcnp); + vrele(fdvp); + vrele(fvp); + vput(tdvp); + vput(tvp); + tcnp->cn_flags &= ~MODMASK; + tcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + if ((tcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost from startdir"); + tcnp->cn_nameiop = DELETE; + (void) relookup(tdvp, &tvp, tcnp); + return (VOP_REMOVE(tdvp, tvp, tcnp)); + } + if (error = VOP_LOCK(fvp)) + goto abortit; + dp = VTOI(fdvp); + ip = VTOI(fvp); + if ((ip->i_flags & (IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { + VOP_UNLOCK(fvp); + error = EPERM; + goto abortit; + } + if ((ip->i_mode & IFMT) == IFDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + dp == ip || (fcnp->cn_flags&ISDOTDOT) || + (ip->i_flag & IN_RENAME)) { + VOP_UNLOCK(fvp); + error = EINVAL; + goto abortit; + } + ip->i_flag |= IN_RENAME; + oldparent = dp->i_number; + doingdirectory++; + } + vrele(fdvp); + + /* + * When the target exists, both the directory + * and target vnodes are returned locked. + */ + dp = VTOI(tdvp); + xp = NULL; + if (tvp) + xp = VTOI(tvp); + + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + ip->i_nlink++; + ip->i_flag |= IN_CHANGE; + tv = time; + if (error = VOP_UPDATE(fvp, &tv, &tv, 1)) { + VOP_UNLOCK(fvp); + goto bad; + } + + /* + * If ".." must be changed (ie the directory gets a new + * parent) then the source directory must not be in the + * directory heirarchy above the target, as this would + * orphan everything below the source directory. Also + * the user must have write permission in the source so + * as to be able to change "..". We must repeat the call + * to namei, as the parent directory is unlocked by the + * call to checkpath(). + */ + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); + VOP_UNLOCK(fvp); + if (oldparent != dp->i_number) + newparent = dp->i_number; + if (doingdirectory && newparent) { + if (error) /* write access check above */ + goto bad; + if (xp != NULL) + vput(tvp); + if (error = ufs_checkpath(ip, dp, tcnp->cn_cred)) + goto out; + if ((tcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost to startdir"); + if (error = relookup(tdvp, &tvp, tcnp)) + goto out; + dp = VTOI(tdvp); + xp = NULL; + if (tvp) + xp = VTOI(tvp); + } + /* + * 2) If target doesn't exist, link the target + * to the source and unlink the source. + * Otherwise, rewrite the target directory + * entry to reference the source inode and + * expunge the original entry's existence. + */ + if (xp == NULL) { + if (dp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't fool with the link count. + */ + if (doingdirectory && newparent) { + if ((nlink_t)dp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto bad; + } + dp->i_nlink++; + dp->i_flag |= IN_CHANGE; + if (error = VOP_UPDATE(tdvp, &tv, &tv, 1)) + goto bad; + } + if (error = ufs_direnter(ip, tdvp, tcnp)) { + if (doingdirectory && newparent) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + (void)VOP_UPDATE(tdvp, &tv, &tv, 1); + } + goto bad; + } + vput(tdvp); + } else { + if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Short circuit rename(foo, foo). + */ + if (xp->i_number == ip->i_number) + panic("rename: same file"); + /* + * If the parent directory is "sticky", then the user must + * own the parent directory, or the destination of the rename, + * otherwise the destination may not be changed (except by + * root). This implements append-only directories. + */ + if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && + tcnp->cn_cred->cr_uid != dp->i_uid && + xp->i_uid != tcnp->cn_cred->cr_uid) { + error = EPERM; + goto bad; + } + /* + * Target must be empty if a directory and have no links + * to it. Also, ensure source and target are compatible + * (both directories, or both not directories). + */ + if ((xp->i_mode&IFMT) == IFDIR) { + if (!ufs_dirempty(xp, dp->i_number, tcnp->cn_cred) || + xp->i_nlink > 2) { + error = ENOTEMPTY; + goto bad; + } + if (!doingdirectory) { + error = ENOTDIR; + goto bad; + } + cache_purge(tdvp); + } else if (doingdirectory) { + error = EISDIR; + goto bad; + } + if (error = ufs_dirrewrite(dp, ip, tcnp)) + goto bad; + /* + * If the target directory is in the same + * directory as the source directory, + * decrement the link count on the parent + * of the target directory. + */ + if (doingdirectory && !newparent) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + } + vput(tdvp); + /* + * Adjust the link count of the target to + * reflect the dirrewrite above. If this is + * a directory it is empty and there are + * no links to it, so we can squash the inode and + * any space associated with it. We disallowed + * renaming over top of a directory with links to + * it above, as the remaining link would point to + * a directory without "." or ".." entries. + */ + xp->i_nlink--; + if (doingdirectory) { + if (--xp->i_nlink != 0) + panic("rename: linked directory"); + error = VOP_TRUNCATE(tvp, (off_t)0, IO_SYNC, + tcnp->cn_cred, tcnp->cn_proc); + } + xp->i_flag |= IN_CHANGE; + vput(tvp); + xp = NULL; + } + + /* + * 3) Unlink the source. + */ + fcnp->cn_flags &= ~MODMASK; + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + if ((fcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost from startdir"); + (void) relookup(fdvp, &fvp, fcnp); + if (fvp != NULL) { + xp = VTOI(fvp); + dp = VTOI(fdvp); + } else { + /* + * From name has disappeared. + */ + if (doingdirectory) + panic("rename: lost dir entry"); + vrele(ap->a_fvp); + return (0); + } + /* + * Ensure that the directory entry still exists and has not + * changed while the new name has been entered. If the source is + * a file then the entry may have been unlinked or renamed. In + * either case there is no further work to be done. If the source + * is a directory then it cannot have been rmdir'ed; its link + * count of three would cause a rmdir to fail with ENOTEMPTY. + * The IRENAME flag ensures that it cannot be moved by another + * rename. + */ + if (xp != ip) { + if (doingdirectory) + panic("rename: lost dir entry"); + } else { + /* + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. + */ + if (doingdirectory && newparent) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + error = vn_rdwr(UIO_READ, fvp, (caddr_t)&dirbuf, + sizeof (struct dirtemplate), (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, + tcnp->cn_cred, (int *)0, (struct proc *)0); + if (error == 0) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (fvp->v_mount->mnt_maxsymlinklen <= 0) + namlen = dirbuf.dotdot_type; + else + namlen = dirbuf.dotdot_namlen; +# else + namlen = dirbuf.dotdot_namlen; +# endif + if (namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + ufs_dirbad(xp, (doff_t)12, + "rename: mangled dir"); + } else { + dirbuf.dotdot_ino = newparent; + (void) vn_rdwr(UIO_WRITE, fvp, + (caddr_t)&dirbuf, + sizeof (struct dirtemplate), + (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED|IO_SYNC, + tcnp->cn_cred, (int *)0, + (struct proc *)0); + cache_purge(fdvp); + } + } + } + error = ufs_dirremove(fdvp, fcnp); + if (!error) { + xp->i_nlink--; + xp->i_flag |= IN_CHANGE; + } + xp->i_flag &= ~IN_RENAME; + } + if (dp) + vput(fdvp); + if (xp) + vput(fvp); + vrele(ap->a_fvp); + return (error); + +bad: + if (xp) + vput(ITOV(xp)); + vput(ITOV(dp)); +out: + if (VOP_LOCK(fvp) == 0) { + ip->i_nlink--; + ip->i_flag |= IN_CHANGE; + vput(fvp); + } else + vrele(fvp); + return (error); +} + +/* + * A virgin directory (no blushing please). + */ +static struct dirtemplate mastertemplate = { + 0, 12, DT_DIR, 1, ".", + 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." +}; +static struct odirtemplate omastertemplate = { + 0, 12, 1, ".", + 0, DIRBLKSIZ - 12, 2, ".." +}; + +/* + * Mkdir system call + */ +int +ufs_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct inode *ip, *dp; + struct vnode *tvp; + struct dirtemplate dirtemplate, *dtp; + struct timeval tv; + int error, dmode; + +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_mkdir: no name"); +#endif + dp = VTOI(dvp); + if ((nlink_t)dp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto out; + } + dmode = vap->va_mode & 0777; + dmode |= IFDIR; + /* + * Must simulate part of ufs_makeinode here to acquire the inode, + * but not have it entered in the parent directory. The entry is + * made later after writing "." and ".." entries. + */ + if (error = VOP_VALLOC(dvp, dmode, cnp->cn_cred, &tvp)) + goto out; + ip = VTOI(tvp); + ip->i_uid = cnp->cn_cred->cr_uid; + ip->i_gid = dp->i_gid; +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + free(cnp->cn_pnbuf, M_NAMEI); + VOP_VFREE(tvp, ip->i_number, dmode); + vput(tvp); + vput(dvp); + return (error); + } +#endif + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = dmode; + tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ + ip->i_nlink = 2; + tv = time; + error = VOP_UPDATE(tvp, &tv, &tv, 1); + + /* + * Bump link count in parent directory + * to reflect work done below. Should + * be done before reference is created + * so reparation is possible if we crash. + */ + dp->i_nlink++; + dp->i_flag |= IN_CHANGE; + if (error = VOP_UPDATE(dvp, &tv, &tv, 1)) + goto bad; + + /* Initialize directory with "." and ".." from static template. */ + if (dvp->v_mount->mnt_maxsymlinklen > 0) + dtp = &mastertemplate; + else + dtp = (struct dirtemplate *)&omastertemplate; + dirtemplate = *dtp; + dirtemplate.dot_ino = ip->i_number; + dirtemplate.dotdot_ino = dp->i_number; + error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)&dirtemplate, + sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (int *)0, (struct proc *)0); + if (error) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + goto bad; + } + if (DIRBLKSIZ > VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) + panic("ufs_mkdir: blksize"); /* XXX should grow with balloc() */ + else { + ip->i_size = DIRBLKSIZ; + ip->i_flag |= IN_CHANGE; + } + + /* Directory set up, now install it's entry in the parent directory. */ + if (error = ufs_direnter(ip, dvp, cnp)) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + } +bad: + /* + * No need to do an explicit VOP_TRUNCATE here, vrele will do this + * for us because we set the link count to 0. + */ + if (error) { + ip->i_nlink = 0; + ip->i_flag |= IN_CHANGE; + vput(tvp); + } else + *ap->a_vpp = tvp; +out: + FREE(cnp->cn_pnbuf, M_NAMEI); + vput(dvp); + return (error); +} + +/* + * Rmdir system call. + */ +int +ufs_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *dvp = ap->a_dvp; + register struct componentname *cnp = ap->a_cnp; + register struct inode *ip, *dp; + int error; + + ip = VTOI(vp); + dp = VTOI(dvp); + /* + * No rmdir "." please. + */ + if (dp == ip) { + vrele(dvp); + vput(vp); + return (EINVAL); + } + /* + * Verify the directory is empty (and valid). + * (Rmdir ".." won't be valid since + * ".." will contain a reference to + * the current directory and thus be + * non-empty.) + */ + error = 0; + if (ip->i_nlink != 2 || + !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { + error = ENOTEMPTY; + goto out; + } + if ((dp->i_flags & APPEND) || (ip->i_flags & (IMMUTABLE | APPEND))) { + error = EPERM; + goto out; + } + /* + * Delete reference to directory before purging + * inode. If we crash in between, the directory + * will be reattached to lost+found, + */ + if (error = ufs_dirremove(dvp, cnp)) + goto out; + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + cache_purge(dvp); + vput(dvp); + dvp = NULL; + /* + * Truncate inode. The only stuff left + * in the directory is "." and "..". The + * "." reference is inconsequential since + * we're quashing it. The ".." reference + * has already been adjusted above. We've + * removed the "." reference and the reference + * in the parent directory, but there may be + * other hard links so decrement by 2 and + * worry about them later. + */ + ip->i_nlink -= 2; + error = VOP_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred, + cnp->cn_proc); + cache_purge(ITOV(ip)); +out: + if (dvp) + vput(dvp); + vput(vp); + return (error); +} + +/* + * symlink -- make a symbolic link + */ +int +ufs_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + register struct vnode *vp, **vpp = ap->a_vpp; + register struct inode *ip; + int len, error; + + if (error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, + vpp, ap->a_cnp)) + return (error); + vp = *vpp; + len = strlen(ap->a_target); + if (len < vp->v_mount->mnt_maxsymlinklen) { + ip = VTOI(vp); + bcopy(ap->a_target, (char *)ip->i_shortlink, len); + ip->i_size = len; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else + error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, (int *)0, + (struct proc *)0); + vput(vp); + return (error); +} + +/* + * Vnode op for reading directories. + * + * The routine below assumes that the on-disk format of a directory + * is the same as that defined by <sys/dirent.h>. If the on-disk + * format changes, then it will be necessary to do a conversion + * from the on-disk format that read returns to the format defined + * by <sys/dirent.h>. + */ +int +ufs_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct uio *uio = ap->a_uio; + int count, lost, error; + + count = uio->uio_resid; + count &= ~(DIRBLKSIZ - 1); + lost = uio->uio_resid - count; + if (count < DIRBLKSIZ || (uio->uio_offset & (DIRBLKSIZ -1))) + return (EINVAL); + uio->uio_resid = count; + uio->uio_iov->iov_len = count; +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { + error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); + } else { + struct dirent *dp, *edp; + struct uio auio; + struct iovec aiov; + caddr_t dirbuf; + int readcnt; + u_char tmp; + + auio = *uio; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + aiov.iov_len = count; + MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); + aiov.iov_base = dirbuf; + error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); + if (error == 0) { + readcnt = count - auio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { + tmp = dp->d_namlen; + dp->d_namlen = dp->d_type; + dp->d_type = tmp; + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, uio); + } + FREE(dirbuf, M_TEMP); + } +# else + error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); +# endif + uio->uio_resid += lost; + return (error); +} + +/* + * Return target name of a symbolic link + */ +int +ufs_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + int isize; + + isize = ip->i_size; + if (isize < vp->v_mount->mnt_maxsymlinklen) { + uiomove((char *)ip->i_shortlink, isize, ap->a_uio); + return (0); + } + return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); +} + +/* + * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually + * done. If a buffer has been saved in anticipation of a CREATE, delete it. + */ +/* ARGSUSED */ +int +ufs_abortop(ap) + struct vop_abortop_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + } */ *ap; +{ + if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) + FREE(ap->a_cnp->cn_pnbuf, M_NAMEI); + return (0); +} + +/* + * Lock an inode. If its already locked, set the WANT bit and sleep. + */ +int +ufs_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip; + struct proc *p = curproc; /* XXX */ + +start: + while (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + } + if (vp->v_tag == VT_NON) + return (ENOENT); + ip = VTOI(vp); + if (ip->i_flag & IN_LOCKED) { + ip->i_flag |= IN_WANTED; +#ifdef DIAGNOSTIC + if (p) { + if (p->p_pid == ip->i_lockholder) + panic("locking against myself"); + ip->i_lockwaiter = p->p_pid; + } else + ip->i_lockwaiter = -1; +#endif + (void) sleep((caddr_t)ip, PINOD); + goto start; + } +#ifdef DIAGNOSTIC + ip->i_lockwaiter = 0; + if (ip->i_lockholder != 0) + panic("lockholder (%d) != 0", ip->i_lockholder); + if (p && p->p_pid == 0) + printf("locking by process 0\n"); + if (p) + ip->i_lockholder = p->p_pid; + else + ip->i_lockholder = -1; +#endif + ip->i_flag |= IN_LOCKED; + return (0); +} + +/* + * Unlock an inode. If WANT bit is on, wakeup. + */ +int lockcount = 90; +int +ufs_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct inode *ip = VTOI(ap->a_vp); + struct proc *p = curproc; /* XXX */ + +#ifdef DIAGNOSTIC + if ((ip->i_flag & IN_LOCKED) == 0) { + vprint("ufs_unlock: unlocked inode", ap->a_vp); + panic("ufs_unlock NOT LOCKED"); + } + if (p && p->p_pid != ip->i_lockholder && p->p_pid > -1 && + ip->i_lockholder > -1 && lockcount++ < 100) + panic("unlocker (%d) != lock holder (%d)", + p->p_pid, ip->i_lockholder); + ip->i_lockholder = 0; +#endif + ip->i_flag &= ~IN_LOCKED; + if (ip->i_flag & IN_WANTED) { + ip->i_flag &= ~IN_WANTED; + wakeup((caddr_t)ip); + } + return (0); +} + +/* + * Check for a locked inode. + */ +int +ufs_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + if (VTOI(ap->a_vp)->i_flag & IN_LOCKED) + return (1); + return (0); +} + +/* + * Calculate the logical to physical mapping if not done already, + * then call the device strategy routine. + */ +int +ufs_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + register struct buf *bp = ap->a_bp; + register struct vnode *vp = bp->b_vp; + register struct inode *ip; + int error; + + ip = VTOI(vp); + if (vp->v_type == VBLK || vp->v_type == VCHR) + panic("ufs_strategy: spec"); + if (bp->b_blkno == bp->b_lblkno) { + if (error = + VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) { + bp->b_error = error; + bp->b_flags |= B_ERROR; + biodone(bp); + return (error); + } + if ((long)bp->b_blkno == -1) + clrbuf(bp); + } + if ((long)bp->b_blkno == -1) { + biodone(bp); + return (0); + } + vp = ip->i_devvp; + bp->b_dev = vp->v_rdev; + VOCALL (vp->v_op, VOFFSET(vop_strategy), ap); + return (0); +} + +/* + * Print out the contents of an inode. + */ +int +ufs_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + + printf("tag VT_UFS, ino %d, on dev %d, %d", ip->i_number, + major(ip->i_dev), minor(ip->i_dev)); +#ifdef FIFO + if (vp->v_type == VFIFO) + fifo_printinfo(vp); +#endif /* FIFO */ + printf("%s\n", (ip->i_flag & IN_LOCKED) ? " (LOCKED)" : ""); + if (ip->i_lockholder == 0) + return (0); + printf("\towner pid %d", ip->i_lockholder); + if (ip->i_lockwaiter) + printf(" waiting pid %d", ip->i_lockwaiter); + printf("\n"); + return (0); +} + +/* + * Read wrapper for special devices. + */ +int +ufsspec_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + /* + * Set access flag. + */ + VTOI(ap->a_vp)->i_flag |= IN_ACCESS; + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for special devices. + */ +int +ufsspec_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + /* + * Set update and change flags. + */ + VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for special devices. + * + * Update the times on the inode then do device close. + */ +int +ufsspec_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct inode *ip = VTOI(ap->a_vp); + + if (ap->a_vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) + ITIMES(ip, &time, &time); + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); +} + +#ifdef FIFO +/* + * Read wrapper for fifo's + */ +int +ufsfifo_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + + /* + * Set access flag. + */ + VTOI(ap->a_vp)->i_flag |= IN_ACCESS; + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for fifo's. + */ +int +ufsfifo_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + + /* + * Set update and change flags. + */ + VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for fifo's. + * + * Update the times on the inode then do device close. + */ +ufsfifo_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + register struct inode *ip = VTOI(ap->a_vp); + + if (ap->a_vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) + ITIMES(ip, &time, &time); + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); +} +#endif /* FIFO */ + +/* + * Return POSIX pathconf information applicable to ufs filesystems. + */ +ufs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + case _PC_PATH_MAX: + *ap->a_retval = PATH_MAX; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_NO_TRUNC: + *ap->a_retval = 1; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Advisory record locking support + */ +int +ufs_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + register struct inode *ip = VTOI(ap->a_vp); + register struct flock *fl = ap->a_fl; + register struct lockf *lock; + off_t start, end; + int error; + + /* + * Avoid the common case of unlocking when inode has no locks. + */ + if (ip->i_lockf == (struct lockf *)0) { + if (ap->a_op != F_SETLK) { + fl->l_type = F_UNLCK; + return (0); + } + } + /* + * Convert the flock structure into a start and end. + */ + switch (fl->l_whence) { + + case SEEK_SET: + case SEEK_CUR: + /* + * Caller is responsible for adding any necessary offset + * when SEEK_CUR is used. + */ + start = fl->l_start; + break; + + case SEEK_END: + start = ip->i_size + fl->l_start; + break; + + default: + return (EINVAL); + } + if (start < 0) + return (EINVAL); + if (fl->l_len == 0) + end = -1; + else + end = start + fl->l_len - 1; + /* + * Create the lockf structure + */ + MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + lock->lf_start = start; + lock->lf_end = end; + lock->lf_id = ap->a_id; + lock->lf_inode = ip; + lock->lf_type = fl->l_type; + lock->lf_next = (struct lockf *)0; + lock->lf_block = (struct lockf *)0; + lock->lf_flags = ap->a_flags; + /* + * Do the requested operation. + */ + switch(ap->a_op) { + case F_SETLK: + return (lf_setlock(lock)); + + case F_UNLCK: + error = lf_clearlock(lock); + FREE(lock, M_LOCKF); + return (error); + + case F_GETLK: + error = lf_getlock(lock, fl); + FREE(lock, M_LOCKF); + return (error); + + default: + free(lock, M_LOCKF); + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Initialize the vnode associated with a new inode, handle aliased + * vnodes. + */ +int +ufs_vinit(mntp, specops, fifoops, vpp) + struct mount *mntp; + int (**specops)(); + int (**fifoops)(); + struct vnode **vpp; +{ + struct inode *ip; + struct vnode *vp, *nvp; + + vp = *vpp; + ip = VTOI(vp); + switch(vp->v_type = IFTOVT(ip->i_mode)) { + case VCHR: + case VBLK: + vp->v_op = specops; + if (nvp = checkalias(vp, ip->i_rdev, mntp)) { + /* + * Discard unneeded vnode, but save its inode. + */ + ufs_ihashrem(ip); + VOP_UNLOCK(vp); + nvp->v_data = vp->v_data; + vp->v_data = NULL; + vp->v_op = spec_vnodeop_p; + vrele(vp); + vgone(vp); + /* + * Reinitialize aliased inode. + */ + vp = nvp; + ip->i_vnode = vp; + ufs_ihashins(ip); + } + break; + case VFIFO: +#ifdef FIFO + vp->v_op = fifoops; + break; +#else + return (EOPNOTSUPP); +#endif + } + if (ip->i_number == ROOTINO) + vp->v_flag |= VROOT; + /* + * Initialize modrev times + */ + SETHIGH(ip->i_modrev, mono_time.tv_sec); + SETLOW(ip->i_modrev, mono_time.tv_usec * 4294); + *vpp = vp; + return (0); +} + +/* + * Allocate a new inode. + */ +int +ufs_makeinode(mode, dvp, vpp, cnp) + int mode; + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; +{ + register struct inode *ip, *pdir; + struct timeval tv; + struct vnode *tvp; + int error; + + pdir = VTOI(dvp); +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_makeinode: no name"); +#endif + *vpp = NULL; + if ((mode & IFMT) == 0) + mode |= IFREG; + + if (error = VOP_VALLOC(dvp, mode, cnp->cn_cred, &tvp)) { + free(cnp->cn_pnbuf, M_NAMEI); + vput(dvp); + return (error); + } + ip = VTOI(tvp); + ip->i_gid = pdir->i_gid; + if ((mode & IFMT) == IFLNK) + ip->i_uid = pdir->i_uid; + else + ip->i_uid = cnp->cn_cred->cr_uid; +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + free(cnp->cn_pnbuf, M_NAMEI); + VOP_VFREE(tvp, ip->i_number, mode); + vput(tvp); + vput(dvp); + return (error); + } +#endif + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = mode; + tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ + ip->i_nlink = 1; + if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && + suser(cnp->cn_cred, NULL)) + ip->i_mode &= ~ISGID; + + /* + * Make sure inode goes to disk before directory entry. + */ + tv = time; + if (error = VOP_UPDATE(tvp, &tv, &tv, 1)) + goto bad; + if (error = ufs_direnter(ip, dvp, cnp)) + goto bad; + if ((cnp->cn_flags & SAVESTART) == 0) + FREE(cnp->cn_pnbuf, M_NAMEI); + vput(dvp); + *vpp = tvp; + return (0); + +bad: + /* + * Write error occurred trying to update the inode + * or the directory so must deallocate the inode. + */ + free(cnp->cn_pnbuf, M_NAMEI); + vput(dvp); + ip->i_nlink = 0; + ip->i_flag |= IN_CHANGE; + vput(tvp); + return (error); +} diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h new file mode 100644 index 0000000..237871f --- /dev/null +++ b/sys/ufs/ufs/ufsmount.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufsmount.h 8.2 (Berkeley) 1/12/94 + */ + +struct buf; +struct inode; +struct nameidata; +struct timeval; +struct ucred; +struct uio; +struct vnode; +struct netexport; + +/* This structure describes the UFS specific mount structure data. */ +struct ufsmount { + struct mount *um_mountp; /* filesystem vfs structure */ + dev_t um_dev; /* device mounted */ + struct vnode *um_devvp; /* block device mounted vnode */ + union { /* pointer to superblock */ + struct lfs *lfs; /* LFS */ + struct fs *fs; /* FFS */ + } ufsmount_u; +#define um_fs ufsmount_u.fs +#define um_lfs ufsmount_u.lfs + struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ + struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ + u_long um_nindir; /* indirect ptrs per block */ + u_long um_bptrtodb; /* indir ptr to disk block */ + u_long um_seqinc; /* inc between seq blocks */ + time_t um_btime[MAXQUOTAS]; /* block quota time limit */ + time_t um_itime[MAXQUOTAS]; /* inode quota time limit */ + char um_qflags[MAXQUOTAS]; /* quota specific flags */ + struct netexport um_export; /* export information */ +}; +/* + * Flags describing the state of quotas. + */ +#define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ +#define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ + +/* Convert mount ptr to ufsmount ptr. */ +#define VFSTOUFS(mp) ((struct ufsmount *)((mp)->mnt_data)) + +/* + * Macros to access file system parameters in the ufsmount structure. + * Used by ufs_bmap. + */ +#define blkptrtodb(ump, b) ((b) << (ump)->um_bptrtodb) +#define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc) +#define MNINDIR(ump) ((ump)->um_nindir) + + |