diff options
author | phk <phk@FreeBSD.org> | 2002-08-12 10:32:56 +0000 |
---|---|---|
committer | phk <phk@FreeBSD.org> | 2002-08-12 10:32:56 +0000 |
commit | 58bc3221a411320c74d814c989aa24f9a640f1aa (patch) | |
tree | 09c2255bc091554defd8ca3a39889c5731235979 /sys | |
parent | 5eefae432e950fedd9c42dac08fbf91f644c1bc5 (diff) | |
download | FreeBSD-src-58bc3221a411320c74d814c989aa24f9a640f1aa.zip FreeBSD-src-58bc3221a411320c74d814c989aa24f9a640f1aa.tar.gz |
Stop pretending that the FFS file ufs_readwrite.c is a UFS file.
Instead of #including it, pull it into ffs_vnops.c and name things
correctly.
Sponsored by: DARPA & NAI Labs.
Diffstat (limited to 'sys')
-rw-r--r-- | sys/ufs/ffs/ffs_vnops.c | 1030 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_readwrite.c | 1073 |
2 files changed, 1028 insertions, 1075 deletions
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 9356ff9..350536a 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -1,4 +1,13 @@ /* + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and Network Associates Laboratories, the Security + * Research Division of Network Associates, Inc. under DARPA/SPAWAR + * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS + * research program + * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * @@ -55,6 +64,8 @@ #include <vm/vm_page.h> #include <vm/vm_object.h> #include <vm/vm_extern.h> +#include <vm/vm_pager.h> +#include <vm/vnode_pager.h> #include <ufs/ufs/extattr.h> #include <ufs/ufs/quota.h> @@ -69,6 +80,9 @@ int ffs_fsync(struct vop_fsync_args *); static int ffs_getpages(struct vop_getpages_args *); static int ffs_read(struct vop_read_args *); static int ffs_write(struct vop_write_args *); +static int ffs_extread(struct vop_read_args *); +static int ffs_extwrite(struct vop_write_args *); + /* Global vfs data structures for ufs. */ vop_t **ffs_vnodeop_p; @@ -106,8 +120,6 @@ VNODEOP_SET(ffs_vnodeop_opv_desc); VNODEOP_SET(ffs_specop_opv_desc); VNODEOP_SET(ffs_fifoop_opv_desc); -#include <ufs/ufs/ufs_readwrite.c> - /* * Synch an open file. */ @@ -273,3 +285,1017 @@ loop: splx(s); return (UFS_UPDATE(vp, wait)); } + + +/* + * Vnode op for reading. + */ +/* ARGSUSED */ +int +ffs_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp; + struct inode *ip; + struct uio *uio; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + int error, orig_resid; + mode_t mode; + int seqcount; + int ioflag; + vm_object_t object; + + if (ap->a_ioflag & IO_EXT) + return (ffs_extread(ap)); + + GIANT_REQUIRED; + + vp = ap->a_vp; + seqcount = ap->a_ioflag >> 16; + ip = VTOI(vp); + mode = ip->i_mode; + uio = ap->a_uio; + ioflag = ap->a_ioflag; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("ffs_read: mode"); + + if (vp->v_type == VLNK) { + if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) + panic("ffs_read: short symlink"); + } else if (vp->v_type != VREG && vp->v_type != VDIR) + panic("ffs_read: type %d", vp->v_type); +#endif + fs = ip->i_fs; + if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize) + return (EFBIG); + + orig_resid = uio->uio_resid; + if (orig_resid <= 0) + return (0); + + object = vp->v_object; + + bytesinfile = ip->i_size - uio->uio_offset; + if (bytesinfile <= 0) { + if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) + ip->i_flag |= IN_ACCESS; + return 0; + } + + if (object) { + vm_object_reference(object); + } + +#ifdef ENABLE_VFS_IOOPT + /* + * If IO optimisation is turned on, + * and we are NOT a VM based IO request, + * (i.e. not headed for the buffer cache) + * but there IS a vm object associated with it. + */ + if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { + int nread, toread; + + toread = uio->uio_resid; + if (toread > bytesinfile) + toread = bytesinfile; + if (toread >= PAGE_SIZE) { + /* + * Then if it's at least a page in size, try + * get the data from the object using vm tricks + */ + error = uioread(toread, uio, object, &nread); + if ((uio->uio_resid == 0) || (error != 0)) { + /* + * If we finished or there was an error + * then finish up (the reference previously + * obtained on object must be released). + */ + if ((error == 0 || + uio->uio_resid != orig_resid) && + (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) + ip->i_flag |= IN_ACCESS; + + if (object) { + vm_object_vndeallocate(object); + } + return error; + } + } + } +#endif + + /* + * Ok so we couldn't do it all in one vm trick... + * so cycle around trying smaller bites.. + */ + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) + break; +#ifdef ENABLE_VFS_IOOPT + if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { + /* + * Obviously we didn't finish above, but we + * didn't get an error either. Try the same trick again. + * but this time we are looping. + */ + int nread, toread; + toread = uio->uio_resid; + if (toread > bytesinfile) + toread = bytesinfile; + + /* + * Once again, if there isn't enough for a + * whole page, don't try optimising. + */ + if (toread >= PAGE_SIZE) { + error = uioread(toread, uio, object, &nread); + if ((uio->uio_resid == 0) || (error != 0)) { + /* + * If we finished or there was an + * error then finish up (the reference + * previously obtained on object must + * be released). + */ + if ((error == 0 || + uio->uio_resid != orig_resid) && + (vp->v_mount->mnt_flag & + MNT_NOATIME) == 0) + ip->i_flag |= IN_ACCESS; + if (object) { + vm_object_vndeallocate(object); + } + return error; + } + /* + * To get here we didnt't finish or err. + * If we did get some data, + * loop to try another bite. + */ + if (nread > 0) { + continue; + } + } + } +#endif + + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + + /* + * size of buffer. The buffer representing the + * end of the file is rounded up to the size of + * the block type ( fragment or full block, + * depending ). + */ + size = blksize(fs, ip, lbn); + blkoffset = blkoff(fs, uio->uio_offset); + + /* + * The amount we want to transfer in this iteration is + * one FS block less the amount of the data before + * our startpoint (duh!) + */ + xfersize = fs->fs_bsize - blkoffset; + + /* + * But if we actually want less than the block, + * or the file doesn't have a whole block more of data, + * then use the lesser number. + */ + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (bytesinfile < xfersize) + xfersize = bytesinfile; + + if (lblktosize(fs, nextlbn) >= ip->i_size) { + /* + * Don't do readahead if this is the end of the file. + */ + error = bread(vp, lbn, size, NOCRED, &bp); + } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { + /* + * Otherwise if we are allowed to cluster, + * grab as much as we can. + * + * XXX This may not be a win if we are not + * doing sequential access. + */ + error = cluster_read(vp, ip->i_size, lbn, + size, NOCRED, uio->uio_resid, seqcount, &bp); + } else if (seqcount > 1) { + /* + * If we are NOT allowed to cluster, then + * if we appear to be acting sequentially, + * fire off a request for a readahead + * as well as a read. Note that the 4th and 5th + * arguments point to arrays of the size specified in + * the 6th argument. + */ + int nextsize = blksize(fs, ip, nextlbn); + error = breadn(vp, lbn, + size, &nextlbn, &nextsize, 1, NOCRED, &bp); + } else { + /* + * Failing all of the above, just read what the + * user asked for. Interestingly, the same as + * the first option above. + */ + error = bread(vp, lbn, size, NOCRED, &bp); + } + if (error) { + brelse(bp); + bp = NULL; + break; + } + + /* + * If IO_DIRECT then set B_DIRECT for the buffer. This + * will cause us to attempt to release the buffer later on + * and will cause the buffer cache to attempt to free the + * underlying pages. + */ + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + +#ifdef ENABLE_VFS_IOOPT + if (vfs_ioopt && object && + (bp->b_flags & B_VMIO) && + ((blkoffset & PAGE_MASK) == 0) && + ((xfersize & PAGE_MASK) == 0)) { + /* + * If VFS IO optimisation is turned on, + * and it's an exact page multiple + * And a normal VM based op, + * then use uiomiveco() + */ + error = + uiomoveco((char *)bp->b_data + blkoffset, + (int)xfersize, uio, object, 0); + } else +#endif + { + /* + * otherwise use the general form + */ + error = + uiomove((char *)bp->b_data + blkoffset, + (int)xfersize, uio); + } + + if (error) + break; + + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + /* + * If there are no dependencies, and it's VMIO, + * then we don't need the buf, mark it available + * for freeing. The VM has the data. + */ + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + /* + * Otherwise let whoever + * made the request take care of + * freeing it. We just queue + * it onto another list. + */ + bqrelse(bp); + } + } + + /* + * This can only happen in the case of an error + * because the loop above resets bp to NULL on each iteration + * and on normal completion has not set a new value into it. + * so it must have come from a 'break' statement + */ + if (bp != NULL) { + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + bqrelse(bp); + } + } + + if (object) { + vm_object_vndeallocate(object); + } + if ((error == 0 || uio->uio_resid != orig_resid) && + (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) + ip->i_flag |= IN_ACCESS; + return (error); +} + +/* + * Vnode op for writing. + */ +int +ffs_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp; + struct uio *uio; + struct inode *ip; + struct fs *fs; + struct buf *bp; + struct thread *td; + ufs_lbn_t lbn; + off_t osize; + int seqcount; + int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; + vm_object_t object; + + if (ap->a_ioflag & IO_EXT) + return (ffs_extwrite(ap)); + + GIANT_REQUIRED; + + extended = 0; + seqcount = ap->a_ioflag >> 16; + ioflag = ap->a_ioflag; + uio = ap->a_uio; + vp = ap->a_vp; + ip = VTOI(vp); + + object = vp->v_object; + if (object) { + vm_object_reference(object); + } + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("ffswrite: mode"); +#endif + + switch (vp->v_type) { + case VREG: + if (ioflag & IO_APPEND) + uio->uio_offset = ip->i_size; + if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { + if (object) { + vm_object_vndeallocate(object); + } + return (EPERM); + } + /* FALLTHROUGH */ + case VLNK: + break; + case VDIR: + panic("ffswrite: dir write"); + break; + default: + panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type, + (int)uio->uio_offset, + (int)uio->uio_resid + ); + } + + fs = ip->i_fs; + if (uio->uio_offset < 0 || + (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) { + if (object) { + vm_object_vndeallocate(object); + } + return (EFBIG); + } + /* + * Maybe this should be above the vnode op call, but so long as + * file servers have no limits, I don't think it matters. + */ + td = uio->uio_td; + if (vp->v_type == VREG && td && + uio->uio_offset + uio->uio_resid > + td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { + PROC_LOCK(td->td_proc); + psignal(td->td_proc, SIGXFSZ); + PROC_UNLOCK(td->td_proc); + if (object) { + vm_object_vndeallocate(object); + } + return (EFBIG); + } + + resid = uio->uio_resid; + osize = ip->i_size; + flags = 0; + if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) + flags = IO_SYNC; + +#ifdef ENABLE_VFS_IOOPT + if (object && (object->flags & OBJ_OPT)) { + vm_freeze_copyopts(object, + OFF_TO_IDX(uio->uio_offset), + OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK)); + } +#endif + for (error = 0; uio->uio_resid > 0;) { + lbn = lblkno(fs, uio->uio_offset); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + + if (uio->uio_offset + xfersize > ip->i_size) + vnode_pager_setsize(vp, uio->uio_offset + xfersize); + + /* + * We must perform a read-before-write if the transfer size + * does not cover the entire buffer. + */ + if (fs->fs_bsize > xfersize) + flags |= BA_CLRBUF; + else + flags &= ~BA_CLRBUF; +/* XXX is uio->uio_offset the right thing here? */ + error = UFS_BALLOC(vp, uio->uio_offset, xfersize, + ap->a_cred, flags, &bp); + if (error != 0) + break; + /* + * If the buffer is not valid we have to clear out any + * garbage data from the pages instantiated for the buffer. + * If we do not, a failed uiomove() during a write can leave + * the prior contents of the pages exposed to a userland + * mmap(). XXX deal with uiomove() errors a better way. + */ + if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) + vfs_bio_clrbuf(bp); + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; + if (ioflag & IO_NOWDRAIN) + bp->b_flags |= B_NOWDRAIN; + + if (uio->uio_offset + xfersize > ip->i_size) { + ip->i_size = uio->uio_offset + xfersize; + DIP(ip, i_size) = ip->i_size; + extended = 1; + } + + size = blksize(fs, ip, lbn) - bp->b_resid; + if (size < xfersize) + xfersize = size; + + error = + uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + bp->b_flags |= B_RELBUF; + } + + /* + * If IO_SYNC each buffer is written synchronously. Otherwise + * if we have a severe page deficiency write the buffer + * asynchronously. Otherwise try to cluster, and if that + * doesn't do it then either do an async write (if O_DIRECT), + * or a delayed write (if not). + */ + if (ioflag & IO_SYNC) { + (void)bwrite(bp); + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + (ioflag & IO_ASYNC)) { + bp->b_flags |= B_CLUSTEROK; + bawrite(bp); + } else if (xfersize + blkoffset == fs->fs_bsize) { + if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { + bp->b_flags |= B_CLUSTEROK; + cluster_write(bp, ip->i_size, seqcount); + } else { + bawrite(bp); + } + } else if (ioflag & IO_DIRECT) { + bp->b_flags |= B_CLUSTEROK; + bawrite(bp); + } else { + bp->b_flags |= B_CLUSTEROK; + bdwrite(bp); + } + if (error || xfersize == 0) + break; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if (resid > uio->uio_resid && ap->a_cred && + suser_cred(ap->a_cred, PRISON_ROOT)) { + ip->i_mode &= ~(ISUID | ISGID); + DIP(ip, i_mode) = ip->i_mode; + } + if (resid > uio->uio_resid) + VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); + if (error) { + if (ioflag & IO_UNIT) { + (void)UFS_TRUNCATE(vp, osize, + IO_NORMAL | (ioflag & IO_SYNC), + ap->a_cred, uio->uio_td); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) + error = UFS_UPDATE(vp, 1); + + if (object) { + vm_object_vndeallocate(object); + } + + return (error); +} + +/* + * get page routine + */ +int +ffs_getpages(ap) + struct vop_getpages_args *ap; +{ + off_t foff, physoffset; + int i, size, bsize; + struct vnode *dp, *vp; + vm_object_t obj; + vm_pindex_t pindex, firstindex; + vm_page_t mreq; + int bbackwards, bforwards; + int pbackwards, pforwards; + int firstpage; + ufs2_daddr_t reqblkno, reqlblkno; + int poff; + int pcount; + int rtval; + int pagesperblock; + + GIANT_REQUIRED; + + pcount = round_page(ap->a_count) / PAGE_SIZE; + mreq = ap->a_m[ap->a_reqpage]; + firstindex = ap->a_m[0]->pindex; + + /* + * if ANY DEV_BSIZE blocks are valid on a large filesystem block, + * then the entire page is valid. Since the page may be mapped, + * user programs might reference data beyond the actual end of file + * occuring within the page. We have to zero that data. + */ + if (mreq->valid) { + if (mreq->valid != VM_PAGE_BITS_ALL) + vm_page_zero_invalid(mreq, TRUE); + vm_page_lock_queues(); + for (i = 0; i < pcount; i++) { + if (i != ap->a_reqpage) { + vm_page_free(ap->a_m[i]); + } + } + vm_page_unlock_queues(); + return VM_PAGER_OK; + } + + vp = ap->a_vp; + obj = vp->v_object; + bsize = vp->v_mount->mnt_stat.f_iosize; + pindex = mreq->pindex; + foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */; + + if (bsize < PAGE_SIZE) + return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, + ap->a_count, + ap->a_reqpage); + + /* + * foff is the file offset of the required page + * reqlblkno is the logical block that contains the page + * poff is the index of the page into the logical block + */ + reqlblkno = foff / bsize; + poff = (foff % bsize) / PAGE_SIZE; + + dp = VTOI(vp)->i_devvp; + if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) + || (reqblkno == -1)) { + vm_page_lock_queues(); + for(i = 0; i < pcount; i++) { + if (i != ap->a_reqpage) + vm_page_free(ap->a_m[i]); + } + vm_page_unlock_queues(); + if (reqblkno == -1) { + if ((mreq->flags & PG_ZERO) == 0) + vm_page_zero_fill(mreq); + vm_page_undirty(mreq); + mreq->valid = VM_PAGE_BITS_ALL; + return VM_PAGER_OK; + } else { + return VM_PAGER_ERROR; + } + } + + physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; + pagesperblock = bsize / PAGE_SIZE; + /* + * find the first page that is contiguous... + * note that pbackwards is the number of pages that are contiguous + * backwards. + */ + firstpage = 0; + if (ap->a_count) { + pbackwards = poff + bbackwards * pagesperblock; + if (ap->a_reqpage > pbackwards) { + firstpage = ap->a_reqpage - pbackwards; + vm_page_lock_queues(); + for(i=0;i<firstpage;i++) + vm_page_free(ap->a_m[i]); + vm_page_unlock_queues(); + } + + /* + * pforwards is the number of pages that are contiguous + * after the current page. + */ + pforwards = (pagesperblock - (poff + 1)) + + bforwards * pagesperblock; + if (pforwards < (pcount - (ap->a_reqpage + 1))) { + vm_page_lock_queues(); + for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) + vm_page_free(ap->a_m[i]); + vm_page_unlock_queues(); + pcount = ap->a_reqpage + pforwards + 1; + } + + /* + * number of pages for I/O corrected for the non-contig pages at + * the beginning of the array. + */ + pcount -= firstpage; + } + + /* + * calculate the size of the transfer + */ + + size = pcount * PAGE_SIZE; + + if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > + obj->un_pager.vnp.vnp_size) + size = obj->un_pager.vnp.vnp_size - + IDX_TO_OFF(ap->a_m[firstpage]->pindex); + + physoffset -= foff; + rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, + (ap->a_reqpage - firstpage), physoffset); + + return (rtval); +} + +/* + * Vnode op for extended attribute reading. + */ +/* ARGSUSED */ +static int +ffs_extread(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp; + struct inode *ip; + struct ufs2_dinode *dp; + struct uio *uio; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + int error, orig_resid; + mode_t mode; + int ioflag; + + GIANT_REQUIRED; + + vp = ap->a_vp; + ip = VTOI(vp); + fs = ip->i_fs; + dp = ip->i_din2; + mode = ip->i_mode; + uio = ap->a_uio; + ioflag = ap->a_ioflag; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) + panic("ffs_extread: mode"); + +#endif + orig_resid = uio->uio_resid; + if (orig_resid <= 0) + return (0); + + bytesinfile = dp->di_extsize - uio->uio_offset; + if (bytesinfile <= 0) { + if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) + ip->i_flag |= IN_ACCESS; + return 0; + } + + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) + break; + + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + + /* + * size of buffer. The buffer representing the + * end of the file is rounded up to the size of + * the block type ( fragment or full block, + * depending ). + */ + size = sblksize(fs, dp->di_extsize, lbn); + blkoffset = blkoff(fs, uio->uio_offset); + + /* + * The amount we want to transfer in this iteration is + * one FS block less the amount of the data before + * our startpoint (duh!) + */ + xfersize = fs->fs_bsize - blkoffset; + + /* + * But if we actually want less than the block, + * or the file doesn't have a whole block more of data, + * then use the lesser number. + */ + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (bytesinfile < xfersize) + xfersize = bytesinfile; + + if (lblktosize(fs, nextlbn) >= dp->di_extsize) { + /* + * Don't do readahead if this is the end of the info. + */ + error = bread(vp, -1 - lbn, size, NOCRED, &bp); + } else { + /* + * If we have a second block, then + * fire off a request for a readahead + * as well as a read. Note that the 4th and 5th + * arguments point to arrays of the size specified in + * the 6th argument. + */ + int nextsize = sblksize(fs, dp->di_extsize, nextlbn); + + nextlbn = -1 - nextlbn; + error = breadn(vp, -1 - lbn, + size, &nextlbn, &nextsize, 1, NOCRED, &bp); + } + if (error) { + brelse(bp); + bp = NULL; + break; + } + + /* + * If IO_DIRECT then set B_DIRECT for the buffer. This + * will cause us to attempt to release the buffer later on + * and will cause the buffer cache to attempt to free the + * underlying pages. + */ + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + + error = uiomove((char *)bp->b_data + blkoffset, + (int)xfersize, uio); + if (error) + break; + + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + /* + * If there are no dependencies, and it's VMIO, + * then we don't need the buf, mark it available + * for freeing. The VM has the data. + */ + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + /* + * Otherwise let whoever + * made the request take care of + * freeing it. We just queue + * it onto another list. + */ + bqrelse(bp); + } + } + + /* + * This can only happen in the case of an error + * because the loop above resets bp to NULL on each iteration + * and on normal completion has not set a new value into it. + * so it must have come from a 'break' statement + */ + if (bp != NULL) { + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + bqrelse(bp); + } + } + + if ((error == 0 || uio->uio_resid != orig_resid) && + (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) + ip->i_flag |= IN_ACCESS; + return (error); +} + +/* + * Vnode op for external attribute writing. + */ +static int +ffs_extwrite(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp; + struct uio *uio; + struct inode *ip; + struct ufs2_dinode *dp; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn; + off_t osize; + int blkoffset, error, flags, ioflag, resid, size, xfersize; + + GIANT_REQUIRED; + + vp = ap->a_vp; + ip = VTOI(vp); + fs = ip->i_fs; + dp = ip->i_din2; + uio = ap->a_uio; + ioflag = ap->a_ioflag; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) + panic("ext_write: mode"); +#endif + + if (ioflag & IO_APPEND) + uio->uio_offset = dp->di_extsize; + + if (uio->uio_offset < 0 || + (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) + return (EFBIG); + + resid = uio->uio_resid; + osize = dp->di_extsize; + flags = IO_EXT; + if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) + flags |= IO_SYNC; + + for (error = 0; uio->uio_resid > 0;) { + lbn = lblkno(fs, uio->uio_offset); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + + /* + * We must perform a read-before-write if the transfer size + * does not cover the entire buffer. + */ + if (fs->fs_bsize > xfersize) + flags |= BA_CLRBUF; + else + flags &= ~BA_CLRBUF; + error = UFS_BALLOC(vp, uio->uio_offset, xfersize, + ap->a_cred, flags, &bp); + if (error != 0) + break; + /* + * If the buffer is not valid we have to clear out any + * garbage data from the pages instantiated for the buffer. + * If we do not, a failed uiomove() during a write can leave + * the prior contents of the pages exposed to a userland + * mmap(). XXX deal with uiomove() errors a better way. + */ + if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) + vfs_bio_clrbuf(bp); + if (ioflag & IO_DIRECT) + bp->b_flags |= B_DIRECT; + if (ioflag & IO_NOWDRAIN) + bp->b_flags |= B_NOWDRAIN; + + if (uio->uio_offset + xfersize > dp->di_extsize) + dp->di_extsize = uio->uio_offset + xfersize; + + size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; + if (size < xfersize) + xfersize = size; + + error = + uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); + if ((ioflag & (IO_VMIO|IO_DIRECT)) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + bp->b_flags |= B_RELBUF; + } + + /* + * If IO_SYNC each buffer is written synchronously. Otherwise + * if we have a severe page deficiency write the buffer + * asynchronously. Otherwise try to cluster, and if that + * doesn't do it then either do an async write (if O_DIRECT), + * or a delayed write (if not). + */ + if (ioflag & IO_SYNC) { + (void)bwrite(bp); + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + xfersize + blkoffset == fs->fs_bsize || + (ioflag & (IO_ASYNC | IO_DIRECT))) + bawrite(bp); + else + bdwrite(bp); + if (error || xfersize == 0) + break; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if (resid > uio->uio_resid && ap->a_cred && + suser_cred(ap->a_cred, PRISON_ROOT)) { + ip->i_mode &= ~(ISUID | ISGID); + dp->di_mode = ip->i_mode; + } + if (error) { + if (ioflag & IO_UNIT) { + (void)UFS_TRUNCATE(vp, osize, + IO_EXT | (ioflag&IO_SYNC), ap->a_cred, uio->uio_td); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) + error = UFS_UPDATE(vp, 1); + return (error); +} diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c deleted file mode 100644 index a37b13b..0000000 --- a/sys/ufs/ufs/ufs_readwrite.c +++ /dev/null @@ -1,1073 +0,0 @@ -/*- - * Copyright (c) 2002 Networks Associates Technology, Inc. - * All rights reserved. - * - * This software was developed for the FreeBSD Project by Marshall - * Kirk McKusick and Network Associates Laboratories, the Security - * Research Division of Network Associates, Inc. under DARPA/SPAWAR - * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS - * research program - * - * Copyright (c) 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 - * $FreeBSD$ - */ - -#define BLKSIZE(a, b, c) blksize(a, b, c) -#define FS struct fs -#define I_FS i_fs -#define READ ffs_read -#define READ_S "ffs_read" -#define WRITE ffs_write -#define WRITE_S "ffs_write" - -#include <vm/vm.h> -#include <vm/vm_object.h> -#include <vm/vm_pager.h> -#include <vm/vm_map.h> -#include <vm/vnode_pager.h> -#include <sys/event.h> -#include <sys/vmmeter.h> - -/* - * Vnode op for reading. - */ -/* ARGSUSED */ -int -READ(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - struct vnode *vp; - struct inode *ip; - struct uio *uio; - FS *fs; - struct buf *bp; - ufs_lbn_t lbn, nextlbn; - off_t bytesinfile; - long size, xfersize, blkoffset; - int error, orig_resid; - mode_t mode; - int seqcount; - int ioflag; - vm_object_t object; - - if (ap->a_ioflag & IO_EXT) - return (ufs_extread(ap)); - - GIANT_REQUIRED; - - vp = ap->a_vp; - seqcount = ap->a_ioflag >> 16; - ip = VTOI(vp); - mode = ip->i_mode; - uio = ap->a_uio; - ioflag = ap->a_ioflag; - -#ifdef DIAGNOSTIC - if (uio->uio_rw != UIO_READ) - panic("%s: mode", READ_S); - - if (vp->v_type == VLNK) { - if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) - panic("%s: short symlink", READ_S); - } else if (vp->v_type != VREG && vp->v_type != VDIR) - panic("%s: type %d", READ_S, vp->v_type); -#endif - fs = ip->I_FS; - if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize) - return (EFBIG); - - orig_resid = uio->uio_resid; - if (orig_resid <= 0) - return (0); - - object = vp->v_object; - - bytesinfile = ip->i_size - uio->uio_offset; - if (bytesinfile <= 0) { - if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) - ip->i_flag |= IN_ACCESS; - return 0; - } - - if (object) { - vm_object_reference(object); - } - -#ifdef ENABLE_VFS_IOOPT - /* - * If IO optimisation is turned on, - * and we are NOT a VM based IO request, - * (i.e. not headed for the buffer cache) - * but there IS a vm object associated with it. - */ - if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { - int nread, toread; - - toread = uio->uio_resid; - if (toread > bytesinfile) - toread = bytesinfile; - if (toread >= PAGE_SIZE) { - /* - * Then if it's at least a page in size, try - * get the data from the object using vm tricks - */ - error = uioread(toread, uio, object, &nread); - if ((uio->uio_resid == 0) || (error != 0)) { - /* - * If we finished or there was an error - * then finish up (the reference previously - * obtained on object must be released). - */ - if ((error == 0 || - uio->uio_resid != orig_resid) && - (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) - ip->i_flag |= IN_ACCESS; - - if (object) { - vm_object_vndeallocate(object); - } - return error; - } - } - } -#endif - - /* - * Ok so we couldn't do it all in one vm trick... - * so cycle around trying smaller bites.. - */ - for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { - if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) - break; -#ifdef ENABLE_VFS_IOOPT - if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { - /* - * Obviously we didn't finish above, but we - * didn't get an error either. Try the same trick again. - * but this time we are looping. - */ - int nread, toread; - toread = uio->uio_resid; - if (toread > bytesinfile) - toread = bytesinfile; - - /* - * Once again, if there isn't enough for a - * whole page, don't try optimising. - */ - if (toread >= PAGE_SIZE) { - error = uioread(toread, uio, object, &nread); - if ((uio->uio_resid == 0) || (error != 0)) { - /* - * If we finished or there was an - * error then finish up (the reference - * previously obtained on object must - * be released). - */ - if ((error == 0 || - uio->uio_resid != orig_resid) && - (vp->v_mount->mnt_flag & - MNT_NOATIME) == 0) - ip->i_flag |= IN_ACCESS; - if (object) { - vm_object_vndeallocate(object); - } - return error; - } - /* - * To get here we didnt't finish or err. - * If we did get some data, - * loop to try another bite. - */ - if (nread > 0) { - continue; - } - } - } -#endif - - lbn = lblkno(fs, uio->uio_offset); - nextlbn = lbn + 1; - - /* - * size of buffer. The buffer representing the - * end of the file is rounded up to the size of - * the block type ( fragment or full block, - * depending ). - */ - size = BLKSIZE(fs, ip, lbn); - blkoffset = blkoff(fs, uio->uio_offset); - - /* - * The amount we want to transfer in this iteration is - * one FS block less the amount of the data before - * our startpoint (duh!) - */ - xfersize = fs->fs_bsize - blkoffset; - - /* - * But if we actually want less than the block, - * or the file doesn't have a whole block more of data, - * then use the lesser number. - */ - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; - if (bytesinfile < xfersize) - xfersize = bytesinfile; - - if (lblktosize(fs, nextlbn) >= ip->i_size) { - /* - * Don't do readahead if this is the end of the file. - */ - error = bread(vp, lbn, size, NOCRED, &bp); - } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { - /* - * Otherwise if we are allowed to cluster, - * grab as much as we can. - * - * XXX This may not be a win if we are not - * doing sequential access. - */ - error = cluster_read(vp, ip->i_size, lbn, - size, NOCRED, uio->uio_resid, seqcount, &bp); - } else if (seqcount > 1) { - /* - * If we are NOT allowed to cluster, then - * if we appear to be acting sequentially, - * fire off a request for a readahead - * as well as a read. Note that the 4th and 5th - * arguments point to arrays of the size specified in - * the 6th argument. - */ - int nextsize = BLKSIZE(fs, ip, nextlbn); - error = breadn(vp, lbn, - size, &nextlbn, &nextsize, 1, NOCRED, &bp); - } else { - /* - * Failing all of the above, just read what the - * user asked for. Interestingly, the same as - * the first option above. - */ - error = bread(vp, lbn, size, NOCRED, &bp); - } - if (error) { - brelse(bp); - bp = NULL; - break; - } - - /* - * If IO_DIRECT then set B_DIRECT for the buffer. This - * will cause us to attempt to release the buffer later on - * and will cause the buffer cache to attempt to free the - * underlying pages. - */ - if (ioflag & IO_DIRECT) - bp->b_flags |= B_DIRECT; - - /* - * We should only get non-zero b_resid when an I/O error - * has occurred, which should cause us to break above. - * However, if the short read did not cause an error, - * then we want to ensure that we do not uiomove bad - * or uninitialized data. - */ - size -= bp->b_resid; - if (size < xfersize) { - if (size == 0) - break; - xfersize = size; - } - -#ifdef ENABLE_VFS_IOOPT - if (vfs_ioopt && object && - (bp->b_flags & B_VMIO) && - ((blkoffset & PAGE_MASK) == 0) && - ((xfersize & PAGE_MASK) == 0)) { - /* - * If VFS IO optimisation is turned on, - * and it's an exact page multiple - * And a normal VM based op, - * then use uiomiveco() - */ - error = - uiomoveco((char *)bp->b_data + blkoffset, - (int)xfersize, uio, object, 0); - } else -#endif - { - /* - * otherwise use the general form - */ - error = - uiomove((char *)bp->b_data + blkoffset, - (int)xfersize, uio); - } - - if (error) - break; - - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { - /* - * If there are no dependencies, and it's VMIO, - * then we don't need the buf, mark it available - * for freeing. The VM has the data. - */ - bp->b_flags |= B_RELBUF; - brelse(bp); - } else { - /* - * Otherwise let whoever - * made the request take care of - * freeing it. We just queue - * it onto another list. - */ - bqrelse(bp); - } - } - - /* - * This can only happen in the case of an error - * because the loop above resets bp to NULL on each iteration - * and on normal completion has not set a new value into it. - * so it must have come from a 'break' statement - */ - if (bp != NULL) { - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { - bp->b_flags |= B_RELBUF; - brelse(bp); - } else { - bqrelse(bp); - } - } - - if (object) { - vm_object_vndeallocate(object); - } - if ((error == 0 || uio->uio_resid != orig_resid) && - (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) - ip->i_flag |= IN_ACCESS; - return (error); -} - -/* - * Vnode op for writing. - */ -int -WRITE(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - struct vnode *vp; - struct uio *uio; - struct inode *ip; - FS *fs; - struct buf *bp; - struct thread *td; - ufs_lbn_t lbn; - off_t osize; - int seqcount; - int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; - vm_object_t object; - - if (ap->a_ioflag & IO_EXT) - return (ufs_extwrite(ap)); - - GIANT_REQUIRED; - - extended = 0; - seqcount = ap->a_ioflag >> 16; - ioflag = ap->a_ioflag; - uio = ap->a_uio; - vp = ap->a_vp; - ip = VTOI(vp); - - object = vp->v_object; - if (object) { - vm_object_reference(object); - } - -#ifdef DIAGNOSTIC - if (uio->uio_rw != UIO_WRITE) - panic("%s: mode", WRITE_S); -#endif - - switch (vp->v_type) { - case VREG: - if (ioflag & IO_APPEND) - uio->uio_offset = ip->i_size; - if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { - if (object) { - vm_object_vndeallocate(object); - } - return (EPERM); - } - /* FALLTHROUGH */ - case VLNK: - break; - case VDIR: - panic("%s: dir write", WRITE_S); - break; - default: - panic("%s: type %p %d (%d,%d)", WRITE_S, vp, (int)vp->v_type, - (int)uio->uio_offset, - (int)uio->uio_resid - ); - } - - fs = ip->I_FS; - if (uio->uio_offset < 0 || - (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) { - if (object) { - vm_object_vndeallocate(object); - } - return (EFBIG); - } - /* - * Maybe this should be above the vnode op call, but so long as - * file servers have no limits, I don't think it matters. - */ - td = uio->uio_td; - if (vp->v_type == VREG && td && - uio->uio_offset + uio->uio_resid > - td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { - PROC_LOCK(td->td_proc); - psignal(td->td_proc, SIGXFSZ); - PROC_UNLOCK(td->td_proc); - if (object) { - vm_object_vndeallocate(object); - } - return (EFBIG); - } - - resid = uio->uio_resid; - osize = ip->i_size; - flags = 0; - if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) - flags = IO_SYNC; - -#ifdef ENABLE_VFS_IOOPT - if (object && (object->flags & OBJ_OPT)) { - vm_freeze_copyopts(object, - OFF_TO_IDX(uio->uio_offset), - OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK)); - } -#endif - for (error = 0; uio->uio_resid > 0;) { - lbn = lblkno(fs, uio->uio_offset); - blkoffset = blkoff(fs, uio->uio_offset); - xfersize = fs->fs_bsize - blkoffset; - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; - - if (uio->uio_offset + xfersize > ip->i_size) - vnode_pager_setsize(vp, uio->uio_offset + xfersize); - - /* - * We must perform a read-before-write if the transfer size - * does not cover the entire buffer. - */ - if (fs->fs_bsize > xfersize) - flags |= BA_CLRBUF; - else - flags &= ~BA_CLRBUF; -/* XXX is uio->uio_offset the right thing here? */ - error = UFS_BALLOC(vp, uio->uio_offset, xfersize, - ap->a_cred, flags, &bp); - if (error != 0) - break; - /* - * If the buffer is not valid we have to clear out any - * garbage data from the pages instantiated for the buffer. - * If we do not, a failed uiomove() during a write can leave - * the prior contents of the pages exposed to a userland - * mmap(). XXX deal with uiomove() errors a better way. - */ - if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) - vfs_bio_clrbuf(bp); - if (ioflag & IO_DIRECT) - bp->b_flags |= B_DIRECT; - if (ioflag & IO_NOWDRAIN) - bp->b_flags |= B_NOWDRAIN; - - if (uio->uio_offset + xfersize > ip->i_size) { - ip->i_size = uio->uio_offset + xfersize; - DIP(ip, i_size) = ip->i_size; - extended = 1; - } - - size = BLKSIZE(fs, ip, lbn) - bp->b_resid; - if (size < xfersize) - xfersize = size; - - error = - uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { - bp->b_flags |= B_RELBUF; - } - - /* - * If IO_SYNC each buffer is written synchronously. Otherwise - * if we have a severe page deficiency write the buffer - * asynchronously. Otherwise try to cluster, and if that - * doesn't do it then either do an async write (if O_DIRECT), - * or a delayed write (if not). - */ - if (ioflag & IO_SYNC) { - (void)bwrite(bp); - } else if (vm_page_count_severe() || - buf_dirty_count_severe() || - (ioflag & IO_ASYNC)) { - bp->b_flags |= B_CLUSTEROK; - bawrite(bp); - } else if (xfersize + blkoffset == fs->fs_bsize) { - if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { - bp->b_flags |= B_CLUSTEROK; - cluster_write(bp, ip->i_size, seqcount); - } else { - bawrite(bp); - } - } else if (ioflag & IO_DIRECT) { - bp->b_flags |= B_CLUSTEROK; - bawrite(bp); - } else { - bp->b_flags |= B_CLUSTEROK; - bdwrite(bp); - } - if (error || xfersize == 0) - break; - ip->i_flag |= IN_CHANGE | IN_UPDATE; - } - /* - * If we successfully wrote any data, and we are not the superuser - * we clear the setuid and setgid bits as a precaution against - * tampering. - */ - if (resid > uio->uio_resid && ap->a_cred && - suser_cred(ap->a_cred, PRISON_ROOT)) { - ip->i_mode &= ~(ISUID | ISGID); - DIP(ip, i_mode) = ip->i_mode; - } - if (resid > uio->uio_resid) - VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); - if (error) { - if (ioflag & IO_UNIT) { - (void)UFS_TRUNCATE(vp, osize, - IO_NORMAL | (ioflag & IO_SYNC), - ap->a_cred, uio->uio_td); - uio->uio_offset -= resid - uio->uio_resid; - uio->uio_resid = resid; - } - } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) - error = UFS_UPDATE(vp, 1); - - if (object) { - vm_object_vndeallocate(object); - } - - return (error); -} - -/* - * get page routine - */ -int -ffs_getpages(ap) - struct vop_getpages_args *ap; -{ - off_t foff, physoffset; - int i, size, bsize; - struct vnode *dp, *vp; - vm_object_t obj; - vm_pindex_t pindex, firstindex; - vm_page_t mreq; - int bbackwards, bforwards; - int pbackwards, pforwards; - int firstpage; - ufs2_daddr_t reqblkno, reqlblkno; - int poff; - int pcount; - int rtval; - int pagesperblock; - - GIANT_REQUIRED; - - pcount = round_page(ap->a_count) / PAGE_SIZE; - mreq = ap->a_m[ap->a_reqpage]; - firstindex = ap->a_m[0]->pindex; - - /* - * if ANY DEV_BSIZE blocks are valid on a large filesystem block, - * then the entire page is valid. Since the page may be mapped, - * user programs might reference data beyond the actual end of file - * occuring within the page. We have to zero that data. - */ - if (mreq->valid) { - if (mreq->valid != VM_PAGE_BITS_ALL) - vm_page_zero_invalid(mreq, TRUE); - vm_page_lock_queues(); - for (i = 0; i < pcount; i++) { - if (i != ap->a_reqpage) { - vm_page_free(ap->a_m[i]); - } - } - vm_page_unlock_queues(); - return VM_PAGER_OK; - } - - vp = ap->a_vp; - obj = vp->v_object; - bsize = vp->v_mount->mnt_stat.f_iosize; - pindex = mreq->pindex; - foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */; - - if (bsize < PAGE_SIZE) - return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, - ap->a_count, - ap->a_reqpage); - - /* - * foff is the file offset of the required page - * reqlblkno is the logical block that contains the page - * poff is the index of the page into the logical block - */ - reqlblkno = foff / bsize; - poff = (foff % bsize) / PAGE_SIZE; - - dp = VTOI(vp)->i_devvp; - if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) - || (reqblkno == -1)) { - vm_page_lock_queues(); - for(i = 0; i < pcount; i++) { - if (i != ap->a_reqpage) - vm_page_free(ap->a_m[i]); - } - vm_page_unlock_queues(); - if (reqblkno == -1) { - if ((mreq->flags & PG_ZERO) == 0) - vm_page_zero_fill(mreq); - vm_page_undirty(mreq); - mreq->valid = VM_PAGE_BITS_ALL; - return VM_PAGER_OK; - } else { - return VM_PAGER_ERROR; - } - } - - physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; - pagesperblock = bsize / PAGE_SIZE; - /* - * find the first page that is contiguous... - * note that pbackwards is the number of pages that are contiguous - * backwards. - */ - firstpage = 0; - if (ap->a_count) { - pbackwards = poff + bbackwards * pagesperblock; - if (ap->a_reqpage > pbackwards) { - firstpage = ap->a_reqpage - pbackwards; - vm_page_lock_queues(); - for(i=0;i<firstpage;i++) - vm_page_free(ap->a_m[i]); - vm_page_unlock_queues(); - } - - /* - * pforwards is the number of pages that are contiguous - * after the current page. - */ - pforwards = (pagesperblock - (poff + 1)) + - bforwards * pagesperblock; - if (pforwards < (pcount - (ap->a_reqpage + 1))) { - vm_page_lock_queues(); - for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) - vm_page_free(ap->a_m[i]); - vm_page_unlock_queues(); - pcount = ap->a_reqpage + pforwards + 1; - } - - /* - * number of pages for I/O corrected for the non-contig pages at - * the beginning of the array. - */ - pcount -= firstpage; - } - - /* - * calculate the size of the transfer - */ - - size = pcount * PAGE_SIZE; - - if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > - obj->un_pager.vnp.vnp_size) - size = obj->un_pager.vnp.vnp_size - - IDX_TO_OFF(ap->a_m[firstpage]->pindex); - - physoffset -= foff; - rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, - (ap->a_reqpage - firstpage), physoffset); - - return (rtval); -} - -/* - * Vnode op for extended attribute reading. - */ -/* ARGSUSED */ -int -ufs_extread(ap) - struct vop_read_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - struct vnode *vp; - struct inode *ip; - struct ufs2_dinode *dp; - struct uio *uio; - struct fs *fs; - struct buf *bp; - ufs_lbn_t lbn, nextlbn; - off_t bytesinfile; - long size, xfersize, blkoffset; - int error, orig_resid; - mode_t mode; - int ioflag; - - GIANT_REQUIRED; - - vp = ap->a_vp; - ip = VTOI(vp); - fs = ip->i_fs; - dp = ip->i_din2; - mode = ip->i_mode; - uio = ap->a_uio; - ioflag = ap->a_ioflag; - -#ifdef DIAGNOSTIC - if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) - panic("ufs_extread: mode"); - -#endif - orig_resid = uio->uio_resid; - if (orig_resid <= 0) - return (0); - - bytesinfile = dp->di_extsize - uio->uio_offset; - if (bytesinfile <= 0) { - if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) - ip->i_flag |= IN_ACCESS; - return 0; - } - - for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { - if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) - break; - - lbn = lblkno(fs, uio->uio_offset); - nextlbn = lbn + 1; - - /* - * size of buffer. The buffer representing the - * end of the file is rounded up to the size of - * the block type ( fragment or full block, - * depending ). - */ - size = sblksize(fs, dp->di_extsize, lbn); - blkoffset = blkoff(fs, uio->uio_offset); - - /* - * The amount we want to transfer in this iteration is - * one FS block less the amount of the data before - * our startpoint (duh!) - */ - xfersize = fs->fs_bsize - blkoffset; - - /* - * But if we actually want less than the block, - * or the file doesn't have a whole block more of data, - * then use the lesser number. - */ - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; - if (bytesinfile < xfersize) - xfersize = bytesinfile; - - if (lblktosize(fs, nextlbn) >= dp->di_extsize) { - /* - * Don't do readahead if this is the end of the info. - */ - error = bread(vp, -1 - lbn, size, NOCRED, &bp); - } else { - /* - * If we have a second block, then - * fire off a request for a readahead - * as well as a read. Note that the 4th and 5th - * arguments point to arrays of the size specified in - * the 6th argument. - */ - int nextsize = sblksize(fs, dp->di_extsize, nextlbn); - - nextlbn = -1 - nextlbn; - error = breadn(vp, -1 - lbn, - size, &nextlbn, &nextsize, 1, NOCRED, &bp); - } - if (error) { - brelse(bp); - bp = NULL; - break; - } - - /* - * If IO_DIRECT then set B_DIRECT for the buffer. This - * will cause us to attempt to release the buffer later on - * and will cause the buffer cache to attempt to free the - * underlying pages. - */ - if (ioflag & IO_DIRECT) - bp->b_flags |= B_DIRECT; - - /* - * We should only get non-zero b_resid when an I/O error - * has occurred, which should cause us to break above. - * However, if the short read did not cause an error, - * then we want to ensure that we do not uiomove bad - * or uninitialized data. - */ - size -= bp->b_resid; - if (size < xfersize) { - if (size == 0) - break; - xfersize = size; - } - - error = uiomove((char *)bp->b_data + blkoffset, - (int)xfersize, uio); - if (error) - break; - - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { - /* - * If there are no dependencies, and it's VMIO, - * then we don't need the buf, mark it available - * for freeing. The VM has the data. - */ - bp->b_flags |= B_RELBUF; - brelse(bp); - } else { - /* - * Otherwise let whoever - * made the request take care of - * freeing it. We just queue - * it onto another list. - */ - bqrelse(bp); - } - } - - /* - * This can only happen in the case of an error - * because the loop above resets bp to NULL on each iteration - * and on normal completion has not set a new value into it. - * so it must have come from a 'break' statement - */ - if (bp != NULL) { - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { - bp->b_flags |= B_RELBUF; - brelse(bp); - } else { - bqrelse(bp); - } - } - - if ((error == 0 || uio->uio_resid != orig_resid) && - (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) - ip->i_flag |= IN_ACCESS; - return (error); -} - -/* - * Vnode op for external attribute writing. - */ -int -ufs_extwrite(ap) - struct vop_write_args /* { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - struct vnode *vp; - struct uio *uio; - struct inode *ip; - struct ufs2_dinode *dp; - struct fs *fs; - struct buf *bp; - ufs_lbn_t lbn; - off_t osize; - int blkoffset, error, flags, ioflag, resid, size, xfersize; - - GIANT_REQUIRED; - - vp = ap->a_vp; - ip = VTOI(vp); - fs = ip->i_fs; - dp = ip->i_din2; - uio = ap->a_uio; - ioflag = ap->a_ioflag; - -#ifdef DIAGNOSTIC - if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) - panic("ext_write: mode"); -#endif - - if (ioflag & IO_APPEND) - uio->uio_offset = dp->di_extsize; - - if (uio->uio_offset < 0 || - (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) - return (EFBIG); - - resid = uio->uio_resid; - osize = dp->di_extsize; - flags = IO_EXT; - if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) - flags |= IO_SYNC; - - for (error = 0; uio->uio_resid > 0;) { - lbn = lblkno(fs, uio->uio_offset); - blkoffset = blkoff(fs, uio->uio_offset); - xfersize = fs->fs_bsize - blkoffset; - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; - - /* - * We must perform a read-before-write if the transfer size - * does not cover the entire buffer. - */ - if (fs->fs_bsize > xfersize) - flags |= BA_CLRBUF; - else - flags &= ~BA_CLRBUF; - error = UFS_BALLOC(vp, uio->uio_offset, xfersize, - ap->a_cred, flags, &bp); - if (error != 0) - break; - /* - * If the buffer is not valid we have to clear out any - * garbage data from the pages instantiated for the buffer. - * If we do not, a failed uiomove() during a write can leave - * the prior contents of the pages exposed to a userland - * mmap(). XXX deal with uiomove() errors a better way. - */ - if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) - vfs_bio_clrbuf(bp); - if (ioflag & IO_DIRECT) - bp->b_flags |= B_DIRECT; - if (ioflag & IO_NOWDRAIN) - bp->b_flags |= B_NOWDRAIN; - - if (uio->uio_offset + xfersize > dp->di_extsize) - dp->di_extsize = uio->uio_offset + xfersize; - - size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; - if (size < xfersize) - xfersize = size; - - error = - uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); - if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { - bp->b_flags |= B_RELBUF; - } - - /* - * If IO_SYNC each buffer is written synchronously. Otherwise - * if we have a severe page deficiency write the buffer - * asynchronously. Otherwise try to cluster, and if that - * doesn't do it then either do an async write (if O_DIRECT), - * or a delayed write (if not). - */ - if (ioflag & IO_SYNC) { - (void)bwrite(bp); - } else if (vm_page_count_severe() || - buf_dirty_count_severe() || - xfersize + blkoffset == fs->fs_bsize || - (ioflag & (IO_ASYNC | IO_DIRECT))) - bawrite(bp); - else - bdwrite(bp); - if (error || xfersize == 0) - break; - ip->i_flag |= IN_CHANGE | IN_UPDATE; - } - /* - * If we successfully wrote any data, and we are not the superuser - * we clear the setuid and setgid bits as a precaution against - * tampering. - */ - if (resid > uio->uio_resid && ap->a_cred && - suser_cred(ap->a_cred, PRISON_ROOT)) { - ip->i_mode &= ~(ISUID | ISGID); - dp->di_mode = ip->i_mode; - } - if (error) { - if (ioflag & IO_UNIT) { - (void)UFS_TRUNCATE(vp, osize, - IO_EXT | (ioflag&IO_SYNC), ap->a_cred, uio->uio_td); - uio->uio_offset -= resid - uio->uio_resid; - uio->uio_resid = resid; - } - } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) - error = UFS_UPDATE(vp, 1); - return (error); -} |