From 49095e88f133de688d2f8a6cf179be6c7add5285 Mon Sep 17 00:00:00 2001 From: julian Date: Tue, 19 May 1998 19:47:22 +0000 Subject: Import the earliest version of the soft update code that I have. --- sys/contrib/softupdates/README | 251 +++ sys/contrib/softupdates/ffs_softdep.c | 3851 +++++++++++++++++++++++++++++++++ sys/contrib/softupdates/softdep.h | 520 +++++ 3 files changed, 4622 insertions(+) create mode 100644 sys/contrib/softupdates/README create mode 100644 sys/contrib/softupdates/ffs_softdep.c create mode 100644 sys/contrib/softupdates/softdep.h (limited to 'sys/contrib/softupdates') diff --git a/sys/contrib/softupdates/README b/sys/contrib/softupdates/README new file mode 100644 index 0000000..d4676c9 --- /dev/null +++ b/sys/contrib/softupdates/README @@ -0,0 +1,251 @@ +Introduction + +This package constitutes the alpha distribution of the soft update +code updates for the fast filesystem. + +Status + +My `filesystem torture tests' (described below) run for days without +a hitch (no panic's, hangs, filesystem corruption, or memory leaks). +However, I have had several panic's reported to me by folks that +are field testing the code which I have not yet been able to +reproduce or fix. Although these panic's are rare and do not cause +filesystem corruption, the code should only be put into production +on systems where the system administrator is aware that it is being +run, and knows how to turn it off if problems arise. Thus, you may +hand out this code to others, but please ensure that this status +message is included with any distributions. Please also include +the file ffs_softdep.stub.c in any distributions so that folks that +cannot abide by the need to redistribute source will not be left +with a kernel that will not link. It will resolve all the calls +into the soft update code and simply ignores the request to enable +them. Thus you will be able to ensure that your other hooks have +not broken anything and that your kernel is softdep-ready for those +that wish to use them. Please report problems back to me with +kernel backtraces of panics if possible. This is massively complex +code, and people only have to have their filesystems hosed once or +twice to avoid future changes like the plague. I want to find and +fix as many bugs as soon as possible so as to get the code rock +solid before it gets widely released. Please report any bugs that +you uncover to mckusick@mckusick.com. + +Performance + +Running the Andrew Benchmarks yields the following raw data: + + Phase Normal Softdep What it does + 1 3s <1s Creating directories + 2 8s 4s Copying files + 3 6s 6s Recursive directory stats + 4 8s 9s Scanning each file + 5 25s 25s Compilation + + Normal: 19.9u 29.2s 0:52.8 135+630io + Softdep: 20.3u 28.5s 0:47.8 103+363io + +Another interesting datapoint are my `filesystem torture tests'. +They consist of 1000 runs of the andrew benchmarks, 1000 copy and +removes of /etc with randomly selected pauses of 0-60 seconds +between each copy and remove, and 500 find from / with randomly +selected pauses of 100 seconds between each run). The run of the +torture test compares as follows: + +With soft updates: writes: 6 sync, 1,113,686 async; run time 19hr, 50min +Normal filesystem: writes: 1,459,147 sync, 487,031 async; run time 27hr, 15min + +The upshot is 42% less I/O and 28% shorter running time. + +Another interesting test point is a full MAKEDEV. Because it runs +as a shell script, it becomes mostly limited by the execution speed +of the machine on which it runs. Here are the numbers: + +With soft updates: + + labrat# time ./MAKEDEV std + 2.2u 32.6s 0:34.82 100.0% 0+0k 11+36io 0pf+0w + + labrat# ls | wc + 522 522 3317 + +Without soft updates: + + labrat# time ./MAKEDEV std + 2.0u 40.5s 0:42.53 100.0% 0+0k 11+1221io 0pf+0w + + labrat# ls | wc + 522 522 3317 + +Of course, some of the system time is being pushed +to the syncer process, but that is a different story. + +To show a benchmark designed to highlight the soft update code +consider a tar of zero-sized files and an rm -rf of a directory tree +that has at least 50 files or so at each level. Running a test with +a directory tree containing 28 directories holding 202 empty files +produces the following numbers: + +With soft updates: +tar: 0.0u 0.5s 0:00.65 76.9% 0+0k 0+44io 0pf+0w (0 sync, 33 async writes) +rm: 0.0u 0.2s 0:00.20 100.0% 0+0k 0+37io 0pf+0w (0 sync, 72 async writes) + +Normal filesystem: +tar: 0.0u 1.1s 0:07.27 16.5% 0+0k 60+586io 0pf+0w (523 sync, 0 async writes) +rm: 0.0u 0.5s 0:01.84 29.3% 0+0k 0+318io 0pf+0w (258 sync, 65 async writes) + +The large reduction in writes is because inodes are clustered, so +most of a block gets allocated, then the whole block is written +out once rather than having the same block written once for each +inode allocated from it. Similarly each directory block is written +once rather than once for each new directory entry. Effectively +what the update code is doing is allocating a bunch of inodes +and directory entries without writing anything, then ensuring that +the block containing the inodes is written first followed by the +directory block that references them. If there were data in the +files it would further ensure that the data blocks were written +before their inodes claimed them. + +Copyright Restrictions + +Please familiarize yourself with the copyright restrictions +contained at the top of either the sys/ufs/ffs/softdep.h or +sys/ufs/ffs/ffs_softdep.c file. The key provision is similar +to the one used by the DB 2.0 package and goes as follows: + + Redistributions in any form must be accompanied by information + on how to obtain complete source code for any accompanying + software that uses the this software. This source code must + either be included in the distribution or be available for + no more than the cost of distribution plus a nominal fee, + and must be freely redistributable under reasonable + conditions. For an executable file, complete source code + means the source code for all modules it contains. It does + not mean source code for modules or files that typically + accompany the operating system on which the executable file + runs, e.g., standard library modules or system header files. + +The idea is to allow those of you freely redistributing your source +to use it while retaining for myself the right to peddle it for +money to the commercial UNIX vendors. Note that I have included a +stub file ffs_softdep.c.stub that is freely redistributable so that +you can put in all the necessary hooks to run the full soft updates +code, but still allow vendors that want to maintain proprietary +source to have a working system. I do plan to release the code with +a `Berkeley style' copyright once I have peddled it around to the +commercial vendors. If you have concerns about this copyright, +feel free to contact me with them and we can try to resolve any +difficulties. + +Soft Dependency Operation + +The soft update implementation does NOT require ANY changes +to the on-disk format of your filesystems. Furthermore it is +not used by default for any filesystems. It must be enabled on +a filesystem by filesystem basis by running tunefs to set a +bit in the superblock indicating that the filesystem should be +managed using soft updates. If you wish to stop using +soft updates due to performance or reliability reasons, +you can simply run tunefs on it again to turn off the bit and +revert to normal operation. The additional dynamic memory load +placed on the kernel malloc arena is approximately equal to +the amount of memory used by vnodes plus inodes (for a system +with 1000 vnodes, the additional peak memory load is about 300K). + +Kernel Changes + +There are two new changes to the kernel functionality that are not +contained in in the soft update files. The first is a `trickle +sync' facility running in the kernel as process 3. This trickle +sync process replaces the traditional `update' program (which should +be commented out of the /etc/rc startup script). When a vnode is +first written it is placed 30 seconds down on the trickle sync +queue. If it still exists and has dirty data when it reaches the +top of the queue, it is sync'ed. This approach evens out the load +on the underlying I/O system and avoids writing short-lived files. +The papers on trickle-sync tend to favor aging based on buffers +rather than files. However, I sync on file age rather than buffer +age because the data structures are much smaller as there are +typically far fewer files than buffers. Although this can make the +I/O spikey when a big file times out, it is still much better than +the wholesale sync's that were happening before. It also adapts +much better to the soft update code where I want to control +aging to improve performance (inodes age in 10 seconds, directories +in 15 seconds, files in 30 seconds). This ensures that most +dependencies are gone (e.g., inodes are written when directory +entries want to go to disk) reducing the amount of rollback that +is needed. + +The other main kernel change is to split the vnode freelist into +two separate lists. One for vnodes that are still being used to +identify buffers and the other for those vnodes no longer identifying +any buffers. The latter list is used by getnewvnode in preference +to the former. + +Packaging of Kernel Changes + +The sys subdirectory contains the changes and additions to the +kernel. My goal in writing this code was to minimize the changes +that need to be made to the kernel. Thus, most of the new code +is contained in the two new files softdep.h and ffs_softdep.c. +The rest of the kernel changes are simply inserting hooks to +call into these two new files. Although there has been some +structural reorganization of the filesystem code to accommodate +gathering the information required by the soft update code, +the actual ordering of filesystem operations when soft updates +are disabled is unchanged. + +The kernel changes are packaged as a set of diffs. As I am +doing my development in BSD/OS, the diffs are relative to the +BSD/OS versions of the files. Because BSD/OS recently had +4.4BSD-Lite2 merged into it, the Lite2 files are a good starting +point for figuring out the changes. There are 40 files that +require change plus the two new files. Most of these files have +only a few lines of changes in them. However, four files have +fairly extensive changes: kern/vfs_subr.c, ufs/ufs/ufs_lookup.c, +ufs/ufs/ufs_vnops.c, and ufs/ffs/ffs_alloc.c. For these four +files, I have provided the original Lite2 version, the Lite2 +version with the diffs merged in, and the diffs between the +BSD/OS and merged version. Even so, I expect that there will +be some difficulty in doing the merge; I am certainly willing +to assist in helping get the code merged into your system. + +Packaging of Utility Changes + +The utilities subdirectory contains the changes and additions +to the utilities. There are diffs to three utilities enclosed: + + tunefs - add a flag to enable and disable soft updates + + mount - print out whether soft updates are enabled and + also statistics on number of sync and async writes + + fsck - tighter checks on acceptable errors and a slightly + different policy for what to put in lost+found on + filesystems using soft updates + +In addition you should recompile vmstat so as to get reports +on the 13 new memory types used by the soft update code. +It is not necessary to use the new version of fsck, however it +would aid in my debugging if you do. Also, because of the time +lag between deleting a directory entry and the inode it +references, you will find a lot more files showing up in your +lost+found if you do not use the new version. Note that the +new version checks for the soft update flag in the superblock +and only uses the new algorithms if it is set. So, it will run +unchanged on the filesystems that are not using soft updates. + +Operation + +Once you have booted a kernel that incorporates the soft update +code and installed the updated utilities, do the following: + +1) Comment out the update program in /etc/rc. + +2) Run `tunefs -n enable' on one or more test filesystems. + +3) Mount these filesystems and then type `mount' to ensure that + they have been enabled for soft updates. + +4) Copy the test directory to a softdep filesystem, chdir into + it and run `./doit'. You may want to check out each of the + three subtests individually first: doit1 - andrew benchmarks, + doit2 - copy and removal of /etc, doit3 - find from /. diff --git a/sys/contrib/softupdates/ffs_softdep.c b/sys/contrib/softupdates/ffs_softdep.c new file mode 100644 index 0000000..c60c6b2 --- /dev/null +++ b/sys/contrib/softupdates/ffs_softdep.c @@ -0,0 +1,3851 @@ +/* + * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved. + * + * The soft dependency code is derived from work done by Greg Ganger + * at the University of Michigan. + * + * The following are the copyrights and redistribution conditions that + * apply to this copy of the soft dependency software. For a license + * to use, redistribute or sell the soft dependency software under + * conditions other than those described here, please contact the + * author at one of the following addresses: + * + * Marshall Kirk McKusick mckusick@mckusick.com + * 1614 Oxford Street +1-510-843-9542 + * Berkeley, CA 94709-1608 + * USA + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. None of the names of McKusick, Ganger, or the University of Michigan + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * 4. Redistributions in any form must be accompanied by information on + * how to obtain complete source code for any accompanying software + * that uses the this software. This source code must either be included + * in the distribution or be available for no more than the cost of + * distribution plus a nominal fee, and must be freely redistributable + * under reasonable conditions. For an executable file, complete + * source code means the source code for all modules it contains. + * It does not mean source code for modules or files that typically + * accompany the operating system on which the executable file runs, + * e.g., standard library modules or system header files. + * + * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_softdep.c 9.1 (McKusick) 7/9/97 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Internal function prototypes. + */ +static void softdep_error __P((char *, int)); +static int getdirtybuf __P((struct buf **, int)); +static int flush_pagedep_deps __P((struct vnode *, struct pagedep *)); +static int flush_inodedep_deps __P((struct fs *, ino_t)); +static int handle_written_filepage __P((struct pagedep *, struct buf *)); +static int handle_written_inodeblock __P((struct inodedep *, struct buf *)); +static void handle_allocdirect_partdone __P((struct allocdirect *)); +static void handle_allocindir_partdone __P((struct allocindir *)); +static void initiate_write_filepage __P((struct pagedep *, struct buf *)); +static void handle_written_mkdir __P((struct mkdir *, int)); +static void initiate_write_inodeblock __P((struct inodedep *, struct buf *)); +static void handle_workitem_freefile __P((struct freefile *)); +static void handle_workitem_remove __P((struct dirrem *)); +static struct dirrem *newdirrem __P((struct buf *, struct inode *, + struct inode *, int)); +static void free_diradd __P((struct diradd *)); +static void free_allocindir __P((struct allocindir *, struct inodedep *)); +static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t, + long *)); +static void deallocate_dependencies __P((struct buf *, struct inodedep *)); +static void free_allocdirect __P((struct allocdirectlst *, + struct allocdirect *, int)); +static int free_inodedep __P((struct inodedep *)); +static void handle_workitem_freeblocks __P((struct freeblks *)); +static void merge_inode_lists __P((struct inodedep *)); +static void setup_allocindir_phase2 __P((struct buf *, struct inode *, + struct allocindir *)); +static struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t, + ufs_daddr_t)); +static void handle_workitem_freefrag __P((struct freefrag *)); +static struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long)); +static void allocdirect_merge __P((struct allocdirectlst *, + struct allocdirect *, struct allocdirect *)); +static struct bmsafemap *bmsafemap_lookup __P((struct buf *)); +static int newblk_lookup __P((struct fs *, ufs_daddr_t, int, + struct newblk **)); +static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **)); +static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, + struct pagedep **)); +static void add_to_worklist __P((struct worklist *)); + +/* + * Exported softdep operations. + */ +struct bio_ops bioops = { + softdep_disk_io_initiation, /* io_start */ + softdep_disk_write_complete, /* io_complete */ + softdep_deallocate_dependencies, /* io_deallocate */ + softdep_process_worklist, /* io_sync */ +}; + +/* + * Names of malloc types. + */ +extern char *memname[]; +#define TYPENAME(type) ((unsigned)(type) < M_LAST ? memname[type] : "???") + +/* + * Locking primitives. + * + * For a uniprocessor, all we need to do is protect against disk + * interrupts. For a multiprocessor, this lock would have to be + * a mutex. A single mutex is used throughout this file, though + * finer grain locking could be used if contention warranted it. + * + * For a multiprocessor, the sleep call would accept a lock and + * release it after the sleep processing was complete. In a uniprocessor + * implementation there is no such interlock, so we simple mark + * the places where it needs to be done with the `interlocked' form + * of the lock calls. Since the uniprocessor sleep already interlocks + * the spl, there is nothing that really needs to be done. + */ +#ifndef /* NOT */ DEBUG +static int lk; +#define ACQUIRE_LOCK(lk) *lk = splbio() +#define FREE_LOCK(lk) splx(*lk) +#define ACQUIRE_LOCK_INTERLOCKED(lk) +#define FREE_LOCK_INTERLOCKED(lk) + +#else /* DEBUG */ +#include +static struct lockit { + int lkt_spl; + pid_t lkt_held; +} lk = { 0, -1 }; +static int lockcnt; + +static void acquire_lock __P((struct lockit *)); +static void free_lock __P((struct lockit *)); +static void acquire_lock_interlocked __P((struct lockit *)); +static void free_lock_interlocked __P((struct lockit *)); + +#define ACQUIRE_LOCK(lk) acquire_lock(lk) +#define FREE_LOCK(lk) free_lock(lk) +#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) +#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) + +static void +acquire_lock(lk) + struct lockit *lk; +{ + + if (lk->lkt_held != -1) + if (lk->lkt_held == curproc->p_pid) + panic("softdep_lock: locking against myself"); + else + panic("softdep_lock: lock held by %d", lk->lkt_held); + lk->lkt_spl = splbio(); + lk->lkt_held = curproc->p_pid; + lockcnt++; +} + +static void +free_lock(lk) + struct lockit *lk; +{ + + if (lk->lkt_held == -1) + panic("softdep_unlock: lock not held"); + lk->lkt_held = -1; + splx(lk->lkt_spl); +} + +static void +acquire_lock_interlocked(lk) + struct lockit *lk; +{ + + if (lk->lkt_held != -1) + if (lk->lkt_held == curproc->p_pid) + panic("softdep_lock_interlocked: locking against self"); + else + panic("softdep_lock_interlocked: lock held by %d", + lk->lkt_held); + lk->lkt_held = curproc->p_pid; + lockcnt++; +} + +static void +free_lock_interlocked(lk) + struct lockit *lk; +{ + + if (lk->lkt_held == -1) + panic("softdep_unlock_interlocked: lock not held"); + lk->lkt_held = -1; +} +#endif /* DEBUG */ + +/* + * Place holder for real semaphores. + */ +struct sema { + int value; + pid_t holder; + char *name; + int prio; + int timo; +}; +static void sema_init __P((struct sema *, char *, int, int)); +static int sema_get __P((struct sema *, struct lockit *)); +static void sema_release __P((struct sema *)); + +static void +sema_init(semap, name, prio, timo) + struct sema *semap; + char *name; + int prio, timo; +{ + + semap->holder = -1; + semap->value = 0; + semap->name = name; + semap->prio = prio; + semap->timo = timo; +} + +static int +sema_get(semap, interlock) + struct sema *semap; + struct lockit *interlock; +{ + + if (semap->value++ > 0) { + if (interlock != NULL) + FREE_LOCK_INTERLOCKED(interlock); + tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo); + if (interlock != NULL) { + ACQUIRE_LOCK_INTERLOCKED(interlock); + FREE_LOCK(interlock); + } + return (0); + } + semap->holder = curproc->p_pid; + if (interlock != NULL) + FREE_LOCK(interlock); + return (1); +} + +static void +sema_release(semap) + struct sema *semap; +{ + + if (semap->value <= 0 || semap->holder != curproc->p_pid) + panic("sema_release: not held"); + if (--semap->value > 0) { + semap->value = 0; + wakeup(semap); + } + semap->holder = -1; +} + +/* + * Worklist queue management. + * These routines require that the lock be held. + */ +#ifndef /* NOT */ DEBUG +#define WORKLIST_INSERT(head, item) do { \ + item->wk_state |= ONWORKLIST; \ + LIST_INSERT_HEAD(head, item, wk_list); \ +} while (0) +#define WORKLIST_REMOVE(item) do { \ + item->wk_state &= ~ONWORKLIST; \ + LIST_REMOVE(item, wk_list); \ +} while (0) +#define WORKITEM_FREE(item, type) FREE(item, type) + +#else /* DEBUG */ +static void worklist_insert __P((struct workhead *, struct worklist *)); +static void worklist_remove __P((struct worklist *)); +static void workitem_free __P((struct worklist *, int)); + +#define WORKLIST_INSERT(head, item) worklist_insert(head, item) +#define WORKLIST_REMOVE(item) worklist_remove(item) +#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type) + +static void +worklist_insert(head, item) + struct workhead *head; + struct worklist *item; +{ + + if (lk.lkt_held == -1) + panic("worklist_insert: lock not held"); + if (item->wk_state & ONWORKLIST) + panic("worklist_insert: already on list"); + item->wk_state |= ONWORKLIST; + LIST_INSERT_HEAD(head, item, wk_list); +} + +static void +worklist_remove(item) + struct worklist *item; +{ + + if (lk.lkt_held == -1) + panic("worklist_remove: lock not held"); + if ((item->wk_state & ONWORKLIST) == 0) + panic("worklist_remove: not on list"); + item->wk_state &= ~ONWORKLIST; + LIST_REMOVE(item, wk_list); +} + +static void +workitem_free(item, type) + struct worklist *item; + int type; +{ + + if (item->wk_state & ONWORKLIST) + panic("workitem_free: still on list"); + if (item->wk_type != type) + panic("workitem_free: type mismatch"); + FREE(item, type); +} +#endif /* DEBUG */ + +/* + * Workitem queue management + */ +static struct workhead softdep_workitem_pending; +static int softdep_worklist_busy; + +/* + * Add an item to the end of the work queue. + * This routine requires that the lock be held. + * This is the only routine that adds items to the list. + * The following routine is the only one that removes items + * and does so in order from first to last. + */ +static void +add_to_worklist(wk) + struct worklist *wk; +{ + static struct worklist *worklist_tail; + + if (wk->wk_state & ONWORKLIST) + panic("add_to_worklist: already on list"); + wk->wk_state |= ONWORKLIST; + if (LIST_FIRST(&softdep_workitem_pending) == NULL) + LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list); + else + LIST_INSERT_AFTER(worklist_tail, wk, wk_list); + worklist_tail = wk; +} + +/* + * Process that runs once per second to handle items in the background queue. + * + * Note that we ensure that everything is done in the order in which they + * appear in the queue. The code below depends on this property to ensure + * that blocks of a file are freed before the inode itself is freed. This + * ordering ensures that no new triples will be generated + * until all the old ones have been purged from the dependency lists. + */ +int +softdep_process_worklist(matchmnt) + struct mount *matchmnt; +{ + struct worklist *wk; + struct fs *matchfs; + int matchcnt; + + matchcnt = 0; + matchfs = NULL; + if (matchmnt != NULL) + matchfs = VFSTOUFS(matchmnt)->um_fs; + /* + * There is no danger of having multiple processes run this + * code. It is single threaded solely so that softdep_flushfiles + * (below) can get an accurate count of the number of items + * related to its mount point that are in the list. + */ + if (softdep_worklist_busy && matchmnt == NULL) + return (-1); + ACQUIRE_LOCK(&lk); + while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) { + WORKLIST_REMOVE(wk); + FREE_LOCK(&lk); + switch (wk->wk_type) { + + case M_DIRREM: + /* removal of a directory entry */ + if (WK_DIRREM(wk)->dm_mnt == matchmnt) + matchcnt += 1; + handle_workitem_remove(WK_DIRREM(wk)); + break; + + case M_FREEBLKS: + /* releasing blocks and/or fragments from a file */ + if (WK_FREEBLKS(wk)->fb_fs == matchfs) + matchcnt += 1; + handle_workitem_freeblocks(WK_FREEBLKS(wk)); + break; + + case M_FREEFRAG: + /* releasing a fragment when replaced as a file grows */ + if (WK_FREEFRAG(wk)->ff_fs == matchfs) + matchcnt += 1; + handle_workitem_freefrag(WK_FREEFRAG(wk)); + break; + + case M_FREEFILE: + /* releasing an inode when its link count drops to 0 */ + if (WK_FREEFILE(wk)->fx_fs == matchfs) + matchcnt += 1; + handle_workitem_freefile(WK_FREEFILE(wk)); + break; + + default: + panic("%s_process_worklist: Unknown type %s", + "softdep", TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + if (softdep_worklist_busy && matchmnt == NULL) + return (-1); + ACQUIRE_LOCK(&lk); + } + FREE_LOCK(&lk); + return (matchcnt); +} + +/* + * Purge the work list of all items associated with a particular mount point. + */ +int +softdep_flushfiles(oldmnt, flags, p) + struct mount *oldmnt; + int flags; + struct proc *p; +{ + struct vnode *devvp; + int error, loopcnt; + + /* + * Await our turn to clear out the queue. + */ + while (softdep_worklist_busy) + sleep(&lbolt, PRIBIO); + softdep_worklist_busy = 1; + if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) { + softdep_worklist_busy = 0; + return (error); + } + /* + * Alternately flush the block device associated with the mount + * point and process any dependencies that the flushing + * creates. In theory, this loop can happen at most twice, + * but we give it a few extra just to be sure. + */ + devvp = VFSTOUFS(oldmnt)->um_devvp; + for (loopcnt = 10; loopcnt > 0; loopcnt--) { + if (softdep_process_worklist(oldmnt) == 0) { + /* + * Do another flush in case any vnodes were brought in + * as part of the cleanup operations. + */ + if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) + break; + /* + * If we still found nothing to do, we are really done. + */ + if (softdep_process_worklist(oldmnt) == 0) + break; + } + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); + error = VOP_FSYNC(devvp, p->p_cred, MNT_WAIT, p); + VOP_UNLOCK(devvp, 0, p); + if (error) + break; + } + softdep_worklist_busy = 0; + if (loopcnt == 0) + panic("softdep_flushfiles: looping"); + return (error); +} + +/* + * Structure hashing. + * + * There are three types of structures that can be looked up: + * 1) pagedep structures identified by mount point, inode number, + * and logical block. + * 2) inodedep structures identified by mount point and inode number. + * 3) newblk structures identified by mount point and + * physical block number. + * + * The "pagedep" and "inodedep" dependency structures are hashed + * separately from the file blocks and inodes to which they correspond. + * This separation helps when the in-memory copy of an inode or + * file block must be replaced. It also obviates the need to access + * an inode or file page when simply updating (or de-allocating) + * dependency structures. Lookup of newblk structures is needed to + * find newly allocated blocks when trying to associate them with + * their allocdirect or allocindir structure. + * + * The lookup routines optionally create and hash a new instance when + * an existing entry is not found. + */ +#define DEPALLOC 0x0001 /* allocate structure if lookup fails */ + +/* + * Structures and routines associated with pagedep caching. + */ +LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; +u_long pagedep_hash; /* size of hash table - 1 */ +#define PAGEDEP_HASH(mp, inum, lbn) \ + (&pagedep_hashtbl[((((int)(mp)) >> 13) + (inum) + (lbn)) & pagedep_hash]) +static struct sema pagedep_in_progress; + +/* + * Look up a pagedep. Return 1 if found, 0 if not found. + * If not found, allocate if DEPALLOC flag is passed. + * Found or allocated entry is returned in pagedeppp. + * This routine must be called with splbio interrupts blocked. + */ +static int +pagedep_lookup(ip, lbn, flags, pagedeppp) + struct inode *ip; + ufs_lbn_t lbn; + int flags; + struct pagedep **pagedeppp; +{ + struct pagedep *pagedep; + struct pagedep_hashhead *pagedephd; + struct mount *mp; + int i; + +#ifdef DEBUG + if (lk.lkt_held == -1) + panic("pagedep_lookup: lock not held"); +#endif + mp = ITOV(ip)->v_mount; + pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); +top: + for (pagedep = LIST_FIRST(pagedephd); pagedep; + pagedep = LIST_NEXT(pagedep, pd_hash)) + if (ip->i_number == pagedep->pd_ino && + lbn == pagedep->pd_lbn && + mp == pagedep->pd_mnt) + break; + if (pagedep) { + *pagedeppp = pagedep; + return (1); + } + if ((flags & DEPALLOC) == 0) { + *pagedeppp = NULL; + return (0); + } + if (sema_get(&pagedep_in_progress, &lk) == 0) { + ACQUIRE_LOCK(&lk); + goto top; + } + MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, + M_WAITOK); + bzero(pagedep, sizeof(struct pagedep)); + pagedep->pd_list.wk_type = M_PAGEDEP; + pagedep->pd_mnt = mp; + pagedep->pd_ino = ip->i_number; + pagedep->pd_lbn = lbn; + LIST_INIT(&pagedep->pd_dirremhd); + LIST_INIT(&pagedep->pd_pendinghd); + for (i = 0; i < DAHASHSZ; i++) + LIST_INIT(&pagedep->pd_diraddhd[i]); + ACQUIRE_LOCK(&lk); + LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); + sema_release(&pagedep_in_progress); + *pagedeppp = pagedep; + return (0); +} + +/* + * Structures and routines associated with inodedep caching. + */ +LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; +u_long inodedep_hash; /* size of hash table - 1 */ +#define INODEDEP_HASH(fs, inum) \ + (&inodedep_hashtbl[((((int)(fs)) >> 13) + (inum)) & inodedep_hash]) +static struct sema inodedep_in_progress; + +/* + * Look up a inodedep. Return 1 if found, 0 if not found. + * If not found, allocate if DEPALLOC flag is passed. + * Found or allocated entry is returned in inodedeppp. + * This routine must be called with splbio interrupts blocked. + */ +static int +inodedep_lookup(fs, inum, flags, inodedeppp) + struct fs *fs; + ino_t inum; + int flags; + struct inodedep **inodedeppp; +{ + struct inodedep *inodedep; + struct inodedep_hashhead *inodedephd; + +#ifdef DEBUG + if (lk.lkt_held == -1) + panic("inodedep_lookup: lock not held"); +#endif + inodedephd = INODEDEP_HASH(fs, inum); +top: + for (inodedep = LIST_FIRST(inodedephd); inodedep; + inodedep = LIST_NEXT(inodedep, id_hash)) + if (inum == inodedep->id_ino && fs == inodedep->id_fs) + break; + if (inodedep) { + *inodedeppp = inodedep; + return (1); + } + if ((flags & DEPALLOC) == 0) { + *inodedeppp = NULL; + return (0); + } + if (sema_get(&inodedep_in_progress, &lk) == 0) { + ACQUIRE_LOCK(&lk); + goto top; + } + MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), + M_INODEDEP, M_WAITOK); + inodedep->id_list.wk_type = M_INODEDEP; + inodedep->id_fs = fs; + inodedep->id_ino = inum; + inodedep->id_state = ALLCOMPLETE; + inodedep->id_nlinkdelta = 0; + inodedep->id_savedino = NULL; + inodedep->id_savedsize = -1; + inodedep->id_buf = NULL; + LIST_INIT(&inodedep->id_pendinghd); + LIST_INIT(&inodedep->id_inowait); + TAILQ_INIT(&inodedep->id_inoupdt); + TAILQ_INIT(&inodedep->id_newinoupdt); + ACQUIRE_LOCK(&lk); + LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); + sema_release(&inodedep_in_progress); + *inodedeppp = inodedep; + return (0); +} + +/* + * Structures and routines associated with newblk caching. + */ +LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; +u_long newblk_hash; /* size of hash table - 1 */ +#define NEWBLK_HASH(fs, inum) \ + (&newblk_hashtbl[((((int)(fs)) >> 13) + (inum)) & newblk_hash]) +static struct sema newblk_in_progress; + +/* + * Look up a newblk. Return 1 if found, 0 if not found. + * If not found, allocate if DEPALLOC flag is passed. + * Found or allocated entry is returned in newblkpp. + */ +static int +newblk_lookup(fs, newblkno, flags, newblkpp) + struct fs *fs; + ufs_daddr_t newblkno; + int flags; + struct newblk **newblkpp; +{ + struct newblk *newblk; + struct newblk_hashhead *newblkhd; + + newblkhd = NEWBLK_HASH(fs, newblkno); +top: + for (newblk = LIST_FIRST(newblkhd); newblk; + newblk = LIST_NEXT(newblk, nb_hash)) + if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) + break; + if (newblk) { + *newblkpp = newblk; + return (1); + } + if ((flags & DEPALLOC) == 0) { + *newblkpp = NULL; + return (0); + } + if (sema_get(&newblk_in_progress, 0) == 0) + goto top; + MALLOC(newblk, struct newblk *, sizeof(struct newblk), + M_NEWBLK, M_WAITOK); + newblk->nb_state = 0; + newblk->nb_fs = fs; + newblk->nb_newblkno = newblkno; + LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); + sema_release(&newblk_in_progress); + *newblkpp = newblk; + return (0); +} + +/* + * Executed during filesystem system initialization before + * mounting any file systems. + */ +void +softdep_initialize() +{ + + LIST_INIT(&mkdirlisthd); + LIST_INIT(&softdep_workitem_pending); + pagedep_hashtbl = hashinit(desiredvnodes * 2, M_PAGEDEP, &pagedep_hash); + sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); + inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); + sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); + newblk_hashtbl = hashinit(desiredvnodes / 10, M_NEWBLK, &newblk_hash); + sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); +} + +/* + * Called at mount time to notify the dependency code that a + * filesystem wishes to use it. + */ +int +softdep_mount(devvp, mp, fs, cred) + struct vnode *devvp; + struct mount *mp; + struct fs *fs; + struct ucred *cred; +{ + struct csum cstotal; + struct cg *cgp; + struct buf *bp; + int error, cyl; + + mp->mnt_flag |= MNT_SOFTDEP; + /* + * When doing soft updates, the counters in the + * superblock may have gotten out of sync, so we have + * to scan the cylinder groups and recalculate them. + */ + if (fs->fs_clean != 0) + return (0); + bzero(&cstotal, sizeof cstotal); + for (cyl = 0; cyl < fs->fs_ncg; cyl++) { + if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), + fs->fs_cgsize, cred, &bp)) != 0) { + brelse(bp); + return (error); + } + cgp = (struct cg *)bp->b_data; + cstotal.cs_nffree += cgp->cg_cs.cs_nffree; + cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; + cstotal.cs_nifree += cgp->cg_cs.cs_nifree; + cstotal.cs_ndir += cgp->cg_cs.cs_ndir; + fs->fs_cs(fs, cyl) = cgp->cg_cs; + brelse(bp); + } +#ifdef DEBUG + if (!bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) + printf("ffs_mountfs: superblock updated\n"); +#endif + bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); + return (0); +} + +/* + * Protecting the freemaps (or bitmaps). + * + * To eliminate the need to execute fsck before mounting a file system + * after a power failure, one must (conservatively) guarantee that the + * on-disk copy of the bitmaps never indicate that a live inode or block is + * free. So, when a block or inode is allocated, the bitmap should be + * updated (on disk) before any new pointers. When a block or inode is + * freed, the bitmap should not be updated until all pointers have been + * reset. The latter dependency is handled by the delayed de-allocation + * approach described below for block and inode de-allocation. The former + * dependency is handled by calling the following procedure when a block or + * inode is allocated. When an inode is allocated an "inodedep" is created + * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. + * Each "inodedep" is also inserted into the hash indexing structure so + * that any additional link additions can be made dependent on the inode + * allocation. + * + * The ufs file system maintains a number of free block counts (e.g., per + * cylinder group, per cylinder and per pair) + * in addition to the bitmaps. These counts are used to improve efficiency + * during allocation and therefore must be consistent with the bitmaps. + * There is no convenient way to guarantee post-crash consistency of these + * counts with simple update ordering, for two main reasons: (1) The counts + * and bitmaps for a single cylinder group block are not in the same disk + * sector. If a disk write is interrupted (e.g., by power failure), one may + * be written and the other not. (2) Some of the counts are located in the + * superblock rather than the cylinder group block. So, we focus our soft + * updates implementation on protecting the bitmaps. When mounting a + * filesystem, we recompute the auxiliary counts from the bitmaps. + */ + +/* + * Called just after updating the cylinder group block to allocate an inode. + */ +void +softdep_setup_inomapdep(bp, ip, newinum) + struct buf *bp; /* buffer for cylgroup block with inode map */ + struct inode *ip; /* inode related to allocation */ + ino_t newinum; /* new inode number being allocated */ +{ + struct inodedep *inodedep; + struct bmsafemap *bmsafemap; + + /* + * Create a dependency for the newly allocated inode. + * Panic if it already exists as something is seriously wrong. + * Otherwise add it to the dependency list for the buffer holding + * the cylinder group map from which it was allocated. + */ + ACQUIRE_LOCK(&lk); + if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0) + panic("softdep_setup_inomapdep: found inode"); + inodedep->id_buf = bp; + inodedep->id_state &= ~DEPCOMPLETE; + bmsafemap = bmsafemap_lookup(bp); + LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); + FREE_LOCK(&lk); +} + +/* + * Called just after updating the cylinder group block to + * allocate block or fragment. + */ +void +softdep_setup_blkmapdep(bp, fs, newblkno) + struct buf *bp; /* buffer for cylgroup block with block map */ + struct fs *fs; /* filesystem doing allocation */ + ufs_daddr_t newblkno; /* number of newly allocated block */ +{ + struct newblk *newblk; + struct bmsafemap *bmsafemap; + + /* + * Create a dependency for the newly allocated block. + * Add it to the dependency list for the buffer holding + * the cylinder group map from which it was allocated. + */ + if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) + panic("softdep_setup_blkmapdep: found block"); + ACQUIRE_LOCK(&lk); + newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp); + LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); + FREE_LOCK(&lk); +} + +/* + * Find the bmsafemap associated with a cylinder group buffer. + * If none exists, create one. The buffer must be locked when + * this routine is called and this routine must be called with + * splbio interrupts blocked. + */ +static struct bmsafemap * +bmsafemap_lookup(bp) + struct buf *bp; +{ + struct bmsafemap *bmsafemap; + struct worklist *wk; + +#ifdef DEBUG + if (lk.lkt_held == -1) + panic("bmsafemap_lookup: lock not held"); +#endif + for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) + if (wk->wk_type == M_BMSAFEMAP) + return (WK_BMSAFEMAP(wk)); + FREE_LOCK(&lk); + MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), + M_BMSAFEMAP, M_WAITOK); + bmsafemap->sm_list.wk_type = M_BMSAFEMAP; + bmsafemap->sm_list.wk_state = 0; + bmsafemap->sm_buf = bp; + LIST_INIT(&bmsafemap->sm_allocdirecthd); + LIST_INIT(&bmsafemap->sm_allocindirhd); + LIST_INIT(&bmsafemap->sm_inodedephd); + LIST_INIT(&bmsafemap->sm_newblkhd); + ACQUIRE_LOCK(&lk); + WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); + return (bmsafemap); +} + +/* + * Direct block allocation dependencies. + * + * When a new block is allocated, the corresponding disk locations must be + * initialized (with zeros or new data) before the on-disk inode points to + * them. Also, the freemap from which the block was allocated must be + * updated (on disk) before the inode's pointer. These two dependencies are + * independent of each other and are needed for all file blocks and indirect + * blocks that are pointed to directly by the inode. Just before the + * "in-core" version of the inode is updated with a newly allocated block + * number, a procedure (below) is called to setup allocation dependency + * structures. These structures are removed when the corresponding + * dependencies are satisfied or when the block allocation becomes obsolete + * (i.e., the file is deleted, the block is de-allocated, or the block is a + * fragment that gets upgraded). All of these cases are handled in + * procedures described later. + * + * When a file extension causes a fragment to be upgraded, either to a larger + * fragment or to a full block, the on-disk location may change (if the + * previous fragment could not simply be extended). In this case, the old + * fragment must be de-allocated, but not until after the inode's pointer has + * been updated. In most cases, this is handled by later procedures, which + * will construct a "freefrag" structure to be added to the workitem queue + * when the inode update is complete (or obsolete). The main exception to + * this is when an allocation occurs while a pending allocation dependency + * (for the same block pointer) remains. This case is handled in the main + * allocation dependency setup procedure by immediately freeing the + * unreferenced fragments. + */ +void +softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) + struct inode *ip; /* inode to which block is being added */ + ufs_lbn_t lbn; /* block pointer within inode */ + ufs_daddr_t newblkno; /* disk block number being added */ + ufs_daddr_t oldblkno; /* previous block number, 0 unless frag */ + long newsize; /* size of new block */ + long oldsize; /* size of new block */ + struct buf *bp; /* bp for allocated block */ +{ + struct allocdirect *adp, *oldadp; + struct allocdirectlst *adphead; + struct bmsafemap *bmsafemap; + struct inodedep *inodedep; + struct pagedep *pagedep; + struct newblk *newblk; + + MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), + M_ALLOCDIRECT, M_WAITOK); + bzero(adp, sizeof(struct allocdirect)); + adp->ad_list.wk_type = M_ALLOCDIRECT; + adp->ad_lbn = lbn; + adp->ad_newblkno = newblkno; + adp->ad_oldblkno = oldblkno; + adp->ad_newsize = newsize; + adp->ad_oldsize = oldsize; + adp->ad_state = ATTACHED; + if (newblkno == oldblkno) + adp->ad_freefrag = NULL; + else + adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); + + if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) + panic("softdep_setup_allocdirect: lost block"); + + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); + adp->ad_inodedep = inodedep; + + if (newblk->nb_state == DEPCOMPLETE) { + adp->ad_state |= DEPCOMPLETE; + adp->ad_buf = NULL; + } else { + bmsafemap = newblk->nb_bmsafemap; + adp->ad_buf = bmsafemap->sm_buf; + LIST_REMOVE(newblk, nb_deps); + LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); + } + LIST_REMOVE(newblk, nb_hash); + FREE(newblk, M_NEWBLK); + + WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); + if (lbn >= NDADDR) { + /* allocating an indirect block */ + if (oldblkno != 0) + panic("softdep_setup_allocdirect: non-zero indir"); + } else { + /* + * Allocating a direct block. + * + * If we are allocating a directory block, then we must + * allocate an associated pagedep to track additions and + * deletions. + */ + if ((ip->i_mode & IFMT) == IFDIR && + pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) + WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); + } + /* + * The list of allocdirects must be kept in sorted and ascending + * order so that the rollback routines can quickly determine the + * first uncommitted block (the size of the file stored on disk + * ends at the end of the lowest committed fragment, or if there + * are no fragments, at the end of the highest committed block). + * Since files generally grow, the typical case is that the new + * block is to be added at the end of the list. We speed this + * special case by checking against the last allocdirect in the + * list before laboriously traversing the list looking for the + * insertion point. + */ + adphead = &inodedep->id_newinoupdt; + oldadp = TAILQ_LAST(adphead, allocdirectlst); + if (oldadp == NULL || oldadp->ad_lbn <= lbn) { + /* insert at end of list */ + TAILQ_INSERT_TAIL(adphead, adp, ad_next); + if (oldadp != NULL && oldadp->ad_lbn == lbn) + allocdirect_merge(adphead, adp, oldadp); + FREE_LOCK(&lk); + return; + } + for (oldadp = TAILQ_FIRST(adphead); oldadp; + oldadp = TAILQ_NEXT(oldadp, ad_next)) { + if (oldadp->ad_lbn >= lbn) + break; + } + if (oldadp == NULL) + panic("softdep_setup_allocdirect: lost entry"); + /* insert in middle of list */ + TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); + if (oldadp->ad_lbn == lbn) + allocdirect_merge(adphead, adp, oldadp); + FREE_LOCK(&lk); +} + +/* + * Replace an old allocdirect dependency with a newer one. + * This routine must be called with splbio interrupts blocked. + */ +static void +allocdirect_merge(adphead, newadp, oldadp) + struct allocdirectlst *adphead; /* head of list holding allocdirects */ + struct allocdirect *newadp; /* allocdirect being added */ + struct allocdirect *oldadp; /* existing allocdirect being checked */ +{ + struct freefrag *freefrag; + +#ifdef DEBUG + if (lk.lkt_held == -1) + panic("allocdirect_merge: lock not held"); +#endif + if (newadp->ad_oldblkno != oldadp->ad_newblkno || + newadp->ad_oldsize != oldadp->ad_newsize || + newadp->ad_lbn >= NDADDR) + panic("allocdirect_check: old %d != new %d || lbn %d >= %d", + newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, + NDADDR); + newadp->ad_oldblkno = oldadp->ad_oldblkno; + newadp->ad_oldsize = oldadp->ad_oldsize; + /* + * If the old dependency had a fragment to free or had never + * previously had a block allocated, then the new dependency + * can immediately post its freefrag and adopt the old freefrag. + * This action is done by swapping the freefrag dependencies. + * The new dependency gains the old one's freefrag, and the + * old one gets the new one and then immediately puts it on + * the worklist when it is freed by free_allocdirect. It is + * not possible to do this swap when the old dependency had a + * non-zero size but no previous fragment to free. This condition + * arises when the new block is an extension of the old block. + * Here, the first part of the fragment allocated to the new + * dependency is part of the block currently claimed on disk by + * the old dependency, so cannot legitimately be freed until the + * conditions for the new dependency are fulfilled. + */ + if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { + freefrag = newadp->ad_freefrag; + newadp->ad_freefrag = oldadp->ad_freefrag; + oldadp->ad_freefrag = freefrag; + } + free_allocdirect(adphead, oldadp, 0); +} + +/* + * Allocate a new freefrag structure if needed. + */ +static struct freefrag * +newfreefrag(ip, blkno, size) + struct inode *ip; + ufs_daddr_t blkno; + long size; +{ + struct freefrag *freefrag; + struct fs *fs; + + if (blkno == 0) + return (NULL); + fs = ip->i_fs; + if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) + panic("newfreefrag: frag size"); + MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), + M_FREEFRAG, M_WAITOK); + freefrag->ff_list.wk_type = M_FREEFRAG; + freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */ + freefrag->ff_inum = ip->i_number; + freefrag->ff_fs = fs; + freefrag->ff_devvp = ip->i_devvp; + freefrag->ff_blkno = blkno; + freefrag->ff_fragsize = size; + return (freefrag); +} + +/* + * This workitem de-allocates fragments that were replaced during + * file block allocation. + */ +static void +handle_workitem_freefrag(freefrag) + struct freefrag *freefrag; +{ + struct inode tip; + + tip.i_fs = freefrag->ff_fs; + tip.i_devvp = freefrag->ff_devvp; + tip.i_dev = freefrag->ff_devvp->v_rdev; + tip.i_number = freefrag->ff_inum; + tip.i_uid = freefrag->ff_state & ~ONWORKLIST; /* XXX - set above */ + ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); + FREE(freefrag, M_FREEFRAG); +} + +/* + * Indirect block allocation dependencies. + * + * The same dependencies that exist for a direct block also exist when + * a new block is allocated and pointed to by an entry in a block of + * indirect pointers. The undo/redo states described above are also + * used here. Because an indirect block contains many pointers that + * may have dependencies, a second copy of the entire in-memory indirect + * block is kept. The buffer cache copy is always completely up-to-date. + * The second copy, which is used only as a source for disk writes, + * contains only the safe pointers (i.e., those that have no remaining + * update dependencies). The second copy is freed when all pointers + * are safe. The cache is not allowed to replace indirect blocks with + * pending update dependencies. If a buffer containing an indirect + * block with dependencies is written, these routines will mark it + * dirty again. It can only be successfully written once all the + * dependencies are removed. The ffs_fsync routine in conjunction with + * softdep_sync_metadata work together to get all the dependencies + * removed so that a file can be successfully written to disk. Three + * procedures are used when setting up indirect block pointer + * dependencies. The division is necessary because of the organization + * of the "balloc" routine and because of the distinction between file + * pages and file metadata blocks. + */ + +/* + * Allocate a new allocindir structure. + */ +static struct allocindir * +newallocindir(ip, ptrno, newblkno, oldblkno) + struct inode *ip; /* inode for file being extended */ + int ptrno; /* offset of pointer in indirect block */ + ufs_daddr_t newblkno; /* disk block number being added */ + ufs_daddr_t oldblkno; /* previous block number, 0 if none */ +{ + struct allocindir *aip; + + MALLOC(aip, struct allocindir *, sizeof(struct allocindir), + M_ALLOCINDIR, M_WAITOK); + bzero(aip, sizeof(struct allocindir)); + aip->ai_list.wk_type = M_ALLOCINDIR; + aip->ai_state = ATTACHED; + aip->ai_offset = ptrno; + aip->ai_newblkno = newblkno; + aip->ai_oldblkno = oldblkno; + aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); + return (aip); +} + +/* + * Called just before setting an indirect block pointer + * to a newly allocated file page. + */ +void +softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) + struct inode *ip; /* inode for file being extended */ + ufs_lbn_t lbn; /* allocated block number within file */ + struct buf *bp; /* buffer with indirect blk referencing page */ + int ptrno; /* offset of pointer in indirect block */ + ufs_daddr_t newblkno; /* disk block number being added */ + ufs_daddr_t oldblkno; /* previous block number, 0 if none */ + struct buf *nbp; /* buffer holding allocated page */ +{ + struct allocindir *aip; + struct pagedep *pagedep; + + aip = newallocindir(ip, ptrno, newblkno, oldblkno); + ACQUIRE_LOCK(&lk); + /* + * If we are allocating a directory page, then we must + * allocate an associated pagedep to track additions and + * deletions. + */ + if ((ip->i_mode & IFMT) == IFDIR && + pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) + WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); + WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); + FREE_LOCK(&lk); + setup_allocindir_phase2(bp, ip, aip); +} + +/* + * Called just before setting an indirect block pointer to a + * newly allocated indirect block. + */ +void +softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) + struct buf *nbp; /* newly allocated indirect block */ + struct inode *ip; /* inode for file being extended */ + struct buf *bp; /* indirect block referencing allocated block */ + int ptrno; /* offset of pointer in indirect block */ + ufs_daddr_t newblkno; /* disk block number being added */ +{ + struct allocindir *aip; + + aip = newallocindir(ip, ptrno, newblkno, 0); + ACQUIRE_LOCK(&lk); + WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); + FREE_LOCK(&lk); + setup_allocindir_phase2(bp, ip, aip); +} + +/* + * Called to finish the allocation of the "aip" allocated + * by one of the two routines above. + */ +static void +setup_allocindir_phase2(bp, ip, aip) + struct buf *bp; /* in-memory copy of the indirect block */ + struct inode *ip; /* inode for file being extended */ + struct allocindir *aip; /* allocindir allocated by the above routines */ +{ + struct worklist *wk; + struct indirdep *indirdep, *newindirdep; + struct bmsafemap *bmsafemap; + struct allocindir *oldaip; + struct freefrag *freefrag; + struct newblk *newblk; + + if (bp->b_lblkno >= 0) + panic("setup_allocindir_phase2: not indir blk"); + for (indirdep = NULL, newindirdep = NULL; ; ) { + ACQUIRE_LOCK(&lk); + for (wk = LIST_FIRST(&bp->b_dep); wk; + wk = LIST_NEXT(wk, wk_list)) { + if (wk->wk_type != M_INDIRDEP) + continue; + indirdep = WK_INDIRDEP(wk); + break; + } + if (indirdep == NULL && newindirdep) { + indirdep = newindirdep; + WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); + newindirdep = NULL; + } + FREE_LOCK(&lk); + if (indirdep) { + if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, + &newblk) == 0) + panic("setup_allocindir: lost block"); + ACQUIRE_LOCK(&lk); + if (newblk->nb_state == DEPCOMPLETE) { + aip->ai_state |= DEPCOMPLETE; + aip->ai_buf = NULL; + } else { + bmsafemap = newblk->nb_bmsafemap; + aip->ai_buf = bmsafemap->sm_buf; + LIST_REMOVE(newblk, nb_deps); + LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, + aip, ai_deps); + } + LIST_REMOVE(newblk, nb_hash); + FREE(newblk, M_NEWBLK); + aip->ai_indirdep = indirdep; + /* + * Check to see if there is an existing dependency + * for this block. If there is, merge the old + * dependency into the new one. + */ + if (aip->ai_oldblkno == 0) + oldaip = NULL; + else + for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd); + oldaip; oldaip = LIST_NEXT(oldaip, ai_next)) + if (oldaip->ai_offset == aip->ai_offset) + break; + if (oldaip != NULL) { + if (oldaip->ai_newblkno != aip->ai_oldblkno) + panic("setup_allocindir_phase2: blkno"); + aip->ai_oldblkno = oldaip->ai_oldblkno; + freefrag = oldaip->ai_freefrag; + oldaip->ai_freefrag = aip->ai_freefrag; + aip->ai_freefrag = freefrag; + free_allocindir(oldaip, NULL); + } + LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); + ((ufs_daddr_t *)indirdep->ir_savebp->b_data) + [aip->ai_offset] = aip->ai_oldblkno; + FREE_LOCK(&lk); + } + if (newindirdep) { + if (indirdep->ir_savebp != NULL) + brelse(newindirdep->ir_savebp); + WORKITEM_FREE((caddr_t)newindirdep, M_INDIRDEP); + } + if (indirdep) + break; + MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), + M_INDIRDEP, M_WAITOK); + newindirdep->ir_list.wk_type = M_INDIRDEP; + newindirdep->ir_state = ATTACHED; + LIST_INIT(&newindirdep->ir_deplisthd); + LIST_INIT(&newindirdep->ir_donehd); + newindirdep->ir_saveddata = (ufs_daddr_t *)bp->b_data; + newindirdep->ir_savebp = + getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); + bcopy((caddr_t)newindirdep->ir_saveddata, + newindirdep->ir_savebp->b_data, bp->b_bcount); + } +} + +/* + * Block de-allocation dependencies. + * + * When blocks are de-allocated, the on-disk pointers must be nullified before + * the blocks are made available for use by other files. (The true + * requirement is that old pointers must be nullified before new on-disk + * pointers are set. We chose this slightly more stringent requirement to + * reduce complexity.) Our implementation handles this dependency by updating + * the inode (or indirect block) appropriately but delaying the actual block + * de-allocation (i.e., freemap and free space count manipulation) until + * after the updated versions reach stable storage. After the disk is + * updated, the blocks can be safely de-allocated whenever it is convenient. + * This implementation handles only the common case of reducing a file's + * length to zero. Other cases are handled by the conventional synchronous + * write approach. + * + * The ffs implementation with which we worked double-checks + * the state of the block pointers and file size as it reduces + * a file's length. Some of this code is replicated here in our + * soft updates implementation. The freeblks->fb_chkcnt field is + * used to transfer a part of this information to the procedure + * that eventually de-allocates the blocks. + * + * This routine should be called from the routine that shortens + * a file's length, before the inode's size or block pointers + * are modified. It will save the block pointer information for + * later release and zero the inode so that the calling routine + * can release it. + */ +void +softdep_setup_freeblocks(ip, length) + struct inode *ip; /* The inode whose length is to be reduced */ + off_t length; /* The new length for the file */ +{ + struct freeblks *freeblks; + struct inodedep *inodedep; + struct allocdirect *adp; + struct vnode *vp; + struct buf *bp; + struct fs *fs; + int i, error; + + fs = ip->i_fs; + if (length != 0) + panic("softde_setup_freeblocks: non-zero length"); + MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), + M_FREEBLKS, M_WAITOK); + bzero(freeblks, sizeof(struct freeblks)); + freeblks->fb_list.wk_type = M_FREEBLKS; + freeblks->fb_uid = ip->i_uid; + freeblks->fb_previousinum = ip->i_number; + freeblks->fb_devvp = ip->i_devvp; + freeblks->fb_fs = fs; + freeblks->fb_oldsize = ip->i_size; + freeblks->fb_newsize = length; + freeblks->fb_chkcnt = ip->i_blocks; + for (i = 0; i < NDADDR; i++) { + freeblks->fb_dblks[i] = ip->i_db[i]; + ip->i_db[i] = 0; + } + for (i = 0; i < NIADDR; i++) { + freeblks->fb_iblks[i] = ip->i_ib[i]; + ip->i_ib[i] = 0; + } + ip->i_blocks = 0; + ip->i_size = 0; + /* + * Push the zero'ed inode to to its disk buffer so that we are free + * to delete its dependencies below. Once the dependencies are gone + * the buffer can be safely released. + */ + if ((error = bread(ip->i_devvp, + fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, &bp)) != 0) + softdep_error("softdep_setup_freeblocks", error); + *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = + ip->i_din; + /* + * Find and eliminate any inode dependencies. + */ + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep); + if ((inodedep->id_state & IOSTARTED) != 0) + panic("softdep_setup_freeblocks: inode busy"); + /* + * Add the freeblks structure to the list of operations that + * must await the zero'ed inode being written to disk. + */ + WORKLIST_INSERT(&inodedep->id_inowait, &freeblks->fb_list); + /* + * Because the file length has been truncated to zero, any + * pending block allocation dependency structures associated + * with this inode are obsolete and can simply be de-allocated. + * We must first merge the two dependency lists to get rid of + * any duplicate freefrag structures, then purge the merged list. + */ + merge_inode_lists(inodedep); + while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) + free_allocdirect(&inodedep->id_inoupdt, adp, 1); + bdwrite(bp); + /* + * We must wait for any I/O in progress to finish so that + * all potential buffers on the dirty list will be visible. + * Once they are all there, walk the list and get rid of + * any dependencies. + */ + vp = ITOV(ip); + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + FREE_LOCK_INTERLOCKED(&lk); + sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); + ACQUIRE_LOCK_INTERLOCKED(&lk); + } + while (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { + bp = LIST_FIRST(&vp->v_dirtyblkhd); + (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); + deallocate_dependencies(bp, inodedep); + bp->b_flags |= B_INVAL; + brelse(bp); + } + /* + * Try freeing the inodedep in case that was the last dependency. + */ + if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0) + (void) free_inodedep(inodedep); + FREE_LOCK(&lk); +} + +/* + * Reclaim any dependency structures from a buffer that is about to + * be reallocated to a new vnode. The buffer must be locked, thus, + * no I/O completion operations can occur while we are manipulating + * its associated dependencies. The mutex is held so that other I/O's + * associated with related dependencies do not occur. + */ +static void +deallocate_dependencies(bp, inodedep) + struct buf *bp; + struct inodedep *inodedep; +{ + struct worklist *wk; + struct indirdep *indirdep; + struct allocindir *aip; + struct pagedep *pagedep; + struct dirrem *dirrem; + struct diradd *dap; + long tmpsize; + caddr_t tmp; + int i; + + while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { + switch (wk->wk_type) { + + case M_INDIRDEP: + indirdep = WK_INDIRDEP(wk); + /* + * None of the indirect pointers will ever be visible, + * so they can simply be tossed. GOINGAWAY ensures + * that allocated pointers will be saved in the buffer + * cache until they are freed. Note that they will + * only be able to be found by their physical address + * since the inode mapping the logical address will + * be gone. The save buffer used for the safe copy + * was allocated in setup_allocindir_phase2 using + * the physical address so it could be used for this + * purpose. Hence we swap the safe copy with the real + * copy, allowing the safe copy to be freed and holding + * on to the real copy for later use in indir_trunc. + */ + if (indirdep->ir_state & GOINGAWAY) + panic("deallocate_dependencies: already gone"); + indirdep->ir_state |= GOINGAWAY; + while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) + free_allocindir(aip, inodedep); + if (bp->b_lblkno >= 0 || + bp->b_blkno != indirdep->ir_savebp->b_lblkno) + panic("deallocate_dependencies: not indir"); + tmp = indirdep->ir_savebp->b_data; + indirdep->ir_savebp->b_data = bp->b_data; + bp->b_data = tmp; + tmpsize = indirdep->ir_savebp->b_bufsize; + indirdep->ir_savebp->b_bufsize = bp->b_bufsize; + bp->b_bufsize = tmpsize; + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); + continue; + + case M_PAGEDEP: + pagedep = WK_PAGEDEP(wk); + /* + * None of the directory additions will ever be + * visible, so they can simply be tossed. + */ + for (i = 0; i < DAHASHSZ; i++) + while (dap=LIST_FIRST(&pagedep->pd_diraddhd[i])) + free_diradd(dap); + while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) + free_diradd(dap); + /* + * Copy any directory remove dependencies to the list + * to be processed after the zero'ed inode is written. + * If the inode has already been written, then they + * can be dumped directly onto the work list. + */ + for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem; + dirrem = LIST_NEXT(dirrem, dm_next)) { + LIST_REMOVE(dirrem, dm_next); + dirrem->dm_dirinum = pagedep->pd_ino; + if (inodedep == NULL) + add_to_worklist(&dirrem->dm_list); + else + WORKLIST_INSERT(&inodedep->id_inowait, + &dirrem->dm_list); + } + WORKLIST_REMOVE(&pagedep->pd_list); + LIST_REMOVE(pagedep, pd_hash); + WORKITEM_FREE(pagedep, M_PAGEDEP); + continue; + + case M_ALLOCINDIR: + free_allocindir(WK_ALLOCINDIR(wk), inodedep); + continue; + + case M_ALLOCDIRECT: + case M_INODEDEP: + panic("deallocate_dependencies: Unexpected type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + + default: + panic("deallocate_dependencies: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } +} + +/* + * Free an allocdirect. Generate a new freefrag work request if appropriate. + * This routine must be called with splbio interrupts blocked. + */ +static void +free_allocdirect(adphead, adp, delay) + struct allocdirectlst *adphead; + struct allocdirect *adp; + int delay; +{ + +#ifdef DEBUG + if (lk.lkt_held == -1) + panic("free_allocdirect: lock not held"); +#endif + if ((adp->ad_state & DEPCOMPLETE) == 0) + LIST_REMOVE(adp, ad_deps); + TAILQ_REMOVE(adphead, adp, ad_next); + if ((adp->ad_state & COMPLETE) == 0) + WORKLIST_REMOVE(&adp->ad_list); + if (adp->ad_freefrag != NULL) { + if (delay) + WORKLIST_INSERT(&adp->ad_inodedep->id_inowait, + &adp->ad_freefrag->ff_list); + else + add_to_worklist(&adp->ad_freefrag->ff_list); + } + WORKITEM_FREE(adp, M_ALLOCDIRECT); +} + +/* + * Prepare an inode to be freed. The actual free operation is not + * done until the zero'ed inode has been written to disk. + */ +void +softdep_freefile(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + struct inode *ip = VTOI(ap->a_pvp); + struct inodedep *inodedep; + struct freefile *freefile; + + /* + * This sets up the inode de-allocation dependency. + */ + MALLOC(freefile, struct freefile *, sizeof(struct freefile), + M_FREEFILE, M_WAITOK); + freefile->fx_list.wk_type = M_FREEFILE; + freefile->fx_list.wk_state = 0; + freefile->fx_mode = ap->a_mode; + freefile->fx_oldinum = ap->a_ino; + freefile->fx_devvp = ip->i_devvp; + freefile->fx_fs = ip->i_fs; + + /* + * If the inodedep does not exist, then the zero'ed inode has + * been written to disk and we can free the file immediately. + */ + ACQUIRE_LOCK(&lk); + if (inodedep_lookup(ip->i_fs, ap->a_ino, 0, &inodedep) == 0) { + add_to_worklist(&freefile->fx_list); + FREE_LOCK(&lk); + return; + } + + /* + * If we still have a bitmap dependency, then the inode has never + * been written to disk. Drop the dependency as it is no longer + * necessary since the inode is being deallocated. We could process + * the freefile immediately, but then we would have to clear the + * id_inowait dependencies here and it is easier just to let the + * zero'ed inode be written and let them be cleaned up in the + * normal followup actions that follow the inode write. + */ + if ((inodedep->id_state & DEPCOMPLETE) == 0) { + inodedep->id_state |= DEPCOMPLETE; + LIST_REMOVE(inodedep, id_deps); + inodedep->id_buf = NULL; + } + /* + * If the inodedep has no dependencies associated with it, + * then we must free it here and free the file immediately. + * This case arises when an early allocation fails (for + * example, the user is over their file quota). + */ + if (free_inodedep(inodedep) == 0) + WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); + else + add_to_worklist(&freefile->fx_list); + FREE_LOCK(&lk); +} + +/* + * Try to free an inodedep structure. Return 1 if it could be freed. + */ +static int +free_inodedep(inodedep) + struct inodedep *inodedep; +{ + + if ((inodedep->id_state & ONWORKLIST) != 0 || + LIST_FIRST(&inodedep->id_pendinghd) != NULL || + LIST_FIRST(&inodedep->id_inowait) != NULL || + TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || + TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || + inodedep->id_nlinkdelta != 0 || inodedep->id_buf != NULL || + inodedep->id_savedino != NULL) + return (0); + LIST_REMOVE(inodedep, id_hash); + WORKITEM_FREE(inodedep, M_INODEDEP); + return (1); +} + +/* + * This workitem routine performs the block de-allocation. + * The workitem is added to the pending list after the updated + * inode block has been written to disk. As mentioned above, + * checks regarding the number of blocks de-allocated (compared + * to the number of blocks allocated for the file) are also + * performed in this function. + */ +static void +handle_workitem_freeblocks(freeblks) + struct freeblks *freeblks; +{ + struct inode tip; + ufs_daddr_t bn; + struct fs *fs; + int i, level, bsize; + long nblocks, blocksreleased = 0; + int error, allerror = 0; + ufs_lbn_t baselbns[NIADDR], tmpval; + + tip.i_number = freeblks->fb_previousinum; + tip.i_devvp = freeblks->fb_devvp; + tip.i_dev = freeblks->fb_devvp->v_rdev; + tip.i_fs = freeblks->fb_fs; + tip.i_size = freeblks->fb_oldsize; + tip.i_uid = freeblks->fb_uid; + fs = freeblks->fb_fs; + tmpval = 1; + baselbns[0] = NDADDR; + for (i = 1; i < NIADDR; i++) { + tmpval *= NINDIR(fs); + baselbns[i] = baselbns[i - 1] + tmpval; + } + nblocks = btodb(fs->fs_bsize); + blocksreleased = 0; + /* + * Indirect blocks first. + */ + for (level = (NIADDR - 1); level >= 0; level--) { + if ((bn = freeblks->fb_iblks[level]) == 0) + continue; + if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level, + baselbns[level], &blocksreleased)) == 0) + allerror = error; + ffs_blkfree(&tip, bn, fs->fs_bsize); + blocksreleased += nblocks; + } + /* + * All direct blocks or frags. + */ + for (i = (NDADDR - 1); i >= 0; i--) { + if ((bn = freeblks->fb_dblks[i]) == 0) + continue; + bsize = blksize(fs, &tip, i); + ffs_blkfree(&tip, bn, bsize); + blocksreleased += btodb(bsize); + } + +#ifdef DIAGNOSTIC + if (freeblks->fb_chkcnt != blocksreleased) + panic("handle_workitem_freeblocks: block count"); + if (allerror) + softdep_error("handle_workitem_freeblks", allerror); +#endif /* DIAGNOSTIC */ + WORKITEM_FREE(freeblks, M_FREEBLKS); +} + +/* + * Release blocks associated with the inode ip and stored in the indirect + * block dbn. If level is greater than SINGLE, the block is an indirect block + * and recursive calls to indirtrunc must be used to cleanse other indirect + * blocks. + */ +static int +indir_trunc(ip, dbn, level, lbn, countp) + struct inode *ip; + ufs_daddr_t dbn; + int level; + ufs_lbn_t lbn; + long *countp; +{ + struct buf *bp; + ufs_daddr_t *bap; + ufs_daddr_t nb; + struct fs *fs; + struct worklist *wk; + struct indirdep *indirdep; + int i, lbnadd, nblocks; + int error, allerror = 0; + + fs = ip->i_fs; + lbnadd = 1; + for (i = level; i > 0; i--) + lbnadd *= NINDIR(fs); + /* + * Get buffer of block pointers to be freed. This routine is not + * called until the zero'ed inode has been written, so it is safe + * to free blocks as they are encountered. Because the inode has + * been zero'ed, calls to bmap on these blocks will fail. So, we + * have to use the on-disk address and the block device for the + * filesystem to look them up. If the file was deleted before its + * indirect blocks were all written to disk, the routine that set + * us up (deallocate_dependencies) will have arranged to leave + * a complete copy of the indirect block in memory for our use. + * Otherwise we have to read the blocks in from the disk. + */ + ACQUIRE_LOCK(&lk); + if ((bp = incore(ip->i_devvp, dbn)) != NULL && + (wk = LIST_FIRST(&bp->b_dep)) != NULL) { + if (wk->wk_type != M_INDIRDEP || + (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || + (indirdep->ir_state & GOINGAWAY) == 0) + panic("indir_trunc: lost indirdep"); + WORKLIST_REMOVE(wk); + WORKITEM_FREE(indirdep, M_INDIRDEP); + if (LIST_FIRST(&bp->b_dep) != NULL) + panic("indir_trunc: dangling dep"); + FREE_LOCK(&lk); + } else { + FREE_LOCK(&lk); + error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); + if (error) + return (error); + } + /* + * Recursively free indirect blocks. + */ + bap = (ufs_daddr_t *)bp->b_data; + nblocks = btodb(fs->fs_bsize); + for (i = NINDIR(fs) - 1; i >= 0; i--) { + if ((nb = bap[i]) == 0) + continue; + if (level != 0) { + if ((error = indir_trunc(ip, fsbtodb(fs, nb), + level - 1, lbn + (i * lbnadd), countp)) != 0) + allerror = error; + } + ffs_blkfree(ip, nb, fs->fs_bsize); + *countp += nblocks; + } + bp->b_flags |= B_INVAL; + brelse(bp); + return (allerror); +} + +/* + * Free an allocindir. + * This routine must be called with splbio interrupts blocked. + */ +static void +free_allocindir(aip, inodedep) + struct allocindir *aip; + struct inodedep *inodedep; +{ + struct freefrag *freefrag; + +#ifdef DEBUG + if (lk.lkt_held == -1) + panic("free_allocindir: lock not held"); +#endif + if ((aip->ai_state & DEPCOMPLETE) == 0) + LIST_REMOVE(aip, ai_deps); + if (aip->ai_state & ONWORKLIST) + WORKLIST_REMOVE(&aip->ai_list); + LIST_REMOVE(aip, ai_next); + if ((freefrag = aip->ai_freefrag) != NULL) { + if (inodedep == NULL) + add_to_worklist(&freefrag->ff_list); + else + WORKLIST_INSERT(&inodedep->id_inowait, + &freefrag->ff_list); + } + WORKITEM_FREE(aip, M_ALLOCINDIR); +} + +/* + * Directory entry addition dependencies. + * + * When adding a new directory entry, the inode (with its incremented link + * count) must be written to disk before the directory entry's pointer to it. + * Also, if the inode is newly allocated, the corresponding freemap must be + * updated (on disk) before the directory entry's pointer. These requirements + * are met via undo/redo on the directory entry's pointer, which consists + * simply of the inode number. + * + * As directory entries are added and deleted, the free space within a + * directory block can become fragmented. The ufs file system will compact + * a fragmented directory block to make space for a new entry. When this + * occurs, the offsets of previously added entries change. Any "diradd" + * dependency structures corresponding to these entries must be updated with + * the new offsets. + */ + +/* + * This routine is called after the in-memory inode's link + * count has been incremented, but before the directory entry's + * pointer to the inode has been set. + */ +void +softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) + struct buf *bp; /* buffer containing directory block */ + struct inode *dp; /* inode for directory */ + off_t diroffset; /* offset of new entry in directory */ + long newinum; /* inode referenced by new directory entry */ + struct buf *newdirbp; /* non-NULL => contents of new mkdir */ +{ + int offset; /* offset of new entry within directory block */ + ufs_lbn_t lbn; /* block in directory containing new entry */ + struct fs *fs; + struct diradd *dap; + struct pagedep *pagedep; + struct inodedep *inodedep; + struct mkdir *mkdir1, *mkdir2; + + /* + * Whiteouts have no dependencies. + */ + if (newinum == WINO) { + if (newdirbp != NULL) + bdwrite(newdirbp); + return; + } + + fs = dp->i_fs; + lbn = lblkno(fs, diroffset); + offset = blkoff(fs, diroffset); + MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); + bzero(dap, sizeof(struct diradd)); + dap->da_list.wk_type = M_DIRADD; + dap->da_offset = offset; + dap->da_newinum = newinum; + dap->da_state = ATTACHED; + if (newdirbp == NULL) { + dap->da_state |= DEPCOMPLETE; + } else { + dap->da_state |= MKDIR_BODY | MKDIR_PARENT; + MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, + M_WAITOK); + mkdir1->md_list.wk_type = M_MKDIR; + mkdir1->md_state = MKDIR_BODY; + mkdir1->md_diradd = dap; + MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, + M_WAITOK); + mkdir2->md_list.wk_type = M_MKDIR; + mkdir2->md_state = MKDIR_PARENT; + mkdir2->md_diradd = dap; + + } + + ACQUIRE_LOCK(&lk); + /* + * If this directory entry references a new directory, create + * its two additional dependencies: its "." and ".." being written + * to disk and the link count increase for its parent directory. + */ + if (newdirbp != NULL) { + /* + * Dependency on "." and ".." being written to disk + */ + LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); + WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); + bdwrite(newdirbp); + /* + * Dependency on link count increase for parent directory + */ + if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 + || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { + dap->da_state &= ~MKDIR_PARENT; + WORKITEM_FREE(mkdir2, M_MKDIR); + } else { + LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); + WORKLIST_INSERT(&inodedep->id_inowait,&mkdir2->md_list); + } + } + /* + * Link into parent directory pagedep and new inode inodedep + * structures to await its being written. + */ + if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) + WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); + dap->da_pagedep = pagedep; + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, + da_pdlist); + if (inodedep_lookup(fs, newinum, DEPALLOC, &inodedep) == 1 && + (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) + WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); + else + WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list); + FREE_LOCK(&lk); +} + +/* + * This procedure is called to change the offset of a directory + * entry when compacting a directory block which must be owned + * exclusively by the caller. Note that the actual entry movement + * must be done in this procedure to ensure that no I/O completions + * occur while the move is in progress. + */ +void +softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) + struct inode *dp; /* inode for directory */ + caddr_t base; /* address of dp->i_offset */ + caddr_t oldloc; /* address of old directory location */ + caddr_t newloc; /* address of new directory location */ + int entrysize; /* size of directory entry */ +{ + int oldoffset, newoffset; + struct pagedep *pagedep; + struct diradd *dap; + ufs_lbn_t lbn; + + ACQUIRE_LOCK(&lk); + lbn = lblkno(dp->i_fs, dp->i_offset); + if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) + goto done; + oldoffset = dp->i_offset + (oldloc - base); + newoffset = dp->i_offset + (newloc - base); + for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]); + dap; dap = LIST_NEXT(dap, da_pdlist)) { + if (dap->da_offset != oldoffset) + continue; + dap->da_offset = newoffset; + if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) + break; + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], + dap, da_pdlist); + break; + } +done: + bcopy(oldloc, newloc, entrysize); + FREE_LOCK(&lk); +} + +/* + * Free a diradd dependency structure. This routine must be called + * with splbio interrupts blocked. + */ +static void +free_diradd(dap) + struct diradd *dap; +{ + struct dirrem *dirrem; + struct pagedep *pagedep; + struct inodedep *inodedep; + struct mkdir *mkdir, *nextmd; + +#ifdef DEBUG + if (lk.lkt_held == -1) + panic("free_diradd: lock not held"); +#endif + WORKLIST_REMOVE(&dap->da_list); + LIST_REMOVE(dap, da_pdlist); + if ((dap->da_state & DIRCHG) == 0) { + pagedep = dap->da_pagedep; + } else { + dirrem = dap->da_previous; + pagedep = dirrem->dm_pagedep; + LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); + } + if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum, + 0, &inodedep) != 0) + (void) free_inodedep(inodedep); + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { + for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { + nextmd = LIST_NEXT(mkdir, md_mkdirs); + if (mkdir->md_diradd != dap) + continue; + dap->da_state &= ~mkdir->md_state; + WORKLIST_REMOVE(&mkdir->md_list); + LIST_REMOVE(mkdir, md_mkdirs); + WORKITEM_FREE(mkdir, M_MKDIR); + } + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) + panic("free_diradd: unfound ref"); + } + WORKITEM_FREE(dap, M_DIRADD); +} + +/* + * Directory entry removal dependencies. + * + * When removing a directory entry, the entry's inode pointer must be + * zero'ed on disk before the corresponding inode's link count is decremented + * (possibly freeing the inode for re-use). This dependency is handled by + * updating the directory entry but delaying the inode count reduction until + * after the directory block has been written to disk. After this point, the + * inode count can be decremented whenever it is convenient. + */ + +/* + * This routine should be called immediately after removing + * a directory entry. The inode's link count should not be + * decremented by the calling procedure -- the soft updates + * code will do this task when it is safe. + */ +void +softdep_setup_remove(bp, dp, ip, isrmdir) + struct buf *bp; /* buffer containing directory block */ + struct inode *dp; /* inode for the directory being modified */ + struct inode *ip; /* inode for directory entry being removed */ + int isrmdir; /* indicates if doing RMDIR */ +{ + struct dirrem *dirrem; + + /* + * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. + */ + dirrem = newdirrem(bp, dp, ip, isrmdir); + if ((dirrem->dm_state & COMPLETE) == 0) { + LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, + dm_next); + } else { + dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; + add_to_worklist(&dirrem->dm_list); + } + FREE_LOCK(&lk); +} + +/* + * Allocate a new dirrem if appropriate and return it along with + * its associated pagedep. Called without a lock, returns with lock. + */ +static struct dirrem * +newdirrem(bp, dp, ip, isrmdir) + struct buf *bp; /* buffer containing directory block */ + struct inode *dp; /* inode for the directory being modified */ + struct inode *ip; /* inode for directory entry being removed */ + int isrmdir; /* indicates if doing RMDIR */ +{ + ufs_lbn_t lbn; + struct diradd *dap; + struct dirrem *dirrem; + struct pagedep *pagedep; + + /* + * Whiteouts have no deletion dependencies. + */ + if (ip == NULL) + panic("newdirrem: whiteout"); + MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), + M_DIRREM, M_WAITOK); + bzero(dirrem, sizeof(struct dirrem)); + dirrem->dm_list.wk_type = M_DIRREM; + dirrem->dm_state = isrmdir ? RMDIR : 0; + dirrem->dm_mnt = ITOV(ip)->v_mount; + dirrem->dm_oldinum = ip->i_number; + + ACQUIRE_LOCK(&lk); + lbn = lblkno(dp->i_fs, dp->i_offset); + if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) + WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); + dirrem->dm_pagedep = pagedep; + for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(dp->i_offset)]); + dap; dap = LIST_NEXT(dap, da_pdlist)) { + /* + * Check for a diradd dependency for the same directory entry. + * If present, then both dependencies become obsolete and can + * be de-allocated. + */ + if (dap->da_offset != dp->i_offset) + continue; + /* + * Must be ATTACHED at this point, so just delete it. + */ + if ((dap->da_state & ATTACHED) == 0) + panic("newdirrem: not ATTACHED"); + if (dap->da_newinum != ip->i_number) + panic("newdirrem: inum %d should be %d", + ip->i_number, dap->da_newinum); + free_diradd(dap); + dirrem->dm_state |= COMPLETE; + break; + } + return (dirrem); +} + +/* + * Directory entry change dependencies. + * + * Changing an existing directory entry requires that an add operation + * be completed first followed by a deletion. The semantics for the addition + * are identical to the description of adding a new entry above except + * that the rollback is to the old inode number rather than zero. Once + * the addition dependency is completed, the removal is done as described + * in the removal routine above. + */ + +/* + * This routine should be called immediately after changing + * a directory entry. The inode's link count should not be + * decremented by the calling procedure -- the soft updates + * code will perform this task when it is safe. + */ +void +softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) + struct buf *bp; /* buffer containing directory block */ + struct inode *dp; /* inode for the directory being modified */ + struct inode *ip; /* inode for directory entry being removed */ + long newinum; /* new inode number for changed entry */ + int isrmdir; /* indicates if doing RMDIR */ +{ + int offset; + struct diradd *dap; + struct dirrem *dirrem; + struct inodedep *inodedep; + + offset = blkoff(dp->i_fs, dp->i_offset); + + /* + * Whiteouts have no addition dependencies. + */ + if (newinum == WINO) { + dap = NULL; + } else { + MALLOC(dap, struct diradd *, sizeof(struct diradd), + M_DIRADD, M_WAITOK); + bzero(dap, sizeof(struct diradd)); + dap->da_list.wk_type = M_DIRADD; + dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; + dap->da_offset = offset; + dap->da_newinum = newinum; + } + + /* + * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. + */ + dirrem = newdirrem(bp, dp, ip, isrmdir); + + /* + * If the inode has already been written, then no addition + * dependency needs to be created. + */ + if (inodedep_lookup(dp->i_fs, newinum, 0, &inodedep) == 0 || + (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { + WORKITEM_FREE(dap, M_DIRADD); + dap = NULL; + } + + if (dap) { + dap->da_previous = dirrem; + LIST_INSERT_HEAD( + &dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)], + dap, da_pdlist); + WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list); + } else if ((dirrem->dm_state & COMPLETE) == 0) { + LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, + dm_next); + } else { + dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; + add_to_worklist(&dirrem->dm_list); + } + FREE_LOCK(&lk); +} + +/* + * Called whenever the link count on an inode is increased. + * It creates an inode dependency so that the new reference(s) + * to the inode cannot be committed to disk until the updated + * inode has been written. + */ +void +softdep_increase_linkcnt(ip) + struct inode *ip; /* the inode with the increased link count */ +{ + struct inodedep *inodedep; + + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); + FREE_LOCK(&lk); +} + +/* + * This workitem decrements the inode's link count. + * If the link count reaches zero, the file is removed. + */ +static void +handle_workitem_remove(dirrem) + struct dirrem *dirrem; +{ + struct proc *p = curproc; /* XXX */ + struct inodedep *inodedep; + struct vnode *vp; + struct inode *ip; + int error; + + if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { + softdep_error("handle_workitem_remove: vget", error); + return; + } + ip = VTOI(vp); + /* + * Normal file deletion. + */ + if ((dirrem->dm_state & RMDIR) == 0) { + ip->i_nlink--; + if (ip->i_nlink < ip->i_effnlink) + panic("handle_workitem_remove: bad file delta"); + ip->i_flag |= IN_CHANGE; + vput(vp); + WORKITEM_FREE(dirrem, M_DIRREM); + return; + } + /* + * Directory deletion. Decrement reference count for both the + * just deleted parent directory entry and the reference for ".". + * Next truncate the directory to length zero. When the + * truncation completes, arrange to have the reference count on + * the parent decremented to account for the loss of "..". + */ + ip->i_nlink -= 2; + if (ip->i_nlink < ip->i_effnlink) + panic("handle_workitem_remove: bad dir delta"); + ip->i_flag |= IN_CHANGE; + if ((error = VOP_TRUNCATE(vp, (off_t)0, 0, p->p_cred, p)) != 0) + softdep_error("handle_workitem_remove: truncate", error); + ACQUIRE_LOCK(&lk); + (void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC, + &inodedep); + dirrem->dm_state = 0; + dirrem->dm_oldinum = dirrem->dm_dirinum; + WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); + FREE_LOCK(&lk); + vput(vp); +} + +/* + * Inode de-allocation dependencies. + * + * When an inode's link count is reduced to zero, it can be de-allocated. We + * found it convenient to postpone de-allocation until after the inode is + * written to disk with its new link count (zero). At this point, all of the + * on-disk inode's block pointers are nullified and, with careful dependency + * list ordering, all dependencies related to the inode will be satisfied and + * the corresponding dependency structures de-allocated. So, if/when the + * inode is reused, there will be no mixing of old dependencies with new + * ones. This artificial dependency is set up by the block de-allocation + * procedure above (softdep_setup_freeblocks) and completed by the + * following procedure. + */ +static void +handle_workitem_freefile(freefile) + struct freefile *freefile; +{ + struct vnode vp; + struct inode tip; + struct inodedep *idp; + struct vop_vfree_args args; + int error; + +#ifdef DEBUG + ACQUIRE_LOCK(&lk); + if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp)) + panic("handle_workitem_freefile: inodedep survived"); + FREE_LOCK(&lk); +#endif + tip.i_devvp = freefile->fx_devvp; + tip.i_dev = freefile->fx_devvp->v_rdev; + tip.i_fs = freefile->fx_fs; + vp.v_data = &tip; + args.a_pvp = &vp; + args.a_ino = freefile->fx_oldinum; + args.a_mode = freefile->fx_mode; + if ((error = ffs_freefile(&args)) != 0) + softdep_error("handle_workitem_freefile", error); + WORKITEM_FREE(freefile, M_FREEFILE); +} + +/* + * Disk writes. + * + * The dependency structures constructed above are most actively used when file + * system blocks are written to disk. No constraints are placed on when a + * block can be written, but unsatisfied update dependencies are made safe by + * modifying (or replacing) the source memory for the duration of the disk + * write. When the disk write completes, the memory block is again brought + * up-to-date. + * + * In-core inode structure reclamation. + * + * Because there are a finite number of "in-core" inode structures, they are + * reused regularly. By transferring all inode-related dependencies to the + * in-memory inode block and indexing them separately (via "inodedep"s), we + * can allow "in-core" inode structures to be reused at any time and avoid + * any increase in contention. + * + * Called just before entering the device driver to initiate a new disk I/O. + * The buffer must be locked, thus, no I/O completion operations can occur + * while we are manipulating its associated dependencies. + */ +void +softdep_disk_io_initiation(bp) + struct buf *bp; /* structure describing disk write to occur */ +{ + struct worklist *wk, *nextwk; + struct indirdep *indirdep; + + /* + * We only care about write operations. There should never + * be dependencies for reads. + */ + if (bp->b_flags & B_READ) + panic("softdep_disk_io_initiation: read"); + /* + * Do any necessary pre-I/O processing. + */ + for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) { + nextwk = LIST_NEXT(wk, wk_list); + switch (wk->wk_type) { + + case M_PAGEDEP: + initiate_write_filepage(WK_PAGEDEP(wk), bp); + continue; + + case M_INODEDEP: + initiate_write_inodeblock(WK_INODEDEP(wk), bp); + continue; + + case M_INDIRDEP: + indirdep = WK_INDIRDEP(wk); + if (indirdep->ir_state & GOINGAWAY) + panic("disk_io_initiation: indirdep gone"); + /* + * If there are no remaining dependencies, this + * will be writing the real pointers, so the + * dependency can be freed. + */ + if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { + brelse(indirdep->ir_savebp); + /* inline expand WORKLIST_REMOVE(wk); */ + wk->wk_state &= ~ONWORKLIST; + LIST_REMOVE(wk, wk_list); + WORKITEM_FREE(indirdep, M_INDIRDEP); + continue; + } + /* + * Replace up-to-date version with safe version. + */ + ACQUIRE_LOCK(&lk); + indirdep->ir_state &= ~ATTACHED; + indirdep->ir_state |= UNDONE; + bp->b_data = indirdep->ir_savebp->b_data; + FREE_LOCK(&lk); + continue; + + case M_MKDIR: + case M_BMSAFEMAP: + case M_ALLOCDIRECT: + case M_ALLOCINDIR: + continue; + + default: + panic("handle_disk_io_initiation: Unexpected type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } +} + +/* + * Called from within the procedure above to deal with unsatisfied + * allocation dependencies in a directory. The buffer must be locked, + * thus, no I/O completion operations can occur while we are + * manipulating its associated dependencies. + */ +static void +initiate_write_filepage(pagedep, bp) + struct pagedep *pagedep; + struct buf *bp; +{ + struct diradd *dap; + struct direct *ep; + int i; + + if (pagedep->pd_state & IOSTARTED) { + /* + * This can only happen if there is a driver that does not + * understand chaining. Here biodone will reissue the call + * to strategy for the incomplete buffers. + */ + printf("initiate_write_filepage: already started\n"); + return; + } + pagedep->pd_state |= IOSTARTED; + ACQUIRE_LOCK(&lk); + for (i = 0; i < DAHASHSZ; i++) { + for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; + dap = LIST_NEXT(dap, da_pdlist)) { + ep = (struct direct *) + ((char *)bp->b_data + dap->da_offset); + if (ep->d_ino != dap->da_newinum) + panic("%s: dir inum %d != new %d", + "initiate_write_filepage", + ep->d_ino, dap->da_newinum); + if (dap->da_state & DIRCHG) + ep->d_ino = dap->da_previous->dm_oldinum; + else + ep->d_ino = 0; + dap->da_state &= ~ATTACHED; + dap->da_state |= UNDONE; + } + } + FREE_LOCK(&lk); +} + +/* + * Called from within the procedure above to deal with unsatisfied + * allocation dependencies in an inodeblock. The buffer must be + * locked, thus, no I/O completion operations can occur while we + * are manipulating its associated dependencies. + */ +static void +initiate_write_inodeblock(inodedep, bp) + struct inodedep *inodedep; + struct buf *bp; /* The inode block */ +{ + struct allocdirect *adp, *lastadp; + struct dinode *dp; + struct fs *fs; + ufs_lbn_t prevlbn; + int i, deplist; + + if (inodedep->id_state & IOSTARTED) + panic("initiate_write_inodeblock: already started"); + inodedep->id_state |= IOSTARTED; + fs = inodedep->id_fs; + dp = (struct dinode *)bp->b_data + + ino_to_fsbo(fs, inodedep->id_ino); + /* + * If the bitmap is not yet written, then the allocated + * inode cannot be written to disk. + */ + if ((inodedep->id_state & DEPCOMPLETE) == 0) { + if (inodedep->id_savedino != NULL) + panic("initiate_write_inodeblock: already doing I/O"); + MALLOC(inodedep->id_savedino, struct dinode *, + sizeof(struct dinode), M_INODEDEP, M_WAITOK); + *inodedep->id_savedino = *dp; + bzero((caddr_t)dp, sizeof(struct dinode)); + return; + } + /* + * If no dependencies, then there is nothing to roll back. + */ + inodedep->id_savedsize = dp->di_size; + if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) + return; + /* + * Set the dependencies to busy. + */ + ACQUIRE_LOCK(&lk); + for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; + adp = TAILQ_NEXT(adp, ad_next)) { +#ifdef DIAGNOSTIC + if (deplist != 0 && prevlbn >= adp->ad_lbn) + panic("softdep_write_inodeblock: lbn order"); + prevlbn = adp->ad_lbn; + if (adp->ad_lbn < NDADDR && + dp->di_db[adp->ad_lbn] != adp->ad_newblkno) + panic("%s: direct pointer #%d mismatch %d != %d", + "softdep_write_inodeblock", adp->ad_lbn, + dp->di_db[adp->ad_lbn], adp->ad_newblkno); + if (adp->ad_lbn >= NDADDR && + dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) + panic("%s: indirect pointer #%d mismatch %d != %d", + "softdep_write_inodeblock", adp->ad_lbn - NDADDR, + dp->di_ib[adp->ad_lbn - NDADDR], adp->ad_newblkno); + deplist |= 1 << adp->ad_lbn; + if ((adp->ad_state & ATTACHED) == 0) + panic("softdep_write_inodeblock: Unknown state 0x%x", + adp->ad_state); +#endif /* DIAGNOSTIC */ + adp->ad_state &= ~ATTACHED; + adp->ad_state |= UNDONE; + } + /* + * The on-disk inode cannot claim to be any larger than the last + * fragment that has been written. Otherwise, the on-disk inode + * might have fragments that were not the last block in the file + * which would corrupt the filesystem. + */ + for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; + lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { + if (adp->ad_lbn >= NDADDR) + break; + dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; + /* keep going until hitting a rollback to a frag */ + if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) + continue; + dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; + for (i = adp->ad_lbn + 1; i < NDADDR; i++) { +#ifdef DIAGNOSTIC + if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) + panic("softdep_write_inodeblock: lost dep1"); +#endif /* DIAGNOSTIC */ + dp->di_db[i] = 0; + } + for (i = 0; i < NIADDR; i++) { +#ifdef DIAGNOSTIC + if (dp->di_ib[i] != 0 && + (deplist & ((1 << NDADDR) << i)) == 0) + panic("softdep_write_inodeblock: lost dep2"); +#endif /* DIAGNOSTIC */ + dp->di_ib[i] = 0; + } + FREE_LOCK(&lk); + return; + } + /* + * If we have zero'ed out the last allocated block of the file, + * roll back the size to the last currently allocated block. + * We know that this last allocated block is a full-sized as + * we already checked for fragments in the loop above. + */ + if (lastadp != NULL && + dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { + for (i = lastadp->ad_lbn; i >= 0; i--) + if (dp->di_db[i] != 0) + break; + dp->di_size = (i + 1) * fs->fs_bsize; + } + /* + * The only dependencies are for indirect blocks. + * + * The file size for indirect block additions is not guaranteed. + * Such a guarantee would be non-trivial to achieve. The conventional + * synchronous write implementation also does not make this guarantee. + * Fsck should catch and fix discrepancies. Arguably, the file size + * can be over-estimated without destroying integrity when the file + * moves into the indirect blocks (i.e., is large). If we want to + * postpone fsck, we are stuck with this argument. + */ + for (; adp; adp = TAILQ_NEXT(adp, ad_next)) + dp->di_ib[adp->ad_lbn - NDADDR] = 0; + FREE_LOCK(&lk); +} + +/* + * This routine is called during the completion interrupt + * service routine for a disk write (from the procedure called + * by the device driver to inform the file system caches of + * a request completion). It should be called early in this + * procedure, before the block is made available to other + * processes or other routines are called. + */ +void +softdep_disk_write_complete(bp) + struct buf *bp; /* describes the completed disk write */ +{ + struct worklist *wk; + struct workhead reattach; + struct newblk *newblk; + struct allocindir *aip; + struct allocdirect *adp; + struct indirdep *indirdep; + struct inodedep *inodedep; + struct bmsafemap *bmsafemap; + +#ifdef DEBUG + if (lk.lkt_held != -1) + panic("softdep_disk_write_complete: lock is held"); + lk.lkt_held = -2; +#endif + LIST_INIT(&reattach); + while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + + case M_PAGEDEP: + if (handle_written_filepage(WK_PAGEDEP(wk), bp)) + WORKLIST_INSERT(&reattach, wk); + continue; + + case M_INODEDEP: + if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) + WORKLIST_INSERT(&reattach, wk); + continue; + + case M_BMSAFEMAP: + bmsafemap = WK_BMSAFEMAP(wk); + while (newblk = LIST_FIRST(&bmsafemap->sm_newblkhd)) { + newblk->nb_state |= DEPCOMPLETE; + newblk->nb_bmsafemap = NULL; + LIST_REMOVE(newblk, nb_deps); + } + while (adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd)) { + adp->ad_state |= DEPCOMPLETE; + adp->ad_buf = NULL; + LIST_REMOVE(adp, ad_deps); + handle_allocdirect_partdone(adp); + } + while (aip = LIST_FIRST(&bmsafemap->sm_allocindirhd)) { + aip->ai_state |= DEPCOMPLETE; + aip->ai_buf = NULL; + LIST_REMOVE(aip, ai_deps); + handle_allocindir_partdone(aip); + } + while ((inodedep = + LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { + inodedep->id_state |= DEPCOMPLETE; + LIST_REMOVE(inodedep, id_deps); + inodedep->id_buf = NULL; + } + WORKITEM_FREE(bmsafemap, M_BMSAFEMAP); + continue; + + case M_MKDIR: + handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); + continue; + + case M_ALLOCDIRECT: + adp = WK_ALLOCDIRECT(wk); + adp->ad_state |= COMPLETE; + handle_allocdirect_partdone(adp); + continue; + + case M_ALLOCINDIR: + aip = WK_ALLOCINDIR(wk); + aip->ai_state |= COMPLETE; + handle_allocindir_partdone(aip); + continue; + + case M_INDIRDEP: + indirdep = WK_INDIRDEP(wk); + if (indirdep->ir_state & GOINGAWAY) + panic("disk_write_complete: indirdep gone"); + bp->b_data = (caddr_t)indirdep->ir_saveddata; + indirdep->ir_state &= ~UNDONE; + indirdep->ir_state |= ATTACHED; + while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { + LIST_REMOVE(aip, ai_next); + handle_allocindir_partdone(aip); + } + WORKLIST_INSERT(&reattach, wk); + bdirty(bp); + continue; + + default: + panic("handle_disk_write_complete: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + /* + * Reattach any requests that must be redone. + */ + while ((wk = LIST_FIRST(&reattach)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(&bp->b_dep, wk); + } +#ifdef DEBUG + if (lk.lkt_held != -2) + panic("softdep_disk_write_complete: lock lost"); + lk.lkt_held = -1; +#endif +} + +/* + * Called from within softdep_disk_write_complete above. Note that + * this routine is always called from interrupt level with further + * splbio interrupts blocked. + */ +static void +handle_allocdirect_partdone(adp) + struct allocdirect *adp; /* the completed allocdirect */ +{ + struct allocdirect *listadp; + struct inodedep *inodedep; + long bsize; + + if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + if (adp->ad_buf != NULL) + panic("handle_allocdirect_partdone: dangling dep"); + /* + * The on-disk inode cannot claim to be any larger than the last + * fragment that has been written. Otherwise, the on-disk inode + * might have fragments that were not the last block in the file + * which would corrupt the filesystem. Thus, we cannot free any + * allocdirects after one whose ad_oldblkno claims a fragment as + * these blocks must be rolled back to zero before writing the inode. + * We check the currently active set of allocdirects in id_inoupdt. + */ + inodedep = adp->ad_inodedep; + bsize = inodedep->id_fs->fs_bsize; + for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp; + listadp = TAILQ_NEXT(listadp, ad_next)) { + /* found our block */ + if (listadp == adp) + break; + /* continue if ad_oldlbn is not a fragment */ + if (listadp->ad_oldsize == 0 || + listadp->ad_oldsize == bsize) + continue; + /* hit a fragment */ + return; + } + /* + * If we have reached the end of the current list without + * finding the just finished dependency, then it must be + * on the future dependency list. Future dependencies cannot + * be freed until they are moved to the current list. + */ + if (listadp == NULL) { +#ifdef DEBUG + for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp; + listadp = TAILQ_NEXT(listadp, ad_next)) + /* found our block */ + if (listadp == adp) + break; + if (listadp == NULL) + panic("handle_allocdirect_partdone: lost dep"); +#endif /* DEBUG */ + return; + } + /* + * If we have found the just finished dependency, then free + * it along with anything that follows it that is complete. + */ + for (; adp; adp = listadp) { + listadp = TAILQ_NEXT(adp, ad_next); + if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + free_allocdirect(&inodedep->id_inoupdt, adp, 1); + } + /* + * Try freeing the inodedep in case that was the last dependency. + */ + (void) free_inodedep(inodedep); +} + +/* + * Called from within softdep_disk_write_complete above. Note that + * this routine is always called from interrupt level with further + * splbio interrupts blocked. + */ +static void +handle_allocindir_partdone(aip) + struct allocindir *aip; /* the completed allocindir */ +{ + struct indirdep *indirdep; + + if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + if (aip->ai_buf != NULL) + panic("handle_allocindir_partdone: dangling dependency"); + indirdep = aip->ai_indirdep; + if (indirdep->ir_state & UNDONE) { + LIST_REMOVE(aip, ai_next); + LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); + return; + } + ((ufs_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = + aip->ai_newblkno; + LIST_REMOVE(aip, ai_next); + if (aip->ai_freefrag != NULL) + add_to_worklist(&aip->ai_freefrag->ff_list); + WORKITEM_FREE(aip, M_ALLOCINDIR); +} + +/* + * Called from within softdep_disk_write_complete above to restore + * in-memory inode block contents to their most up-to-date state. Note + * that this routine is always called from interrupt level with further + * splbio interrupts blocked. + */ +static int +handle_written_inodeblock(inodedep, bp) + struct inodedep *inodedep; + struct buf *bp; /* buffer containing the inode block */ +{ + struct pagedep *pagedep; + struct worklist *wk, *filefree; + struct allocdirect *adp, *nextadp; + struct dinode *dp; + struct diradd *dap; + int hadchanges; + + if ((inodedep->id_state & IOSTARTED) == 0) + panic("handle_written_inodeblock: not started"); + inodedep->id_state &= ~IOSTARTED; + inodedep->id_state |= COMPLETE; + dp = (struct dinode *)bp->b_data + + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); + /* + * If we had to rollback the inode allocation because of + * bitmaps being incomplete, then simply restore it. + * Keep the block dirty so that it will not be reclaimed until + * all associated dependencies have been cleared and the + * corresponding updates written to disk. + */ + if (inodedep->id_savedino != NULL) { + *dp = *inodedep->id_savedino; + FREE(inodedep->id_savedino, M_INODEDEP); + inodedep->id_savedino = NULL; + bdirty(bp); + return (1); + } + /* + * Roll forward anything that had to be rolled back before + * the inode could be updated. + */ + hadchanges = 0; + for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { + nextadp = TAILQ_NEXT(adp, ad_next); + if (adp->ad_state & ATTACHED) + panic("handle_written_inodeblock: new entry"); + if (adp->ad_lbn < NDADDR) { + if (dp->di_db[adp->ad_lbn] != adp->ad_oldblkno) + panic("%s: %s #%d mismatch %d != %d", + "handle_written_inodeblock", + "direct pointer", adp->ad_lbn, + dp->di_db[adp->ad_lbn], adp->ad_oldblkno); + dp->di_db[adp->ad_lbn] = adp->ad_newblkno; + } else { + if (dp->di_ib[adp->ad_lbn - NDADDR] != 0) + panic("%s: %s #%d allocated as %d", + "handle_written_inodeblock", + "indirect pointer", adp->ad_lbn - NDADDR, + dp->di_ib[adp->ad_lbn - NDADDR]); + dp->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; + } + adp->ad_state &= ~UNDONE; + adp->ad_state |= ATTACHED; + hadchanges = 1; + } + /* + * Reset the file size to its most up-to-date value. + */ + if (inodedep->id_savedsize == -1) + panic("handle_written_inodeblock: bad size"); + if (dp->di_size != inodedep->id_savedsize) { + dp->di_size = inodedep->id_savedsize; + hadchanges = 1; + } + inodedep->id_savedsize = -1; + /* + * If there were any rollbacks in the inode block, then it must be + * marked dirty so that its will eventually get written back in + * its correct form. + */ + if (hadchanges) + bdirty(bp); + /* + * Process any allocdirects that completed during the update. + */ + if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) + handle_allocdirect_partdone(adp); + /* + * Process deallocations that were held pending until the + * inode had been written to disk. Freeing of the inode + * is delayed until after all blocks have been freed to + * avoid creation of new triples + * before the old ones have been deleted. + */ + filefree = NULL; + while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + + case M_FREEFILE: + /* + * We defer adding filefree to the worklist until + * all other additions have been made to ensure + * that it will be done after all the old blocks + * have been freed. + */ + if (filefree != NULL) + panic("handle_written_inodeblock: filefree"); + filefree = wk; + continue; + + case M_MKDIR: + handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); + continue; + + case M_DIRADD: + dap = WK_DIRADD(wk); + dap->da_state |= COMPLETE; + if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, + da_pdlist); + } + WORKLIST_INSERT(&inodedep->id_pendinghd, wk); + continue; + + case M_FREEBLKS: + case M_FREEFRAG: + case M_DIRREM: + add_to_worklist(wk); + continue; + + default: + panic("handle_written_inodeblock: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + if (filefree != NULL) + add_to_worklist(filefree); + + /* + * If no outstanding dependencies, free it. + */ + if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0) + return (0); + return (hadchanges); +} + +/* + * Handle the completion of a mkdir dependency. + */ +static void +handle_written_mkdir(mkdir, type) + struct mkdir *mkdir; + int type; +{ + struct diradd *dap; + struct pagedep *pagedep; + + if (mkdir->md_state != type) + panic("handle_written_mkdir: bad type"); + dap = mkdir->md_diradd; + dap->da_state &= ~type; + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) + dap->da_state |= DEPCOMPLETE; + if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); + } + LIST_REMOVE(mkdir, md_mkdirs); + WORKITEM_FREE(mkdir, M_MKDIR); +} + +/* + * Called from within softdep_disk_write_complete above. + * A write operation was just completed. Removed inodes can + * now be freed and associated block pointers may be committed. + * Note that this routine is always called from interrupt level + * with further splbio interrupts blocked. + */ +static int +handle_written_filepage(pagedep, bp) + struct pagedep *pagedep; + struct buf *bp; /* buffer containing the written page */ +{ + struct dirrem *dirrem; + struct diradd *dap, *nextdap; + struct direct *ep; + int i, chgs; + + if ((pagedep->pd_state & IOSTARTED) == 0) + panic("handle_written_filepage: not started"); + pagedep->pd_state &= ~IOSTARTED; + /* + * Process any directory removals that have been committed. + */ + while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { + LIST_REMOVE(dirrem, dm_next); + dirrem->dm_dirinum = pagedep->pd_ino; + add_to_worklist(&dirrem->dm_list); + } + /* + * Free any directory additions that have been committed. + */ + while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) + free_diradd(dap); + /* + * Uncommitted directory entries must be restored. + */ + for (chgs = 0, i = 0; i < DAHASHSZ; i++) { + for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; + dap = nextdap) { + nextdap = LIST_NEXT(dap, da_pdlist); + if (dap->da_state & ATTACHED) + panic("handle_written_filepage: attached"); + ep = (struct direct *) + ((char *)bp->b_data + dap->da_offset); + ep->d_ino = dap->da_newinum; + dap->da_state &= ~UNDONE; + dap->da_state |= ATTACHED; + chgs = 1; + /* + * If the inode referenced by the directory has + * been written out, then the dependency can be + * moved to the pending list. + */ + if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, + da_pdlist); + } + } + } + /* + * If there were any rollbacks in the directory, then it must be + * marked dirty so that its will eventually get written back in + * its correct form. + */ + if (chgs) + bdirty(bp); + /* + * If no dependencies remain, the pagedep will be freed. + * Otherwise it will remain to update the page before it + * is written back to disk. + */ + if (LIST_FIRST(&pagedep->pd_dirremhd) == 0 && + LIST_FIRST(&pagedep->pd_pendinghd) == 0) { + for (i = 0; i < DAHASHSZ; i++) + if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) + break; + if (i == DAHASHSZ) { + LIST_REMOVE(pagedep, pd_hash); + WORKITEM_FREE(pagedep, M_PAGEDEP); + return (0); + } + } + return (1); +} + +/* + * Writing back in-core inode structures. + * + * The file system only accesses an inode's contents when it occupies an + * "in-core" inode structure. These "in-core" structures are separate from + * the page frames used to cache inode blocks. Only the latter are + * transferred to/from the disk. So, when the updated contents of the + * "in-core" inode structure are copied to the corresponding in-memory inode + * block, the dependencies are also transferred. The following procedure is + * called when copying a dirty "in-core" inode to a cached inode block. + */ + +/* + * Called when an inode is loaded from disk. If the effective link count + * differed from the actual link count when it was last flushed, then we + * need to ensure that the correct effective link count is put back. + */ +void +softdep_load_inodeblock(ip) + struct inode *ip; /* the "in_core" copy of the inode */ +{ + struct inodedep *inodedep; + int error, gotit; + + /* + * Check for alternate nlink count. + */ + ip->i_effnlink = ip->i_nlink; + ACQUIRE_LOCK(&lk); + if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { + FREE_LOCK(&lk); + return; + } + if (inodedep->id_nlinkdelta != 0) { + ip->i_effnlink -= inodedep->id_nlinkdelta; + inodedep->id_nlinkdelta = 0; + (void) free_inodedep(inodedep); + } + FREE_LOCK(&lk); +} + +/* + * This routine is called just before the "in-core" inode + * information is to be copied to the in-memory inode block. + * Recall that an inode block contains several inodes. If + * the force flag is set, then the dependencies will be + * cleared so that the update can always be made. Note that + * the buffer is locked when this routine is called, so we + * will never be in the middle of writing the inode block + * to disk. + */ +void +softdep_update_inodeblock(ip, bp, waitfor) + struct inode *ip; /* the "in_core" copy of the inode */ + struct buf *bp; /* the buffer containing the inode block */ + int waitfor; /* 1 => update must be allowed */ +{ + struct inodedep *inodedep; + int error, gotit; + + /* + * If the effective link count is not equal to the actual link + * count, then we must track the difference in an inodedep while + * the inode is (potentially) tossed out of the cache. Otherwise, + * if there is no existing inodedep, then there are no dependencies + * to track. + */ + ACQUIRE_LOCK(&lk); + if (ip->i_effnlink != ip->i_nlink) { + (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, + &inodedep); + } else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { + FREE_LOCK(&lk); + return; + } + if (ip->i_nlink < ip->i_effnlink) + panic("softdep_update_inodeblock: bad delta"); + inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; + /* + * If the last remaining use for the inodedep was to track the + * link count, and there is no difference between the effective + * and actual link count, then we can free the inodedep. + */ + if (free_inodedep(inodedep)) { + FREE_LOCK(&lk); + return; + } + /* + * Changes have been initiated. Anything depending on these + * changes cannot occur until this inode has been written. + */ + inodedep->id_state &= ~COMPLETE; + if ((inodedep->id_state & ONWORKLIST) == 0) + WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); + /* + * Any new dependencies associated with the incore inode must + * now be moved to the list associated with the buffer holding + * the in-memory copy of the inode. Once merged process any + * allocdirects that are completed by the merger. + */ + merge_inode_lists(inodedep); + if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); + /* + * Newly allocated inodes cannot be written until the bitmap + * that allocates them have been written (indicated by + * DEPCOMPLETE being set in id_state). If we are doing a + * forced sync (e.g., an fsync on a file), we force the bitmap + * to be written so that the update can be done. + */ + if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) { + FREE_LOCK(&lk); + return; + } + gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); + FREE_LOCK(&lk); + if (gotit && (error = VOP_BWRITE(inodedep->id_buf)) != 0) + softdep_error("softdep_update_inodeblock: bwrite", error); + if ((inodedep->id_state & DEPCOMPLETE) == 0) + panic("softdep_update_inodeblock: update failed"); +} + +/* + * Merge the new inode dependency list (id_newinoupdt) into the old + * inode dependency list (id_inoupdt). This routine must be called + * with splbio interrupts blocked. + */ +static void +merge_inode_lists(inodedep) + struct inodedep *inodedep; +{ + struct allocdirect *listadp, *newadp; + + newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); + for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { + if (listadp->ad_lbn < newadp->ad_lbn) { + listadp = TAILQ_NEXT(listadp, ad_next); + continue; + } + TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); + TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); + if (listadp->ad_lbn == newadp->ad_lbn) { + allocdirect_merge(&inodedep->id_inoupdt, newadp, + listadp); + listadp = newadp; + } + newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); + } + while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) { + TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next); + TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next); + } +} + +/* + * If we are doing an fsync, then we must ensure that any directory + * entries for the inode have been written after the inode gets to disk. + */ +int +softdep_fsync(vp) + struct vnode *vp; /* the "in_core" copy of the inode */ +{ + struct diradd *dap, *olddap; + struct inodedep *inodedep; + struct pagedep *pagedep; + struct worklist *wk; + struct mount *mnt; + struct vnode *pvp; + struct inode *ip; + struct buf *bp; + struct fs *fs; + struct proc *p = curproc; /* XXX */ + int error, ret, flushparent; + struct timeval tv; + ino_t parentino; + ufs_lbn_t lbn; + + ip = VTOI(vp); + fs = ip->i_fs; + for (error = 0, flushparent = 0, olddap = NULL; ; ) { + ACQUIRE_LOCK(&lk); + if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) + break; + if (LIST_FIRST(&inodedep->id_inowait) != NULL || + TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || + TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) + panic("softdep_fsync: pending ops"); + if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) + break; + if (wk->wk_type != M_DIRADD) + panic("softdep_fsync: Unexpcted type %s", + TYPENAME(wk->wk_type)); + dap = WK_DIRADD(wk); + /* + * If we have failed to get rid of all the dependencies + * then something is seriously wrong. + */ + if (dap == olddap) + panic("softdep_fsync: flush failed"); + olddap = dap; + /* + * Flush our parent if this directory entry + * has a MKDIR_PARENT dependency. + */ + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; + mnt = pagedep->pd_mnt; + parentino = pagedep->pd_ino; + lbn = pagedep->pd_lbn; + if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) + panic("softdep_fsync: dirty"); + flushparent = dap->da_state & MKDIR_PARENT; + /* + * If we are being fsync'ed as part of vgone'ing this vnode, + * then we will not be able to release and recover the + * vnode below, so we just have to give up on writing its + * directory entry out. It will eventually be written, just + * not now, but then the user was not asking to have it + * written, so we are not breaking any promises. + */ + if (vp->v_flag & VXLOCK) + break; + /* + * We prevent deadlock by always fetching inodes from the + * root, moving down the directory tree. Thus, when fetching + * our parent directory, we must unlock ourselves before + * requesting the lock on our parent. See the comment in + * ufs_lookup for details on possible races. + */ + FREE_LOCK(&lk); + VOP_UNLOCK(vp, 0, p); + if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (flushparent) { + tv = time; + if (error = VOP_UPDATE(pvp, &tv, &tv, MNT_WAIT)) { + vput(pvp); + return (error); + } + } + /* + * Flush directory page containing the inode's name. + */ + error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, + &bp); + vput(pvp); + ret = VOP_BWRITE(bp); + if (error != 0) + return (error); + if (ret != 0) + return (ret); + } + FREE_LOCK(&lk); + return (0); +} + +/* + * This routine is called when we are trying to synchronously flush a + * file. This routine must eliminate any filesystem metadata dependencies + * so that the syncing routine can succeed by pushing the dirty blocks + * associated with the file. If any I/O errors occur, they are returned. + */ +int +softdep_sync_metadata(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct allocdirect *adp; + struct allocindir *aip; + struct buf *bp, *nbp; + struct worklist *wk; + int error, waitfor; + + /* + * Check whether this vnode is involved in a filesystem + * that is doing soft dependency processing. + */ + if (vp->v_type != VBLK) { + if (!DOINGSOFTDEP(vp)) + return (0); + } else + if (vp->v_specmountpoint == NULL || + (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0) + return (0); + /* + * Ensure that any direct block dependencies have been cleared. + */ + ACQUIRE_LOCK(&lk); + if (error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number)) { + FREE_LOCK(&lk); + return (error); + } + /* + * For most files, the only metadata dependencies are the + * cylinder group maps that allocate their inode or blocks. + * The block allocation dependencies can be found by traversing + * the dependency lists for any buffers that remain on their + * dirty buffer list. The inode allocation dependency will + * be resolved when the inode is updated with MNT_WAIT. + * This work is done in two passes. The first pass grabs most + * of the buffers and begins asynchronously writing them. The + * only way to wait for these asynchronous writes is to sleep + * on the filesystem vnode which may stay busy for a long time + * if the filesystem is active. So, instead, we make a second + * pass over the dependencies blocking on each write. In the + * usual case we will be blocking against a write that we + * initiated, so when it is done the dependency will have been + * resolved. Thus the second pass is expected to end quickly. + */ + waitfor = MNT_NOWAIT; +top: + if (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { + FREE_LOCK(&lk); + return (0); + } + bp = LIST_FIRST(&vp->v_dirtyblkhd); +loop: + /* + * As we hold the buffer locked, none of its dependencies + * will disappear. + */ + for (wk = LIST_FIRST(&bp->b_dep); wk; + wk = LIST_NEXT(wk, wk_list)) { + switch (wk->wk_type) { + + case M_ALLOCDIRECT: + adp = WK_ALLOCDIRECT(wk); + if (adp->ad_state & DEPCOMPLETE) + break; + nbp = adp->ad_buf; + if (getdirtybuf(&nbp, waitfor) == 0) + break; + FREE_LOCK(&lk); + if (waitfor == MNT_NOWAIT) { + bawrite(nbp); + } else if ((error = VOP_BWRITE(nbp)) != 0) { + bawrite(bp); + return (error); + } + ACQUIRE_LOCK(&lk); + break; + + case M_ALLOCINDIR: + aip = WK_ALLOCINDIR(wk); + if (aip->ai_state & DEPCOMPLETE) + break; + nbp = aip->ai_buf; + if (getdirtybuf(&nbp, waitfor) == 0) + break; + FREE_LOCK(&lk); + if (waitfor == MNT_NOWAIT) { + bawrite(nbp); + } else if ((error = VOP_BWRITE(nbp)) != 0) { + bawrite(bp); + return (error); + } + ACQUIRE_LOCK(&lk); + break; + + case M_INDIRDEP: + for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd); + aip; aip = LIST_NEXT(aip, ai_next)) { + if (aip->ai_state & DEPCOMPLETE) + continue; + nbp = aip->ai_buf; + if (getdirtybuf(&nbp, waitfor) == 0) + break; + FREE_LOCK(&lk); + if (waitfor == MNT_NOWAIT) { + bawrite(nbp); + } else if ((error = VOP_BWRITE(nbp)) != 0) { + bawrite(bp); + return (error); + } + ACQUIRE_LOCK(&lk); + continue; + } + break; + + case M_INODEDEP: + if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs, + WK_INODEDEP(wk)->id_ino)) != 0) { + FREE_LOCK(&lk); + bawrite(bp); + return (error); + } + break; + + case M_PAGEDEP: + /* + * We are trying to sync a directory that may + * have dependencies on both its own metadata + * and/or dependencies on the inodes of any + * recently allocated files. We walk its diradd + * lists pushing out the associated inode. + */ + if (error = flush_pagedep_deps(vp, WK_PAGEDEP(wk))) { + FREE_LOCK(&lk); + bawrite(bp); + return (error); + } + break; + + default: + panic("softdep_sync_metadata: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + (void) getdirtybuf(&LIST_NEXT(bp, b_vnbufs), MNT_WAIT); + nbp = LIST_NEXT(bp, b_vnbufs); + FREE_LOCK(&lk); + bawrite(bp); + ACQUIRE_LOCK(&lk); + if (nbp != NULL) { + bp = nbp; + goto loop; + } + /* + * We must wait for any I/O in progress to finish so that + * all potential buffers on the dirty list will be visible. + * Once they are all there, proceed with the second pass + * which will wait for the I/O as per above. + */ + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + FREE_LOCK_INTERLOCKED(&lk); + sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); + ACQUIRE_LOCK_INTERLOCKED(&lk); + } + /* + * The brief unlock is to allow any pent up dependency + * processing to be done. + */ + if (waitfor == MNT_NOWAIT) { + waitfor = MNT_WAIT; + FREE_LOCK(&lk); + ACQUIRE_LOCK(&lk); + goto top; + } + + /* + * If we have managed to get rid of all the dirty buffers, + * then we are done. For certain directories and block + * devices, we may need to do further work. + */ + if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { + FREE_LOCK(&lk); + return (0); + } + + FREE_LOCK(&lk); + /* + * If we are trying to sync a block device, some of its buffers may + * contain metadata that cannot be written until the contents of some + * partially written files have been written to disk. The only easy + * way to accomplish this is to sync the entire filesystem (luckily + * this happens rarely). + */ + if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) && + (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred, + ap->a_p)) != 0) + return (error); + return (0); +} + +/* + * Flush the dependencies associated with an inodedep. + * Called with splbio blocked. + */ +static int +flush_inodedep_deps(fs, ino) + struct fs *fs; + ino_t ino; +{ + struct inodedep *inodedep; + struct allocdirect *adp; + int error, waitfor; + struct buf *bp; + + /* + * This work is done in two passes. The first pass grabs most + * of the buffers and begins asynchronously writing them. The + * only way to wait for these asynchronous writes is to sleep + * on the filesystem vnode which may stay busy for a long time + * if the filesystem is active. So, instead, we make a second + * pass over the dependencies blocking on each write. In the + * usual case we will be blocking against a write that we + * initiated, so when it is done the dependency will have been + * resolved. Thus the second pass is expected to end quickly. + * We give a brief window at the top of the loop to allow + * any pending I/O to complete. + */ + for (waitfor = MNT_NOWAIT; ; ) { + FREE_LOCK(&lk); + ACQUIRE_LOCK(&lk); + if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) + return (0); + for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; + adp = TAILQ_NEXT(adp, ad_next)) { + if (adp->ad_state & DEPCOMPLETE) + continue; + bp = adp->ad_buf; + if (getdirtybuf(&bp, waitfor) == 0) + break; + FREE_LOCK(&lk); + if (waitfor == MNT_NOWAIT) { + bawrite(bp); + } else if ((error = VOP_BWRITE(bp)) != 0) { + ACQUIRE_LOCK(&lk); + return (error); + } + ACQUIRE_LOCK(&lk); + break; + } + if (adp != NULL) + continue; + for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp; + adp = TAILQ_NEXT(adp, ad_next)) { + if (adp->ad_state & DEPCOMPLETE) + continue; + bp = adp->ad_buf; + if (getdirtybuf(&bp, waitfor) == 0) + break; + FREE_LOCK(&lk); + if (waitfor == MNT_NOWAIT) { + bawrite(bp); + } else if ((error = VOP_BWRITE(bp)) != 0) { + ACQUIRE_LOCK(&lk); + return (error); + } + ACQUIRE_LOCK(&lk); + break; + } + if (adp != NULL) + continue; + /* + * If pass2, we are done, otherwise do pass 2. + */ + if (waitfor == MNT_WAIT) + break; + waitfor = MNT_WAIT; + } + /* + * Try freeing inodedep in case all dependencies have been removed. + */ + if (inodedep_lookup(fs, ino, 0, &inodedep) != 0) + (void) free_inodedep(inodedep); + return (0); +} + +/* + * Eliminate a pagedep dependency by flushing out all its diradd dependencies. + * Called with splbio blocked. + */ +static int +flush_pagedep_deps(pvp, pagedep) + struct vnode *pvp; + struct pagedep *pagedep; +{ + struct proc *p = curproc; /* XXX */ + struct diradd *dap; + struct timeval tv; + struct vnode *vp; + int i, error; + ino_t inum; + + for (i = 0, error = 0; i < DAHASHSZ && error == 0; i++) { + while ((dap = LIST_FIRST(&pagedep->pd_diraddhd[i])) != NULL) { + /* + * Flush ourselves if this directory entry + * has a MKDIR_PARENT dependency. + */ + if (dap->da_state & MKDIR_PARENT) { + tv = time; + FREE_LOCK(&lk); + if (error = VOP_UPDATE(pvp, &tv, &tv, MNT_WAIT)) + break; + ACQUIRE_LOCK(&lk); + /* + * If that cleared dependencies, go on to next. + */ + if (dap != LIST_FIRST(&pagedep->pd_diraddhd[i])) + continue; + if (dap->da_state & MKDIR_PARENT) + panic("flush_pagedep_deps: MKDIR"); + } + /* + * Flush the file on which the directory entry depends. + */ + inum = dap->da_newinum; + FREE_LOCK(&lk); + if ((error = VFS_VGET(pagedep->pd_mnt, inum, &vp)) != 0) + break; + if (vp->v_type == VDIR) { + /* + * A newly allocated directory must have its + * "." and ".." entries written out before its + * name can be committed in its parent. We do + * not want or need the full semantics of a + * synchronous VOP_FSYNC as that may end up + * here again, once for each directory level in + * the filesystem. Instead, we push the blocks + * and wait for them to clear. + */ + if (error = + VOP_FSYNC(vp, p->p_cred, MNT_NOWAIT, p)) { + vput(vp); + break; + } + ACQUIRE_LOCK(&lk); + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + FREE_LOCK_INTERLOCKED(&lk); + sleep((caddr_t)&vp->v_numoutput, + PRIBIO + 1); + ACQUIRE_LOCK_INTERLOCKED(&lk); + } + FREE_LOCK(&lk); + } + tv = time; + error = VOP_UPDATE(vp, &tv, &tv, MNT_WAIT); + vput(vp); + if (error) + break; + /* + * If we have failed to get rid of all the dependencies + * then something is seriously wrong. + */ + if (dap == LIST_FIRST(&pagedep->pd_diraddhd[i])) + panic("flush_pagedep_deps: flush failed"); + ACQUIRE_LOCK(&lk); + } + } + if (error) + ACQUIRE_LOCK(&lk); + return (error); +} + +/* + * Acquire exclusive access to a buffer. + * Must be called with splbio blocked. + * Return 1 if buffer was acquired. + */ +static int +getdirtybuf(bpp, waitfor) + struct buf **bpp; + int waitfor; +{ + struct buf *bp; + + for (;;) { + if ((bp = *bpp) == NULL) + return (0); + if ((bp->b_flags & B_BUSY) == 0) + break; + if (waitfor != MNT_WAIT) + return (0); + bp->b_flags |= B_WANTED; + FREE_LOCK_INTERLOCKED(&lk); + sleep((caddr_t)bp, PRIBIO + 1); + ACQUIRE_LOCK_INTERLOCKED(&lk); + } + if ((bp->b_flags & B_DELWRI) == 0) + return (0); + bremfree(bp); + bp->b_flags |= B_BUSY; + return (1); +} + +/* + * Called whenever a buffer that is being invalidated or reallocated + * contains dependencies. This should only happen if an I/O error has + * occurred. The routine is called with the buffer locked. + */ +void +softdep_deallocate_dependencies(bp) + struct buf *bp; +{ + struct worklist *wk; + + if ((bp->b_flags & B_ERROR) == 0) + panic("softdep_deallocate_dependencies: dangling deps"); + softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); + while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + /* + * XXX - should really clean up, but for now we will + * just leak memory and not worry about it. + */ + case M_PAGEDEP: case M_INDIRDEP: case M_INODEDEP: +#ifdef DEBUG + printf("Lost %s\n", TYPENAME(wk->wk_type)); +#endif + break; + default: + panic("softdep_deallocate_dependencies: bad type"); + } + } +} + +/* + * Function to handle asynchronous write errors in the filesystem. + */ +void +softdep_error(func, error) + char *func; + int error; +{ + + /* XXX should do something better! */ + log(LOG_ERR, "%s: got error %d while accessing filesystem\n", + func, error); +} diff --git a/sys/contrib/softupdates/softdep.h b/sys/contrib/softupdates/softdep.h new file mode 100644 index 0000000..3825bce --- /dev/null +++ b/sys/contrib/softupdates/softdep.h @@ -0,0 +1,520 @@ +/* + * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved. + * + * The soft dependency code is derived from work done by Greg Ganger + * at the University of Michigan. + * + * The following are the copyrights and redistribution conditions that + * apply to this copy of the soft dependency software. For a license + * to use, redistribute or sell the soft dependency software under + * conditions other than those described here, please contact the + * author at one of the following addresses: + * + * Marshall Kirk McKusick mckusick@mckusick.com + * 1614 Oxford Street +1-510-843-9542 + * Berkeley, CA 94709-1608 + * USA + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. None of the names of McKusick, Ganger, or the University of Michigan + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * 4. Redistributions in any form must be accompanied by information on + * how to obtain complete source code for any accompanying software + * that uses the this software. This source code must either be included + * in the distribution or be available for no more than the cost of + * distribution plus a nominal fee, and must be freely redistributable + * under reasonable conditions. For an executable file, complete + * source code means the source code for all modules it contains. + * It does not mean source code for modules or files that typically + * accompany the operating system on which the executable file runs, + * e.g., standard library modules or system header files. + * + * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)softdep.h 9.1 (McKusick) 7/9/97 + */ + +#include + +/* + * Allocation dependencies are handled with undo/redo on the in-memory + * copy of the data. A particular data dependency is eliminated when + * it is ALLCOMPLETE: that is ATTACHED, DEPCOMPLETE, and COMPLETE. + * + * ATTACHED means that the data is not currently being written to + * disk. UNDONE means that the data has been rolled back to a safe + * state for writing to the disk. When the I/O completes, the data is + * restored to its current form and the state reverts to ATTACHED. + * The data must be locked throughout the rollback, I/O, and roll + * forward so that the rolled back information is never visible to + * user processes. The COMPLETE flag indicates that the item has been + * written. For example, a dependency that requires that an inode be + * written will be marked COMPLETE after the inode has been written + * to disk. The DEPCOMPLETE flag indicates the completion of any other + * dependencies such as the writing of a cylinder group map has been + * completed. A dependency structure may be freed only when both it + * and its dependencies have completed and any rollbacks that are in + * progress have finished as indicated by the set of ALLCOMPLETE flags + * all being set. The two MKDIR flags indicate additional dependencies + * that must be done when creating a new directory. MKDIR_BODY is + * cleared when the directory data block containing the "." and ".." + * entries has been written. MKDIR_PARENT is cleared when the parent + * inode with the increased link count for ".." has been written. When + * both MKDIR flags have been cleared, the DEPCOMPLETE flag is set to + * indicate that the directory dependencies have been completed. The + * writing of the directory inode itself sets the COMPLETE flag which + * then allows the directory entry for the new directory to be written + * to disk. The RMDIR flag marks a dirrem structure as representing + * the removal of a directory rather than a file. When the removal + * dependencies are completed, additional work needs to be done + * (truncation of the "." and ".." entries, an additional decrement + * of the associated inode, and a decrement of the parent inode). The + * DIRCHG flag marks a diradd structure as representing the changing + * of an existing entry rather than the addition of a new one. When + * the update is complete the dirrem associated with the inode for + * the old name must be added to the worklist to do the necessary + * reference count decrement. The GOINGAWAY flag indicates that the + * data structure is frozen from further change until its dependencies + * have been completed and its resources freed after which it will be + * discarded. The IOSTARTED flag prevents multiple calls to the I/O + * start routine from doing multiple rollbacks. The ONWORKLIST flag + * shows whether the structure is currently linked onto a worklist. + */ +#define ATTACHED 0x0001 +#define UNDONE 0x0002 +#define COMPLETE 0x0004 +#define DEPCOMPLETE 0x0008 +#define MKDIR_PARENT 0x0010 +#define MKDIR_BODY 0x0020 +#define RMDIR 0x0040 +#define DIRCHG 0x0080 +#define GOINGAWAY 0x0100 +#define IOSTARTED 0x0200 +#define ONWORKLIST 0x8000 + +#define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE) + +/* + * The workitem queue. + * + * It is sometimes useful and/or necessary to clean up certain dependencies + * in the background rather than during execution of an application process + * or interrupt service routine. To realize this, we append dependency + * structures corresponding to such tasks to a "workitem" queue. In a soft + * updates implementation, most pending workitems should not wait for more + * than a couple of seconds, so the filesystem syncer process awakens once + * per second to process the items on the queue. + */ + +/* LIST_HEAD(workhead, worklist); -- declared in buf.h */ + +/* + * Each request can be linked onto a work queue through its worklist structure. + * To avoid the need for a pointer to the structure itself, this structure + * MUST be declared FIRST in each type in which it appears! If more than one + * worklist is needed in the structure, then a wk_data field must be added + * and the macros below changed to use it. + */ +struct worklist { + LIST_ENTRY(worklist) wk_list; /* list of work requests */ + unsigned short wk_type; /* type of request */ + unsigned short wk_state; /* state flags */ +}; +#define WK_DATA(wk) ((void *)(wk)) +#define WK_PAGEDEP(wk) ((struct pagedep *)(wk)) +#define WK_INODEDEP(wk) ((struct inodedep *)(wk)) +#define WK_NEWBLK(wk) ((struct newblk *)(wk)) +#define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk)) +#define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk)) +#define WK_INDIRDEP(wk) ((struct indirdep *)(wk)) +#define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk)) +#define WK_FREEFRAG(wk) ((struct freefrag *)(wk)) +#define WK_FREEBLKS(wk) ((struct freeblks *)(wk)) +#define WK_FREEFILE(wk) ((struct freefile *)(wk)) +#define WK_DIRADD(wk) ((struct diradd *)(wk)) +#define WK_MKDIR(wk) ((struct mkdir *)(wk)) +#define WK_DIRREM(wk) ((struct dirrem *)(wk)) + +/* + * Various types of lists + */ +LIST_HEAD(dirremhd, dirrem); +LIST_HEAD(diraddhd, diradd); +LIST_HEAD(newblkhd, newblk); +LIST_HEAD(inodedephd, inodedep); +LIST_HEAD(allocindirhd, allocindir); +LIST_HEAD(allocdirecthd, allocdirect); +TAILQ_HEAD(allocdirectlst, allocdirect); + +/* + * The "pagedep" structure tracks the various dependencies related to + * a particular directory page. If a directory page has any dependencies, + * it will have a pagedep linked to its associated buffer. The + * pd_dirremhd list holds the list of dirrem requests which decrement + * inode reference counts. These requests are processed after the + * directory page with the corresponding zero'ed entries has been + * written. The pd_diraddhd list maintains the list of diradd requests + * which cannot be committed until their corresponding inode has been + * written to disk. Because a directory may have many new entries + * being created, several lists are maintained hashed on bits of the + * offset of the entry into the directory page to keep the lists from + * getting too long. Once a new directory entry has been cleared to + * be written, it is moved to the pd_pendinghd list. After the new + * entry has been written to disk it is removed from the pd_pendinghd + * list, any removed operations are done, and the dependency structure + * is freed. + */ +#define DAHASHSZ 6 +#define DIRADDHASH(offset) (((offset) >> 2) % DAHASHSZ) +struct pagedep { + struct worklist pd_list; /* page buffer */ +# define pd_state pd_list.wk_state /* check for multiple I/O starts */ + LIST_ENTRY(pagedep) pd_hash; /* hashed lookup */ + struct mount *pd_mnt; /* associated mount point */ + ino_t pd_ino; /* associated file */ + ufs_lbn_t pd_lbn; /* block within file */ + struct dirremhd pd_dirremhd; /* dirrem's waiting for page */ + struct diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */ + struct diraddhd pd_pendinghd; /* directory entries awaiting write */ +}; + +/* + * The "inodedep" structure tracks the set of dependencies associated + * with an inode. Each block that is allocated is represented by an + * "allocdirect" structure (see below). It is linked onto the id_newinoupdt + * list until both its contents and its allocation in the cylinder + * group map have been written to disk. Once the dependencies have been + * satisfied, it is removed from the id_newinoupdt list and any followup + * actions such as releasing the previous block or fragment are placed + * on the id_inowait list. When an inode is updated (copied from the + * in-core inode structure to a disk buffer containing its on-disk + * copy), the "inodedep" structure is linked onto the buffer through + * its worklist. Thus it will be notified when the buffer is about + * to be written and when it is done. At the update time, all the + * elements on the id_newinoupdt list are moved to the id_inoupdt list + * since those changes are now relevant to the copy of the inode in the + * buffer. When the buffer containing the inode is written to disk, any + * updates listed on the id_inoupdt list are rolled back as they are + * not yet safe. Following the write, the changes are once again rolled + * forward and any actions on the id_inowait list are processed (since + * the previously allocated blocks are no longer claimed on the disk). + * The entries on the id_inoupdt and id_newinoupdt lists must be kept + * sorted by logical block number to speed the calculation of the size + * of the rolled back inode (see explanation in initiate_write_inodeblock). + */ +struct inodedep { + struct worklist id_list; /* buffer holding inode block */ +# define id_state id_list.wk_state /* inode dependency state */ + LIST_ENTRY(inodedep) id_hash; /* hashed lookup */ + struct fs *id_fs; /* associated filesystem */ + ino_t id_ino; /* dependent inode */ + nlink_t id_nlinkdelta; /* saved effective link count */ + struct dinode *id_savedino; /* saved dinode contents */ + LIST_ENTRY(inodedep) id_deps; /* bmsafemap's list of inodedep's */ + struct buf *id_buf; /* related bmsafemap (if pending) */ + off_t id_savedsize; /* file size saved during rollback */ + struct workhead id_pendinghd; /* entries awaiting directory write */ + struct workhead id_inowait; /* operations after inode written */ + struct allocdirectlst id_inoupdt; /* updates before inode written */ + struct allocdirectlst id_newinoupdt; /* updates when inode written */ +}; + +/* + * A "newblk" structure is attached to a bmsafemap structure when a block + * or fragment is allocated from a cylinder group. Its state is set to + * DEPCOMPLETE when its cylinder group map is written. It is consumed by + * an associated allocdirect or allocindir allocation which will attach + * themselves to the bmsafemap structure if the newblk's DEPCOMPLETE flag + * is not set (i.e., its cylinder group map has not been written). + */ +struct newblk { + LIST_ENTRY(newblk) nb_hash; /* hashed lookup */ + struct fs *nb_fs; /* associated filesystem */ + ufs_daddr_t nb_newblkno; /* allocated block number */ + int nb_state; /* state of bitmap dependency */ + LIST_ENTRY(newblk) nb_deps; /* bmsafemap's list of newblk's */ + struct bmsafemap *nb_bmsafemap; /* associated bmsafemap */ +}; + +/* + * A "bmsafemap" structure maintains a list of dependency structures + * that depend on the update of a particular cylinder group map. + * It has lists for newblks, allocdirects, allocindirs, and inodedeps. + * It is attached to the buffer of a cylinder group block when any of + * these things are allocated from the cylinder group. It is freed + * after the cylinder group map is written and the state of its + * dependencies are updated with DEPCOMPLETE to indicate that it has + * been processed. + */ +struct bmsafemap { + struct worklist sm_list; /* cylgrp buffer */ + struct buf *sm_buf; /* associated buffer */ + struct allocdirecthd sm_allocdirecthd; /* allocdirect deps */ + struct allocindirhd sm_allocindirhd; /* allocindir deps */ + struct inodedephd sm_inodedephd; /* inodedep deps */ + struct newblkhd sm_newblkhd; /* newblk deps */ +}; + +/* + * An "allocdirect" structure is attached to an "inodedep" when a new block + * or fragment is allocated and pointed to by the inode described by + * "inodedep". The worklist is linked to the buffer that holds the block. + * When the block is first allocated, it is linked to the bmsafemap + * structure associated with the buffer holding the cylinder group map + * from which it was allocated. When the cylinder group map is written + * to disk, ad_state has the DEPCOMPLETE flag set. When the block itself + * is written, the COMPLETE flag is set. Once both the cylinder group map + * and the data itself have been written, it is safe to write the inode + * that claims the block. If there was a previous fragment that had been + * allocated before the file was increased in size, the old fragment may + * be freed once the inode claiming the new block is written to disk. + * This ad_fragfree request is attached to the id_inowait list of the + * associated inodedep (pointed to by ad_inodedep) for processing after + * the inode is written. + */ +struct allocdirect { + struct worklist ad_list; /* buffer holding block */ +# define ad_state ad_list.wk_state /* block pointer state */ + TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */ + ufs_lbn_t ad_lbn; /* block within file */ + ufs_daddr_t ad_newblkno; /* new value of block pointer */ + ufs_daddr_t ad_oldblkno; /* old value of block pointer */ + long ad_newsize; /* size of new block */ + long ad_oldsize; /* size of old block */ + LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */ + struct buf *ad_buf; /* cylgrp buffer (if pending) */ + struct inodedep *ad_inodedep; /* associated inodedep */ + struct freefrag *ad_freefrag; /* fragment to be freed (if any) */ +}; + +/* + * A single "indirdep" structure manages all allocation dependencies for + * pointers in an indirect block. The up-to-date state of the indirect + * block is stored in ir_savedata. The set of pointers that may be safely + * written to the disk is stored in ir_safecopy. The state field is used + * only to track whether the buffer is currently being written (in which + * case it is not safe to update ir_safecopy). Ir_deplisthd contains the + * list of allocindir structures, one for each block that needs to be + * written to disk. Once the block and its bitmap allocation have been + * written the safecopy can be updated to reflect the allocation and the + * allocindir structure freed. If ir_state indicates that an I/O on the + * indirect block is in progress when ir_safecopy is to be updated, the + * update is deferred by placing the allocindir on the ir_donehd list. + * When the I/O on the indirect block completes, the entries on the + * ir_donehd list are processed by updating their corresponding ir_safecopy + * pointers and then freeing the allocindir structure. + */ +struct indirdep { + struct worklist ir_list; /* buffer holding indirect block */ +# define ir_state ir_list.wk_state /* indirect block pointer state */ + ufs_daddr_t *ir_saveddata; /* buffer cache contents */ + struct buf *ir_savebp; /* buffer holding safe copy */ + struct allocindirhd ir_donehd; /* done waiting to update safecopy */ + struct allocindirhd ir_deplisthd; /* allocindir deps for this block */ +}; + +/* + * An "allocindir" structure is attached to an "indirdep" when a new block + * is allocated and pointed to by the indirect block described by the + * "indirdep". The worklist is linked to the buffer that holds the new block. + * When the block is first allocated, it is linked to the bmsafemap + * structure associated with the buffer holding the cylinder group map + * from which it was allocated. When the cylinder group map is written + * to disk, ai_state has the DEPCOMPLETE flag set. When the block itself + * is written, the COMPLETE flag is set. Once both the cylinder group map + * and the data itself have been written, it is safe to write the entry in + * the indirect block that claims the block; the "allocindir" dependency + * can then be freed as it is no longer applicable. + */ +struct allocindir { + struct worklist ai_list; /* buffer holding indirect block */ +# define ai_state ai_list.wk_state /* indirect block pointer state */ + LIST_ENTRY(allocindir) ai_next; /* indirdep's list of allocindir's */ + int ai_offset; /* pointer offset in indirect block */ + ufs_daddr_t ai_newblkno; /* new block pointer value */ + ufs_daddr_t ai_oldblkno; /* old block pointer value */ + struct freefrag *ai_freefrag; /* block to be freed when complete */ + struct indirdep *ai_indirdep; /* address of associated indirdep */ + LIST_ENTRY(allocindir) ai_deps; /* bmsafemap's list of allocindir's */ + struct buf *ai_buf; /* cylgrp buffer (if pending) */ +}; + +/* + * A "freefrag" structure is attached to an "inodedep" when a previously + * allocated fragment is replaced with a larger fragment, rather than extended. + * The "freefrag" structure is constructed and attached when the replacement + * block is first allocated. It is processed after the inode claiming the + * bigger block that replaces it has been written to disk. Note that the + * ff_state field is is used to store the uid, so may lose data. However, + * the uid is used only in printing an error message, so is not critical. + * Keeping it in a short keeps the data structure down to 32 bytes. + */ +struct freefrag { + struct worklist ff_list; /* id_inowait or delayed worklist */ +# define ff_state ff_list.wk_state /* owning user; should be uid_t */ + struct vnode *ff_devvp; /* filesystem device vnode */ + struct fs *ff_fs; /* addr of superblock */ + ufs_daddr_t ff_blkno; /* fragment physical block number */ + long ff_fragsize; /* size of fragment being deleted */ + ino_t ff_inum; /* owning inode number */ +}; + +/* + * A "freeblks" structure is attached to an "inodedep" when the + * corresponding file's length is reduced to zero. It records all + * the information needed to free the blocks of a file after its + * zero'ed inode has been written to disk. + */ +struct freeblks { + struct worklist fb_list; /* id_inowait or delayed worklist */ + ino_t fb_previousinum; /* inode of previous owner of blocks */ + struct vnode *fb_devvp; /* filesystem device vnode */ + struct fs *fb_fs; /* addr of superblock */ + off_t fb_oldsize; /* previous file size */ + off_t fb_newsize; /* new file size */ + int fb_chkcnt; /* used to check cnt of blks released */ + uid_t fb_uid; /* uid of previous owner of blocks */ + ufs_daddr_t fb_dblks[NDADDR]; /* direct blk ptrs to deallocate */ + ufs_daddr_t fb_iblks[NIADDR]; /* indirect blk ptrs to deallocate */ +}; + +/* + * A "freefile" structure is attached to an inode when its + * link count is reduced to zero. It marks the inode as free in + * the cylinder group map after the zero'ed inode has been written + * to disk and any associated blocks and fragments have been freed. + */ +struct freefile { + struct worklist fx_list; /* id_inowait or delayed worklist */ + mode_t fx_mode; /* mode of inode */ + ino_t fx_oldinum; /* inum of the unlinked file */ + struct vnode *fx_devvp; /* filesystem device vnode */ + struct fs *fx_fs; /* addr of superblock */ +}; + +/* + * A "diradd" structure is linked to an "inodedep" id_inowait list when a + * new directory entry is allocated that references the inode described + * by "inodedep". When the inode itself is written (either the initial + * allocation for new inodes or with the increased link count for + * existing inodes), the COMPLETE flag is set in da_state. If the entry + * is for a newly allocated inode, the "inodedep" structure is associated + * with a bmsafemap which prevents the inode from being written to disk + * until the cylinder group has been updated. Thus the da_state COMPLETE + * flag cannot be set until the inode bitmap dependency has been removed. + * When creating a new file, it is safe to write the directory entry that + * claims the inode once the referenced inode has been written. Since + * writing the inode clears the bitmap dependencies, the DEPCOMPLETE flag + * in the diradd can be set unconditionally when creating a file. When + * creating a directory, there are two additional dependencies described by + * mkdir structures (see their description below). When these dependencies + * are resolved the DEPCOMPLETE flag is set in the diradd structure. + * If there are multiple links created to the same inode, there will be + * a separate diradd structure created for each link. The diradd is + * linked onto the pg_diraddhd list of the pagedep for the directory + * page that contains the entry. When a directory page is written, + * the pg_diraddhd list is traversed to rollback any entries that are + * not yet ready to be written to disk. If a directory entry is being + * changed (by rename) rather than added, the DIRCHG flag is set and + * the da_previous entry points to the entry that will be "removed" + * once the new entry has been committed. During rollback, entries + * with da_previous are replaced with the previous inode number rather + * than zero. + * + * The overlaying of da_pagedep and da_previous is done to keep the + * structure down to 32 bytes in size on a 32-bit machine. If a + * da_previous entry is present, the pointer to its pagedep is available + * in the associated dirrem entry. If the DIRCHG flag is set, the + * da_previous entry is valid; if not set the da_pagedep entry is valid. + * The DIRCHG flag never changes; it is set when the structure is created + * if appropriate and is never cleared. + */ +struct diradd { + struct worklist da_list; /* id_inowait and id_pendinghd list */ +# define da_state da_list.wk_state /* state of the new directory entry */ + LIST_ENTRY(diradd) da_pdlist; /* pagedep holding directory block */ + doff_t da_offset; /* offset of new dir entry in dir blk */ + ino_t da_newinum; /* inode number for the new dir entry */ + union { + struct dirrem *dau_previous; /* entry being replaced in dir change */ + struct pagedep *dau_pagedep; /* pagedep dependency for addition */ + } da_un; +}; +#define da_previous da_un.dau_previous +#define da_pagedep da_un.dau_pagedep + +/* + * Two "mkdir" structures are needed to track the additional dependencies + * associated with creating a new directory entry. Normally a directory + * addition can be committed as soon as the newly referenced inode has been + * written to disk with its increased link count. When a directory is + * created there are two additional dependencies: writing the directory + * data block containing the "." and ".." entries (MKDIR_BODY) and writing + * the parent inode with the increased link count for ".." (MKDIR_PARENT). + * These additional dependencies are tracked by two mkdir structures that + * reference the associated "diradd" structure. When they have completed, + * they set the DEPCOMPLETE flag on the diradd so that it knows that its + * extra dependencies have been completed. The md_state field is used only + * to identify which type of dependency the mkdir structure is tracking. + * It is not used in the mainline code for any purpose other than consistency + * checking. All the mkdir structures in the system are linked together on + * a list. This list is needed so that a diradd can find its associated + * mkdir structures and deallocate them if it is prematurely freed (as for + * example if a mkdir is immediately followed by a rmdir of the same directory). + * Here, the free of the diradd must traverse the list to find the associated + * mkdir structures that reference it. The deletion would be faster if the + * diradd structure were simply augmented to have two pointers that referenced + * the associated mkdir's. However, this would increase the size of the diradd + * structure from 32 to 64-bits to speed a very infrequent operation. + */ +struct mkdir { + struct worklist md_list; /* id_inowait or buffer holding dir */ +# define md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */ + struct diradd *md_diradd; /* associated diradd */ + LIST_ENTRY(mkdir) md_mkdirs; /* list of all mkdirs */ +}; +LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; + +/* + * A "dirrem" structure describes an operation to decrement the link + * count on an inode. The dirrem structure is attached to the pg_dirremhd + * list of the pagedep for the directory page that contains the entry. + * It is processed after the directory page with the deleted entry has + * been written to disk. + * + * The overlaying of dm_pagedep and dm_dirinum is done to keep the + * structure down to 32 bytes in size on a 32-bit machine. It works + * because they are never used concurrently. + */ +struct dirrem { + struct worklist dm_list; /* delayed worklist */ +# define dm_state dm_list.wk_state /* state of the old directory entry */ + LIST_ENTRY(dirrem) dm_next; /* pagedep's list of dirrem's */ + struct mount *dm_mnt; /* associated mount point */ + ino_t dm_oldinum; /* inum of the removed dir entry */ + union { + struct pagedep *dmu_pagedep; /* pagedep dependency for remove */ + ino_t dmu_dirinum; /* parent inode number (for rmdir) */ + } dm_un; +}; +#define dm_pagedep dm_un.dmu_pagedep +#define dm_dirinum dm_un.dmu_dirinum -- cgit v1.1