summaryrefslogtreecommitdiffstats
path: root/sys/ufs
diff options
context:
space:
mode:
authordillon <dillon@FreeBSD.org>2000-11-18 23:06:26 +0000
committerdillon <dillon@FreeBSD.org>2000-11-18 23:06:26 +0000
commit2ace35208525bb250b47fe7af60ec2ce681c6c92 (patch)
tree8b9f3edb21d176840f55c8efbf3c9ffe76fdabc6 /sys/ufs
parent59e131028ff3997be98ab838d5ab9f965b1589ca (diff)
downloadFreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.zip
FreeBSD-src-2ace35208525bb250b47fe7af60ec2ce681c6c92.tar.gz
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory situations prior to now. The new code is based on the concept that I/O must be able to function in a low memory situation. All major modules related to I/O (except networking) have been adjusted to allow allocation out of the system reserve memory pool. These modules now detect a low memory situation but rather then block they instead continue to operate, then return resources to the memory pool instead of cache them or leave them wired. Code has been added to stall in a low-memory situation prior to a vnode being locked. Thus situations where a process blocks in a low-memory condition while holding a locked vnode have been reduced to near nothing. Not only will I/O continue to operate, but many prior deadlock conditions simply no longer exist. Implement a number of VFS/BIO fixes (found by Ian): in biodone(), bogus-page replacement code, the loop was not properly incrementing loop variables prior to a continue statement. We do not believe this code can be hit anyway but we aren't taking any chances. We'll turn the whole section into a panic (as it already is in brelse()) after the release is rolled. In biodone(), the foff calculation was incorrectly clamped to the iosize, causing the wrong foff to be calculated for pages in the case of an I/O error or biodone() called without initiating I/O. The problem always caused a panic before. Now it doesn't. The problem is mainly an issue with NFS. Fixed casts for ~PAGE_MASK. This code worked properly before only because the calculations use signed arithmatic. Better to properly extend PAGE_MASK first before inverting it for the 64 bit masking op. In brelse(), the bogus_page fixup code was improperly throwing away the original contents of 'm' when it did the j-loop to fix the bogus pages. The result was that it would potentially invalidate parts of the *WRONG* page(!), leading to corruption. There may still be cases where a background bitmap write is being duplicated, causing potential corruption. We have identified a potentially serious bug related to this but the fix is still TBD. So instead this patch contains a KASSERT to detect the problem and panic the machine rather then continue to corrupt the filesystem. The problem does not occur very often.. it is very hard to reproduce, and it may or may not be the cause of the corruption people have reported. Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>) Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
Diffstat (limited to 'sys/ufs')
-rw-r--r--sys/ufs/ffs/ffs_inode.c3
-rw-r--r--sys/ufs/ffs/ffs_softdep.c36
-rw-r--r--sys/ufs/ufs/ufs_readwrite.c4
3 files changed, 26 insertions, 17 deletions
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index 30f36ee7..a8ae464 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -45,6 +45,7 @@
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
#include <sys/stat.h>
#include <vm/vm.h>
@@ -111,6 +112,8 @@ ffs_update(vp, waitfor)
ino_to_fsbo(fs, ip->i_number)) = ip->i_din;
if (waitfor && !DOINGASYNC(vp)) {
return (bwrite(bp));
+ } else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+ return (bwrite(bp));
} else {
if (bp->b_bufsize == fs->fs_bsize)
bp->b_flags |= B_CLUSTEROK;
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 98ad959..c6ac0bd 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -91,6 +91,8 @@ MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
+#define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
+
#define D_PAGEDEP 0
#define D_INODEDEP 1
#define D_NEWBLK 2
@@ -802,7 +804,7 @@ top:
goto top;
}
MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP,
- M_WAITOK);
+ M_SOFTDEP_FLAGS);
bzero(pagedep, sizeof(struct pagedep));
pagedep->pd_list.wk_type = D_PAGEDEP;
pagedep->pd_mnt = mp;
@@ -879,7 +881,7 @@ top:
}
num_inodedep += 1;
MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
- M_INODEDEP, M_WAITOK);
+ M_INODEDEP, M_SOFTDEP_FLAGS);
inodedep->id_list.wk_type = D_INODEDEP;
inodedep->id_fs = fs;
inodedep->id_ino = inum;
@@ -941,7 +943,7 @@ top:
if (sema_get(&newblk_in_progress, 0) == 0)
goto top;
MALLOC(newblk, struct newblk *, sizeof(struct newblk),
- M_NEWBLK, M_WAITOK);
+ M_NEWBLK, M_SOFTDEP_FLAGS);
newblk->nb_state = 0;
newblk->nb_fs = fs;
newblk->nb_newblkno = newblkno;
@@ -1127,7 +1129,7 @@ bmsafemap_lookup(bp)
return (WK_BMSAFEMAP(wk));
FREE_LOCK(&lk);
MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
- M_BMSAFEMAP, M_WAITOK);
+ M_BMSAFEMAP, M_SOFTDEP_FLAGS);
bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
bmsafemap->sm_list.wk_state = 0;
bmsafemap->sm_buf = bp;
@@ -1187,7 +1189,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
struct newblk *newblk;
MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
- M_ALLOCDIRECT, M_WAITOK);
+ M_ALLOCDIRECT, M_SOFTDEP_FLAGS);
bzero(adp, sizeof(struct allocdirect));
adp->ad_list.wk_type = D_ALLOCDIRECT;
adp->ad_lbn = lbn;
@@ -1339,7 +1341,7 @@ newfreefrag(ip, blkno, size)
if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
panic("newfreefrag: frag size");
MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
- M_FREEFRAG, M_WAITOK);
+ M_FREEFRAG, M_SOFTDEP_FLAGS);
freefrag->ff_list.wk_type = D_FREEFRAG;
freefrag->ff_state = ip->i_uid & ~ONWORKLIST; /* XXX - used below */
freefrag->ff_inum = ip->i_number;
@@ -1408,7 +1410,7 @@ newallocindir(ip, ptrno, newblkno, oldblkno)
struct allocindir *aip;
MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
- M_ALLOCINDIR, M_WAITOK);
+ M_ALLOCINDIR, M_SOFTDEP_FLAGS);
bzero(aip, sizeof(struct allocindir));
aip->ai_list.wk_type = D_ALLOCINDIR;
aip->ai_state = ATTACHED;
@@ -1561,7 +1563,7 @@ setup_allocindir_phase2(bp, ip, aip)
if (indirdep)
break;
MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
- M_INDIRDEP, M_WAITOK);
+ M_INDIRDEP, M_SOFTDEP_FLAGS);
newindirdep->ir_list.wk_type = D_INDIRDEP;
newindirdep->ir_state = ATTACHED;
LIST_INIT(&newindirdep->ir_deplisthd);
@@ -1623,7 +1625,7 @@ softdep_setup_freeblocks(ip, length)
if (length != 0)
panic("softde_setup_freeblocks: non-zero length");
MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
- M_FREEBLKS, M_WAITOK);
+ M_FREEBLKS, M_SOFTDEP_FLAGS);
bzero(freeblks, sizeof(struct freeblks));
freeblks->fb_list.wk_type = D_FREEBLKS;
freeblks->fb_uid = ip->i_uid;
@@ -1870,7 +1872,7 @@ softdep_freefile(pvp, ino, mode)
* This sets up the inode de-allocation dependency.
*/
MALLOC(freefile, struct freefile *, sizeof(struct freefile),
- M_FREEFILE, M_WAITOK);
+ M_FREEFILE, M_SOFTDEP_FLAGS);
freefile->fx_list.wk_type = D_FREEFILE;
freefile->fx_list.wk_state = 0;
freefile->fx_mode = mode;
@@ -2186,7 +2188,7 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
fs = dp->i_fs;
lbn = lblkno(fs, diroffset);
offset = blkoff(fs, diroffset);
- MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK);
+ MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS);
bzero(dap, sizeof(struct diradd));
dap->da_list.wk_type = D_DIRADD;
dap->da_offset = offset;
@@ -2198,12 +2200,12 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
} else {
dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
- M_WAITOK);
+ M_SOFTDEP_FLAGS);
mkdir1->md_list.wk_type = D_MKDIR;
mkdir1->md_state = MKDIR_BODY;
mkdir1->md_diradd = dap;
MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
- M_WAITOK);
+ M_SOFTDEP_FLAGS);
mkdir2->md_list.wk_type = D_MKDIR;
mkdir2->md_state = MKDIR_PARENT;
mkdir2->md_diradd = dap;
@@ -2438,7 +2440,7 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp)
(void) request_cleanup(FLUSH_REMOVE, 0);
num_dirrem += 1;
MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
- M_DIRREM, M_WAITOK);
+ M_DIRREM, M_SOFTDEP_FLAGS);
bzero(dirrem, sizeof(struct dirrem));
dirrem->dm_list.wk_type = D_DIRREM;
dirrem->dm_state = isrmdir ? RMDIR : 0;
@@ -2535,7 +2537,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
*/
if (newinum != WINO) {
MALLOC(dap, struct diradd *, sizeof(struct diradd),
- M_DIRADD, M_WAITOK);
+ M_DIRADD, M_SOFTDEP_FLAGS);
bzero(dap, sizeof(struct diradd));
dap->da_list.wk_type = D_DIRADD;
dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
@@ -2841,7 +2843,7 @@ softdep_disk_io_initiation(bp)
* Replace up-to-date version with safe version.
*/
MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
- M_INDIRDEP, M_WAITOK);
+ M_INDIRDEP, M_SOFTDEP_FLAGS);
ACQUIRE_LOCK(&lk);
indirdep->ir_state &= ~ATTACHED;
indirdep->ir_state |= UNDONE;
@@ -2942,7 +2944,7 @@ initiate_write_inodeblock(inodedep, bp)
if (inodedep->id_savedino != NULL)
panic("initiate_write_inodeblock: already doing I/O");
MALLOC(inodedep->id_savedino, struct dinode *,
- sizeof(struct dinode), M_INODEDEP, M_WAITOK);
+ sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS);
*inodedep->id_savedino = *dp;
bzero((caddr_t)dp, sizeof(struct dinode));
return;
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index be43550..785219c 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -48,6 +48,7 @@
#include <vm/vm_map.h>
#include <vm/vnode_pager.h>
#include <sys/event.h>
+#include <sys/vmmeter.h>
#define VN_KNOTE(vp, b) \
KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
@@ -501,6 +502,9 @@ WRITE(ap)
} else {
bawrite(bp);
}
+ } else if (vm_page_count_severe() || buf_dirty_count_severe()) {
+ bp->b_flags |= B_CLUSTEROK;
+ bawrite(bp);
} else {
bp->b_flags |= B_CLUSTEROK;
bdwrite(bp);
OpenPOWER on IntegriCloud