diff options
author | dillon <dillon@FreeBSD.org> | 2001-09-08 20:02:33 +0000 |
---|---|---|
committer | dillon <dillon@FreeBSD.org> | 2001-09-08 20:02:33 +0000 |
commit | d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5 (patch) | |
tree | 5f36b4fb47a0f9f3d486afb708c5f95a6d1f5472 | |
parent | df61d9eb64550a7afc1b41ccc9e0261af45c98c1 (diff) | |
download | FreeBSD-src-d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5.zip FreeBSD-src-d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5.tar.gz |
This brings in a Yahoo coredump patch from Paul, with additional mods by
me (addition of vn_rdwr_inchunks). The problem Yahoo is solving is that
if you have large process images core dumping, or you have a large number of
forked processes all core dumping at the same time, the original coredump code
would leave the vnode locked throughout. This can cause the directory vnode
to get locked up, which can cause the parent directory vnode to get locked
up, and so on all the way to the root node, locking the entire machine up
for extremely long periods of time.
This patch solves the problem in two ways. First it uses an advisory
non-blocking lock to abort multiple processes trying to core to the same
file. Second (my contribution) it chunks up the writes and uses bwillwrite()
to avoid holding the vnode locked while blocking in the buffer cache.
Submitted by: ps
Reviewed by: dillon
MFC after: 2 weeks
-rw-r--r-- | sys/compat/pecoff/imgact_pecoff.c | 8 | ||||
-rw-r--r-- | sys/kern/imgact_aout.c | 8 | ||||
-rw-r--r-- | sys/kern/imgact_elf.c | 9 | ||||
-rw-r--r-- | sys/kern/kern_sig.c | 23 | ||||
-rw-r--r-- | sys/kern/vfs_vnops.c | 39 | ||||
-rw-r--r-- | sys/sys/vnode.h | 3 |
6 files changed, 74 insertions, 16 deletions
diff --git a/sys/compat/pecoff/imgact_pecoff.c b/sys/compat/pecoff/imgact_pecoff.c index 0adc03d..9d08513 100644 --- a/sys/compat/pecoff/imgact_pecoff.c +++ b/sys/compat/pecoff/imgact_pecoff.c @@ -187,15 +187,15 @@ pecoff_coredump(register struct proc * p, register struct vnode * vp, #endif error = cpu_coredump(p, vp, cred); if (error == 0) - error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr, + error = vn_rdwr_inchunks(UIO_WRITE, vp, vm->vm_daddr, (int) ctob(vm->vm_dsize), (off_t) ctob(UPAGES), UIO_USERSPACE, - IO_NODELOCKED | IO_UNIT, cred, (int *) NULL, p); + IO_UNIT, cred, (int *) NULL, p); if (error == 0) - error = vn_rdwr(UIO_WRITE, vp, + error = vn_rdwr_inchunks(UIO_WRITE, vp, (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)), round_page(ctob(vm->vm_ssize)), (off_t) ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE, - IO_NODELOCKED | IO_UNIT, cred, (int *) NULL, p); + IO_UNIT, cred, (int *) NULL, p); return (error); } diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c index 9994ad5..9aa8b3d 100644 --- a/sys/kern/imgact_aout.c +++ b/sys/kern/imgact_aout.c @@ -264,15 +264,15 @@ aout_coredump(p, vp, limit) fill_kinfo_proc(p, &p->p_addr->u_kproc); error = cpu_coredump(p, vp, cred); if (error == 0) - error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr, + error = vn_rdwr_inchunks(UIO_WRITE, vp, vm->vm_daddr, (int)ctob(vm->vm_dsize), (off_t)ctob(UPAGES), UIO_USERSPACE, - IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); + IO_UNIT, cred, (int *) NULL, p); if (error == 0) - error = vn_rdwr(UIO_WRITE, vp, + error = vn_rdwr_inchunks(UIO_WRITE, vp, (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)), round_page(ctob(vm->vm_ssize)), (off_t)ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE, - IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); + IO_UNIT, cred, (int *) NULL, p); return (error); } diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 6157650..63999d1 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -794,9 +794,10 @@ elf_coredump(p, vp, limit) php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; offset = hdrsize; for (i = 0; i < seginfo.count; i++) { - error = vn_rdwr(UIO_WRITE, vp, (caddr_t)php->p_vaddr, + error = vn_rdwr_inchunks(UIO_WRITE, vp, + (caddr_t)php->p_vaddr, php->p_filesz, offset, UIO_USERSPACE, - IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p); + IO_UNIT, cred, (int *)NULL, p); if (error != 0) break; offset += php->p_filesz; @@ -958,8 +959,8 @@ elf_corehdr(p, vp, cred, numsegs, hdr, hdrsize) free(tempdata, M_TEMP); /* Write it to the core file. */ - return vn_rdwr(UIO_WRITE, vp, hdr, hdrsize, (off_t)0, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, NULL, p); + return vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0, + UIO_SYSSPACE, IO_UNIT, cred, NULL, p); } static void diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index c64af8f..607c78c 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -68,6 +68,7 @@ #include <sys/sysent.h> #include <sys/sysctl.h> #include <sys/malloc.h> +#include <sys/unistd.h> #include <machine/cpu.h> @@ -1853,6 +1854,7 @@ coredump(p) { register struct vnode *vp; register struct ucred *cred = p->p_ucred; + struct flock lf; struct nameidata nd; struct vattr vattr; int error, error1, flags; @@ -1895,8 +1897,19 @@ restart: return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; + + VOP_UNLOCK(vp, 0, p); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_WRLCK; + error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK); + if (error) + goto out2; + if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { - VOP_UNLOCK(vp, 0, p); + lf.l_type = F_UNLCK; + VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); if ((error = vn_close(vp, FWRITE, cred, p)) != 0) return (error); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) @@ -1908,7 +1921,7 @@ restart: if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) { error = EFAULT; - goto out; + goto out1; } VATTR_NULL(&vattr); vattr.va_size = 0; @@ -1922,9 +1935,11 @@ restart: p->p_sysent->sv_coredump(p, vp, limit) : ENOSYS; -out: - VOP_UNLOCK(vp, 0, p); +out1: + lf.l_type = F_UNLCK; + VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); vn_finished_write(mp); +out2: error1 = vn_close(vp, FWRITE, cred, p); if (error == 0) error = error1; diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 1cd6c99..9a6f87a 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -335,6 +335,45 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) } /* + * Package up an I/O request on a vnode into a uio and do it. The I/O + * request is split up into smaller chunks and we try to avoid saturating + * the buffer cache while potentially holding a vnode locked, so we + * check bwillwrite() before calling vn_rdwr() + */ +int +vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) + enum uio_rw rw; + struct vnode *vp; + caddr_t base; + int len; + off_t offset; + enum uio_seg segflg; + int ioflg; + struct ucred *cred; + int *aresid; + struct proc *p; +{ + int error = 0; + + do { + int chunk = (len > MAXBSIZE) ? MAXBSIZE : len; + + if (rw != UIO_READ && vp->v_type == VREG) + bwillwrite(); + error = vn_rdwr(rw, vp, base, chunk, offset, segflg, + ioflg, cred, aresid, p); + len -= chunk; /* aresid calc already includes length */ + if (error) + break; + offset += chunk; + base += chunk; + } while (len); + if (aresid) + *aresid += len; + return (error); +} + +/* * File table vnode read routine. */ static int diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 9ee79d1..b4d7828 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -608,6 +608,9 @@ int vn_pollrecord __P((struct vnode *vp, struct proc *p, int events)); int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *cred, int *aresid, struct proc *p)); +int vn_rdwr_inchunks __P((enum uio_rw rw, struct vnode *vp, caddr_t base, + int len, off_t offset, enum uio_seg segflg, int ioflg, + struct ucred *cred, int *aresid, struct proc *p)); int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p)); int vn_start_write __P((struct vnode *vp, struct mount **mpp, int flags)); dev_t vn_todev __P((struct vnode *vp)); |