summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordillon <dillon@FreeBSD.org>2001-09-08 20:02:33 +0000
committerdillon <dillon@FreeBSD.org>2001-09-08 20:02:33 +0000
commitd73b3c59f0f82580650e5b2965bdd3dd4cac7bd5 (patch)
tree5f36b4fb47a0f9f3d486afb708c5f95a6d1f5472
parentdf61d9eb64550a7afc1b41ccc9e0261af45c98c1 (diff)
downloadFreeBSD-src-d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5.zip
FreeBSD-src-d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5.tar.gz
This brings in a Yahoo coredump patch from Paul, with additional mods by
me (addition of vn_rdwr_inchunks). The problem Yahoo is solving is that if you have large process images core dumping, or you have a large number of forked processes all core dumping at the same time, the original coredump code would leave the vnode locked throughout. This can cause the directory vnode to get locked up, which can cause the parent directory vnode to get locked up, and so on all the way to the root node, locking the entire machine up for extremely long periods of time. This patch solves the problem in two ways. First it uses an advisory non-blocking lock to abort multiple processes trying to core to the same file. Second (my contribution) it chunks up the writes and uses bwillwrite() to avoid holding the vnode locked while blocking in the buffer cache. Submitted by: ps Reviewed by: dillon MFC after: 2 weeks
-rw-r--r--sys/compat/pecoff/imgact_pecoff.c8
-rw-r--r--sys/kern/imgact_aout.c8
-rw-r--r--sys/kern/imgact_elf.c9
-rw-r--r--sys/kern/kern_sig.c23
-rw-r--r--sys/kern/vfs_vnops.c39
-rw-r--r--sys/sys/vnode.h3
6 files changed, 74 insertions, 16 deletions
diff --git a/sys/compat/pecoff/imgact_pecoff.c b/sys/compat/pecoff/imgact_pecoff.c
index 0adc03d..9d08513 100644
--- a/sys/compat/pecoff/imgact_pecoff.c
+++ b/sys/compat/pecoff/imgact_pecoff.c
@@ -187,15 +187,15 @@ pecoff_coredump(register struct proc * p, register struct vnode * vp,
#endif
error = cpu_coredump(p, vp, cred);
if (error == 0)
- error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
+ error = vn_rdwr_inchunks(UIO_WRITE, vp, vm->vm_daddr,
(int) ctob(vm->vm_dsize), (off_t) ctob(UPAGES), UIO_USERSPACE,
- IO_NODELOCKED | IO_UNIT, cred, (int *) NULL, p);
+ IO_UNIT, cred, (int *) NULL, p);
if (error == 0)
- error = vn_rdwr(UIO_WRITE, vp,
+ error = vn_rdwr_inchunks(UIO_WRITE, vp,
(caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)),
round_page(ctob(vm->vm_ssize)),
(off_t) ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE,
- IO_NODELOCKED | IO_UNIT, cred, (int *) NULL, p);
+ IO_UNIT, cred, (int *) NULL, p);
return (error);
}
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
index 9994ad5..9aa8b3d 100644
--- a/sys/kern/imgact_aout.c
+++ b/sys/kern/imgact_aout.c
@@ -264,15 +264,15 @@ aout_coredump(p, vp, limit)
fill_kinfo_proc(p, &p->p_addr->u_kproc);
error = cpu_coredump(p, vp, cred);
if (error == 0)
- error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
+ error = vn_rdwr_inchunks(UIO_WRITE, vp, vm->vm_daddr,
(int)ctob(vm->vm_dsize), (off_t)ctob(UPAGES), UIO_USERSPACE,
- IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p);
+ IO_UNIT, cred, (int *) NULL, p);
if (error == 0)
- error = vn_rdwr(UIO_WRITE, vp,
+ error = vn_rdwr_inchunks(UIO_WRITE, vp,
(caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)),
round_page(ctob(vm->vm_ssize)),
(off_t)ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE,
- IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p);
+ IO_UNIT, cred, (int *) NULL, p);
return (error);
}
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 6157650..63999d1 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -794,9 +794,10 @@ elf_coredump(p, vp, limit)
php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
offset = hdrsize;
for (i = 0; i < seginfo.count; i++) {
- error = vn_rdwr(UIO_WRITE, vp, (caddr_t)php->p_vaddr,
+ error = vn_rdwr_inchunks(UIO_WRITE, vp,
+ (caddr_t)php->p_vaddr,
php->p_filesz, offset, UIO_USERSPACE,
- IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p);
+ IO_UNIT, cred, (int *)NULL, p);
if (error != 0)
break;
offset += php->p_filesz;
@@ -958,8 +959,8 @@ elf_corehdr(p, vp, cred, numsegs, hdr, hdrsize)
free(tempdata, M_TEMP);
/* Write it to the core file. */
- return vn_rdwr(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
- UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, NULL, p);
+ return vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
+ UIO_SYSSPACE, IO_UNIT, cred, NULL, p);
}
static void
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index c64af8f..607c78c 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -68,6 +68,7 @@
#include <sys/sysent.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
+#include <sys/unistd.h>
#include <machine/cpu.h>
@@ -1853,6 +1854,7 @@ coredump(p)
{
register struct vnode *vp;
register struct ucred *cred = p->p_ucred;
+ struct flock lf;
struct nameidata nd;
struct vattr vattr;
int error, error1, flags;
@@ -1895,8 +1897,19 @@ restart:
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
+
+ VOP_UNLOCK(vp, 0, p);
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_WRLCK;
+ error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK);
+ if (error)
+ goto out2;
+
if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
- VOP_UNLOCK(vp, 0, p);
+ lf.l_type = F_UNLCK;
+ VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
if ((error = vn_close(vp, FWRITE, cred, p)) != 0)
return (error);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
@@ -1908,7 +1921,7 @@ restart:
if (vp->v_type != VREG ||
VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) {
error = EFAULT;
- goto out;
+ goto out1;
}
VATTR_NULL(&vattr);
vattr.va_size = 0;
@@ -1922,9 +1935,11 @@ restart:
p->p_sysent->sv_coredump(p, vp, limit) :
ENOSYS;
-out:
- VOP_UNLOCK(vp, 0, p);
+out1:
+ lf.l_type = F_UNLCK;
+ VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
vn_finished_write(mp);
+out2:
error1 = vn_close(vp, FWRITE, cred, p);
if (error == 0)
error = error1;
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 1cd6c99..9a6f87a 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -335,6 +335,45 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
}
/*
+ * Package up an I/O request on a vnode into a uio and do it. The I/O
+ * request is split up into smaller chunks and we try to avoid saturating
+ * the buffer cache while potentially holding a vnode locked, so we
+ * check bwillwrite() before calling vn_rdwr()
+ */
+int
+vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
+ enum uio_rw rw;
+ struct vnode *vp;
+ caddr_t base;
+ int len;
+ off_t offset;
+ enum uio_seg segflg;
+ int ioflg;
+ struct ucred *cred;
+ int *aresid;
+ struct proc *p;
+{
+ int error = 0;
+
+ do {
+ int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
+
+ if (rw != UIO_READ && vp->v_type == VREG)
+ bwillwrite();
+ error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
+ ioflg, cred, aresid, p);
+ len -= chunk; /* aresid calc already includes length */
+ if (error)
+ break;
+ offset += chunk;
+ base += chunk;
+ } while (len);
+ if (aresid)
+ *aresid += len;
+ return (error);
+}
+
+/*
* File table vnode read routine.
*/
static int
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 9ee79d1..b4d7828 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -608,6 +608,9 @@ int vn_pollrecord __P((struct vnode *vp, struct proc *p, int events));
int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base,
int len, off_t offset, enum uio_seg segflg, int ioflg,
struct ucred *cred, int *aresid, struct proc *p));
+int vn_rdwr_inchunks __P((enum uio_rw rw, struct vnode *vp, caddr_t base,
+ int len, off_t offset, enum uio_seg segflg, int ioflg,
+ struct ucred *cred, int *aresid, struct proc *p));
int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p));
int vn_start_write __P((struct vnode *vp, struct mount **mpp, int flags));
dev_t vn_todev __P((struct vnode *vp));
OpenPOWER on IntegriCloud