This brings in a Yahoo coredump patch from Paul, with additional mods by

me (addition of vn_rdwr_inchunks). The problem Yahoo is solving is that if you have large process images core dumping, or you have a large number of forked processes all core dumping at the same time, the original coredump code would leave the vnode locked throughout. This can cause the directory vnode to get locked up, which can cause the parent directory vnode to get locked up, and so on all the way to the root node, locking the entire machine up for extremely long periods of time. This patch solves the problem in two ways. First it uses an advisory non-blocking lock to abort multiple processes trying to core to the same file. Second (my contribution) it chunks up the writes and uses bwillwrite() to avoid holding the vnode locked while blocking in the buffer cache. Submitted by: ps Reviewed by: dillon MFC after: 2 weeks
author: dillon <dillon@FreeBSD.org> 2001-09-08 20:02:33 +0000
committer: dillon <dillon@FreeBSD.org> 2001-09-08 20:02:33 +0000
commit: d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5 (patch)
tree: 5f36b4fb47a0f9f3d486afb708c5f95a6d1f5472
parent: df61d9eb64550a7afc1b41ccc9e0261af45c98c1 (diff)
download: FreeBSD-src-d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5.zip
FreeBSD-src-d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5.tar.gz
6 files changed, 74 insertions, 16 deletions
diff --git a/sys/compat/pecoff/imgact_pecoff.c b/sys/compat/pecoff/imgact_pecoff.c
index 0adc03d..9d08513 100644
--- a/sys/compat/pecoff/imgact_pecoff.c
+++ b/sys/compat/pecoff/imgact_pecoff.c
@@ -187,15 +187,15 @@ pecoff_coredump(register struct proc * p, register struct vnode * vp,
 #endif
 	error = cpu_coredump(p, vp, cred);
 	if (error == 0)
-		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
+		error = vn_rdwr_inchunks(UIO_WRITE, vp, vm->vm_daddr,
 				(int) ctob(vm->vm_dsize), (off_t) ctob(UPAGES), UIO_USERSPACE,
-			    IO_NODELOCKED | IO_UNIT, cred, (int *) NULL, p);
+			    IO_UNIT, cred, (int *) NULL, p);
 	if (error == 0)
-		error = vn_rdwr(UIO_WRITE, vp,
+		error = vn_rdwr_inchunks(UIO_WRITE, vp,
 			(caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)),
 				round_page(ctob(vm->vm_ssize)),
 		   (off_t) ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE,
-			    IO_NODELOCKED | IO_UNIT, cred, (int *) NULL, p);
+			    IO_UNIT, cred, (int *) NULL, p);
 	return (error);
 
 }
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
index 9994ad5..9aa8b3d 100644
--- a/sys/kern/imgact_aout.c
+++ b/sys/kern/imgact_aout.c
@@ -264,15 +264,15 @@ aout_coredump(p, vp, limit)
 	fill_kinfo_proc(p, &p->p_addr->u_kproc);
 	error = cpu_coredump(p, vp, cred);
 	if (error == 0)
-		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
+		error = vn_rdwr_inchunks(UIO_WRITE, vp, vm->vm_daddr,
 		    (int)ctob(vm->vm_dsize), (off_t)ctob(UPAGES), UIO_USERSPACE,
-		    IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p);
+		    IO_UNIT, cred, (int *) NULL, p);
 	if (error == 0)
-		error = vn_rdwr(UIO_WRITE, vp,
+		error = vn_rdwr_inchunks(UIO_WRITE, vp,
 		    (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)),
 		    round_page(ctob(vm->vm_ssize)),
 		    (off_t)ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE,
-		    IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p);
+		    IO_UNIT, cred, (int *) NULL, p);
 	return (error);
 }
 
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 6157650..63999d1 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -794,9 +794,10 @@ elf_coredump(p, vp, limit)
 		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
 		offset = hdrsize;
 		for (i = 0;  i < seginfo.count;  i++) {
-			error = vn_rdwr(UIO_WRITE, vp, (caddr_t)php->p_vaddr,
+			error = vn_rdwr_inchunks(UIO_WRITE, vp, 
+			    (caddr_t)php->p_vaddr,
 			    php->p_filesz, offset, UIO_USERSPACE,
-			    IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p);
+			    IO_UNIT, cred, (int *)NULL, p);
 			if (error != 0)
 				break;
 			offset += php->p_filesz;
@@ -958,8 +959,8 @@ elf_corehdr(p, vp, cred, numsegs, hdr, hdrsize)
 	free(tempdata, M_TEMP);
 
 	/* Write it to the core file. */
-	return vn_rdwr(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
-	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, NULL, p);
+	return vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
+	    UIO_SYSSPACE, IO_UNIT, cred, NULL, p);
 }
 
 static void
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index c64af8f..607c78c 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -68,6 +68,7 @@
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
+#include <sys/unistd.h>
 
 #include <machine/cpu.h>
 
@@ -1853,6 +1854,7 @@ coredump(p)
 {
 	register struct vnode *vp;
 	register struct ucred *cred = p->p_ucred;
+	struct flock lf;
 	struct nameidata nd;
 	struct vattr vattr;
 	int error, error1, flags;
@@ -1895,8 +1897,19 @@ restart:
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
+
+	VOP_UNLOCK(vp, 0, p);
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	lf.l_type = F_WRLCK;
+	error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK);
+	if (error)
+		goto out2;
+
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
-		VOP_UNLOCK(vp, 0, p);
+		lf.l_type = F_UNLCK;
+		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 		if ((error = vn_close(vp, FWRITE, cred, p)) != 0)
 			return (error);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
@@ -1908,7 +1921,7 @@ restart:
 	if (vp->v_type != VREG ||
 	    VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) {
 		error = EFAULT;
-		goto out;
+		goto out1;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
@@ -1922,9 +1935,11 @@ restart:
 	  p->p_sysent->sv_coredump(p, vp, limit) :
 	  ENOSYS;
 
-out:
-	VOP_UNLOCK(vp, 0, p);
+out1:
+	lf.l_type = F_UNLCK;
+	VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	vn_finished_write(mp);
+out2:
 	error1 = vn_close(vp, FWRITE, cred, p);
 	if (error == 0)
 		error = error1;
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 1cd6c99..9a6f87a 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -335,6 +335,45 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
 }
 
 /*
+ * Package up an I/O request on a vnode into a uio and do it.  The I/O
+ * request is split up into smaller chunks and we try to avoid saturating
+ * the buffer cache while potentially holding a vnode locked, so we 
+ * check bwillwrite() before calling vn_rdwr()
+ */
+int
+vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
+	enum uio_rw rw;
+	struct vnode *vp;
+	caddr_t base;
+	int len;
+	off_t offset;
+	enum uio_seg segflg;
+	int ioflg;
+	struct ucred *cred;
+	int *aresid;
+	struct proc *p;
+{
+	int error = 0;
+
+	do {
+		int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
+
+		if (rw != UIO_READ && vp->v_type == VREG)
+			bwillwrite();
+		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
+		    ioflg, cred, aresid, p);
+		len -= chunk;	/* aresid calc already includes length */
+		if (error)
+			break;
+		offset += chunk;
+		base += chunk;
+	} while (len);
+	if (aresid)
+		*aresid += len;
+	return (error);
+}
+
+/*
  * File table vnode read routine.
  */
 static int
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 9ee79d1..b4d7828 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -608,6 +608,9 @@ int	vn_pollrecord __P((struct vnode *vp, struct proc *p, int events));
 int	vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *cred, int *aresid, struct proc *p));
+int	vn_rdwr_inchunks __P((enum uio_rw rw, struct vnode *vp, caddr_t base,
+	    int len, off_t offset, enum uio_seg segflg, int ioflg,
+	    struct ucred *cred, int *aresid, struct proc *p));
 int	vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p));
 int	vn_start_write __P((struct vnode *vp, struct mount **mpp, int flags));
 dev_t	vn_todev __P((struct vnode *vp));
author	dillon <dillon@FreeBSD.org>	2001-09-08 20:02:33 +0000
committer	dillon <dillon@FreeBSD.org>	2001-09-08 20:02:33 +0000
commit	d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5 (patch)
tree	5f36b4fb47a0f9f3d486afb708c5f95a6d1f5472
parent	df61d9eb64550a7afc1b41ccc9e0261af45c98c1 (diff)
download	FreeBSD-src-d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5.zip FreeBSD-src-d73b3c59f0f82580650e5b2965bdd3dd4cac7bd5.tar.gz