summaryrefslogtreecommitdiffstats
path: root/sys/kern/uipc_syscalls.c
diff options
context:
space:
mode:
authordg <dg@FreeBSD.org>1998-11-05 14:28:26 +0000
committerdg <dg@FreeBSD.org>1998-11-05 14:28:26 +0000
commitb178f74f12a0446656640ae873e0bc71057f5de3 (patch)
treed66b2ab9c599863a9679de94cdc9514d6a3dbd1d /sys/kern/uipc_syscalls.c
parent60b560b337fe2a6159e49741372219aa0353da03 (diff)
downloadFreeBSD-src-b178f74f12a0446656640ae873e0bc71057f5de3.zip
FreeBSD-src-b178f74f12a0446656640ae873e0bc71057f5de3.tar.gz
Implemented zero-copy TCP/IP extensions via sendfile(2) - send a
file to a stream socket. sendfile(2) is similar to implementations in HP-UX, Linux, and other systems, but the API is more extensive and addresses many of the complaints that the Apache Group and others have had with those other implementations. Thanks to Marc Slemko of the Apache Group for helping me work out the best API for this. Anyway, this has the "net" result of speeding up sends of files over TCP/IP sockets by about 10X (that is to say, uses 1/10th of the CPU cycles) when compared to a traditional read/write loop.
Diffstat (limited to 'sys/kern/uipc_syscalls.c')
-rw-r--r--sys/kern/uipc_syscalls.c404
1 files changed, 403 insertions, 1 deletions
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index a841bfa..f0cca24 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -2,6 +2,9 @@
* Copyright (c) 1982, 1986, 1989, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
+ * sendfile(2) and related extensions:
+ * Copyright (c) 1998, David Greenman. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -31,7 +34,7 @@
* SUCH DAMAGE.
*
* @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
- * $Id: uipc_syscalls.c,v 1.40 1998/06/10 10:30:23 dfr Exp $
+ * $Id: uipc_syscalls.c,v 1.41 1998/08/23 03:06:59 wollman Exp $
*/
#include "opt_compat.h"
@@ -39,6 +42,7 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/kernel.h>
#include <sys/sysproto.h>
#include <sys/malloc.h>
#include <sys/filedesc.h>
@@ -51,9 +55,27 @@
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/uio.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <machine/limits.h>
+
+static void sf_buf_init(void *arg);
+SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
+static struct sf_buf *sf_buf_alloc(void);
+static void sf_buf_ref(caddr_t addr, u_int size);
+static void sf_buf_free(caddr_t addr, u_int size);
static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
static int recvit __P((struct proc *p, int s, struct msghdr *mp,
@@ -65,6 +87,11 @@ static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
int compat));
+static SLIST_HEAD(, sf_buf) sf_freelist;
+static vm_offset_t sf_base;
+static struct sf_buf *sf_bufs;
+static int sf_buf_alloc_want;
+
/*
* System call interface to the socket abstraction.
*/
@@ -1274,3 +1301,378 @@ getsock(fdp, fdes, fpp)
*fpp = fp;
return (0);
}
+
+/*
+ * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
+ * XXX - The sf_buf functions are currently private to sendfile(2), so have
+ * been made static, but may be useful in the future for doing zero-copy in
+ * other parts of the networking code.
+ */
+static void
+sf_buf_init(void *arg)
+{
+ int i;
+
+ SLIST_INIT(&sf_freelist);
+ sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
+ sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
+ bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
+ for (i = 0; i < nsfbufs; i++) {
+ sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
+ SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
+ }
+}
+
+/*
+ * Get an sf_buf from the freelist. Will block if none are available.
+ */
+static struct sf_buf *
+sf_buf_alloc()
+{
+ struct sf_buf *sf;
+ int s;
+
+ s = splimp();
+ while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
+ sf_buf_alloc_want = 1;
+ tsleep(&sf_freelist, PVM, "sfbufa", 0);
+ }
+ SLIST_REMOVE_HEAD(&sf_freelist, free_list);
+ splx(s);
+ sf->refcnt = 1;
+ return (sf);
+}
+
+#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
+static void
+sf_buf_ref(caddr_t addr, u_int size)
+{
+ struct sf_buf *sf;
+
+ sf = dtosf(addr);
+ if (sf->refcnt == 0)
+ panic("sf_buf_ref: referencing a free sf_buf");
+ sf->refcnt++;
+}
+
+/*
+ * Lose a reference to an sf_buf. When none left, detach mapped page
+ * and release resources back to the system.
+ *
+ * Must be called at splimp.
+ */
+static void
+sf_buf_free(caddr_t addr, u_int size)
+{
+ struct sf_buf *sf;
+ struct vm_page *m;
+
+ sf = dtosf(addr);
+ if (sf->refcnt == 0)
+ panic("sf_buf_free: freeing free sf_buf");
+ sf->refcnt--;
+ if (sf->refcnt == 0) {
+ pmap_qremove((vm_offset_t)addr, 1);
+ m = sf->m;
+ vm_page_unwire(m, 0);
+ /*
+ * Check for the object going away on us. This can
+ * happen since we don't hold a reference to it.
+ * If so, we're responsible for freeing the page.
+ */
+ if (m->wire_count == 0 && m->object == NULL)
+ vm_page_free(m);
+ sf->m = NULL;
+ SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
+ if (sf_buf_alloc_want) {
+ sf_buf_alloc_want = 0;
+ wakeup(&sf_freelist);
+ }
+ }
+}
+
+/*
+ * sendfile(2).
+ * int sendfile(int fd, int s, off_t offset, size_t nbytes,
+ * struct sf_hdtr *hdtr, off_t *sbytes, int flags)
+ *
+ * Send a file specified by 'fd' and starting at 'offset' to a socket
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if
+ * nbytes == 0. Optionally add a header and/or trailer to the socket
+ * output. If specified, write the total number of bytes sent into *sbytes.
+ */
+int
+sendfile(struct proc *p, struct sendfile_args *uap)
+{
+ struct file *fp;
+ struct filedesc *fdp = p->p_fd;
+ struct vnode *vp;
+ struct vm_object *obj;
+ struct socket *so;
+ struct mbuf *m;
+ struct sf_buf *sf;
+ struct vm_page *pg;
+ struct writev_args nuap;
+ struct sf_hdtr hdtr;
+ off_t off, xfsize, sbytes = 0;
+ int error = 0, i, s;
+
+ /*
+ * Do argument checking. Must be a regular file in, stream
+ * type and connected socket out, positive offset.
+ */
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
+ (fp->f_flag & FREAD) == 0) {
+ error = EBADF;
+ goto done;
+ }
+ if (fp->f_type != DTYPE_VNODE) {
+ error = EINVAL;
+ goto done;
+ }
+ vp = (struct vnode *)fp->f_data;
+ obj = vp->v_object;
+ if (vp->v_type != VREG || obj == NULL) {
+ error = EINVAL;
+ goto done;
+ }
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
+ goto done;
+ so = (struct socket *)fp->f_data;
+ if (so->so_type != SOCK_STREAM) {
+ error = EINVAL;
+ goto done;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ goto done;
+ }
+ if (uap->offset < 0) {
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * If specified, get the pointer to the sf_hdtr struct for
+ * any headers/trailers.
+ */
+ if (uap->hdtr != NULL) {
+ error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
+ if (error)
+ goto done;
+ /*
+ * Send any headers. Wimp out and use writev(2).
+ */
+ if (hdtr.headers != NULL) {
+ nuap.fd = uap->s;
+ nuap.iovp = hdtr.headers;
+ nuap.iovcnt = hdtr.hdr_cnt;
+ error = writev(p, &nuap);
+ if (error)
+ goto done;
+ sbytes += p->p_retval[0];
+ }
+ }
+
+ /*
+ * Protect against multiple writers to the socket.
+ */
+ (void) sblock(&so->so_snd, M_WAITOK);
+
+ /*
+ * Loop through the pages in the file, starting with the requested
+ * offset. Get a file page (do I/O if necessary), map the file page
+ * into an sf_buf, attach an mbuf header to the sf_buf, and queue
+ * it on the socket.
+ */
+ for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
+ vm_pindex_t pindex;
+
+ pindex = OFF_TO_IDX(off);
+retry_lookup:
+ /*
+ * Calculate the amount to transfer. Not to exceed a page,
+ * the EOF, or the passed in nbytes.
+ */
+ xfsize = obj->un_pager.vnp.vnp_size - off;
+ if (xfsize > PAGE_SIZE)
+ xfsize = PAGE_SIZE;
+ if (off & PAGE_MASK)
+ xfsize -= (off & PAGE_MASK);
+ if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
+ xfsize = uap->nbytes - sbytes;
+ if (xfsize <= 0)
+ break;
+ /*
+ * Attempt to look up the page. If the page doesn't exist or the
+ * part we're interested in isn't valid, then read it from disk.
+ * If some other part of the kernel has this page (i.e. it's busy),
+ * then disk I/O may be occuring on it, so wait and retry.
+ */
+ pg = vm_page_lookup(obj, pindex);
+ if (pg == NULL || (!(pg->flags & PG_BUSY) && !pg->busy &&
+ !vm_page_is_valid(pg, off & PAGE_MASK, xfsize))) {
+ struct uio auio;
+ struct iovec aiov;
+ int bsize;
+
+ if (pg == NULL) {
+ pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
+ if (pg == NULL) {
+ VM_WAIT;
+ goto retry_lookup;
+ }
+ vm_page_flag_clear(pg, PG_BUSY);
+ }
+ /*
+ * Ensure that our page is still around when the I/O completes.
+ */
+ vm_page_io_start(pg);
+ vm_page_wire(pg);
+ /*
+ * Get the page from backing store.
+ */
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = 0;
+ aiov.iov_len = MAXBSIZE;
+ auio.uio_resid = MAXBSIZE;
+ auio.uio_offset = trunc_page(off);
+ auio.uio_segflg = UIO_NOCOPY;
+ auio.uio_rw = UIO_READ;
+ auio.uio_procp = curproc;
+ vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
+ error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
+ p->p_ucred);
+ VOP_UNLOCK(vp, 0, p);
+ vm_page_io_finish(pg);
+ vm_page_flag_clear(pg, PG_ZERO);
+ if (error) {
+ vm_page_unwire(pg, 0);
+ /*
+ * See if anyone else might know about this page.
+ * If not and it is not valid, then free it.
+ */
+ if (pg->wire_count == 0 && pg->valid == 0 &&
+ pg->busy == 0 && !(pg->flags & PG_BUSY) &&
+ pg->hold_count == 0)
+ vm_page_free(pg);
+ sbunlock(&so->so_snd);
+ goto done;
+ }
+ } else {
+ if ((pg->flags & PG_BUSY) || pg->busy) {
+ s = splvm();
+ if ((pg->flags & PG_BUSY) || pg->busy) {
+ /*
+ * Page is busy. Wait and retry.
+ */
+ vm_page_flag_set(pg, PG_WANTED);
+ tsleep(pg, PVM, "sfpbsy", 0);
+ splx(s);
+ goto retry_lookup;
+ }
+ splx(s);
+ }
+ /*
+ * Protect from having the page ripped out from beneath us.
+ */
+ vm_page_wire(pg);
+ }
+ /*
+ * Allocate a kernel virtual page and insert the physical page
+ * into it.
+ */
+ sf = sf_buf_alloc();
+ sf->m = pg;
+ pmap_qenter(sf->kva, &pg, 1);
+ /*
+ * Get an mbuf header and set it up as having external storage.
+ */
+ MGETHDR(m, M_WAIT, MT_DATA);
+ m->m_ext.ext_free = sf_buf_free;
+ m->m_ext.ext_ref = sf_buf_ref;
+ m->m_ext.ext_buf = (void *)sf->kva;
+ m->m_ext.ext_size = PAGE_SIZE;
+ m->m_data = (char *) sf->kva + (off & PAGE_MASK);
+ m->m_flags |= M_EXT;
+ m->m_pkthdr.len = m->m_len = xfsize;
+ /*
+ * Add the buffer to the socket buffer chain.
+ */
+ s = splnet();
+retry_space:
+ /*
+ * Make sure that the socket is still able to take more data.
+ * CANTSENDMORE being true usually means that the connection
+ * was closed. so_error is true when an error was sensed after
+ * a previous send.
+ * The state is checked after the page mapping and buffer
+ * allocation above since those operations may block and make
+ * any socket checks stale. From this point forward, nothing
+ * blocks before the pru_send (or more accurately, any blocking
+ * results in a loop back to here to re-check).
+ */
+ if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
+ if (so->so_state & SS_CANTSENDMORE) {
+ error = EPIPE;
+ } else {
+ error = so->so_error;
+ so->so_error = 0;
+ }
+ m_freem(m);
+ sbunlock(&so->so_snd);
+ splx(s);
+ goto done;
+ }
+ /*
+ * Wait for socket space to become available. We do this just
+ * after checking the connection state above in order to avoid
+ * a race condition with sbwait().
+ */
+ if (sbspace(&so->so_snd) <= 0) {
+ error = sbwait(&so->so_snd);
+ /*
+ * An error from sbwait usually indicates that we've
+ * been interrupted by a signal. If we've sent anything
+ * then return bytes sent, otherwise return the error.
+ */
+ if (error) {
+ m_freem(m);
+ sbunlock(&so->so_snd);
+ splx(s);
+ goto done;
+ }
+ goto retry_space;
+ }
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
+ splx(s);
+ if (error) {
+ sbunlock(&so->so_snd);
+ goto done;
+ }
+ }
+ sbunlock(&so->so_snd);
+
+ /*
+ * Send trailers. Wimp out and use writev(2).
+ */
+ if (uap->hdtr != NULL && hdtr.trailers != NULL) {
+ nuap.fd = uap->s;
+ nuap.iovp = hdtr.trailers;
+ nuap.iovcnt = hdtr.trl_cnt;
+ error = writev(p, &nuap);
+ if (error)
+ goto done;
+ sbytes += p->p_retval[0];
+ }
+
+done:
+ if (uap->sbytes != NULL) {
+ copyout(&sbytes, uap->sbytes, sizeof(off_t));
+ }
+ return (error);
+}
OpenPOWER on IntegriCloud