summaryrefslogtreecommitdiffstats
path: root/sys/net/bpf_zerocopy.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/net/bpf_zerocopy.c')
-rw-r--r--sys/net/bpf_zerocopy.c510
1 files changed, 510 insertions, 0 deletions
diff --git a/sys/net/bpf_zerocopy.c b/sys/net/bpf_zerocopy.c
new file mode 100644
index 0000000..896ad1d
--- /dev/null
+++ b/sys/net/bpf_zerocopy.c
@@ -0,0 +1,510 @@
+/*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_bpf.h"
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sf_buf.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+
+#include <machine/atomic.h>
+
+#include <net/if.h>
+#include <net/bpf.h>
+#include <net/bpfdesc.h>
+#include <net/bpf_zerocopy.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
+/*
+ * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which
+ * are mapped into the kernel address space using sf_bufs and used directly
+ * by BPF. Memory is wired since page faults cannot be tolerated in the
+ * contexts where the buffers are copied to (locks held, interrupt context,
+ * etc). Access to shared memory buffers is synchronized using a header on
+ * each buffer, allowing the number of system calls to go to zero as BPF
+ * reaches saturation (buffers filled as fast as they can be drained by the
+ * user process). Full details of the protocol for communicating between the
+ * user process and BPF may be found in bpf(4).
+ */
+
+/*
+ * Maximum number of pages per buffer. Since all BPF devices use two, the
+ * maximum per device is 2*BPF_MAX_PAGES. Resource limits on the number of
+ * sf_bufs may be an issue, so do not set this too high. On older systems,
+ * kernel address space limits may also be an issue.
+ */
+#define BPF_MAX_PAGES 512
+
+/*
+ * struct zbuf describes a memory buffer loaned by a user process to the
+ * kernel. We represent this as a series of pages managed using an array of
+ * sf_bufs. Even though the memory is contiguous in user space, it may not
+ * be mapped contiguously in the kernel (i.e., a set of physically
+ * non-contiguous pages in the direct map region) so we must implement
+ * scatter-gather copying. One significant mitigating factor is that on
+ * systems with a direct memory map, we can avoid TLB misses.
+ *
+ * At the front of the shared memor region is a bpf_zbuf_header, which
+ * contains shared control data to allow user space and the kernel to
+ * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF
+ * knows that the space is not available.
+ */
+struct zbuf {
+ vm_offset_t zb_uaddr; /* User address, may be stale. */
+ size_t zb_size; /* Size of buffer, incl. header. */
+ u_int zb_numpages; /* Number of pages. */
+ struct sf_buf **zb_pages; /* Pages themselves. */
+ struct bpf_zbuf_header *zb_header; /* Shared header. */
+};
+
+/*
+ * Release a page we've previously wired.
+ */
+static void
+zbuf_page_free(vm_page_t pp)
+{
+
+ vm_page_lock_queues();
+ vm_page_unwire(pp, 0);
+ if (pp->wire_count == 0 && pp->object == NULL)
+ vm_page_free(pp);
+ vm_page_unlock_queues();
+}
+
+/*
+ * Free an sf_buf with attached page.
+ */
+static void
+zbuf_sfbuf_free(struct sf_buf *sf)
+{
+ vm_page_t pp;
+
+ pp = sf_buf_page(sf);
+ sf_buf_free(sf);
+ zbuf_page_free(pp);
+}
+
+/*
+ * Free a zbuf, including its page array, sbufs, and pages. Allow partially
+ * allocated zbufs to be freed so that it may be used even during a zbuf
+ * setup.
+ */
+static void
+zbuf_free(struct zbuf *zb)
+{
+ int i;
+
+ for (i = 0; i < zb->zb_numpages; i++) {
+ if (zb->zb_pages[i] != NULL)
+ zbuf_sfbuf_free(zb->zb_pages[i]);
+ }
+ free(zb->zb_pages, M_BPF);
+ free(zb, M_BPF);
+}
+
+/*
+ * Given a user pointer to a page of user memory, return an sf_buf for the
+ * page. Because we may be requesting quite a few sf_bufs, prefer failure to
+ * deadlock and use SFB_NOWAIT.
+ */
+static struct sf_buf *
+zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr)
+{
+ struct sf_buf *sf;
+ vm_page_t pp;
+
+ if (vm_fault_quick((caddr_t) uaddr, VM_PROT_READ | VM_PROT_WRITE) <
+ 0)
+ return (NULL);
+ pp = pmap_extract_and_hold(map->pmap, uaddr, VM_PROT_READ |
+ VM_PROT_WRITE);
+ if (pp == NULL)
+ return (NULL);
+ vm_page_lock_queues();
+ vm_page_wire(pp);
+ vm_page_unhold(pp);
+ vm_page_unlock_queues();
+ sf = sf_buf_alloc(pp, SFB_NOWAIT);
+ if (sf == NULL) {
+ zbuf_page_free(pp);
+ return (NULL);
+ }
+ return (sf);
+}
+
+/*
+ * Create a zbuf describing a range of user address space memory. Validate
+ * page alignment, size requirements, etc.
+ */
+static int
+zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len,
+ struct zbuf **zbp)
+{
+ struct zbuf *zb;
+ struct vm_map *map;
+ int error, i;
+
+ *zbp = NULL;
+
+ /*
+ * User address must be page-aligned.
+ */
+ if (uaddr & PAGE_MASK)
+ return (EINVAL);
+
+ /*
+ * Length must be an integer number of full pages.
+ */
+ if (len & PAGE_MASK)
+ return (EINVAL);
+
+ /*
+ * Length must not exceed per-buffer resource limit.
+ */
+ if ((len / PAGE_SIZE) > BPF_MAX_PAGES)
+ return (EINVAL);
+
+ /*
+ * Allocate the buffer and set up each page with is own sf_buf.
+ */
+ error = 0;
+ zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK);
+ zb->zb_uaddr = uaddr;
+ zb->zb_size = len;
+ zb->zb_numpages = len / PAGE_SIZE;
+ zb->zb_pages = malloc(sizeof(struct sf_buf *) *
+ zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK);
+ map = &td->td_proc->p_vmspace->vm_map;
+ for (i = 0; i < zb->zb_numpages; i++) {
+ zb->zb_pages[i] = zbuf_sfbuf_get(map,
+ uaddr + (i * PAGE_SIZE));
+ if (zb->zb_pages[i] == NULL) {
+ error = EFAULT;
+ goto error;
+ }
+ }
+ zb->zb_header =
+ (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]);
+ bzero(zb->zb_header, sizeof(*zb->zb_header));
+ *zbp = zb;
+ return (0);
+
+error:
+ zbuf_free(zb);
+ return (error);
+}
+
+/*
+ * Copy bytes from a source into the specified zbuf. The caller is
+ * responsible for performing bounds checking, etc.
+ */
+void
+bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+ void *src, u_int len)
+{
+ u_int count, page, poffset;
+ u_char *src_bytes;
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_append_bytes: not in zbuf mode"));
+ KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf"));
+
+ src_bytes = (u_char *)src;
+ zb = (struct zbuf *)buf;
+
+ /*
+ * Scatter-gather copy to user pages mapped into kernel address space
+ * using sf_bufs: copy up to a page at a time.
+ */
+ offset += sizeof(struct bpf_zbuf_header);
+ page = offset / PAGE_SIZE;
+ poffset = offset % PAGE_SIZE;
+ while (len > 0) {
+ KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:"
+ " page overflow (%d p %d np)\n", page, zb->zb_numpages));
+
+ count = min(len, PAGE_SIZE - poffset);
+ bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) +
+ poffset, count);
+ poffset += count;
+ if (poffset == PAGE_SIZE) {
+ poffset = 0;
+ page++;
+ }
+ KASSERT(poffset < PAGE_SIZE,
+ ("bpf_zerocopy_append_bytes: page offset overflow (%d)",
+ poffset));
+ len -= count;
+ src_bytes += count;
+ }
+}
+
+/*
+ * Copy bytes from an mbuf chain to the specified zbuf: copying will be
+ * scatter-gather both from mbufs, which may be fragmented over memory, and
+ * to pages, which may not be contiguously mapped in kernel address space.
+ * As with bpf_zerocopy_append_bytes(), the caller is responsible for
+ * checking that this will not exceed the buffer limit.
+ */
+void
+bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
+ void *src, u_int len)
+{
+ u_int count, moffset, page, poffset;
+ const struct mbuf *m;
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_append_mbuf not in zbuf mode"));
+ KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf"));
+
+ m = (struct mbuf *)src;
+ zb = (struct zbuf *)buf;
+
+ /*
+ * Scatter gather both from an mbuf chain and to a user page set
+ * mapped into kernel address space using sf_bufs. If we're lucky,
+ * each mbuf requires one copy operation, but if page alignment and
+ * mbuf alignment work out less well, we'll be doing two copies per
+ * mbuf.
+ */
+ offset += sizeof(struct bpf_zbuf_header);
+ page = offset / PAGE_SIZE;
+ poffset = offset % PAGE_SIZE;
+ moffset = 0;
+ while (len > 0) {
+ KASSERT(page < zb->zb_numpages,
+ ("bpf_zerocopy_append_mbuf: page overflow (%d p %d "
+ "np)\n", page, zb->zb_numpages));
+ KASSERT(m != NULL,
+ ("bpf_zerocopy_append_mbuf: end of mbuf chain"));
+
+ count = min(m->m_len - moffset, len);
+ count = min(count, PAGE_SIZE - poffset);
+ bcopy(mtod(m, u_char *) + moffset,
+ ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset,
+ count);
+ poffset += count;
+ if (poffset == PAGE_SIZE) {
+ poffset = 0;
+ page++;
+ }
+ KASSERT(poffset < PAGE_SIZE,
+ ("bpf_zerocopy_append_mbuf: page offset overflow (%d)",
+ poffset));
+ moffset += count;
+ if (moffset == m->m_len) {
+ m = m->m_next;
+ moffset = 0;
+ }
+ len -= count;
+ }
+}
+
+/*
+ * Notification from the BPF framework that a buffer has moved into the held
+ * slot on a descriptor. Zero-copy BPF will update the shared page to let
+ * the user process know.
+ */
+void
+bpf_zerocopy_bufheld(struct bpf_d *d)
+{
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_bufheld: not in zbuf mode"));
+
+ zb = (struct zbuf *)d->bd_hbuf;
+ KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));
+ zb->zb_header->bzh_kernel_len = d->bd_hlen;
+ atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
+}
+
+/*
+ * Query from the BPF framework regarding whether the buffer currently in the
+ * held position can be moved to the free position, which can be indicated by
+ * the user process making their generation number equal to the kernel
+ * generation number.
+ */
+int
+bpf_zerocopy_canfreebuf(struct bpf_d *d)
+{
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_canfreebuf: not in zbuf mode"));
+
+ zb = (struct zbuf *)d->bd_hbuf;
+ if (zb == NULL)
+ return (0);
+ if (zb->zb_header->bzh_kernel_gen ==
+ atomic_load_acq_int(&zb->zb_header->bzh_user_gen))
+ return (1);
+ return (0);
+}
+
+/*
+ * Free zero copy buffers at request of descriptor.
+ */
+void
+bpf_zerocopy_free(struct bpf_d *d)
+{
+ struct zbuf *zb;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_free: not in zbuf mode"));
+
+ zb = (struct zbuf *)d->bd_sbuf;
+ if (zb != NULL)
+ zbuf_free(zb);
+ zb = (struct zbuf *)d->bd_hbuf;
+ if (zb != NULL)
+ zbuf_free(zb);
+ zb = (struct zbuf *)d->bd_fbuf;
+ if (zb != NULL)
+ zbuf_free(zb);
+}
+
+/*
+ * Ioctl to return the maximum buffer size.
+ */
+int
+bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
+{
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode"));
+
+ *i = BPF_MAX_PAGES * PAGE_SIZE;
+ return (0);
+}
+
+/*
+ * Ioctl to force rotation of the two buffers, if there's any data available.
+ * This can be used by user space to implement time outs when waiting for a
+ * buffer to fill.
+ */
+int
+bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
+ struct bpf_zbuf *bz)
+{
+ struct zbuf *bzh;
+
+ bzero(bz, sizeof(*bz));
+ BPFD_LOCK(d);
+ if (d->bd_hbuf == NULL && d->bd_slen != 0) {
+ ROTATE_BUFFERS(d);
+ bzh = (struct zbuf *)d->bd_hbuf;
+ bz->bz_bufa = (void *)bzh->zb_uaddr;
+ bz->bz_buflen = d->bd_hlen;
+ }
+ BPFD_UNLOCK(d);
+ return (0);
+}
+
+/*
+ * Ioctl to configure zero-copy buffers -- may be done only once.
+ */
+int
+bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
+ struct bpf_zbuf *bz)
+{
+ struct zbuf *zba, *zbb;
+ int error;
+
+ KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
+ ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode"));
+
+ /*
+ * Must set both buffers. Cannot clear them.
+ */
+ if (bz->bz_bufa == NULL || bz->bz_bufb == NULL)
+ return (EINVAL);
+
+ /*
+ * Buffers must have a size greater than 0. Alignment and other size
+ * validity checking is done in zbuf_setup().
+ */
+ if (bz->bz_buflen == 0)
+ return (EINVAL);
+
+ /*
+ * Allocate new buffers.
+ */
+ error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen,
+ &zba);
+ if (error)
+ return (error);
+ error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen,
+ &zbb);
+ if (error) {
+ zbuf_free(zba);
+ return (error);
+ }
+
+ /*
+ * We only allow buffers to be installed once, so atomically check
+ * that no buffers are currently installed and install new buffers.
+ */
+ BPFD_LOCK(d);
+ if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL ||
+ d->bd_bif != NULL) {
+ BPFD_UNLOCK(d);
+ zbuf_free(zba);
+ zbuf_free(zbb);
+ return (EINVAL);
+ }
+ d->bd_fbuf = (caddr_t)zbb;
+ d->bd_sbuf = (caddr_t)zba;
+ d->bd_slen = 0;
+ d->bd_hlen = 0;
+
+ /*
+ * We expose only the space left in the buffer after the size of the
+ * shared management region.
+ */
+ d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header);
+ BPFD_UNLOCK(d);
+ return (0);
+}
OpenPOWER on IntegriCloud