summaryrefslogtreecommitdiffstats
path: root/sys/dev/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/md/md.c')
-rw-r--r--sys/dev/md/md.c1153
1 files changed, 1153 insertions, 0 deletions
diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c
new file mode 100644
index 0000000..e2922fc
--- /dev/null
+++ b/sys/dev/md/md.c
@@ -0,0 +1,1153 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $FreeBSD$
+ *
+ */
+
+/*
+ * The following functions are based in the vn(4) driver: mdstart_swap(),
+ * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
+ * and as such under the following copyright:
+ *
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah Hdr: vn.c 1.13 94/04/02
+ *
+ * from: @(#)vn.c 8.6 (Berkeley) 4/1/94
+ * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
+ */
+
+#include "opt_md.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/devicestat.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mdioctl.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/stdint.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
+#include <vm/uma.h>
+
+#define MD_MODVER 1
+
+#define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */
+
+#ifndef MD_NSECT
+#define MD_NSECT (10000 * 2)
+#endif
+
+static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
+static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
+
+static int md_debug;
+SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
+
+#if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
+/* Image gets put here: */
+static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
+static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
+#endif
+
+static int mdrootready;
+static int mdunits;
+static dev_t status_dev = 0;
+
+#define CDEV_MAJOR 95
+
+static d_strategy_t mdstrategy;
+static d_open_t mdopen;
+static d_close_t mdclose;
+static d_ioctl_t mdioctl, mdctlioctl;
+
+static struct cdevsw md_cdevsw = {
+ /* open */ mdopen,
+ /* close */ mdclose,
+ /* read */ physread,
+ /* write */ physwrite,
+ /* ioctl */ mdioctl,
+ /* poll */ nopoll,
+ /* mmap */ nommap,
+ /* strategy */ mdstrategy,
+ /* name */ MD_NAME,
+ /* maj */ CDEV_MAJOR,
+ /* dump */ nodump,
+ /* psize */ nopsize,
+ /* flags */ D_DISK | D_CANFREE | D_MEMDISK,
+};
+
+static struct cdevsw mdctl_cdevsw = {
+ /* open */ nullopen,
+ /* close */ nullclose,
+ /* read */ noread,
+ /* write */ nowrite,
+ /* ioctl */ mdctlioctl,
+ /* poll */ nopoll,
+ /* mmap */ nommap,
+ /* strategy */ nostrategy,
+ /* name */ MD_NAME,
+ /* maj */ CDEV_MAJOR
+};
+
+static struct cdevsw mddisk_cdevsw;
+
+static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
+
+#define NINDIR (PAGE_SIZE / sizeof(uintptr_t))
+#define NMASK (NINDIR-1)
+static int nshift;
+
+struct indir {
+ uintptr_t *array;
+ uint total;
+ uint used;
+ uint shift;
+};
+
+struct md_s {
+ int unit;
+ LIST_ENTRY(md_s) list;
+ struct devstat stats;
+ struct bio_queue_head bio_queue;
+ struct disk disk;
+ dev_t dev;
+ enum md_types type;
+ unsigned nsect;
+ unsigned opencount;
+ unsigned secsize;
+ unsigned flags;
+ char name[20];
+ struct proc *procp;
+
+ /* MD_MALLOC related fields */
+ struct indir *indir;
+ uma_zone_t uma;
+
+ /* MD_PRELOAD related fields */
+ u_char *pl_ptr;
+ unsigned pl_len;
+
+ /* MD_VNODE related fields */
+ struct vnode *vnode;
+ struct ucred *cred;
+
+ /* MD_SWAP related fields */
+ vm_object_t object;
+};
+
+static int mddestroy(struct md_s *sc, struct thread *td);
+
+static struct indir *
+new_indir(uint shift)
+{
+ struct indir *ip;
+
+ ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO);
+ if (ip == NULL)
+ return (NULL);
+ ip->array = malloc(sizeof(uintptr_t) * NINDIR,
+ M_MDSECT, M_NOWAIT | M_ZERO);
+ if (ip->array == NULL) {
+ free(ip, M_MD);
+ return (NULL);
+ }
+ ip->total = NINDIR;
+ ip->shift = shift;
+ return (ip);
+}
+
+static void
+del_indir(struct indir *ip)
+{
+
+ free(ip->array, M_MDSECT);
+ free(ip, M_MD);
+}
+
+static void
+destroy_indir(struct md_s *sc, struct indir *ip)
+{
+ int i;
+
+ for (i = 0; i < NINDIR; i++) {
+ if (!ip->array[i])
+ continue;
+ if (ip->shift)
+ destroy_indir(sc, (struct indir*)(ip->array[i]));
+ else if (ip->array[i] > 255)
+ uma_zfree(sc->uma, (void *)(ip->array[i]));
+ }
+ del_indir(ip);
+}
+
+/*
+ * This function does the math and alloctes the top level "indir" structure
+ * for a device of "size" sectors.
+ */
+
+static struct indir *
+dimension(off_t size)
+{
+ off_t rcnt;
+ struct indir *ip;
+ int i, layer;
+
+ rcnt = size;
+ layer = 0;
+ while (rcnt > NINDIR) {
+ rcnt /= NINDIR;
+ layer++;
+ }
+ /* figure out log2(NINDIR) */
+ for (i = NINDIR, nshift = -1; i; nshift++)
+ i >>= 1;
+
+ /*
+ * XXX: the top layer is probably not fully populated, so we allocate
+ * too much space for ip->array in new_indir() here.
+ */
+ ip = new_indir(layer * nshift);
+ return (ip);
+}
+
+/*
+ * Read a given sector
+ */
+
+static uintptr_t
+s_read(struct indir *ip, off_t offset)
+{
+ struct indir *cip;
+ int idx;
+ uintptr_t up;
+
+ if (md_debug > 1)
+ printf("s_read(%jd)\n", (intmax_t)offset);
+ up = 0;
+ for (cip = ip; cip != NULL;) {
+ if (cip->shift) {
+ idx = (offset >> cip->shift) & NMASK;
+ up = cip->array[idx];
+ cip = (struct indir *)up;
+ continue;
+ }
+ idx = offset & NMASK;
+ return (cip->array[idx]);
+ }
+ return (0);
+}
+
+/*
+ * Write a given sector, prune the tree if the value is 0
+ */
+
+static int
+s_write(struct indir *ip, off_t offset, uintptr_t ptr)
+{
+ struct indir *cip, *lip[10];
+ int idx, li;
+ uintptr_t up;
+
+ if (md_debug > 1)
+ printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
+ up = 0;
+ li = 0;
+ cip = ip;
+ for (;;) {
+ lip[li++] = cip;
+ if (cip->shift) {
+ idx = (offset >> cip->shift) & NMASK;
+ up = cip->array[idx];
+ if (up != 0) {
+ cip = (struct indir *)up;
+ continue;
+ }
+ /* Allocate branch */
+ cip->array[idx] =
+ (uintptr_t)new_indir(cip->shift - nshift);
+ if (cip->array[idx] == 0)
+ return (ENOSPC);
+ cip->used++;
+ up = cip->array[idx];
+ cip = (struct indir *)up;
+ continue;
+ }
+ /* leafnode */
+ idx = offset & NMASK;
+ up = cip->array[idx];
+ if (up != 0)
+ cip->used--;
+ cip->array[idx] = ptr;
+ if (ptr != 0)
+ cip->used++;
+ break;
+ }
+ if (cip->used != 0 || li == 1)
+ return (0);
+ li--;
+ while (cip->used == 0 && cip != ip) {
+ li--;
+ idx = (offset >> lip[li]->shift) & NMASK;
+ up = lip[li]->array[idx];
+ KASSERT(up == (uintptr_t)cip, ("md screwed up"));
+ del_indir(cip);
+ lip[li]->array[idx] = 0;
+ lip[li]->used--;
+ cip = lip[li];
+ }
+ return (0);
+}
+
+static int
+mdopen(dev_t dev, int flag, int fmt, struct thread *td)
+{
+ struct md_s *sc;
+
+ if (md_debug)
+ printf("mdopen(%s %x %x %p)\n",
+ devtoname(dev), flag, fmt, td);
+
+ sc = dev->si_drv1;
+
+ sc->disk.d_sectorsize = sc->secsize;
+ sc->disk.d_mediasize = (off_t)sc->nsect * sc->secsize;
+ sc->disk.d_fwsectors = sc->nsect > 63 ? 63 : sc->nsect;
+ sc->disk.d_fwheads = 1;
+ sc->opencount++;
+ return (0);
+}
+
+static int
+mdclose(dev_t dev, int flags, int fmt, struct thread *td)
+{
+ struct md_s *sc = dev->si_drv1;
+
+ sc->opencount--;
+ return (0);
+}
+
+static int
+mdioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
+{
+
+ if (md_debug)
+ printf("mdioctl(%s %lx %p %x %p)\n",
+ devtoname(dev), cmd, addr, flags, td);
+
+ return (ENOIOCTL);
+}
+
+static int
+mdstart_malloc(struct md_s *sc, struct bio *bp)
+{
+ int i, error;
+ u_char *dst;
+ unsigned secno, nsec, uc;
+ uintptr_t sp, osp;
+
+ nsec = bp->bio_bcount / sc->secsize;
+ secno = bp->bio_pblkno;
+ dst = bp->bio_data;
+ error = 0;
+ while (nsec--) {
+ osp = s_read(sc->indir, secno);
+ if (bp->bio_cmd == BIO_DELETE) {
+ if (osp != 0)
+ error = s_write(sc->indir, secno, 0);
+ } else if (bp->bio_cmd == BIO_READ) {
+ if (osp == 0)
+ bzero(dst, sc->secsize);
+ else if (osp <= 255)
+ for (i = 0; i < sc->secsize; i++)
+ dst[i] = osp;
+ else
+ bcopy((void *)osp, dst, sc->secsize);
+ osp = 0;
+ } else if (bp->bio_cmd == BIO_WRITE) {
+ if (sc->flags & MD_COMPRESS) {
+ uc = dst[0];
+ for (i = 1; i < sc->secsize; i++)
+ if (dst[i] != uc)
+ break;
+ } else {
+ i = 0;
+ uc = 0;
+ }
+ if (i == sc->secsize) {
+ if (osp != uc)
+ error = s_write(sc->indir, secno, uc);
+ } else {
+ if (osp <= 255) {
+ sp = (uintptr_t) uma_zalloc(
+ sc->uma, M_NOWAIT);
+ if (sp == 0) {
+ error = ENOSPC;
+ break;
+ }
+ bcopy(dst, (void *)sp, sc->secsize);
+ error = s_write(sc->indir, secno, sp);
+ } else {
+ bcopy(dst, (void *)osp, sc->secsize);
+ osp = 0;
+ }
+ }
+ } else {
+ error = EOPNOTSUPP;
+ }
+ if (osp > 255)
+ uma_zfree(sc->uma, (void*)osp);
+ if (error)
+ break;
+ secno++;
+ dst += sc->secsize;
+ }
+ bp->bio_resid = 0;
+ return (error);
+}
+
+static int
+mdstart_preload(struct md_s *sc, struct bio *bp)
+{
+
+ if (bp->bio_cmd == BIO_DELETE) {
+ } else if (bp->bio_cmd == BIO_READ) {
+ bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount);
+ } else {
+ bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount);
+ }
+ bp->bio_resid = 0;
+ return (0);
+}
+
+static int
+mdstart_vnode(struct md_s *sc, struct bio *bp)
+{
+ int error;
+ struct uio auio;
+ struct iovec aiov;
+ struct mount *mp;
+
+ /*
+ * VNODE I/O
+ *
+ * If an error occurs, we set BIO_ERROR but we do not set
+ * B_INVAL because (for a write anyway), the buffer is
+ * still valid.
+ */
+
+ bzero(&auio, sizeof(auio));
+
+ aiov.iov_base = bp->bio_data;
+ aiov.iov_len = bp->bio_bcount;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize;
+ auio.uio_segflg = UIO_SYSSPACE;
+ if(bp->bio_cmd == BIO_READ)
+ auio.uio_rw = UIO_READ;
+ else
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_resid = bp->bio_bcount;
+ auio.uio_td = curthread;
+ /*
+ * When reading set IO_DIRECT to try to avoid double-caching
+ * the data. When writing IO_DIRECT is not optimal, but we
+ * must set IO_NOWDRAIN to avoid a wdrain deadlock.
+ */
+ if (bp->bio_cmd == BIO_READ) {
+ vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
+ } else {
+ (void) vn_start_write(sc->vnode, &mp, V_WAIT);
+ vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred);
+ vn_finished_write(mp);
+ }
+ VOP_UNLOCK(sc->vnode, 0, curthread);
+ bp->bio_resid = auio.uio_resid;
+ return (error);
+}
+
+static int
+mdstart_swap(struct md_s *sc, struct bio *bp)
+{
+
+ if ((bp->bio_cmd == BIO_DELETE) && (sc->flags & MD_RESERVE))
+ biodone(bp);
+ else
+ vm_pager_strategy(sc->object, bp);
+ return (-1);
+}
+
+static void
+mdstrategy(struct bio *bp)
+{
+ struct md_s *sc;
+
+ if (md_debug > 1)
+ printf("mdstrategy(%p) %s %x, %jd, %jd %ld, %p)\n",
+ (void *)bp, devtoname(bp->bio_dev), bp->bio_flags,
+ (intmax_t)bp->bio_blkno,
+ (intmax_t)bp->bio_pblkno,
+ bp->bio_bcount / DEV_BSIZE,
+ (void *)bp->bio_data);
+
+ sc = bp->bio_dev->si_drv1;
+
+ /* XXX: LOCK(sc->lock) */
+ bioqdisksort(&sc->bio_queue, bp);
+ /* XXX: UNLOCK(sc->lock) */
+
+ wakeup(sc);
+}
+
+static void
+md_kthread(void *arg)
+{
+ struct md_s *sc;
+ struct bio *bp;
+ int error;
+
+ sc = arg;
+ curthread->td_base_pri = PRIBIO;
+
+ mtx_lock(&Giant);
+ for (;;) {
+ /* XXX: LOCK(unique unit numbers) */
+ bp = bioq_first(&sc->bio_queue);
+ if (bp)
+ bioq_remove(&sc->bio_queue, bp);
+ /* XXX: UNLOCK(unique unit numbers) */
+ if (!bp) {
+ tsleep(sc, PRIBIO, "mdwait", 0);
+ if (sc->flags & MD_SHUTDOWN) {
+ sc->procp = NULL;
+ wakeup(&sc->procp);
+ kthread_exit(0);
+ }
+ continue;
+ }
+
+ switch (sc->type) {
+ case MD_MALLOC:
+ devstat_start_transaction(&sc->stats);
+ error = mdstart_malloc(sc, bp);
+ break;
+ case MD_PRELOAD:
+ devstat_start_transaction(&sc->stats);
+ error = mdstart_preload(sc, bp);
+ break;
+ case MD_VNODE:
+ devstat_start_transaction(&sc->stats);
+ error = mdstart_vnode(sc, bp);
+ break;
+ case MD_SWAP:
+ error = mdstart_swap(sc, bp);
+ break;
+ default:
+ panic("Impossible md(type)");
+ break;
+ }
+
+ if (error != -1)
+ biofinish(bp, &sc->stats, error);
+ }
+}
+
+static struct md_s *
+mdfind(int unit)
+{
+ struct md_s *sc;
+
+ /* XXX: LOCK(unique unit numbers) */
+ LIST_FOREACH(sc, &md_softc_list, list) {
+ if (sc->unit == unit)
+ break;
+ }
+ /* XXX: UNLOCK(unique unit numbers) */
+ return (sc);
+}
+
+static struct md_s *
+mdnew(int unit)
+{
+ struct md_s *sc;
+ int error, max = -1;
+
+ /* XXX: LOCK(unique unit numbers) */
+ LIST_FOREACH(sc, &md_softc_list, list) {
+ if (sc->unit == unit) {
+ /* XXX: UNLOCK(unique unit numbers) */
+ return (NULL);
+ }
+ if (sc->unit > max)
+ max = sc->unit;
+ }
+ if (unit == -1)
+ unit = max + 1;
+ if (unit > 255)
+ return (NULL);
+ sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
+ sc->unit = unit;
+ sprintf(sc->name, "md%d", unit);
+ error = kthread_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
+ if (error) {
+ free(sc, M_MD);
+ return (NULL);
+ }
+ LIST_INSERT_HEAD(&md_softc_list, sc, list);
+ /* XXX: UNLOCK(unique unit numbers) */
+ return (sc);
+}
+
+static void
+mdinit(struct md_s *sc)
+{
+
+ bioq_init(&sc->bio_queue);
+ devstat_add_entry(&sc->stats, MD_NAME, sc->unit, sc->secsize,
+ DEVSTAT_NO_ORDERED_TAGS,
+ DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
+ DEVSTAT_PRIORITY_OTHER);
+ sc->dev = disk_create(sc->unit, &sc->disk, 0, &md_cdevsw, &mddisk_cdevsw);
+ sc->dev->si_drv1 = sc;
+}
+
+/*
+ * XXX: we should check that the range they feed us is mapped.
+ * XXX: we should implement read-only.
+ */
+
+static int
+mdcreate_preload(struct md_ioctl *mdio)
+{
+ struct md_s *sc;
+
+ if (mdio->md_size == 0)
+ return (EINVAL);
+ if (mdio->md_options & ~(MD_AUTOUNIT))
+ return (EINVAL);
+ if (mdio->md_options & MD_AUTOUNIT) {
+ sc = mdnew(-1);
+ if (sc == NULL)
+ return (ENOMEM);
+ mdio->md_unit = sc->unit;
+ } else {
+ sc = mdnew(mdio->md_unit);
+ if (sc == NULL)
+ return (EBUSY);
+ }
+ sc->type = MD_PRELOAD;
+ sc->secsize = DEV_BSIZE;
+ sc->nsect = mdio->md_size;
+ sc->flags = mdio->md_options & MD_FORCE;
+ /* Cast to pointer size, then to pointer to avoid warning */
+ sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
+ sc->pl_len = (mdio->md_size << DEV_BSHIFT);
+ mdinit(sc);
+ return (0);
+}
+
+
+static int
+mdcreate_malloc(struct md_ioctl *mdio)
+{
+ struct md_s *sc;
+ off_t u;
+ uintptr_t sp;
+ int error;
+
+ error = 0;
+ if (mdio->md_size == 0)
+ return (EINVAL);
+ if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
+ return (EINVAL);
+ /* Compression doesn't make sense if we have reserved space */
+ if (mdio->md_options & MD_RESERVE)
+ mdio->md_options &= ~MD_COMPRESS;
+ if (mdio->md_options & MD_AUTOUNIT) {
+ sc = mdnew(-1);
+ if (sc == NULL)
+ return (ENOMEM);
+ mdio->md_unit = sc->unit;
+ } else {
+ sc = mdnew(mdio->md_unit);
+ if (sc == NULL)
+ return (EBUSY);
+ }
+ sc->type = MD_MALLOC;
+ sc->secsize = DEV_BSIZE;
+ sc->nsect = mdio->md_size;
+ sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
+ sc->indir = dimension(sc->nsect);
+ sc->uma = uma_zcreate(sc->name, sc->secsize,
+ NULL, NULL, NULL, NULL, 0x1ff, 0);
+ if (mdio->md_options & MD_RESERVE) {
+ for (u = 0; u < sc->nsect; u++) {
+ sp = (uintptr_t) uma_zalloc(sc->uma, M_NOWAIT | M_ZERO);
+ if (sp != 0)
+ error = s_write(sc->indir, u, sp);
+ else
+ error = ENOMEM;
+ if (error)
+ break;
+ }
+ }
+ if (!error) {
+ printf("%s%d: Malloc disk\n", MD_NAME, sc->unit);
+ mdinit(sc);
+ } else
+ mddestroy(sc, NULL);
+ return (error);
+}
+
+
+static int
+mdsetcred(struct md_s *sc, struct ucred *cred)
+{
+ char *tmpbuf;
+ int error = 0;
+
+ /*
+ * Set credits in our softc
+ */
+
+ if (sc->cred)
+ crfree(sc->cred);
+ sc->cred = crhold(cred);
+
+ /*
+ * Horrible kludge to establish credentials for NFS XXX.
+ */
+
+ if (sc->vnode) {
+ struct uio auio;
+ struct iovec aiov;
+
+ tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK);
+ bzero(&auio, sizeof(auio));
+
+ aiov.iov_base = tmpbuf;
+ aiov.iov_len = sc->secsize;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_resid = aiov.iov_len;
+ vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
+ error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
+ VOP_UNLOCK(sc->vnode, 0, curthread);
+ free(tmpbuf, M_TEMP);
+ }
+ return (error);
+}
+
+static int
+mdcreate_vnode(struct md_ioctl *mdio, struct thread *td)
+{
+ struct md_s *sc;
+ struct vattr vattr;
+ struct nameidata nd;
+ int error, flags;
+
+ flags = FREAD|FWRITE;
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
+ error = vn_open(&nd, &flags, 0);
+ if (error) {
+ if (error != EACCES && error != EPERM && error != EROFS)
+ return (error);
+ flags &= ~FWRITE;
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
+ error = vn_open(&nd, &flags, 0);
+ if (error)
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp->v_type != VREG ||
+ (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+ (void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
+ return (error ? error : EINVAL);
+ }
+ VOP_UNLOCK(nd.ni_vp, 0, td);
+
+ if (mdio->md_options & MD_AUTOUNIT) {
+ sc = mdnew(-1);
+ mdio->md_unit = sc->unit;
+ } else {
+ sc = mdnew(mdio->md_unit);
+ }
+ if (sc == NULL) {
+ (void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
+ return (EBUSY);
+ }
+
+ sc->type = MD_VNODE;
+ sc->flags = mdio->md_options & MD_FORCE;
+ if (!(flags & FWRITE))
+ sc->flags |= MD_READONLY;
+ sc->secsize = DEV_BSIZE;
+ sc->vnode = nd.ni_vp;
+
+ /*
+ * If the size is specified, override the file attributes.
+ */
+ if (mdio->md_size)
+ sc->nsect = mdio->md_size;
+ else
+ sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */
+ if (sc->nsect == 0) {
+ mddestroy(sc, td);
+ return (EINVAL);
+ }
+ error = mdsetcred(sc, td->td_ucred);
+ if (error) {
+ mddestroy(sc, td);
+ return (error);
+ }
+ mdinit(sc);
+ return (0);
+}
+
+static int
+mddestroy(struct md_s *sc, struct thread *td)
+{
+
+ GIANT_REQUIRED;
+
+ if (sc->dev != NULL) {
+ devstat_remove_entry(&sc->stats);
+ disk_destroy(sc->dev);
+ }
+ sc->flags |= MD_SHUTDOWN;
+ wakeup(sc);
+ while (sc->procp != NULL)
+ tsleep(&sc->procp, PRIBIO, "mddestroy", hz / 10);
+ if (sc->vnode != NULL)
+ (void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
+ FREAD : (FREAD|FWRITE), sc->cred, td);
+ if (sc->cred != NULL)
+ crfree(sc->cred);
+ if (sc->object != NULL) {
+ vm_pager_deallocate(sc->object);
+ }
+ if (sc->indir)
+ destroy_indir(sc, sc->indir);
+ if (sc->uma)
+ uma_zdestroy(sc->uma);
+
+ /* XXX: LOCK(unique unit numbers) */
+ LIST_REMOVE(sc, list);
+ /* XXX: UNLOCK(unique unit numbers) */
+ free(sc, M_MD);
+ return (0);
+}
+
+static int
+mdcreate_swap(struct md_ioctl *mdio, struct thread *td)
+{
+ int error;
+ struct md_s *sc;
+
+ GIANT_REQUIRED;
+
+ if (mdio->md_options & MD_AUTOUNIT) {
+ sc = mdnew(-1);
+ mdio->md_unit = sc->unit;
+ } else {
+ sc = mdnew(mdio->md_unit);
+ }
+ if (sc == NULL)
+ return (EBUSY);
+
+ sc->type = MD_SWAP;
+
+ /*
+ * Range check. Disallow negative sizes or any size less then the
+ * size of a page. Then round to a page.
+ */
+
+ if (mdio->md_size == 0) {
+ mddestroy(sc, td);
+ return (EDOM);
+ }
+
+ /*
+ * Allocate an OBJT_SWAP object.
+ *
+ * sc_secsize is PAGE_SIZE'd
+ *
+ * mdio->size is in DEV_BSIZE'd chunks.
+ * Note the truncation.
+ */
+
+ sc->secsize = PAGE_SIZE;
+ sc->nsect = mdio->md_size / (PAGE_SIZE / DEV_BSIZE);
+ sc->object = vm_pager_allocate(OBJT_SWAP, NULL, sc->secsize * (vm_offset_t)sc->nsect, VM_PROT_DEFAULT, 0);
+ sc->flags = mdio->md_options & MD_FORCE;
+ if (mdio->md_options & MD_RESERVE) {
+ if (swap_pager_reserve(sc->object, 0, sc->nsect) < 0) {
+ vm_pager_deallocate(sc->object);
+ sc->object = NULL;
+ mddestroy(sc, td);
+ return (EDOM);
+ }
+ }
+ error = mdsetcred(sc, td->td_ucred);
+ if (error)
+ mddestroy(sc, td);
+ else
+ mdinit(sc);
+ return (error);
+}
+
+static int
+mddetach(int unit, struct thread *td)
+{
+ struct md_s *sc;
+
+ sc = mdfind(unit);
+ if (sc == NULL)
+ return (ENOENT);
+ if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
+ return (EBUSY);
+ switch(sc->type) {
+ case MD_VNODE:
+ case MD_SWAP:
+ case MD_MALLOC:
+ case MD_PRELOAD:
+ return (mddestroy(sc, td));
+ default:
+ return (EOPNOTSUPP);
+ }
+}
+
+static int
+mdctlioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
+{
+ struct md_ioctl *mdio;
+ struct md_s *sc;
+
+ if (md_debug)
+ printf("mdctlioctl(%s %lx %p %x %p)\n",
+ devtoname(dev), cmd, addr, flags, td);
+
+ /*
+ * We assert the version number in the individual ioctl
+ * handlers instead of out here because (a) it is possible we
+ * may add another ioctl in the future which doesn't read an
+ * mdio, and (b) the correct return value for an unknown ioctl
+ * is ENOIOCTL, not EINVAL.
+ */
+ mdio = (struct md_ioctl *)addr;
+ switch (cmd) {
+ case MDIOCATTACH:
+ if (mdio->md_version != MDIOVERSION)
+ return (EINVAL);
+ switch (mdio->md_type) {
+ case MD_MALLOC:
+ return (mdcreate_malloc(mdio));
+ case MD_PRELOAD:
+ return (mdcreate_preload(mdio));
+ case MD_VNODE:
+ return (mdcreate_vnode(mdio, td));
+ case MD_SWAP:
+ return (mdcreate_swap(mdio, td));
+ default:
+ return (EINVAL);
+ }
+ case MDIOCDETACH:
+ if (mdio->md_version != MDIOVERSION)
+ return (EINVAL);
+ if (mdio->md_file != NULL || mdio->md_size != 0 ||
+ mdio->md_options != 0)
+ return (EINVAL);
+ return (mddetach(mdio->md_unit, td));
+ case MDIOCQUERY:
+ if (mdio->md_version != MDIOVERSION)
+ return (EINVAL);
+ sc = mdfind(mdio->md_unit);
+ if (sc == NULL)
+ return (ENOENT);
+ mdio->md_type = sc->type;
+ mdio->md_options = sc->flags;
+ switch (sc->type) {
+ case MD_MALLOC:
+ mdio->md_size = sc->nsect;
+ break;
+ case MD_PRELOAD:
+ mdio->md_size = sc->nsect;
+ mdio->md_base = (uint64_t)(intptr_t)sc->pl_ptr;
+ break;
+ case MD_SWAP:
+ mdio->md_size = sc->nsect * (PAGE_SIZE / DEV_BSIZE);
+ break;
+ case MD_VNODE:
+ mdio->md_size = sc->nsect;
+ /* XXX fill this in */
+ mdio->md_file = NULL;
+ break;
+ }
+ return (0);
+ default:
+ return (ENOIOCTL);
+ };
+ return (ENOIOCTL);
+}
+
+static void
+md_preloaded(u_char *image, unsigned length)
+{
+ struct md_s *sc;
+
+ sc = mdnew(-1);
+ if (sc == NULL)
+ return;
+ sc->type = MD_PRELOAD;
+ sc->secsize = DEV_BSIZE;
+ sc->nsect = length / DEV_BSIZE;
+ sc->pl_ptr = image;
+ sc->pl_len = length;
+ if (sc->unit == 0)
+ mdrootready = 1;
+ mdinit(sc);
+}
+
+static void
+md_drvinit(void *unused)
+{
+
+ caddr_t mod;
+ caddr_t c;
+ u_char *ptr, *name, *type;
+ unsigned len;
+
+#ifdef MD_ROOT_SIZE
+ md_preloaded(mfs_root, MD_ROOT_SIZE*1024);
+#endif
+ mod = NULL;
+ while ((mod = preload_search_next_name(mod)) != NULL) {
+ name = (char *)preload_search_info(mod, MODINFO_NAME);
+ type = (char *)preload_search_info(mod, MODINFO_TYPE);
+ if (name == NULL)
+ continue;
+ if (type == NULL)
+ continue;
+ if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
+ continue;
+ c = preload_search_info(mod, MODINFO_ADDR);
+ ptr = *(u_char **)c;
+ c = preload_search_info(mod, MODINFO_SIZE);
+ len = *(size_t *)c;
+ printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
+ MD_NAME, mdunits, name, len, ptr);
+ md_preloaded(ptr, len);
+ }
+ status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL,
+ 0600, MDCTL_NAME);
+}
+
+static int
+md_modevent(module_t mod, int type, void *data)
+{
+ int error;
+ struct md_s *sc;
+
+ switch (type) {
+ case MOD_LOAD:
+ md_drvinit(NULL);
+ break;
+ case MOD_UNLOAD:
+ LIST_FOREACH(sc, &md_softc_list, list) {
+ error = mddetach(sc->unit, curthread);
+ if (error != 0)
+ return (error);
+ }
+ if (status_dev)
+ destroy_dev(status_dev);
+ status_dev = 0;
+ break;
+ default:
+ break;
+ }
+ return (0);
+}
+
+static moduledata_t md_mod = {
+ MD_NAME,
+ md_modevent,
+ NULL
+};
+DECLARE_MODULE(md, md_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR);
+MODULE_VERSION(md, MD_MODVER);
+
+
+#ifdef MD_ROOT
+static void
+md_takeroot(void *junk)
+{
+ if (mdrootready)
+ rootdevnames[0] = "ufs:/dev/md0";
+}
+
+SYSINIT(md_root, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, md_takeroot, NULL);
+#endif
OpenPOWER on IntegriCloud