26 files changed, 12118 insertions, 0 deletions
diff --git a/sys/dev/vinum/COPYRIGHT b/sys/dev/vinum/COPYRIGHT
new file mode 100644
index 0000000..f0295e6
--- /dev/null
+++ b/sys/dev/vinum/COPYRIGHT
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *  
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $FreeBSD$
+ */
diff --git a/sys/dev/vinum/makestatetext b/sys/dev/vinum/makestatetext
new file mode 100755
index 0000000..c5a7da2
--- /dev/null
+++ b/sys/dev/vinum/makestatetext
@@ -0,0 +1,78 @@
+#!/bin/sh
+# Make statetexts.h from vinumstate.h
+# $FreeBSD$
+# $Id: makestatetext,v 1.7 1999/12/29 07:24:54 grog Exp grog $
+infile=vinumstate.h
+ofile=statetexts.h
+echo >$ofile "/* Created by $0 on" `date`.  "Do not edit */"
+echo >>$ofile
+cat >> $ofile <<FOO
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called \`\`Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *  
+ * This software is provided \`\`as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ */
+
+FOO
+
+echo >>$ofile "/* Drive state texts */"
+echo >>$ofile "char *drivestatetext [] = 
+  { "
+egrep  -e 'drive_[A-z0-9]*,'  <$infile | grep -v = | sed 's: *drive_\([^,]*\).*:  \"\1\",:' >>$ofile
+cat <<FOO >> $ofile
+  };
+
+/* Subdisk state texts */
+char *sdstatetext [] =
+  { 
+FOO
+egrep  -e 'sd_[A-z0-9]*,' $infile | grep -v = | sed 's: *sd_\([^,]*\).*:  \"\1\",:' >>$ofile
+cat <<FOO >> $ofile
+  };
+
+/* Plex state texts */
+char *plexstatetext [] =
+  { 
+FOO
+egrep  -e 'plex_[A-z0-9]*,' $infile | grep -v = | sed 's: *plex_\([^,]*\).*:  \"\1\",:' >>$ofile
+cat <<FOO >> $ofile
+  };
+
+/* Volume state texts */
+char *volstatetext [] =
+  { 
+FOO
+egrep  -e 'volume_[A-z0-9]*,' $infile | grep -v = | sed 's: *volume_\([^,]*\).*:  \"\1\",:' >>$ofile
+cat <<FOO >> $ofile
+  };
+FOO
diff --git a/sys/dev/vinum/request.h b/sys/dev/vinum/request.h
new file mode 100644
index 0000000..600130f
--- /dev/null
+++ b/sys/dev/vinum/request.h
@@ -0,0 +1,273 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: request.h,v 1.22 2003/04/24 04:37:08 grog Exp $
+ * $FreeBSD$
+ */
+
+/* Information needed to set up a transfer */
+
+enum xferinfo {
+    XFR_NORMAL_READ = 1,
+    XFR_NORMAL_WRITE = 2,				    /* write request in normal mode */
+    XFR_RECOVERY_READ = 4,
+    XFR_DEGRADED_WRITE = 8,
+    XFR_PARITYLESS_WRITE = 0x10,
+    XFR_NO_PARITY_STRIPE = 0x20,			    /* parity stripe is not available */
+    XFR_DATA_BLOCK = 0x40,				    /* data block in request */
+    XFR_PARITY_BLOCK = 0x80,				    /* parity block in request */
+    XFR_BAD_SUBDISK = 0x100,				    /* this subdisk is dead */
+    XFR_MALLOCED = 0x200,				    /* this buffer is malloced */
+#ifdef VINUMDEBUG
+    XFR_PHASE2 = 0x800,					    /* documentation only: 2nd phase write */
+#endif
+    XFR_REVIVECONFLICT = 0x1000,			    /* possible conflict with a revive operation */
+    XFR_BUFLOCKED = 0x2000,				    /* BUF_LOCK performed on this buffer */
+    XFR_COPYBUF = 0x4000,				    /* data buffer was copied */
+    /* operations that need a parity block */
+    XFR_PARITYOP = (XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE),
+    /* operations that use the group parameters */
+    XFR_GROUPOP = (XFR_DEGRADED_WRITE | XFR_RECOVERY_READ),
+    /* operations that that use the data parameters */
+    XFR_DATAOP = (XFR_NORMAL_READ | XFR_NORMAL_WRITE | XFR_PARITYLESS_WRITE),
+    /* operations requiring read before write */
+    XFR_RBW = (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE),
+    /* operations that need a malloced buffer */
+    XFR_NEEDS_MALLOC = (XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE)
+};
+
+/*
+ * Describe one low-level request, part of a
+ * high-level request.  This is an extended
+ * struct buf buffer, and the first element
+ * *must* be a struct buf.  We pass this
+ * structure to the I/O routines instead of a
+ * struct buf in order to be able to locate the
+ * high-level request when it completes.
+ *
+ * All offsets and lengths are in sectors.
+ */
+
+struct rqelement {
+    struct buf b;					    /* buf structure */
+    struct rqgroup *rqg;				    /* pointer to our group */
+    /* Information about the transfer */
+    daddr_t sdoffset;					    /* offset in subdisk */
+    int useroffset;					    /* offset in user buffer of normal data */
+    /*
+     * dataoffset and datalen refer to "individual" data
+     * transfers which involve only this drive (normal read,
+     * parityless write) and also degraded write.
+     *
+     * groupoffset and grouplen refer to the other "group"
+     * operations (normal write, recovery read) which involve
+     * more than one drive.  Both the offsets are relative to
+     * the start of the local buffer.
+     */
+    int dataoffset;					    /* offset in buffer of the normal data */
+    int groupoffset;					    /* offset in buffer of group data */
+    short datalen;					    /* length of normal data (sectors) */
+    short grouplen;					    /* length of group data (sectors) */
+    short buflen;					    /* total buffer length to allocate */
+    short flags;					    /* really enum xferinfo (see above) */
+    /* Ways to find other components */
+    short sdno;						    /* subdisk number */
+    short driveno;					    /* drive number */
+    struct timeval launchtime;				    /* time of launch, for info function */
+};
+
+/*
+ * A group of requests built to satisfy an I/O
+ * transfer on a single plex.
+ */
+struct rqgroup {
+    struct rqgroup *next;				    /* pointer to next group */
+    struct request *rq;					    /* pointer to the request */
+    short count;					    /* number of requests in this group */
+    short active;					    /* and number active */
+    short plexno;					    /* index of plex */
+    int badsdno;					    /* index of bad subdisk or -1 */
+    enum xferinfo flags;				    /* description of transfer */
+    struct rangelock *lock;				    /* lock for this transfer */
+    daddr_t lockbase;					    /* and lock address */
+    struct rqelement rqe[0];				    /* and the elements of this request */
+};
+
+/*
+ * Describe one high-level request and the
+ * work we have to do to satisfy it.
+ */
+struct request {
+    struct buf *bp;					    /* pointer to the high-level request */
+    caddr_t save_data;					    /* for copied write buffers */
+    enum xferinfo flags;
+    union {
+	int volno;					    /* volume index */
+	int plexno;					    /* or plex index */
+    } volplex;
+    int error;						    /* current error indication */
+    int sdno;						    /* reviving subdisk (XFR_REVIVECONFLICT) */
+    short isplex;					    /* set if this is a plex request */
+    short active;					    /* number of subrequests still active */
+    struct rqgroup *rqg;				    /* pointer to the first group of requests */
+    struct rqgroup *lrqg;				    /* and to the last group of requests */
+    struct request *next;				    /* link of waiting requests */
+};
+
+/*
+ * Extended buffer header for subdisk I/O.  Includes
+ * a pointer to the user I/O request.
+ */
+struct sdbuf {
+    struct buf b;					    /* our buffer */
+    struct buf *bp;					    /* and pointer to parent */
+    short driveno;					    /* drive index */
+    short sdno;						    /* and subdisk index */
+};
+
+/*
+ * Values returned by rqe and friends.  Be careful
+ * with these: they are in order of increasing
+ * seriousness.  Some routines check for
+ * > REQUEST_RECOVERED to indicate a failed request. XXX
+ */
+enum requeststatus {
+    REQUEST_OK,						    /* request built OK */
+    REQUEST_RECOVERED,					    /* request OK, but involves RAID5 recovery */
+    REQUEST_DEGRADED,					    /* parts of request failed */
+    REQUEST_EOF,					    /* parts of request failed: outside plex */
+    REQUEST_DOWN,					    /* all of request failed: subdisk(s) down */
+    REQUEST_ENOMEM					    /* all of request failed: ran out of memory */
+};
+
+#ifdef VINUMDEBUG
+/* Trace entry for request info (DEBUG_LASTREQS) */
+enum rqinfo_type {
+    loginfo_unused,					    /* never been used */
+    loginfo_user_bp,					    /* this is the bp when strategy is called */
+    loginfo_user_bpl,					    /* and this is the bp at launch time */
+    loginfo_rqe,					    /* user RQE */
+    loginfo_iodone,					    /* iodone */
+    loginfo_raid5_data,					    /* write RAID-5 data block */
+    loginfo_raid5_parity,				    /* write RAID-5 parity block */
+    loginfo_sdio,					    /* subdisk I/O */
+    loginfo_sdiol,					    /* subdisk I/O launch */
+    loginfo_sdiodone,					    /* subdisk iodone */
+    loginfo_lockwait,					    /* wait for range lock */
+    loginfo_lock,					    /* lock range */
+    loginfo_unlock,					    /* unlock range */
+};
+
+/*
+ * This is the rangelock structure with an added
+ * buffer pointer and plex number.  We don't need
+ * the plex number for the locking protocol, but
+ * it does help a lot when logging.
+ */
+struct rangelockinfo {
+    daddr_t stripe;					    /* address + 1 of the range being locked  */
+    struct buf *bp;					    /* user's buffer pointer */
+    int plexno;
+};
+
+union rqinfou {						    /* info to pass to logrq */
+    struct buf *bp;
+    struct rqelement *rqe;				    /* address of request, for correlation */
+    struct rangelockinfo *lockinfo;
+};
+
+struct rqinfo {
+    enum rqinfo_type type;				    /* kind of event */
+    struct timeval timestamp;				    /* time it happened */
+    struct buf *bp;					    /* point to user buffer */
+    int devmajor;					    /* major and minor device info */
+    int devminor;
+    union {
+	struct buf b;					    /* yup, the *whole* buffer header */
+	struct rqelement rqe;				    /* and the whole rqe */
+	struct rangelock lockinfo;
+    } info;
+};
+
+#define RQINFO_SIZE 128					    /* number of info slots in buffer */
+
+void logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp);
+#endif
+
+/* Structures for the daemon */
+
+/* types of request to the daemon */
+enum daemonrq {
+    daemonrq_none,					    /* dummy to catch bugs */
+    daemonrq_ioerror,					    /* error occurred on I/O */
+    daemonrq_saveconfig,				    /* save configuration */
+    daemonrq_return,					    /* return to userland */
+    daemonrq_ping,					    /* show sign of life */
+    daemonrq_init,					    /* initialize a plex */
+    daemonrq_revive,					    /* revive a subdisk */
+    daemonrq_closedrive,				    /* close a drive */
+};
+
+/* info field for daemon requests */
+union daemoninfo {					    /* and the request information */
+    struct request *rq;					    /* for daemonrq_ioerror */
+    struct sd *sd;					    /* for daemonrq_revive */
+    struct plex *plex;					    /* for daemonrq_init */
+    struct drive *drive;				    /* for daemonrq_closedrive */
+    int nothing;					    /* for passing NULL */
+};
+
+struct daemonq {
+    struct daemonq *next;				    /* pointer to next element in queue */
+    enum daemonrq type;					    /* type of request */
+    int privateinuse;					    /* private element, being used */
+    union daemoninfo info;				    /* and the request information */
+};
+
+void queue_daemon_request(enum daemonrq type, union daemoninfo info);
+
+extern int daemon_options;
+
+enum daemon_option {
+    daemon_verbose = 1,					    /* talk about what we're doing */
+    daemon_stopped = 2,
+    daemon_noupdate = 4,				    /* don't update the disk config, for recovery */
+};
+
+void freerq(struct request *rq);
+void unlockrange(int plexno, struct rangelock *);
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/statetexts.h b/sys/dev/vinum/statetexts.h
new file mode 100644
index 0000000..88cfc17
--- /dev/null
+++ b/sys/dev/vinum/statetexts.h
@@ -0,0 +1,91 @@
+/* Created by ./makestatetext on Wed Jan 5 10:05:30 CST 2000. Do not edit */
+
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $FreeBSD$
+ */
+
+/* Drive state texts */
+char *drivestatetext[] =
+{
+    "unallocated",
+    "referenced",
+    "down",
+    "up",
+};
+
+/* Subdisk state texts */
+char *sdstatetext[] =
+{
+    "unallocated",
+    "uninit",
+    "referenced",
+    "init",
+    "empty",
+    "initializing",
+    "initialized",
+    "obsolete",
+    "stale",
+    "crashed",
+    "down",
+    "reviving",
+    "reborn",
+    "up",
+};
+
+/* Plex state texts */
+char *plexstatetext[] =
+{
+    "unallocated",
+    "referenced",
+    "init",
+    "faulty",
+    "down",
+    "initializing",
+    "corrupt",
+    "degraded",
+    "flaky",
+    "up",
+};
+
+/* Volume state texts */
+char *volstatetext[] =
+{
+    "unallocated",
+    "uninit",
+    "down",
+    "up",
+};
diff --git a/sys/dev/vinum/vinum.c b/sys/dev/vinum/vinum.c
new file mode 100644
index 0000000..36dfa98
--- /dev/null
+++ b/sys/dev/vinum/vinum.c
@@ -0,0 +1,531 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinum.c,v 1.44 2003/05/23 00:50:55 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#define STATIC static					    /* nothing while we're testing */
+
+#include <dev/vinum/vinumhdr.h>
+#include <sys/sysproto.h>				    /* for sync(2) */
+#ifdef VINUMDEBUG
+#include <sys/reboot.h>
+int debug = 0;						    /* debug flags */
+extern int total_malloced;
+extern int malloccount;
+extern struct mc malloced[];
+#endif
+#include <dev/vinum/request.h>
+
+struct cdevsw vinum_cdevsw =
+{
+    .d_open = vinumopen,
+    .d_close = vinumclose,
+    .d_read = physread,
+    .d_write = physwrite,
+    .d_ioctl = vinumioctl,
+    .d_strategy = vinumstrategy,
+    .d_name = "vinum",
+    .d_maj = VINUM_CDEV_MAJOR,
+    .d_flags = D_DISK
+};
+
+/* Called by main() during pseudo-device attachment. */
+void vinumattach(void *);
+STATIC int vinum_modevent(module_t mod, modeventtype_t type, void *unused);
+STATIC void vinum_clone(void *arg, char *name, int namelen, dev_t * dev);
+
+struct _vinum_conf vinum_conf;				    /* configuration information */
+
+dev_t vinum_daemon_dev;
+dev_t vinum_super_dev;
+
+static eventhandler_tag dev_clone_tag;
+
+/*
+ * Mutexes for plex synchronization.  Ideally each plex
+ * should have its own mutex, but the fact that the plex
+ * struct can move makes that very complicated.  Instead,
+ * have plexes use share these mutexes based on modulo plex
+ * number.
+ */
+struct mtx plexmutex[PLEXMUTEXES];
+
+/*
+ * Called by main() during pseudo-device attachment.  All we need
+ * to do is allocate enough space for devices to be configured later, and
+ * add devsw entries.
+ */
+void
+vinumattach(void *dummy)
+{
+    char *envp;
+    int i;
+#define MUTEXNAMELEN 16
+    char mutexname[MUTEXNAMELEN];
+#if PLEXMUTEXES > 10000
+#error Increase size of MUTEXNAMELEN
+#endif
+/* modload should prevent multiple loads, so this is worth a panic */
+    if ((vinum_conf.flags & VF_LOADED) != 0)
+	panic("vinum: already loaded");
+
+    log(LOG_INFO, "vinum: loaded\n");
+#ifdef VINUMDEBUG
+    vinum_conf.flags |= VF_LOADED | VF_HASDEBUG;	    /* we're loaded now, and we support debug */
+#else
+    vinum_conf.flags |= VF_LOADED;			    /* we're loaded now */
+#endif
+
+    daemonq = NULL;					    /* initialize daemon's work queue */
+    dqend = NULL;
+
+    vinum_daemon_dev = make_dev(&vinum_cdevsw,
+	VINUM_DAEMON_MINOR,
+	UID_ROOT,
+	GID_WHEEL,
+	S_IRUSR | S_IWUSR,
+	"vinum/controld");
+    vinum_super_dev = make_dev(&vinum_cdevsw,
+	VINUM_SUPERDEV_MINOR,
+	UID_ROOT,
+	GID_WHEEL,
+	S_IRUSR | S_IWUSR,
+	"vinum/control");
+
+    vinum_conf.version = VINUMVERSION;			    /* note what version we are */
+
+    /* allocate space: drives... */
+    DRIVE = (struct drive *) Malloc(sizeof(struct drive) * INITIAL_DRIVES);
+    CHECKALLOC(DRIVE, "vinum: no memory\n");
+    bzero(DRIVE, sizeof(struct drive) * INITIAL_DRIVES);
+    vinum_conf.drives_allocated = INITIAL_DRIVES;	    /* number of drive slots allocated */
+    vinum_conf.drives_used = 0;				    /* and number in use */
+
+    /* volumes, ... */
+    VOL = (struct volume *) Malloc(sizeof(struct volume) * INITIAL_VOLUMES);
+    CHECKALLOC(VOL, "vinum: no memory\n");
+    bzero(VOL, sizeof(struct volume) * INITIAL_VOLUMES);
+    vinum_conf.volumes_allocated = INITIAL_VOLUMES;	    /* number of volume slots allocated */
+    vinum_conf.volumes_used = 0;			    /* and number in use */
+
+    /* plexes, ... */
+    PLEX = (struct plex *) Malloc(sizeof(struct plex) * INITIAL_PLEXES);
+    CHECKALLOC(PLEX, "vinum: no memory\n");
+    bzero(PLEX, sizeof(struct plex) * INITIAL_PLEXES);
+    vinum_conf.plexes_allocated = INITIAL_PLEXES;	    /* number of plex slots allocated */
+    vinum_conf.plexes_used = 0;				    /* and number in use */
+
+    for (i = 0; i < PLEXMUTEXES; i++) {
+	snprintf(mutexname, MUTEXNAMELEN, "vinumplex%d", i);
+	mtx_init(&plexmutex[i], mutexname, "plex", MTX_DEF);
+    }
+
+    /* and subdisks */
+    SD = (struct sd *) Malloc(sizeof(struct sd) * INITIAL_SUBDISKS);
+    CHECKALLOC(SD, "vinum: no memory\n");
+    bzero(SD, sizeof(struct sd) * INITIAL_SUBDISKS);
+    vinum_conf.subdisks_allocated = INITIAL_SUBDISKS;	    /* number of sd slots allocated */
+    vinum_conf.subdisks_used = 0;			    /* and number in use */
+    dev_clone_tag = EVENTHANDLER_REGISTER(dev_clone, vinum_clone, 0, 1000);
+
+    /*
+     * See if the loader has passed us any of the autostart
+     * options.
+     */
+    envp = NULL;
+    if ((envp = getenv("vinum.autostart")) != NULL) {	    /* start all drives now */
+	vinum_scandisk(NULL);
+	freeenv(envp);
+    } else if ((envp = getenv("vinum.drives")) != NULL) {
+	vinum_scandisk(envp);
+	freeenv(envp);
+    }
+}
+
+/*
+ * Check if we have anything open.  If confopen is != 0,
+ * that goes for the super device as well, otherwise
+ * only for volumes.
+ *
+ * Return 0 if not inactive, 1 if inactive.
+ */
+int
+vinum_inactive(int confopen)
+{
+    int i;
+    int can_do = 1;					    /* assume we can do it */
+
+    if (confopen && (vinum_conf.flags & VF_OPEN))	    /* open by vinum(8)? */
+	return 0;					    /* can't do it while we're open */
+    lock_config();
+    for (i = 0; i < vinum_conf.volumes_allocated; i++) {
+	if ((VOL[i].state > volume_down)
+	    && (VOL[i].flags & VF_OPEN)) {		    /* volume is open */
+	    can_do = 0;
+	    break;
+	}
+    }
+    unlock_config();
+    return can_do;
+}
+
+/*
+ * Free all structures.
+ * If cleardrive is 0, save the configuration; otherwise
+ * remove the configuration from the drive.
+ *
+ * Before coming here, ensure that no volumes are open.
+ */
+void
+free_vinum(int cleardrive)
+{
+    int i;
+    int drives_allocated = vinum_conf.drives_allocated;
+
+    while ((vinum_conf.flags & (VF_STOPPING | VF_DAEMONOPEN))
+	== (VF_STOPPING | VF_DAEMONOPEN)) {		    /* at least one daemon open, we're stopping */
+	queue_daemon_request(daemonrq_return, (union daemoninfo) 0); /* stop the daemon */
+	tsleep(&vinumclose, PUSER, "vstop", 1);		    /* and wait for it */
+    }
+    if (DRIVE != NULL) {
+	if (cleardrive) {				    /* remove the vinum config */
+	    for (i = 0; i < drives_allocated; i++)
+		remove_drive(i);			    /* remove the drive */
+	} else {					    /* keep the config */
+	    for (i = 0; i < drives_allocated; i++)
+		free_drive(&DRIVE[i]);			    /* close files and things */
+	}
+	Free(DRIVE);
+    }
+    if (SD != NULL) {
+	for (i = 0; i < vinum_conf.subdisks_allocated; i++) {
+	    struct sd *sd = &SD[i];
+
+	    if (sd->state != sd_unallocated)
+		free_sd(i);
+	}
+	Free(SD);
+    }
+    if (PLEX != NULL) {
+	for (i = 0; i < vinum_conf.plexes_allocated; i++) {
+	    struct plex *plex = &PLEX[i];
+
+	    if (plex->state != plex_unallocated)	    /* we have real data there */
+		free_plex(i);
+	}
+	Free(PLEX);
+    }
+    if (VOL != NULL) {
+	for (i = 0; i < vinum_conf.volumes_allocated; i++) {
+	    struct volume *volume = &VOL[i];
+
+	    if (volume->state != volume_unallocated)
+		free_volume(i);
+	}
+	Free(VOL);
+    }
+    bzero(&vinum_conf, sizeof(vinum_conf));
+    vinum_conf.version = VINUMVERSION;			    /* reinstate version number */
+}
+
+STATIC int
+vinum_modevent(module_t mod, modeventtype_t type, void *unused)
+{
+    struct sync_args dummyarg =
+    {0};
+    int i;
+
+    switch (type) {
+    case MOD_LOAD:
+	vinumattach(NULL);
+	return 0;					    /* OK */
+    case MOD_UNLOAD:
+	if (!vinum_inactive(1))				    /* is anything open? */
+	    return EBUSY;				    /* yes, we can't do it */
+	vinum_conf.flags |= VF_STOPPING;		    /* note that we want to stop */
+	sync(curthread, &dummyarg);			    /* write out buffers */
+	free_vinum(0);					    /* clean up */
+#ifdef VINUMDEBUG
+	if (total_malloced) {
+	    int i;
+#ifdef INVARIANTS
+	    int *poke;
+#endif
+
+	    for (i = 0; i < malloccount; i++) {
+		if (debug & DEBUG_WARNINGS)		    /* want to hear about them */
+		    log(LOG_WARNING,
+			"vinum: exiting with %d bytes malloced from %s:%d\n",
+			malloced[i].size,
+			malloced[i].file,
+			malloced[i].line);
+#ifdef INVARIANTS
+		poke = &((int *) malloced[i].address)
+		    [malloced[i].size / (2 * sizeof(int))]; /* middle of the area */
+		if (*poke == 0xdeadc0de)		    /* already freed */
+		    log(LOG_ERR,
+			"vinum: exiting with malloc table inconsistency at %p from %s:%d\n",
+			malloced[i].address,
+			malloced[i].file,
+			malloced[i].line);
+#endif
+		Free(malloced[i].address);
+	    }
+	}
+#endif
+	destroy_dev(vinum_daemon_dev);			    /* daemon device */
+	destroy_dev(vinum_super_dev);
+	for (i = 0; i < PLEXMUTEXES; i++)
+	    mtx_destroy(&plexmutex[i]);
+	log(LOG_INFO, "vinum: unloaded\n");		    /* tell the world */
+	EVENTHANDLER_DEREGISTER(dev_clone, dev_clone_tag);
+	return 0;
+    default:
+	break;
+    }
+    return 0;
+}
+
+static moduledata_t vinum_mod =
+{
+    "vinum",
+    (modeventhand_t) vinum_modevent,
+    0
+};
+DECLARE_MODULE(vinum, vinum_mod, SI_SUB_RAID, SI_ORDER_MIDDLE);
+
+/* ARGSUSED */
+/* Open a vinum object */
+int
+vinumopen(dev_t dev,
+    int flags,
+    int fmt,
+    struct thread *td)
+{
+    int error;
+    unsigned int index;
+    struct volume *vol;
+    struct plex *plex;
+    struct sd *sd;
+    int devminor;					    /* minor number */
+
+    devminor = minor(dev);
+    error = 0;
+    /* First, decide what we're looking at */
+    switch (DEVTYPE(dev)) {
+    case VINUM_VOLUME_TYPE:
+	/*
+	 * The super device and daemon device are the last two
+	 * volume numbers, so check for them first.
+	 */
+	if ((devminor == VINUM_DAEMON_MINOR)		    /* daemon device */
+	||(devminor == VINUM_SUPERDEV_MINOR)) {		    /* or normal super device */
+	    error = suser(td);				    /* are we root? */
+
+	    if (error == 0) {				    /* yes, can do */
+		if (devminor == VINUM_DAEMON_MINOR)	    /* daemon device */
+		    vinum_conf.flags |= VF_DAEMONOPEN;	    /* we're open */
+		else					    /* superdev */
+		    vinum_conf.flags |= VF_OPEN;	    /* we're open */
+	    }
+	    return error;
+	}
+	/* Must be a real volume.  Check. */
+	index = Volno(dev);
+	if (index >= vinum_conf.volumes_allocated)
+	    return ENXIO;				    /* no such device */
+	vol = &VOL[index];
+
+	switch (vol->state) {
+	case volume_unallocated:
+	case volume_uninit:
+	    return ENXIO;
+
+	case volume_up:
+	    vol->flags |= VF_OPEN;			    /* note we're open */
+	    return 0;
+
+	case volume_down:
+	    return EIO;
+
+	default:
+	    return EINVAL;
+	}
+
+    case VINUM_PLEX_TYPE:
+	index = Plexno(dev);				    /* get plex index in vinum_conf */
+	if (index >= vinum_conf.plexes_allocated)
+	    return ENXIO;				    /* no such device */
+	plex = &PLEX[index];
+
+	switch (plex->state) {
+	case plex_unallocated:
+	    return ENXIO;
+
+	case plex_referenced:
+	    return EINVAL;
+
+	default:
+	    plex->flags |= VF_OPEN;			    /* note we're open */
+	    return 0;
+	}
+
+    case VINUM_SD_TYPE:
+    case VINUM_SD2_TYPE:
+	index = Sdno(dev);				    /* get the subdisk number */
+	if (index >= vinum_conf.subdisks_allocated)	    /* not a valid SD entry */
+	    return ENXIO;				    /* no such device */
+	sd = &SD[index];
+
+	/*
+	 * Opening a subdisk is always a special operation, so
+	 * we ignore the state as long as it represents a real
+	 * subdisk.
+	 */
+	switch (sd->state) {
+	case sd_unallocated:
+	    return ENXIO;
+
+	case sd_uninit:
+	case sd_referenced:
+	    return EINVAL;
+
+	default:
+	    sd->flags |= VF_OPEN;			    /* note we're open */
+	    return 0;
+	}
+    }
+    return 0;						    /* to keep the compiler happy */
+}
+
+/* ARGSUSED */
+int
+vinumclose(dev_t dev,
+    int flags,
+    int fmt,
+    struct thread *td)
+{
+    unsigned int index;
+    struct volume *vol;
+    int devminor;
+
+    devminor = minor(dev);
+    /* First, decide what we're looking at */
+    switch (DEVTYPE(dev)) {
+    case VINUM_VOLUME_TYPE:
+	/*
+	 * The super device and daemon device are the last two
+	 * volume numbers, so check for them first.
+	 */
+	if ((devminor == VINUM_DAEMON_MINOR)		    /* daemon device */
+	||(devminor == VINUM_SUPERDEV_MINOR)) {		    /* or normal super device */
+	    /*
+	     * don't worry about whether we're root:
+	     * nobody else would get this far.
+	     */
+	    if (devminor == VINUM_SUPERDEV_MINOR)	    /* normal superdev */
+		vinum_conf.flags &= ~VF_OPEN;		    /* no longer open */
+	    else {					    /* the daemon device */
+		vinum_conf.flags &= ~VF_DAEMONOPEN;	    /* no longer open */
+		if (vinum_conf.flags & VF_STOPPING)	    /* we're trying to stop, */
+		    wakeup(&vinumclose);		    /* we can continue now */
+	    }
+	    return 0;
+	}
+	/* Real volume */
+	index = Volno(dev);
+	if (index >= vinum_conf.volumes_allocated)
+	    return ENXIO;				    /* no such device */
+	vol = &VOL[index];
+
+	switch (vol->state) {
+	case volume_unallocated:
+	case volume_uninit:
+	    return ENXIO;
+
+	case volume_up:
+	    vol->flags &= ~VF_OPEN;			    /* reset our flags */
+	    return 0;
+
+	case volume_down:
+	    return EIO;
+
+	default:
+	    return EINVAL;
+	}
+
+    case VINUM_PLEX_TYPE:
+	if (Volno(dev) >= vinum_conf.volumes_allocated)
+	    return ENXIO;
+	/* FALLTHROUGH */
+
+    case VINUM_SD_TYPE:
+	if ((Volno(dev) >= vinum_conf.volumes_allocated) || /* no such volume */
+	    (Plexno(dev) >= vinum_conf.plexes_allocated))   /* or no such plex */
+	    return ENXIO;				    /* no such device */
+	/* FALLTHROUGH */
+
+    default:
+	return ENODEV;					    /* don't know what to do with these */
+    }
+}
+
+void
+vinum_clone(void *arg, char *name, int namelen, dev_t * dev)
+{
+    struct volume *vol;
+    int i;
+
+    if (*dev != NODEV)
+	return;
+    if (strncmp(name, "vinum/", sizeof("vinum/") - 1) != 0)
+	return;
+
+    name += sizeof("vinum/") - 1;
+    if ((i = find_volume(name, 0)) == -1)
+	return;
+
+    vol = &VOL[i];
+    *dev = vol->dev;
+}
+
+
+/* Local Variables: */
+/* fill-column: 60 */
+/* End: */
diff --git a/sys/dev/vinum/vinumconfig.c b/sys/dev/vinum/vinumconfig.c
new file mode 100644
index 0000000..2c00921
--- /dev/null
+++ b/sys/dev/vinum/vinumconfig.c
@@ -0,0 +1,2148 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumconfig.c,v 1.41 2003/05/23 00:57:34 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#define STATIC static
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+#define MAXTOKEN 64					    /* maximum number of tokens in a line */
+
+/*
+ * We can afford the luxury of global variables here,
+ * since start_config ensures that these functions
+ * are single-threaded.
+ */
+
+/* These are indices in vinum_conf of the last-mentioned of each kind of object */
+static int current_drive;				    /* note the last drive we mention, for
+							    * some defaults */
+static int current_plex;				    /* and the same for the last plex */
+static int current_volume;				    /* and the last volme */
+static struct _ioctl_reply *ioctl_reply;		    /* struct to return via ioctl */
+
+
+/* These values are used by most of these routines, so set them as globals */
+static char *token[MAXTOKEN];				    /* pointers to individual tokens */
+static int tokens;					    /* number of tokens */
+
+#define TOCONS	0x01
+#define TOTTY	0x02
+#define TOLOG	0x04
+
+struct putchar_arg {
+    int flags;
+    struct tty *tty;
+};
+
+#define MSG_MAX 1024					    /* maximum length of a formatted message */
+/*
+ * Format an error message and return to the user
+ * in the reply.  CARE: This routine is designed
+ * to be called only from the configuration
+ * routines, so it assumes it's the owner of the
+ * configuration lock, and unlocks it on exit.
+ */
+void
+throw_rude_remark(int error, char *msg,...)
+{
+    int retval;
+    va_list ap;
+    char *text;
+    static int finishing;				    /* don't recurse */
+    int was_finishing;
+
+    if ((vinum_conf.flags & VF_LOCKED) == 0)		    /* bug catcher */
+	panic("throw_rude_remark: called without config lock");
+    va_start(ap, msg);
+    if ((ioctl_reply != NULL)				    /* we're called from the user */
+    &&(!(vinum_conf.flags & VF_READING_CONFIG))) {	    /* and not reading from disk: return msg */
+	/*
+	 * We can't just format to ioctl_reply, since it
+	 * may contain our input parameters
+	 */
+	text = Malloc(MSG_MAX);
+	if (text == NULL) {
+	    log(LOG_ERR, "vinum: can't allocate error message buffer\n");
+	    printf("vinum: ");
+	    vprintf(msg, ap);				    /* print to the console */
+	    printf("\n");
+	} else {
+	    retval = kvprintf(msg, NULL, (void *) text, 10, ap);
+	    text[retval] = '\0';			    /* delimit */
+	    strlcpy(ioctl_reply->msg, text, sizeof(ioctl_reply->msg));
+	    ioctl_reply->error = error;			    /* first byte is the error number */
+	    Free(text);
+	}
+    } else {
+	printf("vinum: ");
+	vprintf(msg, ap);				    /* print to the console */
+	printf("\n");
+    }
+    va_end(ap);
+
+    if (vinum_conf.flags & VF_READING_CONFIG) {		    /* go through to the bitter end, */
+	if ((vinum_conf.flags & VF_READING_CONFIG)	    /* we're reading from disk, */
+	&&((daemon_options & daemon_noupdate) == 0)) {
+	    log(LOG_NOTICE, "Disabling configuration updates\n");
+	    daemon_options |= daemon_noupdate;
+	}
+	return;
+    }
+    /*
+     * We have a problem here: we want to unlock the
+     * configuration, which implies tidying up, but
+     * if we find an error while tidying up, we
+     * could recurse for ever.  Use this kludge to
+     * only try once.
+     */
+    was_finishing = finishing;
+    finishing = 1;
+    finish_config(was_finishing);			    /* unlock anything we may be holding */
+    finishing = was_finishing;
+    longjmp(command_fail, error);
+}
+
+/*
+ * Check a volume to see if the plex is already assigned to it.
+ * Return index in volume->plex, or -1 if not assigned
+ */
+int
+my_plex(int volno, int plexno)
+{
+    int i;
+    struct volume *vol;
+
+    vol = &VOL[volno];					    /* point to volno */
+    for (i = 0; i < vol->plexes; i++)
+	if (vol->plex[i] == plexno)
+	    return i;
+    return -1;						    /* not found */
+}
+
+/*
+ * Check a plex to see if the subdisk is already assigned to it.
+ * Return index in plex->sd, or -1 if not assigned
+ */
+int
+my_sd(int plexno, int sdno)
+{
+    int i;
+    struct plex *plex;
+
+    plex = &PLEX[plexno];
+    for (i = 0; i < plex->subdisks; i++)
+	if (plex->sdnos[i] == sdno)
+	    return i;
+    return -1;						    /* not found */
+}
+
+/* Add plex to the volume if possible */
+int
+give_plex_to_volume(int volno, int plexno, int preferme)
+{
+    struct volume *vol;
+    int i;
+    int volplexno;
+
+    /*
+     * It's not an error for the plex to already
+     * belong to the volume, but we need to check a
+     * number of things to make sure it's done right.
+     * Some day.
+     */
+    volplexno = my_plex(volno, plexno);
+    vol = &VOL[volno];					    /* point to volume */
+    if (volplexno < 0) {
+	if (vol->plexes == MAXPLEX)			    /* all plexes allocated */
+	    throw_rude_remark(ENOSPC,
+		"Too many plexes for volume %s",
+		vol->name);
+	else if ((vol->plexes > 0)			    /* we have other plexes */
+	&&((vol->flags & VF_CONFIG_SETUPSTATE) == 0))	    /* and we're not setting up state */
+	    invalidate_subdisks(&PLEX[plexno], sd_stale);   /* make our subdisks invalid */
+	vol->plex[vol->plexes] = plexno;		    /* this one */
+	vol->plexes++;					    /* add another plex */
+	PLEX[plexno].volno = volno;			    /* note the number of our volume */
+
+	/* Find out how big our volume is */
+	for (i = 0; i < vol->plexes; i++)
+	    vol->size = max(vol->size, PLEX[vol->plex[i]].length);
+	volplexno = vol->plexes - 1;			    /* number of plex in volume */
+    }
+    if (preferme) {
+	if (vol->preferred_plex >= 0)			    /* already had a facourite, */
+	    printf("vinum: changing preferred plex for %s from %s to %s\n",
+		vol->name,
+		PLEX[vol->plex[vol->preferred_plex]].name,
+		PLEX[plexno].name);
+	vol->preferred_plex = volplexno;
+    }
+    return volplexno;
+}
+
+/*
+ * Add subdisk to a plex if possible
+ */
+int
+give_sd_to_plex(int plexno, int sdno)
+{
+    int i;
+    struct plex *plex;
+    struct sd *sd;
+
+    /*
+     * It's not an error for the sd to already
+     * belong to the plex, but we need to check a
+     * number of things to make sure it's done right.
+     * Some day.
+     */
+    i = my_sd(plexno, sdno);
+    if (i >= 0)						    /* does it already belong to us? */
+	return i;					    /* that's it */
+
+    plex = &PLEX[plexno];				    /* point to the plex */
+    sd = &SD[sdno];					    /* and the subdisk */
+
+    /* Do we have an offset?  Otherwise put it after the last one */
+    if (sd->plexoffset < 0) {				    /* no offset specified */
+	if (plex->subdisks > 0) {
+	    struct sd *lastsd = &SD[plex->sdnos[plex->subdisks - 1]]; /* last subdisk */
+
+	    if (plex->organization == plex_concat)	    /* concat, */
+		sd->plexoffset = lastsd->sectors + lastsd->plexoffset; /* starts here */
+	    else					    /* striped, RAID-4 or RAID-5 */
+		sd->plexoffset = plex->stripesize * plex->subdisks; /* starts here */
+	} else						    /* first subdisk */
+	    sd->plexoffset = 0;				    /* start at the beginning */
+    }
+    if (plex->subdisks == MAXSD)			    /* we already have our maximum */
+	throw_rude_remark(ENOSPC,			    /* crap out */
+	    "Can't add %s to %s: plex full",
+	    sd->name,
+	    plex->name);
+
+    plex->subdisks++;					    /* another entry */
+    if (plex->subdisks >= plex->subdisks_allocated)	    /* need more space */
+	EXPAND(plex->sdnos, int, plex->subdisks_allocated, INITIAL_SUBDISKS_IN_PLEX);
+
+    /* Adjust size of plex and volume. */
+    if (isparity(plex))					    /* RAID-4 or RAID-5 */
+	plex->length = (plex->subdisks - 1) * sd->sectors;  /* size is one disk short */
+    else
+	plex->length += sd->sectors;			    /* plex gets this much bigger */
+    if (plex->volno >= 0)				    /* we have a volume */
+	VOL[plex->volno].size = max(VOL[plex->volno].size, plex->length); /* adjust its size */
+
+    /*
+     * We need to check that the subdisks don't overlap,
+     * but we can't do that until a point where we *must*
+     * know the size of all the subdisks.  That's not
+     * here.  But we need to sort them by offset
+     */
+    for (i = 0; i < plex->subdisks - 1; i++) {
+	if (sd->plexoffset < SD[plex->sdnos[i]].plexoffset) { /* it fits before this one */
+	    /* First move any remaining subdisks by one */
+	    int j;
+
+	    for (j = plex->subdisks - 1; j > i; j--)	    /* move up one at a time */
+		plex->sdnos[j] = plex->sdnos[j - 1];
+	    plex->sdnos[i] = sdno;
+	    sd->plexsdno = i;				    /* note where we are in the subdisk */
+	    return i;
+	}
+    }
+
+    /*
+     * The plex doesn't have any subdisk with a
+     * larger offset.  Insert it here.
+     */
+    plex->sdnos[i] = sdno;
+    sd->plexsdno = i;					    /* note where we are in the subdisk */
+    sd->plexno = plex->plexno;				    /* and who we belong to */
+    return i;
+}
+
+/*
+ * Add a subdisk to drive if possible.  The
+ * pointer to the drive must already be stored in
+ * the sd structure, but the drive doesn't know
+ * about the subdisk yet.
+ */
+void
+give_sd_to_drive(int sdno)
+{
+    struct sd *sd;					    /* pointer to subdisk */
+    struct drive *drive;				    /* and drive */
+    int fe;						    /* index in free list */
+    int sfe;						    /* and index of subdisk when assigning max */
+
+    sd = &SD[sdno];					    /* point to sd */
+    drive = &DRIVE[sd->driveno];			    /* and drive */
+
+    if (drive->state != drive_up) {
+	update_sd_state(sdno);				    /* that crashes the subdisk */
+	return;
+    }
+    sd->sectorsize = drive->sectorsize;			    /* get sector size from drive */
+    if (drive->flags & VF_HOTSPARE)			    /* the drive is a hot spare, */
+	throw_rude_remark(ENOSPC,
+	    "Can't place %s on hot spare drive %s",
+	    sd->name,
+	    drive->label.name);
+    if ((drive->sectors_available == 0)			    /* no space left */
+    ||(sd->sectors > drive->sectors_available)) {	    /* or too big, */
+	sd->driveoffset = -1;				    /* don't be confusing */
+	free_sd(sd->sdno);
+	throw_rude_remark(ENOSPC, "No space for %s on %s", sd->name, drive->label.name);
+	return;						    /* in case we come back here */
+    }
+    drive->subdisks_used++;				    /* one more subdisk */
+
+    if (sd->sectors == 0) {				    /* take the largest chunk */
+	sfe = 0;					    /* to keep the compiler happy */
+	for (fe = 0; fe < drive->freelist_entries; fe++) {
+	    if (drive->freelist[fe].sectors >= sd->sectors) { /* more space here */
+		sd->sectors = drive->freelist[fe].sectors;  /* take it */
+		sd->driveoffset = drive->freelist[fe].offset;
+		sfe = fe;				    /* and note the index for later */
+	    }
+	}
+	if (sd->sectors == 0) {				    /* no luck, */
+	    sd->driveoffset = -1;			    /* don't be confusing */
+	    free_sd(sd->sdno);
+	    throw_rude_remark(ENOSPC,			    /* give up */
+		"No space for %s on %s",
+		sd->name,
+		drive->label.name);
+	}
+	if (sfe < (drive->freelist_entries - 1))	    /* not the last one, */
+	    bcopy(&drive->freelist[sfe + 1],
+		&drive->freelist[sfe],
+		(drive->freelist_entries - sfe) * sizeof(struct drive_freelist));
+	drive->freelist_entries--;			    /* one less entry */
+	drive->sectors_available -= sd->sectors;	    /* and note how much less space we have */
+    } else if (sd->driveoffset < 0) {			    /* no offset specified, find one */
+	for (fe = 0; fe < drive->freelist_entries; fe++) {
+	    if (drive->freelist[fe].sectors >= sd->sectors) { /* it'll fit here */
+		sd->driveoffset = drive->freelist[fe].offset;
+		if (sd->sectors == drive->freelist[fe].sectors) { /* used up the entire entry */
+		    if (fe < (drive->freelist_entries - 1)) /* not the last one, */
+			bcopy(&drive->freelist[fe + 1],
+			    &drive->freelist[fe],
+			    (drive->freelist_entries - fe) * sizeof(struct drive_freelist));
+		    drive->freelist_entries--;		    /* one less entry */
+		} else {
+		    drive->freelist[fe].sectors -= sd->sectors;	/* this much less space */
+		    drive->freelist[fe].offset += sd->sectors; /* this much further on */
+		}
+		drive->sectors_available -= sd->sectors;    /* and note how much less space we have */
+		break;
+	    }
+	}
+	if (sd->driveoffset < 0)
+	    /*
+	     * Didn't find anything.  Although the drive has
+	     * enough space, it's too fragmented
+	     */
+	{
+	    free_sd(sd->sdno);
+	    throw_rude_remark(ENOSPC, "No space for %s on %s", sd->name, drive->label.name);
+	}
+    } else {						    /* specific offset */
+	/*
+	 * For a specific offset to work, the space must be
+	 * entirely in a single freelist entry.  Look for it.
+	 */
+	u_int64_t sdend = sd->driveoffset + sd->sectors;    /* end of our subdisk */
+	for (fe = 0; fe < drive->freelist_entries; fe++) {
+	    u_int64_t dend = drive->freelist[fe].offset + drive->freelist[fe].sectors; /* end of entry */
+	    if (dend >= sdend) {			    /* fits before here */
+		if (drive->freelist[fe].offset > sd->driveoffset) { /* starts after the beginning of sd area */
+		    sd->driveoffset = -1;		    /* don't be confusing */
+		    set_sd_state(sd->sdno, sd_down, setstate_force);
+		    throw_rude_remark(ENOSPC,
+			"No space for %s on drive %s at offset %lld",
+			sd->name,
+			drive->label.name,
+			sd->driveoffset);
+		    return;
+		}
+		/*
+		 * We've found the space, and we can allocate it.
+		 * We don't need to say that to the subdisk, which
+		 * already knows about it.  We need to tell it to
+		 * the free list, though.  We have four possibilities:
+		 *
+		 * 1.  The subdisk exactly eats up the entry.  That's the
+		 *     same as above.
+		 * 2.  The subdisk starts at the beginning and leaves space
+		 *     at the end.
+		 * 3.  The subdisk starts after the beginning and leaves
+		 *     space at the end as well: we end up with another
+		 *     fragment.
+		 * 4.  The subdisk leaves space at the beginning and finishes
+		 *     at the end.
+		 */
+		drive->sectors_available -= sd->sectors;    /* note how much less space we have */
+		if (sd->driveoffset == drive->freelist[fe].offset) { /* 1 or 2 */
+		    if (sd->sectors == drive->freelist[fe].sectors) { /* 1: used up the entire entry */
+			if (fe < (drive->freelist_entries - 1))	/* not the last one, */
+			    bcopy(&drive->freelist[fe + 1],
+				&drive->freelist[fe],
+				(drive->freelist_entries - fe) * sizeof(struct drive_freelist));
+			drive->freelist_entries--;	    /* one less entry */
+		    } else {				    /* 2: space at the end */
+			drive->freelist[fe].sectors -= sd->sectors; /* this much less space */
+			drive->freelist[fe].offset += sd->sectors; /* this much further on */
+		    }
+		} else {				    /* 3 or 4 */
+		    drive->freelist[fe].sectors = sd->driveoffset - drive->freelist[fe].offset;
+		    if (dend > sdend) {			    /* 3: space at the end as well */
+			if (fe < (drive->freelist_entries - 1))	/* not the last one */
+			    bcopy(&drive->freelist[fe],	    /* move the rest down */
+				&drive->freelist[fe + 1],
+				(drive->freelist_entries - fe) * sizeof(struct drive_freelist));
+			drive->freelist_entries++;	    /* one less entry */
+			drive->freelist[fe + 1].offset = sdend;	/* second entry starts after sd */
+			drive->freelist[fe + 1].sectors = dend - sdend;	/* and is this long */
+		    }
+		}
+		break;
+	    }
+	}
+    }
+    drive->opencount++;					    /* one more subdisk attached */
+}
+
+/* Get an empty drive entry from the drive table */
+int
+get_empty_drive(void)
+{
+    int driveno;
+    struct drive *drive;
+
+    /* first see if we have one which has been deallocated */
+    for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) {
+	if (DRIVE[driveno].state == drive_unallocated)	    /* bingo */
+	    break;
+    }
+
+    if (driveno >= vinum_conf.drives_allocated)		    /* we've used all our allocation */
+	EXPAND(DRIVE, struct drive, vinum_conf.drives_allocated, INITIAL_DRIVES);
+
+    /* got a drive entry.  Make it pretty */
+    drive = &DRIVE[driveno];
+    bzero(drive, sizeof(struct drive));
+    drive->driveno = driveno;				    /* put number in structure */
+    drive->flags |= VF_NEWBORN;				    /* newly born drive */
+    strcpy(drive->devicename, "unknown");		    /* and make the name ``unknown'' */
+    return driveno;					    /* return the index */
+}
+
+/*
+ * Find the named drive in vinum_conf.drive,
+ * return the index in vinum_conf.drive.
+ * Don't mark the drive as allocated (XXX SMP)
+ * If create != 0, create an entry if it doesn't exist
+ */
+/* XXX check if we have it open from attach */
+int
+find_drive(const char *name, int create)
+{
+    int driveno;
+    struct drive *drive;
+
+    if (name != NULL) {
+	for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) {
+	    drive = &DRIVE[driveno];			    /* point to drive */
+	    if ((drive->label.name[0] != '\0')		    /* it has a name */
+	    &&(strcmp(drive->label.name, name) == 0)	    /* and it's this one */
+	    &&(drive->state > drive_unallocated))	    /* and it's a real one: found */
+		return driveno;
+	}
+    }
+    /* the drive isn't in the list.  Add it if he wants */
+    if (create == 0)					    /* don't want to create */
+	return -1;					    /* give up */
+
+    driveno = get_empty_drive();
+    drive = &DRIVE[driveno];
+    if (name != NULL)
+	strlcpy(drive->label.name,			    /* put in its name */
+	    name,
+	    sizeof(drive->label.name));
+    drive->state = drive_referenced;			    /* in use, nothing worthwhile there */
+    return driveno;					    /* return the index */
+}
+
+/*
+ * Find a drive given its device name.
+ * devname must be valid.
+ * Otherwise the same as find_drive above.
+ */
+int
+find_drive_by_name(const char *devname, int create)
+{
+    int driveno;
+    struct drive *drive;
+
+    for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) {
+	drive = &DRIVE[driveno];			    /* point to drive */
+	if ((strcmp(drive->devicename, devname) == 0)	    /* it's this device */
+	&&(drive->state > drive_unallocated))		    /* and it's a real one: found */
+	    return driveno;
+    }
+
+    /* the drive isn't in the list.  Add it if he wants */
+    if (create == 0)					    /* don't want to create */
+	return -1;					    /* give up */
+
+    driveno = get_empty_drive();
+    drive = &DRIVE[driveno];
+    bcopy(devname,					    /* put in its name */
+	drive->devicename,
+	min(sizeof(drive->devicename),
+	    strlen(devname)));
+    drive->state = drive_referenced;			    /* in use, nothing worthwhile there */
+    return driveno;					    /* return the index */
+}
+
+/* Find an empty subdisk in the subdisk table */
+int
+get_empty_sd(void)
+{
+    int sdno;
+    struct sd *sd;
+
+    /* first see if we have one which has been deallocated */
+    for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) {
+	if (SD[sdno].state == sd_unallocated)		    /* bingo */
+	    break;
+    }
+    if (sdno >= vinum_conf.subdisks_allocated)
+	/*
+	 * We've run out of space.  sdno is pointing
+	 * where we want it, but at the moment we
+	 * don't have the space.  Get it.
+	 *
+	 * XXX We should check for overflow here.  We
+	 * shouldn't allocate more than VINUM_MAXSD
+	 * subdisks (currently at least a quarter of a
+	 * million).
+	 */
+	EXPAND(SD, struct sd, vinum_conf.subdisks_allocated, INITIAL_SUBDISKS);
+
+    /* initialize some things */
+    sd = &SD[sdno];					    /* point to it */
+    bzero(sd, sizeof(struct sd));			    /* initialize */
+    sd->flags |= VF_NEWBORN;				    /* newly born subdisk */
+    sd->plexno = -1;					    /* no plex */
+    sd->sectors = -1;					    /* no space */
+    sd->driveno = -1;					    /* no drive */
+    sd->plexoffset = -1;				    /* and no offsets */
+    sd->driveoffset = -1;
+    return sdno;					    /* return the index */
+}
+
+/* return a drive to the free pool */
+void
+free_drive(struct drive *drive)
+{
+    if ((drive->state > drive_referenced)		    /* real drive */
+    ||(drive->flags & VF_OPEN)) {			    /* how can it be open without a state? */
+	LOCKDRIVE(drive);
+	if (drive->flags & VF_OPEN) {			    /* it's open, */
+	    close_locked_drive(drive);			    /* close it */
+	    drive->state = drive_down;			    /* and note the fact */
+	}
+	if (drive->freelist)
+	    Free(drive->freelist);
+	bzero(drive, sizeof(struct drive));		    /* this also sets drive_unallocated */
+	unlockdrive(drive);
+    }
+}
+
+/*
+ * Find the named subdisk in vinum_conf.sd.
+ *
+ * If create != 0, create an entry if it doesn't exist
+ *
+ * Return index in vinum_conf.sd
+ */
+int
+find_subdisk(const char *name, int create)
+{
+    int sdno;
+    struct sd *sd;
+
+    for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) {
+	if (strcmp(SD[sdno].name, name) == 0)		    /* found it */
+	    return sdno;
+    }
+
+    /* the subdisk isn't in the list.  Add it if he wants */
+    if (create == 0)					    /* don't want to create */
+	return -1;					    /* give up */
+
+    /* Allocate one and insert the name */
+    sdno = get_empty_sd();
+    sd = &SD[sdno];
+    bcopy(name, sd->name, min(sizeof(sd->name), strlen(name)));	/* put in its name */
+    return sdno;					    /* return the pointer */
+}
+
+/* Return space to a drive */
+void
+return_drive_space(int driveno, int64_t offset, int length)
+{
+    struct drive *drive;
+    int fe;						    /* free list entry */
+    u_int64_t sdend;					    /* end of our subdisk */
+    u_int64_t dend;					    /* end of our freelist entry */
+
+    drive = &DRIVE[driveno];
+    if (drive->state == drive_up) {
+	sdend = offset + length;			    /* end of our subdisk */
+
+	/* Look for where to return the sd address space */
+	for (fe = 0;
+	    (fe < drive->freelist_entries) && (drive->freelist[fe].offset < offset);
+	    fe++);
+	/*
+	 * Now we are pointing to the last entry, the first
+	 * with a higher offset than the subdisk, or both.
+	 */
+	if ((fe > 1)					    /* not the first entry */
+	&&((fe == drive->freelist_entries)		    /* gone past the end */
+	||(drive->freelist[fe].offset > offset)))	    /* or past the block were looking for */
+	    fe--;					    /* point to the block before */
+	dend = drive->freelist[fe].offset + drive->freelist[fe].sectors; /* end of the entry */
+
+	/*
+	 * At this point, we are pointing to the correct
+	 * place in the free list.  A number of possibilities
+	 * exist:
+	 *
+	 * 1.  The block to be freed starts at the end of the
+	 *     block to which we are pointing.  This has two
+	 *     subcases:
+	 *
+	 * a.  The block to be freed ends at the beginning
+	 *     of the following block.  Merge the three
+	 *     areas into a single block.
+	 *
+	 * b.  The block is shorter than the space between
+	 *     the current block and the next one.  Enlarge
+	 *     the current block.
+	 *
+	 * 2.  The block to be freed starts after the end
+	 *     of the block.  Again, we have two cases:
+	 *
+	 * a.  It ends before the start of the following block.
+	 *     Create a new free block.
+	 *
+	 * b.  It ends at the start of the following block.
+	 *     Enlarge the following block downwards.
+	 *
+	 * When there is only one free space block, and the
+	 * space to be returned is before it, the pointer is
+	 * to a non-existent zeroth block. XXX check this
+	 */
+	if (offset == dend) {				    /* Case 1: it starts at the end of this block */
+	    if ((fe < drive->freelist_entries - 1)	    /* we're not the last block in the free list */
+	    /* and the subdisk ends at the start of the next block */
+	    &&(sdend == drive->freelist[fe + 1].offset)) {
+		drive->freelist[fe].sectors		    /* 1a: merge all three blocks */
+		    = drive->freelist[fe + 1].sectors;
+		if (fe < drive->freelist_entries - 2)	    /* still more blocks after next */
+		    bcopy(&drive->freelist[fe + 2],	    /* move down one */
+			&drive->freelist[fe + 1],
+			(drive->freelist_entries - 2 - fe)
+			* sizeof(struct drive_freelist));
+		drive->freelist_entries--;		    /* one less entry in the free list */
+	    } else					    /* 1b: just enlarge this block */
+		drive->freelist[fe].sectors += length;
+	} else {					    /* Case 2 */
+	    if (offset > dend)				    /* it starts after this block */
+		fe++;					    /* so look at the next block */
+	    if ((fe < drive->freelist_entries)		    /* we're not the last block in the free list */
+	    /* and the subdisk ends at the start of this block: case 4 */
+	    &&(sdend == drive->freelist[fe].offset)) {
+		drive->freelist[fe].offset = offset;	    /* it starts where the sd was */
+		drive->freelist[fe].sectors += length;	    /* and it's this much bigger */
+	    } else {					    /* case 3: non-contiguous */
+		if (fe < drive->freelist_entries)	    /* not after the last block, */
+		    bcopy(&drive->freelist[fe],		    /* move the rest up one entry */
+			&drive->freelist[fe + 1],
+			(drive->freelist_entries - fe)
+			* sizeof(struct drive_freelist));
+		drive->freelist_entries++;		    /* one less entry */
+		drive->freelist[fe].offset = offset;	    /* this entry represents the sd */
+		drive->freelist[fe].sectors = length;
+	    }
+	}
+	drive->sectors_available += length;		    /* the sectors are now available */
+    }
+}
+
+/*
+ * Free an allocated sd entry.
+ * This performs memory management only.  remove()
+ * is responsible for checking relationships.
+ */
+void
+free_sd(int sdno)
+{
+    struct sd *sd;
+
+    sd = &SD[sdno];
+    if ((sd->driveno >= 0)				    /* we have a drive, */
+    &&(sd->sectors > 0))				    /* and some space on it */
+	return_drive_space(sd->driveno,			    /* return the space */
+	    sd->driveoffset,
+	    sd->sectors);
+    if (sd->plexno >= 0)
+	PLEX[sd->plexno].subdisks--;			    /* one less subdisk */
+    destroy_dev(sd->dev);
+    bzero(sd, sizeof(struct sd));			    /* and clear it out */
+    sd->state = sd_unallocated;
+    vinum_conf.subdisks_used--;				    /* one less sd */
+}
+
+/* Find an empty plex in the plex table */
+int
+get_empty_plex(void)
+{
+    int plexno;
+    struct plex *plex;					    /* if we allocate one */
+
+    /* first see if we have one which has been deallocated */
+    for (plexno = 0; plexno < vinum_conf.plexes_allocated; plexno++) {
+	if (PLEX[plexno].state == plex_unallocated)	    /* bingo */
+	    break;					    /* and get out of here */
+    }
+
+    if (plexno >= vinum_conf.plexes_allocated)
+	EXPAND(PLEX, struct plex, vinum_conf.plexes_allocated, INITIAL_PLEXES);
+
+    /* Found a plex.  Give it an sd structure */
+    plex = &PLEX[plexno];				    /* this one is ours */
+    bzero(plex, sizeof(struct plex));			    /* polish it up */
+    plex->sdnos = (int *) Malloc(sizeof(int) * INITIAL_SUBDISKS_IN_PLEX); /* allocate sd table */
+    CHECKALLOC(plex->sdnos, "vinum: Can't allocate plex subdisk table");
+    bzero(plex->sdnos, (sizeof(int) * INITIAL_SUBDISKS_IN_PLEX)); /* do we need this? */
+    plex->flags |= VF_NEWBORN;				    /* newly born plex */
+    plex->subdisks = 0;					    /* no subdisks in use */
+    plex->subdisks_allocated = INITIAL_SUBDISKS_IN_PLEX;    /* and we have space for this many */
+    plex->organization = plex_disorg;			    /* and it's not organized */
+    plex->volno = -1;					    /* no volume yet */
+    return plexno;					    /* return the index */
+}
+
+/*
+ * Find the named plex in vinum_conf.plex
+ *
+ * If create != 0, create an entry if it doesn't exist
+ * return index in vinum_conf.plex
+ */
+int
+find_plex(const char *name, int create)
+{
+    int plexno;
+    struct plex *plex;
+
+    for (plexno = 0; plexno < vinum_conf.plexes_allocated; plexno++) {
+	if (strcmp(PLEX[plexno].name, name) == 0)	    /* found it */
+	    return plexno;
+    }
+
+    /* the plex isn't in the list.  Add it if he wants */
+    if (create == 0)					    /* don't want to create */
+	return -1;					    /* give up */
+
+    /* Allocate one and insert the name */
+    plexno = get_empty_plex();
+    plex = &PLEX[plexno];				    /* point to it */
+    bcopy(name, plex->name, min(sizeof(plex->name), strlen(name))); /* put in its name */
+    return plexno;					    /* return the pointer */
+}
+
+/*
+ * Free an allocated plex entry
+ * and its associated memory areas
+ */
+void
+free_plex(int plexno)
+{
+    struct plex *plex;
+
+    plex = &PLEX[plexno];
+    if (plex->sdnos)
+	Free(plex->sdnos);
+    if (plex->lock)
+	Free(plex->lock);
+    destroy_dev(plex->dev);
+    bzero(plex, sizeof(struct plex));			    /* and clear it out */
+    plex->state = plex_unallocated;
+}
+
+/* Find an empty volume in the volume table */
+int
+get_empty_volume(void)
+{
+    int volno;
+    struct volume *vol;
+    int i;
+
+    /* first see if we have one which has been deallocated */
+    for (volno = 0; volno < vinum_conf.volumes_allocated; volno++) {
+	if (VOL[volno].state == volume_unallocated)	    /* bingo */
+	    break;
+    }
+
+    if (volno >= vinum_conf.volumes_allocated)
+	EXPAND(VOL, struct volume, vinum_conf.volumes_allocated, INITIAL_VOLUMES);
+
+    /* Now initialize fields */
+    vol = &VOL[volno];
+    bzero(vol, sizeof(struct volume));
+    vol->flags |= VF_NEWBORN | VF_CREATED;		    /* newly born volume */
+    vol->preferred_plex = ROUND_ROBIN_READPOL;		    /* round robin */
+    for (i = 0; i < MAXPLEX; i++)			    /* mark the plexes missing */
+	vol->plex[i] = -1;
+    return volno;					    /* return the index */
+}
+
+/*
+ * Find the named volume in vinum_conf.volume.
+ *
+ * If create != 0, create an entry if it doesn't exist
+ * return the index in vinum_conf
+ */
+int
+find_volume(const char *name, int create)
+{
+    int volno;
+    struct volume *vol;
+
+    for (volno = 0; volno < vinum_conf.volumes_allocated; volno++) {
+	if (strcmp(VOL[volno].name, name) == 0)		    /* found it */
+	    return volno;
+    }
+
+    /* the volume isn't in the list.  Add it if he wants */
+    if (create == 0)					    /* don't want to create */
+	return -1;					    /* give up */
+
+    /* Allocate one and insert the name */
+    volno = get_empty_volume();
+    vol = &VOL[volno];
+    bcopy(name, vol->name, min(sizeof(vol->name), strlen(name))); /* put in its name */
+    vol->blocksize = DEV_BSIZE;				    /* block size of this volume */
+    return volno;					    /* return the pointer */
+}
+
+/*
+ * Free an allocated volume entry
+ * and its associated memory areas
+ */
+void
+free_volume(int volno)
+{
+    struct volume *vol;
+
+    vol = &VOL[volno];
+    destroy_dev(vol->dev);
+    bzero(vol, sizeof(struct volume));			    /* and clear it out */
+    vol->state = volume_unallocated;
+}
+
+/*
+ * Handle a drive definition.  We store the information in the global variable
+ * drive, so we don't need to allocate.
+ *
+ * If we find an error, print a message and return
+ */
+void
+config_drive(int update)
+{
+    enum drive_label_info partition_status;		    /* info about the partition */
+    int parameter;
+    int driveno;					    /* index of drive in vinum_conf */
+    struct drive *drive;				    /* and pointer to it */
+    int otherdriveno;					    /* index of possible second drive */
+    int sdno;
+
+    if (tokens < 2)					    /* not enough tokens */
+	throw_rude_remark(EINVAL, "Drive has no name\n");
+    driveno = find_drive(token[1], 1);			    /* allocate a drive to initialize */
+    drive = &DRIVE[driveno];				    /* and get a pointer */
+    if (update && ((drive->flags & VF_NEWBORN) == 0))	    /* this drive exists already */
+	return;						    /* don't do anything */
+    drive->flags &= ~VF_NEWBORN;			    /* no longer newly born */
+
+    if (drive->state != drive_referenced) {		    /* we already know this drive */
+	/*
+	 * XXX Check which definition is more up-to-date.  Give
+	 * preference for the definition on its own drive.
+	 */
+	return;						    /* XXX */
+    }
+    for (parameter = 2; parameter < tokens; parameter++) {  /* look at the other tokens */
+	switch (get_keyword(token[parameter], &keyword_set)) {
+	case kw_device:
+	    parameter++;
+	    otherdriveno = find_drive_by_name(token[parameter], 0); /* see if it exists already */
+	    if (otherdriveno >= 0) {			    /* yup, */
+		drive->state = drive_unallocated;	    /* deallocate the drive */
+		throw_rude_remark(EEXIST,		    /* and complain */
+		    "Drive %s would have same device as drive %s",
+		    token[1],
+		    DRIVE[otherdriveno].label.name);
+	    }
+	    if (drive->devicename[0] == '/') {		    /* we know this drive... */
+		if (strcmp(drive->devicename, token[parameter])) /* different name */
+		    close_drive(drive);			    /* close it if it's open */
+		else					    /* no change */
+		    break;
+	    }
+	    /* open the device and get the configuration */
+	    bcopy(token[parameter],			    /* insert device information */
+		drive->devicename,
+		min(sizeof(drive->devicename),
+		    strlen(token[parameter])));
+	    partition_status = read_drive_label(drive, 1);
+	    switch (partition_status) {
+	    case DL_CANT_OPEN:				    /* not our kind */
+		close_drive(drive);
+		if (drive->lasterror == EFTYPE)		    /* wrong kind of partition */
+		    throw_rude_remark(drive->lasterror,
+			"Drive %s has invalid partition type",
+			drive->label.name);
+		else					    /* I/O error of some kind */
+		    throw_rude_remark(drive->lasterror,
+			"Can't initialize drive %s",
+			drive->label.name);
+		break;
+
+	    case DL_WRONG_DRIVE:			    /* valid drive, not the name we expected */
+		if (vinum_conf.flags & VF_FORCECONFIG) {    /* but we'll accept that */
+		    bcopy(token[1], drive->label.name, sizeof(drive->label.name));
+		    break;
+		}
+		close_drive(drive);
+		/*
+		 * There's a potential race condition here:
+		 * the rude remark refers to a field in an
+		 * unallocated drive, which potentially could
+		 * be reused.  This works because we're the only
+		 * thread accessing the config at the moment.
+		 */
+		drive->state = drive_unallocated;	    /* throw it away completely */
+		throw_rude_remark(drive->lasterror,
+		    "Incorrect drive name %s specified for drive %s",
+		    token[1],
+		    drive->label.name);
+		break;
+
+	    case DL_DELETED_LABEL:			    /* it was a drive, but we deleted it */
+	    case DL_NOT_OURS:				    /* nothing to do with the rest */
+	    case DL_OURS:
+		break;
+	    }
+	    /*
+	     * read_drive_label overwrites the device name.
+	     * If we get here, we can have the drive,
+	     * so put it back again
+	     */
+	    bcopy(token[parameter],
+		drive->devicename,
+		min(sizeof(drive->devicename),
+		    strlen(token[parameter])));
+	    break;
+
+	case kw_state:
+	    parameter++;				    /* skip the keyword */
+	    if (vinum_conf.flags & VF_READING_CONFIG)
+		drive->state = DriveState(token[parameter]); /* set the state */
+	    break;
+
+	case kw_hotspare:				    /* this drive is a hot spare */
+	    drive->flags |= VF_HOTSPARE;
+	    break;
+
+	default:
+	    close_drive(drive);
+	    throw_rude_remark(EINVAL,
+		"Drive %s, invalid keyword: %s",
+		token[1],
+		token[parameter]);
+	}
+    }
+
+    if (drive->devicename[0] != '/') {
+	drive->state = drive_unallocated;		    /* deallocate the drive */
+	throw_rude_remark(EINVAL, "No device name for %s", drive->label.name);
+    }
+    vinum_conf.drives_used++;				    /* passed all hurdles: one more in use */
+    /*
+     * If we're replacing a drive, it could be that
+     * we already have subdisks referencing this
+     * drive.  Note where they should be and change
+     * their state to obsolete.
+     */
+    for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) {
+	if ((SD[sdno].state > sd_referenced)
+	    && (SD[sdno].driveno == driveno)) {
+	    give_sd_to_drive(sdno);
+	    if (SD[sdno].state > sd_stale)
+		SD[sdno].state = sd_stale;
+	}
+    }
+}
+
+/*
+ * Handle a subdisk definition.  We store the
+ * information in the global variable sd, so we
+ * don't need to allocate.
+ *
+ * On error throw a message back to the caller.
+ */
+void
+config_subdisk(int update)
+{
+    int parameter;
+    int sdno;						    /* index of sd in vinum_conf */
+    struct sd *sd;					    /* and pointer to it */
+    u_int64_t size;
+    int detached = 0;					    /* set to 1 if this is a detached subdisk */
+    int sdindex = -1;					    /* index in plexes subdisk table */
+    enum sdstate state = sd_unallocated;		    /* state to set, if specified */
+    int autosize = 0;					    /* set if we autosize in give_sd_to_drive */
+    int namedsdno;					    /* index of another with this name */
+    char partition = 0;					    /* partition of external subdisk */
+
+    sdno = get_empty_sd();				    /* allocate an SD to initialize */
+    sd = &SD[sdno];					    /* and get a pointer */
+
+    for (parameter = 1; parameter < tokens; parameter++) {  /* look at the other tokens */
+	switch (get_keyword(token[parameter], &keyword_set)) {
+	    /*
+	     * If we have a 'name' parameter, it must
+	     * come first, because we're too lazy to tidy
+	     * up dangling refs if it comes later.
+	     */
+	case kw_name:
+	    namedsdno = find_subdisk(token[++parameter], 0); /* find an existing sd with this name */
+	    if (namedsdno >= 0) {			    /* got one */
+		if (SD[namedsdno].state == sd_referenced) { /* we've been told about this one */
+		    if (parameter > 2)
+			throw_rude_remark(EINVAL,
+			    "sd %s: name parameter must come first\n", /* no go */
+			    token[parameter]);
+		    else {
+			int i;
+			struct plex *plex;		    /* for tidying up dangling references */
+
+			*sd = SD[namedsdno];		    /* copy from the referenced one */
+			SD[namedsdno].state = sd_unallocated; /* and deallocate the referenced one */
+			plex = &PLEX[sd->plexno];	    /* now take a look at our plex */
+			for (i = 0; i < plex->subdisks; i++) { /* look for the pointer */
+			    if (plex->sdnos[i] == namedsdno) /* pointing to the old subdisk */
+				plex->sdnos[i] = sdno;	    /* bend it to point here */
+			}
+		    }
+		}
+		if (update)				    /* are we updating? */
+		    return;				    /* that's OK, nothing more to do */
+		else
+		    throw_rude_remark(EINVAL, "Duplicate subdisk %s", token[parameter]);
+	    } else
+		bcopy(token[parameter],
+		    sd->name,
+		    min(sizeof(sd->name), strlen(token[parameter])));
+	    break;
+
+	case kw_detached:
+	    detached = 1;
+	    break;
+
+	case kw_plexoffset:
+	    size = sizespec(token[++parameter]);
+	    if ((size == -1)				    /* unallocated */
+	    &&(vinum_conf.flags & VF_READING_CONFIG))	    /* reading from disk */
+		break;					    /* invalid sd; just ignore it */
+	    if ((size % DEV_BSIZE) != 0)
+		throw_rude_remark(EINVAL,
+		    "sd %s, bad plex offset alignment: %lld",
+		    sd->name,
+		    (long long) size);
+	    else
+		sd->plexoffset = size / DEV_BSIZE;
+	    break;
+
+	case kw_driveoffset:
+	    size = sizespec(token[++parameter]);
+	    if ((size == -1)				    /* unallocated */
+	    &&(vinum_conf.flags & VF_READING_CONFIG))	    /* reading from disk */
+		break;					    /* invalid sd; just ignore it */
+	    if ((size % DEV_BSIZE) != 0)
+		throw_rude_remark(EINVAL,
+		    "sd %s, bad drive offset alignment: %lld",
+		    sd->name,
+		    (long long) size);
+	    else
+		sd->driveoffset = size / DEV_BSIZE;
+	    break;
+
+	case kw_len:
+	    if (get_keyword(token[++parameter], &keyword_set) == kw_max) /* select maximum size from drive */
+		size = 0;				    /* this is how we say it :-) */
+	    else
+		size = sizespec(token[parameter]);
+	    if ((size % DEV_BSIZE) != 0)
+		throw_rude_remark(EINVAL, "sd %s, length %d not multiple of sector size", sd->name, size);
+	    else
+		sd->sectors = size / DEV_BSIZE;
+	    /*
+	     * We have a problem with autosizing: we need to
+	     * give the drive to the plex before we give it
+	     * to the drive, in order to be clean if we give
+	     * up in the middle, but at this time the size hasn't
+	     * been set.  Note that we have to fix up after
+	     * giving the subdisk to the drive.
+	     */
+	    if (size == 0)
+		autosize = 1;				    /* note that we're autosizing */
+	    break;
+
+	case kw_drive:
+	    sd->driveno = find_drive(token[++parameter], 1); /* insert drive information */
+	    break;
+
+	case kw_plex:
+	    sd->plexno = find_plex(token[++parameter], 1);  /* insert plex information */
+	    break;
+
+	    /*
+	     * Set the state.  We can't do this directly,
+	     * because give_sd_to_plex may change it
+	     */
+	case kw_state:
+	    parameter++;				    /* skip the keyword */
+	    if (vinum_conf.flags & VF_READING_CONFIG)
+		state = SdState(token[parameter]);	    /* set the state */
+	    break;
+
+	case kw_partition:
+	    parameter++;				    /* skip the keyword */
+	    if ((strlen(token[parameter]) != 1)
+		|| (token[parameter][0] < 'a')
+		|| (token[parameter][0] > 'h'))
+		throw_rude_remark(EINVAL,
+		    "%s: invalid partition %c",
+		    sd->name,
+		    token[parameter][0]);
+	    else
+		partition = token[parameter][0];
+	    break;
+
+	case kw_retryerrors:
+	    sd->flags |= VF_RETRYERRORS;
+	    break;
+
+	default:
+	    throw_rude_remark(EINVAL, "%s: invalid keyword: %s", sd->name, token[parameter]);
+	}
+    }
+
+    /* Check we have a drive name */
+    if (sd->driveno < 0) {				    /* didn't specify a drive */
+	sd->driveno = current_drive;			    /* set to the current drive */
+	if (sd->driveno < 0)				    /* no current drive? */
+	    throw_rude_remark(EINVAL, "Subdisk %s is not associated with a drive", sd->name);
+    }
+    if (DRIVE[sd->driveno].state != drive_up)
+	sd->state = sd_crashed;
+
+    /*
+     * This is tacky.  If something goes wrong
+     * with the checks, we may end up losing drive
+     * space.  FIXME.
+     */
+    if (autosize != 0)					    /* need to find a size, */
+	give_sd_to_drive(sdno);				    /* do it before the plex */
+
+    /*  Check for a plex name */
+    if ((sd->plexno < 0)				    /* didn't specify a plex */
+    &&(!detached))					    /* and didn't say not to, */
+	sd->plexno = current_plex;			    /* set to the current plex */
+
+    if (sd->plexno >= 0)
+	sdindex = give_sd_to_plex(sd->plexno, sdno);	    /* now tell the plex that it has this sd */
+
+    sd->sdno = sdno;					    /* point to our entry in the table */
+
+    /* Does the subdisk have a name?  If not, give it one */
+    if (sd->name[0] == '\0') {				    /* no name */
+	char sdsuffix[8];				    /* form sd name suffix here */
+
+	/* Do we have a plex name? */
+	if (sdindex >= 0)				    /* we have a plex */
+	    strlcpy(sd->name,				    /* take it from there */
+		PLEX[sd->plexno].name,
+		sizeof(sd->name));
+	else						    /* no way */
+	    throw_rude_remark(EINVAL, "Unnamed sd is not associated with a plex");
+	sprintf(sdsuffix, ".s%d", sdindex);		    /* form the suffix */
+	strlcat(sd->name, sdsuffix, sizeof(sd->name));	    /* and add it to the name */
+    }
+    /* do we have complete info for this subdisk? */
+    if (sd->sectors < 0)
+	throw_rude_remark(EINVAL, "sd %s has no length spec", sd->name);
+
+    if (sd->dev == NULL)
+	/*
+	 * sdno can (at least theoretically) overflow
+	 * into the low order bit of the type field.
+	 * This gives rise to a subdisk with type
+	 * VINUM_SD2_TYPE.  This is a feature, not a
+	 * bug.
+	 */
+	sd->dev = make_dev(&vinum_cdevsw,
+	    VINUMMINOR(sdno, VINUM_SD_TYPE),
+	    UID_ROOT,
+	    GID_OPERATOR,
+	    S_IRUSR | S_IWUSR | S_IRGRP,
+	    "vinum/sd/%s",
+	    sd->name);
+    if (state != sd_unallocated)			    /* we had a specific state to set */
+	sd->state = state;				    /* do it now */
+    else if (sd->state == sd_unallocated)		    /* no, nothing set yet, */
+	sd->state = sd_empty;				    /* must be empty */
+    if (autosize == 0)					    /* no autoconfig, do the drive now */
+	give_sd_to_drive(sdno);
+    vinum_conf.subdisks_used++;				    /* one more in use */
+}
+
+/*
+ * Handle a plex definition.
+ */
+void
+config_plex(int update)
+{
+    int parameter;
+    int plexno;						    /* index of plex in vinum_conf */
+    struct plex *plex;					    /* and pointer to it */
+    int pindex = MAXPLEX;				    /* index in volume's plex list */
+    int detached = 0;					    /* don't give it to a volume */
+    int namedplexno;
+    enum plexstate state = plex_init;			    /* state to set at end */
+    int preferme;					    /* set if we want to be preferred access */
+
+    current_plex = -1;					    /* forget the previous plex */
+    preferme = 0;					    /* nothing special yet */
+    plexno = get_empty_plex();				    /* allocate a plex */
+    plex = &PLEX[plexno];				    /* and point to it */
+    plex->plexno = plexno;				    /* and back to the config */
+
+    for (parameter = 1; parameter < tokens; parameter++) {  /* look at the other tokens */
+	switch (get_keyword(token[parameter], &keyword_set)) {
+	    /*
+	     * If we have a 'name' parameter, it must
+	     * come first, because we're too lazy to tidy
+	     * up dangling refs if it comes later.
+	     */
+	case kw_name:
+	    namedplexno = find_plex(token[++parameter], 0); /* find an existing plex with this name */
+	    if (namedplexno >= 0) {			    /* plex exists already, */
+		if (PLEX[namedplexno].state == plex_referenced) { /* we've been told about this one */
+		    if (parameter > 2)			    /* we've done other things first, */
+			throw_rude_remark(EINVAL,
+			    "plex %s: name parameter must come first\n", /* no go */
+			    token[parameter]);
+		    else {
+			int i;
+			struct volume *vol;		    /* for tidying up dangling references */
+
+			*plex = PLEX[namedplexno];	    /* get the info */
+			PLEX[namedplexno].state = plex_unallocated; /* and deallocate the other one */
+			vol = &VOL[plex->volno];	    /* point to the volume */
+			for (i = 0; i < MAXPLEX; i++) {	    /* for each plex */
+			    if (vol->plex[i] == namedplexno)
+				vol->plex[i] = plexno;	    /* bend the pointer */
+			}
+		    }
+		    break;				    /* use this one */
+		}
+		if (update)				    /* are we updating? */
+		    return;				    /* yes: that's OK, just return */
+		else
+		    throw_rude_remark(EINVAL, "Duplicate plex %s", token[parameter]);
+	    } else
+		bcopy(token[parameter],			    /* put in the name */
+		    plex->name,
+		    min(MAXPLEXNAME, strlen(token[parameter])));
+	    break;
+
+	case kw_detached:
+	    detached = 1;
+	    break;
+
+	case kw_org:					    /* plex organization */
+	    switch (get_keyword(token[++parameter], &keyword_set)) {
+	    case kw_concat:
+		plex->organization = plex_concat;
+		break;
+
+	    case kw_striped:
+		{
+		    int stripesize = sizespec(token[++parameter]);
+
+		    plex->organization = plex_striped;
+		    if (stripesize % DEV_BSIZE != 0)	    /* not a multiple of block size, */
+			throw_rude_remark(EINVAL, "plex %s: stripe size %d not a multiple of sector size",
+			    plex->name,
+			    stripesize);
+		    else
+			plex->stripesize = stripesize / DEV_BSIZE;
+		    break;
+		}
+
+	    case kw_raid4:
+		{
+		    int stripesize = sizespec(token[++parameter]);
+
+		    plex->organization = plex_raid4;
+		    if (stripesize % DEV_BSIZE != 0)	    /* not a multiple of block size, */
+			throw_rude_remark(EINVAL, "plex %s: stripe size %d not a multiple of sector size",
+			    plex->name,
+			    stripesize);
+		    else
+			plex->stripesize = stripesize / DEV_BSIZE;
+		    break;
+		}
+
+	    case kw_raid5:
+		{
+		    int stripesize = sizespec(token[++parameter]);
+
+		    plex->organization = plex_raid5;
+		    if (stripesize % DEV_BSIZE != 0)	    /* not a multiple of block size, */
+			throw_rude_remark(EINVAL, "plex %s: stripe size %d not a multiple of sector size",
+			    plex->name,
+			    stripesize);
+		    else
+			plex->stripesize = stripesize / DEV_BSIZE;
+		    break;
+		}
+
+	    default:
+		throw_rude_remark(EINVAL, "Invalid plex organization");
+	    }
+	    if (isstriped(plex)
+		&& (plex->stripesize == 0))		    /* didn't specify a valid stripe size */
+		throw_rude_remark(EINVAL, "Need a stripe size parameter");
+	    break;
+
+	    /*
+	     * We're the preferred plex of our volume.
+	     * Unfortunately, we don't know who our
+	     * volume is yet.  Note that we want to be
+	     * preferred, and actually do it after we
+	     * get a volume.
+	     */
+	case kw_preferred:
+	    preferme = 1;
+	    break;
+
+	case kw_volume:
+	    plex->volno = find_volume(token[++parameter], 1); /* insert a pointer to the volume */
+	    break;
+
+	case kw_sd:					    /* add a subdisk */
+	    {
+		int sdno;
+
+		sdno = find_subdisk(token[++parameter], 1); /* find a subdisk */
+		SD[sdno].plexoffset = sizespec(token[++parameter]); /* get the offset */
+		give_sd_to_plex(plexno, sdno);		    /* and insert it there */
+		break;
+	    }
+
+	case kw_state:
+	    parameter++;				    /* skip the keyword */
+	    if (vinum_conf.flags & VF_READING_CONFIG)
+		state = PlexState(token[parameter]);	    /* set the state */
+	    break;
+
+	default:
+	    throw_rude_remark(EINVAL, "plex %s, invalid keyword: %s",
+		plex->name,
+		token[parameter]);
+	}
+    }
+
+    if (plex->organization == plex_disorg)
+	throw_rude_remark(EINVAL, "No plex organization specified");
+
+    if ((plex->volno < 0)				    /* we don't have a volume */
+    &&(!detached))					    /* and we wouldn't object */
+	plex->volno = current_volume;
+
+    if (plex->volno >= 0)
+	pindex = give_plex_to_volume(plex->volno,	    /* Now tell the volume that it has this plex */
+	    plexno,
+	    preferme);
+
+    /* Does the plex have a name?  If not, give it one */
+    if (plex->name[0] == '\0') {			    /* no name */
+	char plexsuffix[8];				    /* form plex name suffix here */
+	/* Do we have a volume name? */
+	if (plex->volno >= 0)				    /* we have a volume */
+	    strlcpy(plex->name,				    /* take it from there */
+		VOL[plex->volno].name,
+		sizeof(plex->name));
+	else						    /* no way */
+	    throw_rude_remark(EINVAL, "Unnamed plex is not associated with a volume");
+	sprintf(plexsuffix, ".p%d", pindex);		    /* form the suffix */
+	strlcat(plex->name, plexsuffix, sizeof(plex->name)); /* and add it to the name */
+    }
+    if (isstriped(plex)) {
+	plex->lock = (struct rangelock *)
+	    Malloc(PLEX_LOCKS * sizeof(struct rangelock));
+	CHECKALLOC(plex->lock, "vinum: Can't allocate lock table\n");
+	bzero((char *) plex->lock, PLEX_LOCKS * sizeof(struct rangelock));
+	plex->lockmtx = &plexmutex[plexno % PLEXMUTEXES];   /* use this mutex for locking */
+    }
+    /* Note the last plex we configured */
+    current_plex = plexno;
+    plex->state = state;				    /* set whatever state we chose */
+    vinum_conf.plexes_used++;				    /* one more in use */
+    if (plex->dev == NULL)
+	plex->dev = make_dev(&vinum_cdevsw,
+	    VINUMMINOR(plexno, VINUM_PLEX_TYPE),
+	    UID_ROOT,
+	    GID_OPERATOR,
+	    S_IRUSR | S_IWUSR | S_IRGRP,
+	    "vinum/plex/%s",
+	    plex->name);
+}
+
+/*
+ * Handle a volume definition.
+ * If we find an error, print a message, deallocate the nascent volume, and return
+ */
+void
+config_volume(int update)
+{
+    int parameter;
+    int volno;
+    struct volume *vol;					    /* collect volume info here */
+    int i;
+
+    if (tokens < 2)					    /* not enough tokens */
+	throw_rude_remark(EINVAL, "Volume has no name");
+    current_volume = -1;				    /* forget the previous volume */
+    volno = find_volume(token[1], 1);			    /* allocate a volume to initialize */
+    vol = &VOL[volno];					    /* and get a pointer */
+    if (update && ((vol->flags & VF_CREATED) == 0))	    /* this volume exists already */
+	return;						    /* don't do anything */
+    vol->flags &= ~VF_CREATED;				    /* it exists now */
+
+    for (parameter = 2; parameter < tokens; parameter++) {  /* look at all tokens */
+	switch (get_keyword(token[parameter], &keyword_set)) {
+	case kw_plex:
+	    {
+		int plexno;				    /* index of this plex */
+		int myplexno;				    /* and index if it's already ours */
+
+		plexno = find_plex(token[++parameter], 1);  /* find a plex */
+		if (plexno < 0)				    /* couldn't */
+		    break;				    /* we've already had an error message */
+		myplexno = my_plex(volno, plexno);	    /* does it already belong to us? */
+		if (myplexno > 0)			    /* yes, shouldn't get it again */
+		    throw_rude_remark(EINVAL,
+			"Plex %s already belongs to volume %s",
+			token[parameter],
+			vol->name);
+		else if (++vol->plexes > 8)		    /* another entry */
+		    throw_rude_remark(EINVAL,
+			"Too many plexes for volume %s",
+			vol->name);
+		vol->plex[vol->plexes - 1] = plexno;
+		PLEX[plexno].state = plex_referenced;	    /* we know something about it */
+		PLEX[plexno].volno = volno;		    /* and this volume references it */
+	    }
+	    break;
+
+	case kw_readpol:
+	    switch (get_keyword(token[++parameter], &keyword_set)) { /* decide what to do */
+	    case kw_round:
+		vol->preferred_plex = ROUND_ROBIN_READPOL;  /* default */
+		break;
+
+	    case kw_prefer:
+		{
+		    int myplexno;			    /* index of this plex */
+
+		    myplexno = find_plex(token[++parameter], 1); /* find a plex */
+		    if (myplexno < 0) {			    /* couldn't */
+			printf("vinum: couldn't find preferred plex %s for %s\n",
+			    token[parameter],
+			    vol->name);
+			break;				    /* we've already had an error message */
+		    }
+		    myplexno = my_plex(volno, myplexno);    /* does it already belong to us? */
+		    if (myplexno > 0)			    /* yes */
+			vol->preferred_plex = myplexno;	    /* just note the index */
+		    else if (++vol->plexes > 8)		    /* another entry */
+			throw_rude_remark(EINVAL, "Too many plexes");
+		    else {				    /* space for the new plex */
+			vol->plex[vol->plexes - 1] = myplexno; /* add it to our list */
+			vol->preferred_plex = vol->plexes - 1; /* and note the index */
+		    }
+		}
+		break;
+
+	    default:
+		throw_rude_remark(EINVAL, "Invalid read policy");
+	    }
+
+	case kw_setupstate:
+	    vol->flags |= VF_CONFIG_SETUPSTATE;		    /* set the volume up later on */
+	    break;
+
+	case kw_state:
+	    parameter++;				    /* skip the keyword */
+	    if (vinum_conf.flags & VF_READING_CONFIG)
+		vol->state = VolState(token[parameter]);    /* set the state */
+	    break;
+
+	    /*
+	     * XXX experimental ideas.  These are not
+	     * documented, and will not be until I
+	     * decide they're worth keeping.
+	     */
+	case kw_writethrough:				    /* set writethrough mode */
+	    vol->flags |= VF_WRITETHROUGH;
+	    break;
+
+	case kw_writeback:				    /* set writeback mode */
+	    vol->flags &= ~VF_WRITETHROUGH;
+	    break;
+
+	default:
+	    throw_rude_remark(EINVAL, "volume %s, invalid keyword: %s",
+		vol->name,
+		token[parameter]);
+	}
+    }
+    current_volume = volno;				    /* note last referred volume */
+    vol->volno = volno;					    /* also note in volume */
+
+    /*
+     * Before we can actually use the volume, we need
+     * a volume label.  We could start to fake one here,
+     * but it will be a lot easier when we have some
+     * to copy from the drives, so defer it until we
+     * set up the configuration. XXX
+     */
+    if (vol->state == volume_unallocated)
+	vol->state = volume_down;			    /* now ready to bring up at the end */
+
+    /* Find out how big our volume is */
+    for (i = 0; i < vol->plexes; i++)
+	vol->size = max(vol->size, PLEX[vol->plex[i]].length);
+    vinum_conf.volumes_used++;				    /* one more in use */
+    if (vol->dev == NULL)
+	vol->dev = make_dev(&vinum_cdevsw,
+	    VINUMMINOR(volno, VINUM_VOLUME_TYPE),
+	    UID_ROOT,
+	    GID_OPERATOR,
+	    S_IRUSR | S_IWUSR | S_IRGRP,
+	    "vinum/%s",
+	    vol->name);
+}
+
+/*
+ * Parse a config entry.  CARE!  This destroys the original contents of the
+ * config entry, which we don't really need after this.  More specifically, it
+ * places \0 characters at the end of each token.
+ *
+ * Return 0 if all is well, otherwise EINVAL for invalid keyword,
+ * or ENOENT if 'read' command doesn't find any drives.
+ */
+int
+parse_config(char *cptr, struct keywordset *keyset, int update)
+{
+    int status;
+
+    status = 0;						    /* until proven otherwise */
+    tokens = tokenize(cptr, token, MAXTOKEN);		    /* chop up into tokens */
+
+    if (tokens <= 0)					    /* screwed up or empty line */
+	return tokens;					    /* give up */
+    else if (tokens == MAXTOKEN)			    /* too many */
+	throw_rude_remark(E2BIG,
+	    "Configuration error for %s: too many parameters",
+	    token[1]);
+
+    if (token[0][0] == '#')				    /* comment line */
+	return 0;
+
+    switch (get_keyword(token[0], keyset)) {		    /* decide what to do */
+    case kw_drive:
+	config_drive(update);
+	break;
+
+    case kw_subdisk:
+	config_subdisk(update);
+	break;
+
+    case kw_plex:
+	config_plex(update);
+	break;
+
+    case kw_volume:
+	config_volume(update);
+	break;
+
+	/* Anything else is invalid in this context */
+    default:
+	throw_rude_remark(EINVAL,			    /* should we die? */
+	    "Invalid configuration information: %s",
+	    token[0]);
+    }
+    return status;
+}
+
+/*
+ * parse a line handed in from userland via ioctl.
+ * This differs only by the error reporting mechanism:
+ * we return the error indication in the reply to the
+ * ioctl, so we need to set a global static pointer in
+ * this file.  This technique works because we have
+ * ensured that configuration is performed in a single-
+ * threaded manner
+ */
+int
+parse_user_config(char *cptr, struct keywordset *keyset)
+{
+    int status;
+
+    ioctl_reply = (struct _ioctl_reply *) cptr;
+    status = parse_config(cptr, keyset, 0);
+    ioctl_reply = NULL;					    /* don't do this again */
+    return status;
+}
+
+/* Remove an object */
+void
+remove(struct vinum_ioctl_msg *msg)
+{
+    struct vinum_ioctl_msg message = *msg;		    /* make a copy to hand on */
+
+    ioctl_reply = (struct _ioctl_reply *) msg;		    /* reinstate the address to reply to */
+    ioctl_reply->error = 0;				    /* no error, */
+    ioctl_reply->msg[0] = '\0';				    /* no message */
+
+    switch (message.type) {
+    case drive_object:
+	remove_drive_entry(message.index, message.force);
+	updateconfig(0);
+	return;
+
+    case sd_object:
+	remove_sd_entry(message.index, message.force, message.recurse);
+	updateconfig(0);
+	return;
+
+    case plex_object:
+	remove_plex_entry(message.index, message.force, message.recurse);
+	updateconfig(0);
+	return;
+
+    case volume_object:
+	remove_volume_entry(message.index, message.force, message.recurse);
+	updateconfig(0);
+	return;
+
+    default:
+	ioctl_reply->error = EINVAL;
+	strcpy(ioctl_reply->msg, "Invalid object type");
+    }
+}
+
+/* Remove a drive.  */
+void
+remove_drive_entry(int driveno, int force)
+{
+    struct drive *drive = &DRIVE[driveno];
+    int sdno;
+
+    if ((driveno > vinum_conf.drives_allocated)		    /* not a valid drive */
+    ||(drive->state == drive_unallocated)) {		    /* or nothing there */
+	ioctl_reply->error = EINVAL;
+	strcpy(ioctl_reply->msg, "No such drive");
+    } else if (drive->opencount > 0) {			    /* we have subdisks */
+	if (force) {					    /* do it at any cost */
+	    for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) {
+		if ((SD[sdno].state != sd_unallocated)	    /* subdisk is allocated */
+		&&(SD[sdno].driveno == driveno))	    /* and it belongs to this drive */
+		    remove_sd_entry(sdno, force, 0);
+	    }
+	    remove_drive(driveno);			    /* now remove it */
+	    vinum_conf.drives_used--;			    /* one less drive */
+	} else
+	    ioctl_reply->error = EBUSY;			    /* can't do that */
+    } else {
+	remove_drive(driveno);				    /* just remove it */
+	vinum_conf.drives_used--;			    /* one less drive */
+    }
+}
+
+/* remove a subdisk */
+void
+remove_sd_entry(int sdno, int force, int recurse)
+{
+    struct sd *sd = &SD[sdno];
+
+    if ((sdno > vinum_conf.subdisks_allocated)		    /* not a valid sd */
+    ||(sd->state == sd_unallocated)) {			    /* or nothing there */
+	ioctl_reply->error = EINVAL;
+	strcpy(ioctl_reply->msg, "No such subdisk");
+    } else if (sd->flags & VF_OPEN)			    /* we're open */
+	ioctl_reply->error = EBUSY;			    /* no getting around that */
+    else if (sd->plexno >= 0) {				    /* we have a plex */
+	if (force) {					    /* do it at any cost */
+	    struct plex *plex = &PLEX[sd->plexno];	    /* point to our plex */
+	    int mysdno;
+
+	    for (mysdno = 0;				    /* look for ourselves */
+		mysdno < plex->subdisks && &SD[plex->sdnos[mysdno]] != sd;
+		mysdno++);
+	    if (mysdno == plex->subdisks)		    /* didn't find it */
+		log(LOG_ERR,
+		    "Error removing subdisk %s: not found in plex %s\n",
+		    SD[mysdno].name,
+		    plex->name);
+	    else {					    /* remove the subdisk from plex */
+		if (mysdno < (plex->subdisks - 1))	    /* not the last subdisk */
+		    bcopy(&plex->sdnos[mysdno + 1],
+			&plex->sdnos[mysdno],
+			(plex->subdisks - 1 - mysdno) * sizeof(int));
+		plex->subdisks--;
+		sd->plexno = -1;			    /* disown the subdisk */
+	    }
+
+	    /*
+	     * Removing a subdisk from a striped or
+	     * RAID-4 or RAID-5 plex really tears the
+	     * hell out of the structure, and it needs
+	     * to be reinitialized.
+	     */
+	    if (plex->organization != plex_concat)	    /* not concatenated, */
+		set_plex_state(plex->plexno, plex_faulty, setstate_force); /* need to reinitialize */
+	    log(LOG_INFO, "vinum: removing %s\n", sd->name);
+	    free_sd(sdno);
+	} else
+	    ioctl_reply->error = EBUSY;			    /* can't do that */
+    } else {
+	log(LOG_INFO, "vinum: removing %s\n", sd->name);
+	free_sd(sdno);
+    }
+}
+
+/* remove a plex */
+void
+remove_plex_entry(int plexno, int force, int recurse)
+{
+    struct plex *plex = &PLEX[plexno];
+    int sdno;
+
+    if ((plexno > vinum_conf.plexes_allocated)		    /* not a valid plex */
+    ||(plex->state == plex_unallocated)) {		    /* or nothing there */
+	ioctl_reply->error = EINVAL;
+	strcpy(ioctl_reply->msg, "No such plex");
+    } else if (plex->flags & VF_OPEN) {			    /* we're open */
+	ioctl_reply->error = EBUSY;			    /* no getting around that */
+	return;
+    }
+    if (plex->subdisks) {
+	if (force) {					    /* do it anyway */
+	    if (recurse) {				    /* remove all below */
+		int sds = plex->subdisks;
+		for (sdno = 0; sdno < sds; sdno++)
+		    free_sd(plex->sdnos[sdno]);		    /* free all subdisks */
+	    } else {					    /* just tear them out */
+		int sds = plex->subdisks;
+		for (sdno = 0; sdno < sds; sdno++)
+		    SD[plex->sdnos[sdno]].plexno = -1;	    /* no plex any more */
+	    }
+	} else {					    /* can't do it without force */
+	    ioctl_reply->error = EBUSY;			    /* can't do that */
+	    return;
+	}
+    }
+    if (plex->volno >= 0) {				    /* we are part of a volume */
+	if (force) {					    /* do it at any cost */
+	    struct volume *vol = &VOL[plex->volno];
+	    int myplexno;
+
+	    for (myplexno = 0; myplexno < vol->plexes; myplexno++)
+		if (vol->plex[myplexno] == plexno)	    /* found it */
+		    break;
+	    if (myplexno == vol->plexes)		    /* didn't find it.  Huh? */
+		log(LOG_ERR,
+		    "Error removing plex %s: not found in volume %s\n",
+		    plex->name,
+		    vol->name);
+	    if (myplexno < (vol->plexes - 1))		    /* not the last plex in the list */
+		bcopy(&vol->plex[myplexno + 1],
+		    &vol->plex[myplexno],
+		    vol->plexes - 1 - myplexno);
+	    vol->plexes--;
+	} else {
+	    ioctl_reply->error = EBUSY;			    /* can't do that */
+	    return;
+	}
+    }
+    log(LOG_INFO, "vinum: removing %s\n", plex->name);
+    free_plex(plexno);
+    vinum_conf.plexes_used--;				    /* one less plex */
+}
+
+/* remove a volume */
+void
+remove_volume_entry(int volno, int force, int recurse)
+{
+    struct volume *vol = &VOL[volno];
+    int plexno;
+
+    if ((volno > vinum_conf.volumes_allocated)		    /* not a valid volume */
+    ||(vol->state == volume_unallocated)) {		    /* or nothing there */
+	ioctl_reply->error = EINVAL;
+	strcpy(ioctl_reply->msg, "No such volume");
+    } else if (vol->flags & VF_OPEN)			    /* we're open */
+	ioctl_reply->error = EBUSY;			    /* no getting around that */
+    else if (vol->plexes) {
+	if (recurse && force) {				    /* remove all below */
+	    int plexes = vol->plexes;
+
+/*       for (plexno = plexes - 1; plexno >= 0; plexno--) */
+	    for (plexno = 0; plexno < plexes; plexno++)
+		remove_plex_entry(vol->plex[plexno], force, recurse);
+	    log(LOG_INFO, "vinum: removing %s\n", vol->name);
+	    free_volume(volno);
+	    vinum_conf.volumes_used--;			    /* one less volume */
+	} else
+	    ioctl_reply->error = EBUSY;			    /* can't do that */
+    } else {
+	log(LOG_INFO, "vinum: removing %s\n", vol->name);
+	free_volume(volno);
+	vinum_conf.volumes_used--;			    /* one less volume */
+    }
+}
+
+/* Currently called only from ioctl */
+void
+update_sd_config(int sdno, int diskconfig)
+{
+    if (!diskconfig)
+	set_sd_state(sdno, sd_up, setstate_configuring);
+    SD[sdno].flags &= ~VF_NEWBORN;
+}
+
+void
+update_plex_config(int plexno, int diskconfig)
+{
+    u_int64_t size;
+    int sdno;
+    struct plex *plex = &PLEX[plexno];
+    enum plexstate state = plex_up;			    /* state we want the plex in */
+    int remainder;					    /* size of fractional stripe at end */
+    int added_plex;					    /* set if we add a plex to a volume */
+    int required_sds;					    /* number of subdisks we need */
+    struct sd *sd;
+    struct volume *vol;
+    int data_sds = 0;					    /* number of sds carrying data */
+
+    if (plex->state < plex_init)			    /* not a real plex, */
+	return;
+    added_plex = 0;
+    if (plex->volno >= 0) {				    /* we have a volume */
+	vol = &VOL[plex->volno];
+
+	/*
+	 * If we're newly born,
+	 * and the volume isn't,
+	 * and it has other plexes,
+	 * and we didn't read this mess from disk,
+	 * we were added later.
+	 */
+	if ((plex->flags & VF_NEWBORN)
+	    && ((vol->flags & VF_NEWBORN) == 0)
+	    && (vol->plexes > 0)
+	    && (diskconfig == 0)) {
+	    added_plex = 1;
+	    state = plex_down;				    /* so take ourselves down */
+	}
+    }
+    /*
+     * Check that our subdisks make sense.  For
+     * striped plexes, we need at least two
+     * subdisks, and for RAID-4 and RAID-5 plexes we
+     * need at least three subdisks.  In each case
+     * they must all be the same size.
+     */
+    if (plex->organization == plex_striped) {
+	data_sds = plex->subdisks;
+	required_sds = 2;
+    } else if (isparity(plex)) {			    /* RAID 4 or 5 */
+	data_sds = plex->subdisks - 1;
+	required_sds = 3;
+    } else
+	required_sds = 0;
+    if (required_sds > 0) {				    /* striped, RAID-4 or RAID-5 */
+	if (plex->subdisks < required_sds) {
+	    log(LOG_ERR,
+		"vinum: plex %s does not have at least %d subdisks\n",
+		plex->name,
+		required_sds);
+	    state = plex_faulty;
+	}
+	/*
+	 * Now see if the plex size is a multiple of
+	 * the stripe size.  If not, trim off the end
+	 * of each subdisk and return it to the drive.
+	 */
+	if (plex->length > 0) {
+	    if (data_sds > 0) {
+		if (plex->stripesize > 0) {
+		    remainder = (int) (plex->length	    /* are we exact? */
+			% ((u_int64_t) plex->stripesize * data_sds));
+		    if (remainder) {			    /* no */
+			log(LOG_INFO, "vinum: removing %d blocks of partial stripe at the end of %s\n",
+			    remainder,
+			    plex->name);
+			plex->length -= remainder;	    /* shorten the plex */
+			remainder /= data_sds;		    /* spread the remainder amongst the sds */
+			for (sdno = 0; sdno < plex->subdisks; sdno++) {
+			    sd = &SD[plex->sdnos[sdno]];    /* point to the subdisk */
+			    return_drive_space(sd->driveno, /* return the space */
+				sd->driveoffset + sd->sectors - remainder,
+				remainder);
+			    sd->sectors -= remainder;	    /* and shorten it */
+			}
+		    }
+		} else					    /* no data sds, */
+		    plex->length = 0;			    /* reset length */
+	    }
+	}
+    }
+    size = 0;
+    for (sdno = 0; sdno < plex->subdisks; sdno++) {
+	sd = &SD[plex->sdnos[sdno]];
+	if (isstriped(plex)
+	    && (sdno > 0)
+	    && (sd->sectors != SD[plex->sdnos[sdno - 1]].sectors)) {
+	    log(LOG_ERR, "vinum: %s must have equal sized subdisks\n", plex->name);
+	    state = plex_down;
+	}
+	size += sd->sectors;
+	if (added_plex)					    /* we were added later */
+	    sd->state = sd_stale;			    /* stale until proven otherwise */
+	if (plex->sectorsize != 0) {
+	    if (sd->sectorsize != plex->sectorsize)	    /* incompatible sector sizes? */
+		printf("vinum: incompatible sector sizes.  "
+		    "%s has %d bytes, %s has %d bytes.  Ignored.\n",
+		    sd->name,
+		    sd->sectorsize,
+		    plex->name,
+		    plex->sectorsize);
+	} else						    /* not set yet, */
+	    plex->sectorsize = sd->sectorsize;
+    }
+
+    if (plex->subdisks) {				    /* plex has subdisks, calculate size */
+	/*
+	 * XXX We shouldn't need to calculate the size any
+	 * more.  Check this some time
+	 */
+	if (isparity(plex))
+	    size = size / plex->subdisks * (plex->subdisks - 1); /* less space for RAID-4 and RAID-5 */
+	if (plex->length != size)
+	    log(LOG_INFO,
+		"Correcting length of %s: was %lld, is %lld\n",
+		plex->name,
+		(long long) plex->length,
+		(long long) size);
+	plex->length = size;
+    } else {						    /* no subdisks, */
+	plex->length = 0;				    /* no size */
+	state = plex_down;				    /* take it down */
+    }
+    update_plex_state(plexno);				    /* set the state */
+    plex->flags &= ~VF_NEWBORN;
+}
+
+void
+update_volume_config(int volno)
+{
+    struct volume *vol = &VOL[volno];
+    struct plex *plex;
+    int plexno;
+
+    if (vol->state != volume_unallocated)
+	/*
+	 * Recalculate the size of the volume,
+	 * which might change if the original
+	 * plexes were not a multiple of the
+	 * stripe size.
+	 */
+    {
+	vol->size = 0;
+	for (plexno = 0; plexno < vol->plexes; plexno++) {
+	    plex = &PLEX[vol->plex[plexno]];
+	    vol->size = max(plex->length, vol->size);	    /* maximum size */
+	    plex->volplexno = plexno;			    /* note it in the plex */
+	    if (vol->sectorsize != 0) {
+		if (plex->sectorsize != vol->sectorsize)    /* incompatible sector sizes? */
+		    printf("vinum: incompatible sector sizes.  "
+			"%s has %d, %s has %d.  Ignored.\n",
+			plex->name,
+			plex->sectorsize,
+			vol->name,
+			vol->sectorsize);
+	    } else					    /* not set yet, */
+		vol->sectorsize = plex->sectorsize;
+	}
+    }
+    vol->flags &= ~VF_NEWBORN;				    /* no longer newly born */
+}
+
+/*
+ * Update the global configuration.  This is
+ * called after configuration changes.
+ *
+ * diskconfig is != 0 if we're reading in a config
+ * from disk.  In this case, we don't try to bring
+ * the devices up, though we will bring them down
+ * if there's some error which got missed when
+ * writing to disk.
+ */
+void
+updateconfig(int diskconfig)
+{
+    int plexno;
+    int volno;
+
+    for (plexno = 0; plexno < vinum_conf.plexes_allocated; plexno++)
+	update_plex_config(plexno, diskconfig);
+
+    for (volno = 0; volno < vinum_conf.volumes_allocated; volno++) {
+	if (VOL[volno].state > volume_uninit) {
+	    VOL[volno].flags &= ~VF_CONFIG_SETUPSTATE;	    /* no more setupstate */
+	    update_volume_state(volno);
+	    update_volume_config(volno);
+	}
+    }
+    save_config();
+}
+
+/*
+ * Start manual changes to the configuration and lock out
+ * others who may wish to do so.
+ * XXX why do we need this and lock_config too?
+ */
+int
+start_config(int force)
+{
+    int error;
+
+    current_drive = -1;					    /* note the last drive we mention, for
+							    * some defaults */
+    current_plex = -1;					    /* and the same for the last plex */
+    current_volume = -1;				    /* and the last volume */
+    while ((vinum_conf.flags & VF_CONFIGURING) != 0) {
+	vinum_conf.flags |= VF_WILL_CONFIGURE;
+	if ((error = tsleep(&vinum_conf, PRIBIO | PCATCH, "vincfg", 0)) != 0)
+	    return error;
+    }
+    /*
+     * We need two flags here: VF_CONFIGURING
+     * tells other processes to hold off (this
+     * function), and VF_CONFIG_INCOMPLETE
+     * tells the state change routines not to
+     * propagate incrememntal state changes
+     */
+    vinum_conf.flags |= VF_CONFIGURING | VF_CONFIG_INCOMPLETE;
+    if (force)
+	vinum_conf.flags |= VF_FORCECONFIG;		    /* overwrite differently named drives */
+    current_drive = -1;					    /* reset the defaults */
+    current_plex = -1;					    /* and the same for the last plex */
+    current_volume = -1;				    /* and the last volme */
+    return 0;
+}
+
+/*
+ * Update the config if update is 1, and unlock
+ * it.  We won't update the configuration if we
+ * are called in a recursive loop via throw_rude_remark.
+ */
+void
+finish_config(int update)
+{
+    /* we've finished our config */
+    vinum_conf.flags &= ~(VF_CONFIG_INCOMPLETE | VF_READING_CONFIG | VF_FORCECONFIG);
+    if (update)
+	updateconfig(0);				    /* so update things */
+    else
+	updateconfig(1);				    /* do some updates only */
+    vinum_conf.flags &= ~VF_CONFIGURING;		    /* and now other people can take a turn */
+    if ((vinum_conf.flags & VF_WILL_CONFIGURE) != 0) {
+	vinum_conf.flags &= ~VF_WILL_CONFIGURE;
+	wakeup_one(&vinum_conf);
+    }
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumdaemon.c b/sys/dev/vinum/vinumdaemon.c
new file mode 100644
index 0000000..3ae09c0
--- /dev/null
+++ b/sys/dev/vinum/vinumdaemon.c
@@ -0,0 +1,281 @@
+/* daemon.c: kernel part of Vinum daemon */
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumdaemon.c,v 1.8 2000/01/03 05:22:03 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+#ifdef VINUMDEBUG
+#include <sys/reboot.h>
+#endif
+
+/* declarations */
+void recover_io(struct request *rq);
+
+int daemon_options = 0;					    /* options */
+int daemonpid;						    /* PID of daemon */
+struct daemonq *daemonq;				    /* daemon's work queue */
+struct daemonq *dqend;					    /* and the end of the queue */
+
+/*
+ * We normally call Malloc to get a queue element.  In interrupt
+ * context, we can't guarantee that we'll get one, since we're not
+ * allowed to wait.  If malloc fails, use one of these elements.
+ */
+
+#define INTQSIZE 4
+struct daemonq intq[INTQSIZE];				    /* queue elements for interrupt context */
+struct daemonq *intqp;					    /* and pointer in it */
+
+void
+vinum_daemon(void)
+{
+    int s;
+    struct daemonq *request;
+
+    PROC_LOCK(curproc);
+    curproc->p_flag |= P_SYSTEM;			    /* we're a system process */
+    mtx_lock_spin(&sched_lock);
+    curproc->p_sflag |= PS_INMEM;
+    mtx_unlock_spin(&sched_lock);
+    PROC_UNLOCK(curproc);
+    daemon_save_config();				    /* start by saving the configuration */
+    daemonpid = curproc->p_pid;				    /* mark our territory */
+    while (1) {
+	tsleep(&vinum_daemon, PRIBIO, "vinum", 0);	    /* wait for something to happen */
+
+	/*
+	 * It's conceivable that, as the result of an
+	 * I/O error, we'll be out of action long
+	 * enough that another daemon gets started.
+	 * That's OK, just give up gracefully.
+	 */
+	if (curproc->p_pid != daemonpid) {		    /* we've been ousted in our sleep */
+	    if (daemon_options & daemon_verbose)
+		log(LOG_INFO, "vinum: abdicating\n");
+	    return;
+	}
+	while (daemonq != NULL) {			    /* we have work to do, */
+	    s = splhigh();				    /* don't get interrupted here */
+	    request = daemonq;				    /* get the request */
+	    daemonq = daemonq->next;			    /* and detach it */
+	    if (daemonq == NULL)			    /* got to the end, */
+		dqend = NULL;				    /* no end any more */
+	    splx(s);
+
+	    switch (request->type) {
+		/*
+		 * We had an I/O error on a request.  Go through the
+		 * request and try to salvage it
+		 */
+	    case daemonrq_ioerror:
+		if (daemon_options & daemon_verbose) {
+		    struct request *rq = request->info.rq;
+
+		    log(LOG_WARNING,
+			"vinum: recovering I/O request: %p\n%s dev %d.%d, offset 0x%llx, length %ld\n",
+			rq,
+			rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
+			major(rq->bp->b_dev),
+			minor(rq->bp->b_dev),
+			(long long)rq->bp->b_blkno,
+			rq->bp->b_bcount);
+		}
+		recover_io(request->info.rq);		    /* the failed request */
+		break;
+
+		/*
+		 * Write the config to disk.  We could end up with
+		 * quite a few of these in a row.  Only honour the
+		 * last one
+		 */
+	    case daemonrq_saveconfig:
+		if ((daemonq == NULL)			    /* no more requests */
+		||(daemonq->type != daemonrq_saveconfig)) { /* or the next isn't the same */
+		    if (((daemon_options & daemon_noupdate) == 0) /* we're allowed to do it */
+		    &&((vinum_conf.flags & VF_READING_CONFIG) == 0)) { /* and we're not building the config now */
+			/*
+			   * We obviously don't want to save a
+			   * partial configuration.  Less obviously,
+			   * we don't need to do anything if we're
+			   * asked to write the config when we're
+			   * building it up, because we save it at
+			   * the end.
+			 */
+			if (daemon_options & daemon_verbose)
+			    log(LOG_INFO, "vinum: saving config\n");
+			daemon_save_config();		    /* save it */
+		    }
+		}
+		break;
+
+	    case daemonrq_return:			    /* been told to stop */
+		if (daemon_options & daemon_verbose)
+		    log(LOG_INFO, "vinum: stopping\n");
+		daemon_options |= daemon_stopped;	    /* note that we've stopped */
+		Free(request);
+		while (daemonq != NULL) {		    /* backed up requests, */
+		    request = daemonq;			    /* get the request */
+		    daemonq = daemonq->next;		    /* and detach it */
+		    Free(request);			    /* then free it */
+		}
+		wakeup(&vinumclose);			    /* and wake any waiting vinum(8)s */
+		return;
+
+	    case daemonrq_ping:				    /* tell the caller we're here */
+		if (daemon_options & daemon_verbose)
+		    log(LOG_INFO, "vinum: ping reply\n");
+		wakeup(&vinum_finddaemon);		    /* wake up the caller */
+		break;
+
+	    case daemonrq_closedrive:			    /* close a drive */
+		close_drive(request->info.drive);	    /* do it */
+		break;
+
+	    case daemonrq_init:				    /* initialize a plex */
+		/* XXX */
+	    case daemonrq_revive:			    /* revive a subdisk */
+		/* XXX */
+		/* FALLTHROUGH */
+	    default:
+		log(LOG_WARNING, "Invalid request\n");
+		break;
+	    }
+	    if (request->privateinuse)			    /* one of ours, */
+		request->privateinuse = 0;		    /* no longer in use */
+	    else
+		Free(request);				    /* return it */
+	}
+    }
+}
+
+/*
+ * Recover a failed I/O operation.
+ *
+ * The correct way to do this is to examine the request and determine
+ * how to recover each individual failure.  In the case of a write,
+ * this could be as simple as doing nothing: the defective drives may
+ * already be down, and there may be nothing else to do.  In case of
+ * a read, it will be necessary to retry if there are alternative
+ * copies of the data.
+ *
+ * The easy way (here) is just to reissue the request.  This will take
+ * a little longer, but nothing like as long as the failure will have
+ * taken.
+ *
+ */
+void
+recover_io(struct request *rq)
+{
+    /*
+     * This should read:
+     *
+     *     vinumstrategy(rq->bp);
+     *
+     * Negotiate with phk to get it fixed.
+     */
+    DEV_STRATEGY(rq->bp);				    /* reissue the command */
+}
+
+/* Functions called to interface with the daemon */
+
+/* queue a request for the daemon */
+void
+queue_daemon_request(enum daemonrq type, union daemoninfo info)
+{
+    int s;
+
+    struct daemonq *qelt = (struct daemonq *) Malloc(sizeof(struct daemonq));
+
+    if (qelt == NULL) {					    /* malloc failed, we're prepared for that */
+	/*
+	 * Take one of our spares.  Give up if it's still in use; the only
+	 * message we're likely to get here is a 'drive failed' message,
+	 * and that'll come by again if we miss it.
+	 */
+	if (intqp->privateinuse)			    /* still in use? */
+	    return;					    /* yes, give up */
+	qelt = intqp++;
+	if (intqp == &intq[INTQSIZE])			    /* got to the end, */
+	    intqp = intq;				    /* wrap around */
+	qelt->privateinuse = 1;				    /* it's ours, and it's in use */
+    } else
+	qelt->privateinuse = 0;
+
+    qelt->next = NULL;					    /* end of the chain */
+    qelt->type = type;
+    qelt->info = info;
+    s = splhigh();
+    if (daemonq) {					    /* something queued already */
+	dqend->next = qelt;
+	dqend = qelt;
+    } else {						    /* queue is empty, */
+	daemonq = qelt;					    /* this is the whole queue */
+	dqend = qelt;
+    }
+    splx(s);
+    wakeup(&vinum_daemon);				    /* and give the d�mon a kick */
+}
+
+/*
+ * see if the daemon is running.  Return 0 (no error)
+ * if it is, ESRCH otherwise
+ */
+int
+vinum_finddaemon()
+{
+    int result;
+
+    if (daemonpid != 0) {				    /* we think we have a daemon, */
+	queue_daemon_request(daemonrq_ping, (union daemoninfo) 0); /* queue a ping */
+	result = tsleep(&vinum_finddaemon, PUSER, "reap", 2 * hz);
+	if (result == 0)				    /* yup, the daemon's up and running */
+	    return 0;
+    }
+    /* no daemon, or we couldn't talk to it: start it */
+    vinum_daemon();					    /* start the daemon */
+    return 0;
+}
+
+int
+vinum_setdaemonopts(int options)
+{
+    daemon_options = options;
+    return 0;
+}
diff --git a/sys/dev/vinum/vinumext.h b/sys/dev/vinum/vinumext.h
new file mode 100644
index 0000000..807bb5c6
--- /dev/null
+++ b/sys/dev/vinum/vinumext.h
@@ -0,0 +1,263 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumext.h,v 1.33 2003/05/23 00:57:48 grog Exp $
+ * $FreeBSD$
+ */
+
+/* vinumext.h: external definitions */
+
+/* *sigh* We still need this at the moment. */
+#ifdef _KERNEL
+extern struct _vinum_conf vinum_conf;			    /* configuration information */
+extern struct mtx plexmutex[];				    /* mutexes for plexes to use */
+#else
+extern struct __vinum_conf vinum_conf;			    /* configuration information */
+#endif
+
+#ifdef VINUMDEBUG
+extern int debug;					    /* debug flags */
+#endif
+
+/* Physical read and write drive */
+#define read_drive(a, b, c, d) driveio (a, b, c, d, BIO_READ)
+#define write_drive(a, b, c, d) driveio (a, b, c, d, BIO_WRITE)
+
+#define CHECKALLOC(ptr, msg) \
+  if (ptr == NULL) \
+    { \
+    printf (msg); \
+    longjmp (command_fail, -1); \
+    }
+#ifndef _KERNEL
+struct vnode;
+struct thread;
+#endif
+
+#ifdef _KERNEL
+int vinum_inactive(int);
+void free_vinum(int);
+int give_sd_to_plex(int plexno, int sdno);
+void give_sd_to_drive(int sdno);
+int give_plex_to_volume(int, int, int);
+struct drive *check_drive(char *);
+enum drive_label_info read_drive_label(struct drive *, int);
+int parse_config(char *, struct keywordset *, int);
+int parse_user_config(char *cptr, struct keywordset *keyset);
+u_int64_t sizespec(char *spec);
+int volume_index(struct volume *volume);
+int plex_index(struct plex *plex);
+int sd_index(struct sd *sd);
+int drive_index(struct drive *drive);
+int my_plex(int volno, int plexno);
+int my_sd(int plexno, int sdno);
+int get_empty_drive(void);
+int find_drive(const char *name, int create);
+int find_drive_by_name(const char *devname, int create);
+int get_empty_sd(void);
+int find_subdisk(const char *name, int create);
+void return_drive_space(int driveno, int64_t offset, int length);
+void free_sd(int sdno);
+void free_volume(int volno);
+int get_empty_plex(void);
+int find_plex(const char *name, int create);
+void free_plex(int plexno);
+int get_empty_volume(void);
+int find_volume(const char *name, int create);
+void config_subdisk(int);
+void config_plex(int);
+void config_volume(int);
+void config_drive(int);
+void updateconfig(int);
+void update_sd_config(int sdno, int kernelstate);
+void update_plex_config(int plexno, int kernelstate);
+void update_volume_config(int volno);
+void update_config(void);
+void drive_io_done(struct buf *);
+void save_config(void);
+void daemon_save_config(void);
+void write_config(char *, int);
+int start_config(int);
+void finish_config(int);
+void remove(struct vinum_ioctl_msg *msg);
+void remove_drive_entry(int driveno, int force);
+void remove_sd_entry(int sdno, int force, int recurse);
+void remove_plex_entry(int plexno, int force, int recurse);
+void remove_volume_entry(int volno, int force, int recurse);
+
+void checkdiskconfig(char *);
+int open_drive(struct drive *, struct thread *, int);
+void close_drive(struct drive *drive);
+void close_locked_drive(struct drive *drive);
+int driveio(struct drive *, char *, size_t, off_t, int);
+int set_drive_parms(struct drive *drive);
+int init_drive(struct drive *, int);
+/* void throw_rude_remark (int, struct _ioctl_reply *, char *, ...); XXX */
+void throw_rude_remark(int, char *,...);
+
+void format_config(char *config, int len);
+void checkkernel(char *op);
+void free_drive(struct drive *drive);
+void down_drive(struct drive *drive);
+void remove_drive(int driveno);
+
+int vinum_scandisk(char *drivename);
+
+/* I/O */
+d_open_t vinumopen;
+d_close_t vinumclose;
+d_strategy_t vinumstrategy;
+d_ioctl_t vinumioctl;
+
+int vinum_super_ioctl(dev_t, u_long, caddr_t);
+int vinumstart(struct buf *bp, int reviveok);
+int launch_requests(struct request *rq, int reviveok);
+void sdio(struct buf *bp);
+
+/* XXX Do we need this? */
+int vinumpart(dev_t);
+
+extern jmp_buf command_fail;				    /* return here if config fails */
+
+#ifdef VINUMDEBUG
+/* Memory allocation and request tracing */
+void vinum_meminfo(caddr_t data);
+int vinum_mallocinfo(caddr_t data);
+int vinum_rqinfo(caddr_t data);
+void LongJmp(jmp_buf, int);
+char *basename(char *);
+#endif
+
+#ifdef VINUMDEBUG
+void expand_table(void **, int, int, char *, int);
+#else
+void expand_table(void **, int, int);
+#endif
+
+struct disklabel;
+struct request;
+struct rqgroup *allocrqg(struct request *rq, int elements);
+void deallocrqg(struct rqgroup *rqg);
+
+/* Device number decoding */
+int Volno(dev_t x);
+int Plexno(dev_t x);
+int Sdno(dev_t x);
+
+/* State transitions */
+int set_drive_state(int driveno, enum drivestate state, enum setstateflags flags);
+int set_sd_state(int sdno, enum sdstate state, enum setstateflags flags);
+enum requeststatus checksdstate(struct sd *sd, struct request *rq, daddr_t diskaddr, daddr_t diskend);
+int set_plex_state(int plexno, enum plexstate state, enum setstateflags flags);
+int set_volume_state(int volumeno, enum volumestate state, enum setstateflags flags);
+void update_sd_state(int sdno);
+void forceup(int plexno);
+void update_plex_state(int plexno);
+void update_volume_state(int volno);
+void invalidate_subdisks(struct plex *, enum sdstate);
+void get_volume_label(char *name, int plexes, u_int64_t size, struct disklabel *lp);
+int write_volume_label(int);
+void start_object(struct vinum_ioctl_msg *);
+void stop_object(struct vinum_ioctl_msg *);
+void setstate(struct vinum_ioctl_msg *msg);
+void setstate_by_force(struct vinum_ioctl_msg *msg);
+void vinum_label(int);
+int vinum_writedisklabel(struct volume *, struct disklabel *);
+int initsd(int, int);
+struct buf *parityrebuild(struct plex *, u_int64_t, int, enum parityop, struct rangelock **, off_t *);
+enum requeststatus sddownstate(struct request *rq);
+
+int restart_plex(int plexno);
+int revive_read(struct sd *sd);
+int revive_block(int sdno);
+void parityops(struct vinum_ioctl_msg *);
+
+/* Auxiliary functions */
+enum sdstates sdstatemap(struct plex *plex);
+enum volplexstate vpstate(struct plex *plex);
+#endif
+
+struct drive *validdrive(int driveno, struct _ioctl_reply *);
+struct sd *validsd(int sdno, struct _ioctl_reply *);
+struct plex *validplex(int plexno, struct _ioctl_reply *);
+struct volume *validvol(int volno, struct _ioctl_reply *);
+void resetstats(struct vinum_ioctl_msg *msg);
+
+/* Locking */
+#ifdef VINUMDEBUG
+int lockdrive(struct drive *drive, char *, int);
+#else
+int lockdrive(struct drive *drive);
+#endif
+void unlockdrive(struct drive *drive);
+int lockvol(struct volume *vol);
+void unlockvol(struct volume *vol);
+int lockplex(struct plex *plex);
+void unlockplex(struct plex *plex);
+struct rangelock *lockrange(daddr_t stripe, struct buf *bp, struct plex *plex);
+int lock_config(void);
+void unlock_config(void);
+
+/* D�mon */
+
+void vinum_daemon(void);
+int vinum_finddaemon(void);
+int vinum_setdaemonopts(int);
+extern struct daemonq *daemonq;				    /* daemon's work queue */
+extern struct daemonq *dqend;				    /* and the end of the queue */
+extern struct cdevsw vinum_cdevsw;
+
+#undef Free						    /* defined in some funny net stuff */
+#ifdef _KERNEL
+#ifdef VINUMDEBUG
+#define Malloc(x)  MMalloc ((x), __FILE__, __LINE__)	    /* show where we came from */
+#define Free(x)	   FFree ((x), __FILE__, __LINE__)	    /* show where we came from */
+caddr_t MMalloc(int size, char *, int);
+void FFree(void *mem, char *, int);
+#define LOCKDRIVE(d) lockdrive (d, __FILE__, __LINE__)
+#else
+#define Malloc(x)  malloc((x), M_DEVBUF, \
+	curthread->td_proc->p_intr_nesting_level == 0? M_WAITOK: M_NOWAIT)
+#define Free(x)    free((x), M_DEVBUF)
+#define LOCKDRIVE(d) lockdrive (d)
+#endif
+#else
+#define Malloc(x)  malloc ((x))				    /* just the size */
+#define Free(x)	   free ((x))				    /* just the address */
+#endif
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumhdr.h b/sys/dev/vinum/vinumhdr.h
new file mode 100644
index 0000000..e8161e8
--- /dev/null
+++ b/sys/dev/vinum/vinumhdr.h
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *  
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ */
+
+/* Header files used by all modules */
+/*
+ * $Id: vinumhdr.h,v 1.19 2001/05/22 04:07:22 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#ifdef _KERNEL
+#include "opt_vinum.h"
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/conf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/sysctl.h>
+#endif
+#include <sys/errno.h>
+#include <sys/time.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/uio.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+#include <sys/syslog.h>
+#include <sys/fcntl.h>
+#include <sys/queue.h>
+#ifdef _KERNEL
+#include <machine/setjmp.h>
+#include <machine/stdarg.h>
+#else
+#include <setjmp.h>
+#include <stdarg.h>
+#endif
+#include <vm/vm.h>
+#include <dev/vinum/vinumvar.h>
+#include <dev/vinum/vinumio.h>
+#include <dev/vinum/vinumkw.h>
+#include <dev/vinum/vinumext.h>
+#include <dev/vinum/vinumutil.h>
+#include <machine/cpu.h>
diff --git a/sys/dev/vinum/vinuminterrupt.c b/sys/dev/vinum/vinuminterrupt.c
new file mode 100644
index 0000000..8d72579
--- /dev/null
+++ b/sys/dev/vinum/vinuminterrupt.c
@@ -0,0 +1,467 @@
+/* vinuminterrupt.c: bottom half of the driver */
+
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinuminterrupt.c,v 1.14 2001/05/23 23:03:37 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+#include <sys/resourcevar.h>
+
+void complete_raid5_write(struct rqelement *);
+void complete_rqe(struct buf *bp);
+void sdio_done(struct buf *bp);
+
+/*
+ * Take a completed buffer, transfer the data back if
+ * it's a read, and complete the high-level request
+ * if this is the last subrequest.
+ *
+ * The bp parameter is in fact a struct rqelement, which
+ * includes a couple of extras at the end.
+ */
+void
+complete_rqe(struct buf *bp)
+{
+    struct rqelement *rqe;
+    struct request *rq;
+    struct rqgroup *rqg;
+    struct buf *ubp;					    /* user buffer */
+    struct drive *drive;
+    struct sd *sd;
+    char *gravity;					    /* for error messages */
+
+    rqe = (struct rqelement *) bp;			    /* point to the element element that completed */
+    rqg = rqe->rqg;					    /* and the request group */
+    rq = rqg->rq;					    /* and the complete request */
+    ubp = rq->bp;					    /* user buffer */
+
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_LASTREQS)
+	logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
+#endif
+    drive = &DRIVE[rqe->driveno];
+    drive->active--;					    /* one less outstanding I/O on this drive */
+    vinum_conf.active--;				    /* one less outstanding I/O globally */
+    if ((drive->active == (DRIVE_MAXACTIVE - 1))	    /* we were at the drive limit */
+    ||(vinum_conf.active == VINUM_MAXACTIVE))		    /* or the global limit */
+	wakeup(&launch_requests);			    /* let another one at it */
+    if ((bp->b_io.bio_flags & BIO_ERROR) != 0) {	    /* transfer in error */
+	gravity = "";
+	sd = &SD[rqe->sdno];
+
+	if (bp->b_error != 0)				    /* did it return a number? */
+	    rq->error = bp->b_error;			    /* yes, put it in. */
+	else if (rq->error == 0)			    /* no: do we have one already? */
+	    rq->error = EIO;				    /* no: catchall "I/O error" */
+	sd->lasterror = rq->error;
+	if (bp->b_iocmd == BIO_READ) {			    /* read operation */
+	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
+		gravity = " fatal";
+		set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
+	    }
+	    log(LOG_ERR,
+		"%s:%s read error, block %lld for %ld bytes\n",
+		gravity,
+		sd->name,
+		(long long)bp->b_blkno,
+		bp->b_bcount);
+	} else {					    /* write operation */
+	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
+		gravity = "fatal ";
+		set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
+	    }
+	    log(LOG_ERR,
+		"%s:%s write error, block %lld for %ld bytes\n",
+		gravity,
+		sd->name,
+		(long long)bp->b_blkno,
+		bp->b_bcount);
+	}
+	log(LOG_ERR,
+	    "%s: user buffer block %lld for %ld bytes\n",
+	    sd->name,
+	    (long long)ubp->b_blkno,
+	    ubp->b_bcount);
+	if (rq->error == ENXIO) {			    /* the drive's down too */
+	    log(LOG_ERR,
+		"%s: fatal drive I/O error, block %lld for %ld bytes\n",
+		DRIVE[rqe->driveno].label.name,
+		(long long)bp->b_blkno,
+		bp->b_bcount);
+	    DRIVE[rqe->driveno].lasterror = rq->error;
+	    set_drive_state(rqe->driveno,		    /* take the drive down */
+		drive_down,
+		setstate_force);
+	}
+    }
+    /* Now update the statistics */
+    if (bp->b_iocmd == BIO_READ) {			    /* read operation */
+	DRIVE[rqe->driveno].reads++;
+	DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
+	SD[rqe->sdno].reads++;
+	SD[rqe->sdno].bytes_read += bp->b_bcount;
+	PLEX[rqe->rqg->plexno].reads++;
+	PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
+	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
+	    VOL[PLEX[rqe->rqg->plexno].volno].reads++;
+	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
+	}
+    } else {						    /* write operation */
+	DRIVE[rqe->driveno].writes++;
+	DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
+	SD[rqe->sdno].writes++;
+	SD[rqe->sdno].bytes_written += bp->b_bcount;
+	PLEX[rqe->rqg->plexno].writes++;
+	PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
+	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
+	    VOL[PLEX[rqe->rqg->plexno].volno].writes++;
+	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
+	}
+    }
+    if (rqg->flags & XFR_RECOVERY_READ) {		    /* recovery read, */
+	int *sdata;					    /* source */
+	int *data;					    /* and group data */
+	int length;					    /* and count involved */
+	int count;					    /* loop counter */
+	struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */
+
+	/* XOR destination is the user data */
+	sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT];	/* old data contents */
+	data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
+	length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
+
+	for (count = 0; count < length; count++)
+	    data[count] ^= sdata[count];
+
+	/*
+	 * In a normal read, we will normally read directly
+	 * into the user buffer.  This doesn't work if
+	 * we're also doing a recovery, so we have to
+	 * copy it
+	 */
+	if (rqe->flags & XFR_NORMAL_READ) {		    /* normal read as well, */
+	    char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
+	    char *dst;
+
+	    dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
+	    length = rqe->datalen << DEV_BSHIFT;	    /* and count involved */
+	    bcopy(src, dst, length);			    /* move it */
+	}
+    } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation  */
+    &&(rqg->active == 1))				    /* and this is the last active request */
+	complete_raid5_write(rqe);
+    /*
+     * This is the earliest place where we can be
+     * sure that the request has really finished,
+     * since complete_raid5_write can issue new
+     * requests.
+     */
+    rqg->active--;					    /* this request now finished */
+    if (rqg->active == 0) {				    /* request group finished, */
+	rq->active--;					    /* one less */
+	if (rqg->lock) {				    /* got a lock? */
+	    unlockrange(rqg->plexno, rqg->lock);	    /* yes, free it */
+	    rqg->lock = 0;
+	}
+    }
+    if (rq->active == 0) {				    /* request finished, */
+#ifdef VINUMDEBUG
+	if (debug & DEBUG_RESID) {
+	    if (ubp->b_resid != 0)			    /* still something to transfer? */
+		Debugger("resid");
+	}
+#endif
+
+	if (rq->error) {				    /* did we have an error? */
+	    if (rq->isplex) {				    /* plex operation, */
+		ubp->b_io.bio_flags |= BIO_ERROR;	    /* yes, propagate to user */
+		ubp->b_error = rq->error;
+	    } else					    /* try to recover */
+		queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
+	} else {
+	    ubp->b_resid = 0;				    /* completed our transfer */
+	    if (rq->isplex == 0)			    /* volume request, */
+		VOL[rq->volplex.volno].active--;	    /* another request finished */
+	    if (rq->flags & XFR_COPYBUF) {
+		Free(ubp->b_data);
+		ubp->b_data = rq->save_data;
+	    }
+	    bufdone(ubp);				    /* top level buffer completed */
+	    freerq(rq);					    /* return the request storage */
+	}
+    }
+}
+
+/* Free a request block and anything hanging off it */
+void
+freerq(struct request *rq)
+{
+    struct rqgroup *rqg;
+    struct rqgroup *nrqg;				    /* next in chain */
+    int rqno;
+
+    for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) {	    /* through the whole request chain */
+	if (rqg->lock)					    /* got a lock? */
+	    unlockrange(rqg->plexno, rqg->lock);	    /* yes, free it */
+	for (rqno = 0; rqno < rqg->count; rqno++) {
+	    if ((rqg->rqe[rqno].flags & XFR_MALLOCED)	    /* data buffer was malloced, */
+	    &&rqg->rqe[rqno].b.b_data)			    /* and the allocation succeeded */
+		Free(rqg->rqe[rqno].b.b_data);		    /* free it */
+	    if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) {	    /* locked this buffer, */
+		BUF_UNLOCK(&rqg->rqe[rqno].b);		    /* unlock it again */
+		BUF_LOCKFREE(&rqg->rqe[rqno].b);
+	    }
+	}
+	nrqg = rqg->next;				    /* note the next one */
+	Free(rqg);					    /* and free this one */
+    }
+    Free(rq);						    /* free the request itself */
+}
+
+/* I/O on subdisk completed */
+void
+sdio_done(struct buf *bp)
+{
+    struct sdbuf *sbp;
+
+    sbp = (struct sdbuf *) bp;
+    if (sbp->b.b_io.bio_flags & BIO_ERROR) {		    /* had an error */
+	sbp->bp->b_io.bio_flags |= BIO_ERROR;		    /* propagate upwards */
+	sbp->bp->b_error = sbp->b.b_error;
+    }
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_LASTREQS)
+	logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
+#endif
+    sbp->bp->b_resid = sbp->b.b_resid;			    /* copy the resid field */
+    /* Now update the statistics */
+    if (bp->b_iocmd == BIO_READ) {			    /* read operation */
+	DRIVE[sbp->driveno].reads++;
+	DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
+	SD[sbp->sdno].reads++;
+	SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
+    } else {						    /* write operation */
+	DRIVE[sbp->driveno].writes++;
+	DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
+	SD[sbp->sdno].writes++;
+	SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
+    }
+    bufdone(sbp->bp);					    /* complete the caller's I/O */
+    BUF_UNLOCK(&sbp->b);
+    BUF_LOCKFREE(&sbp->b);
+    Free(sbp);
+}
+
+/* Start the second phase of a RAID-4 or RAID-5 group write operation. */
+void
+complete_raid5_write(struct rqelement *rqe)
+{
+    int *sdata;						    /* source */
+    int *pdata;						    /* and parity block data */
+    int length;						    /* and count involved */
+    int count;						    /* loop counter */
+    int rqno;						    /* request index */
+    int rqoffset;					    /* offset of request data from parity data */
+    struct buf *ubp;					    /* user buffer header */
+    struct request *rq;					    /* pointer to our request */
+    struct rqgroup *rqg;				    /* and to the request group */
+    struct rqelement *prqe;				    /* point to the parity block */
+    struct drive *drive;				    /* drive to access */
+
+    rqg = rqe->rqg;					    /* and to our request group */
+    rq = rqg->rq;					    /* point to our request */
+    ubp = rq->bp;					    /* user's buffer header */
+    prqe = &rqg->rqe[0];				    /* point to the parity block */
+
+    /*
+     * If we get to this function, we have normal or
+     * degraded writes, or a combination of both.  We do
+     * the same thing in each case: we perform an
+     * exclusive or to the parity block.  The only
+     * difference is the origin of the data and the
+     * address range.
+     */
+    if (rqe->flags & XFR_DEGRADED_WRITE) {		    /* do the degraded write stuff */
+	pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
+	bzero(pdata, prqe->grouplen << DEV_BSHIFT);	    /* start with nothing in the parity block */
+
+	/* Now get what data we need from each block */
+	for (rqno = 1; rqno < rqg->count; rqno++) {	    /* for all the data blocks */
+	    rqe = &rqg->rqe[rqno];			    /* this request */
+	    sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
+	    length = rqe->grouplen << (DEV_BSHIFT - 2);	    /* and count involved */
+
+	    /*
+	     * Add the data block to the parity block.  Before
+	     * we started the request, we zeroed the parity
+	     * block, so the result of adding all the other
+	     * blocks and the block we want to write will be
+	     * the correct parity block.
+	     */
+	    for (count = 0; count < length; count++)
+		pdata[count] ^= sdata[count];
+	    if ((rqe->flags & XFR_MALLOCED)		    /* the buffer was malloced, */
+	    &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {	    /* and we have no normal write, */
+		Free(rqe->b.b_data);			    /* free it now */
+		rqe->flags &= ~XFR_MALLOCED;
+	    }
+	}
+    }
+    if (rqg->flags & XFR_NORMAL_WRITE) {		    /* do normal write stuff */
+	/* Get what data we need from each block */
+	for (rqno = 1; rqno < rqg->count; rqno++) {	    /* for all the data blocks */
+	    rqe = &rqg->rqe[rqno];			    /* this request */
+	    if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
+		== (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
+		sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
+		rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
+		pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
+		length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
+
+		/*
+		 * "remove" the old data block
+		 * from the parity block
+		 */
+		if ((pdata < ((int *) prqe->b.b_data))
+		    || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
+		    || (sdata < ((int *) rqe->b.b_data))
+		    || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
+		    panic("complete_raid5_write: bounds overflow");
+		for (count = 0; count < length; count++)
+		    pdata[count] ^= sdata[count];
+
+		/* "add" the new data block */
+		sdata = (int *) (&ubp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
+		if ((sdata < ((int *) ubp->b_data))
+		    || (&sdata[length] > ((int *) (ubp->b_data + ubp->b_bcount))))
+		    panic("complete_raid5_write: bounds overflow");
+		for (count = 0; count < length; count++)
+		    pdata[count] ^= sdata[count];
+
+		/* Free the malloced buffer */
+		if (rqe->flags & XFR_MALLOCED) {	    /* the buffer was malloced, */
+		    Free(rqe->b.b_data);		    /* free it */
+		    rqe->flags &= ~XFR_MALLOCED;
+		} else
+		    panic("complete_raid5_write: malloc conflict");
+
+		if ((rqe->b.b_iocmd == BIO_READ)	    /* this was a read */
+		&&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
+		    rqe->b.b_flags &= ~B_DONE;		    /* start a new request */
+		    rqe->b.b_iocmd = BIO_WRITE;		    /* we're writing now */
+		    rqe->b.b_iodone = complete_rqe;	    /* call us here when done */
+		    rqe->flags &= ~XFR_PARITYOP;	    /* reset flags that brought us here */
+		    rqe->b.b_data = &ubp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
+		    rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
+		    rqe->b.b_bufsize = rqe->b.b_bcount;	    /* don't claim more */
+		    rqe->b.b_resid = rqe->b.b_bcount;	    /* nothing transferred */
+		    rqe->b.b_blkno += rqe->dataoffset;	    /* point to the correct block */
+		    rqg->active++;			    /* another active request */
+		    drive = &DRIVE[rqe->driveno];	    /* drive to access */
+
+							    /* We can't sleep here, so we just increment the counters. */
+		    drive->active++;
+		    if (drive->active >= drive->maxactive)
+			drive->maxactive = drive->active;
+		    vinum_conf.active++;
+		    if (vinum_conf.active >= vinum_conf.maxactive)
+			vinum_conf.maxactive = vinum_conf.active;
+#ifdef VINUMDEBUG
+		    if (debug & DEBUG_ADDRESSES)
+			log(LOG_DEBUG,
+			    "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
+			    rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
+			    major(rqe->b.b_dev),
+			    minor(rqe->b.b_dev),
+			    rqe->sdno,
+			    (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
+			    (long long)rqe->b.b_blkno,
+			    rqe->b.b_bcount);
+		    if (debug & DEBUG_LASTREQS)
+			logrq(loginfo_raid5_data, (union rqinfou) rqe, ubp);
+#endif
+		    DEV_STRATEGY(&rqe->b);
+		}
+	    }
+	}
+    }
+    /* Finally, write the parity block */
+    rqe = &rqg->rqe[0];
+    rqe->b.b_flags &= ~B_DONE;				    /* we're not done */
+    rqe->b.b_iocmd = BIO_WRITE;				    /* we're writing now */
+    rqe->b.b_iodone = complete_rqe;			    /* call us here when done */
+    rqg->flags &= ~XFR_PARITYOP;			    /* reset flags that brought us here */
+    rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;	    /* length to write */
+    rqe->b.b_bufsize = rqe->b.b_bcount;			    /* don't claim we have more */
+    rqe->b.b_resid = rqe->b.b_bcount;			    /* nothing transferred */
+    rqg->active++;					    /* another active request */
+    drive = &DRIVE[rqe->driveno];			    /* drive to access */
+
+    /* We can't sleep here, so we just increment the counters. */
+    drive->active++;
+    if (drive->active >= drive->maxactive)
+	drive->maxactive = drive->active;
+    vinum_conf.active++;
+    if (vinum_conf.active >= vinum_conf.maxactive)
+	vinum_conf.maxactive = vinum_conf.active;
+
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_ADDRESSES)
+	log(LOG_DEBUG,
+	   "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
+	    rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
+	    major(rqe->b.b_dev),
+	    minor(rqe->b.b_dev),
+	    rqe->sdno,
+	    (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
+	    (long long)rqe->b.b_blkno,
+	    rqe->b.b_bcount);
+    if (debug & DEBUG_LASTREQS)
+	logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubp);
+#endif
+    DEV_STRATEGY(&rqe->b);
+}
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumio.c b/sys/dev/vinum/vinumio.c
new file mode 100644
index 0000000..8544f95
--- /dev/null
+++ b/sys/dev/vinum/vinumio.c
@@ -0,0 +1,959 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumio.c,v 1.39 2003/05/23 00:59:53 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+static char *sappend(char *txt, char *s);
+static int drivecmp(const void *va, const void *vb);
+
+/*
+ * Open the device associated with the drive, and
+ * set drive's vp.  Return an error number.
+ */
+int
+open_drive(struct drive *drive, struct thread *td, int verbose)
+{
+    struct cdevsw *dsw;					    /* pointer to cdevsw entry */
+
+    if (drive->flags & VF_OPEN)				    /* open already, */
+	return EBUSY;					    /* don't do it again */
+
+    drive->dev = getdiskbyname(drive->devicename);
+    if (drive->dev == NODEV)				    /* didn't find anything */
+	return ENOENT;
+
+    drive->dev->si_iosize_max = DFLTPHYS;
+    dsw = devsw(drive->dev);
+    if (dsw == NULL)					    /* sanity, should not happen */
+	drive->lasterror = ENOENT;
+    else if ((dsw->d_flags & D_DISK) == 0)
+	drive->lasterror = ENOTBLK;
+    else
+	drive->lasterror = (dsw->d_open) (drive->dev, FWRITE | FREAD, 0, NULL);
+
+    if (drive->lasterror != 0) {			    /* failed */
+	drive->state = drive_down;			    /* just force it down */
+	if (verbose)
+	    log(LOG_WARNING,
+		"vinum open_drive %s: failed with error %d\n",
+		drive->devicename, drive->lasterror);
+    } else
+	drive->flags |= VF_OPEN;			    /* we're open now */
+
+    return drive->lasterror;
+}
+
+/*
+ * Set some variables in the drive struct in more
+ * convenient form.  Return error indication.
+ */
+int
+set_drive_parms(struct drive *drive)
+{
+    drive->blocksize = BLKDEV_IOSIZE;			    /* do we need this? */
+    drive->secsperblock = drive->blocksize		    /* number of sectors per block */
+	/ drive->sectorsize;
+
+    /* Now update the label part */
+    bcopy(hostname, drive->label.sysname, VINUMHOSTNAMELEN); /* put in host name */
+    microtime(&drive->label.date_of_birth);		    /* and current time */
+    drive->label.drive_size = drive->mediasize;		    /* size of the drive in bytes */
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_BIGDRIVE)				    /* pretend we're 100 times as big */
+	drive->label.drive_size *= 100;
+#endif
+
+    /* number of sectors available for subdisks */
+    drive->sectors_available = drive->label.drive_size / DEV_BSIZE - DATASTART;
+
+    /*
+     * Bug in 3.0 as of January 1998: you can open
+     * non-existent slices.  They have a length of 0.
+     */
+    if (drive->label.drive_size < MINVINUMSLICE) {	    /* too small to worry about */
+	set_drive_state(drive->driveno, drive_down, setstate_force);
+	drive->lasterror = ENOSPC;
+	return ENOSPC;
+    }
+    drive->freelist_size = INITIAL_DRIVE_FREELIST;	    /* initial number of entries */
+    drive->freelist = (struct drive_freelist *)
+	Malloc(INITIAL_DRIVE_FREELIST * sizeof(struct drive_freelist));
+    if (drive->freelist == NULL)			    /* can't malloc, dammit */
+	return ENOSPC;
+    drive->freelist_entries = 1;			    /* just (almost) the complete drive */
+    drive->freelist[0].offset = DATASTART;		    /* starts here */
+    drive->freelist[0].sectors = (drive->label.drive_size >> DEV_BSHIFT) - DATASTART; /* and it's this long */
+    if (drive->label.name[0] != '\0')			    /* got a name */
+	set_drive_state(drive->driveno, drive_up, setstate_force); /* our drive is accessible */
+    else						    /* we know about it, but that's all */
+	drive->state = drive_referenced;
+    return 0;
+}
+
+/*
+ * Initialize a drive: open the device and add
+ * device information.
+ */
+int
+init_drive(struct drive *drive, int verbose)
+{
+
+    drive->lasterror = open_drive(drive, curthread, verbose); /* open the drive */
+    if (drive->lasterror)
+	return drive->lasterror;
+
+    drive->lasterror = (*devsw(drive->dev)->d_ioctl) (drive->dev,
+	DIOCGSECTORSIZE,
+	(caddr_t) & drive->sectorsize,
+	FREAD,
+	curthread);
+    if (drive->lasterror == 0)
+	drive->lasterror = (*devsw(drive->dev)->d_ioctl) (drive->dev,
+	    DIOCGMEDIASIZE,
+	    (caddr_t) & drive->mediasize,
+	    FREAD,
+	    curthread);
+    if (drive->lasterror) {
+	if (verbose)
+	    log(LOG_ERR,
+		"vinum: Can't get drive dimensions for %s: error %d\n",
+		drive->devicename,
+		drive->lasterror);
+	close_drive(drive);
+	return drive->lasterror;
+    }
+    return set_drive_parms(drive);			    /* set various odds and ends */
+}
+
+/* Close a drive if it's open. */
+void
+close_drive(struct drive *drive)
+{
+    LOCKDRIVE(drive);					    /* keep the daemon out */
+    if (drive->flags & VF_OPEN)
+	close_locked_drive(drive);			    /* and close it */
+    if (drive->state > drive_down)			    /* if it's up */
+	drive->state = drive_down;			    /* make sure it's down */
+    unlockdrive(drive);
+}
+
+/*
+ * Real drive close code, called with drive already locked.
+ * We have also checked that the drive is open.  No errors.
+ */
+void
+close_locked_drive(struct drive *drive)
+{
+    int error;
+
+    /*
+     * If we can't access the drive, we can't flush
+     * the queues, which spec_close() will try to
+     * do.  Get rid of them here first.
+     */
+    error = (*devsw(drive->dev)->d_close) (drive->dev, FWRITE | FREAD, 0, NULL);
+    drive->flags &= ~VF_OPEN;				    /* no longer open */
+    if (drive->lasterror == 0)
+	drive->lasterror = error;
+}
+
+/*
+ * Remove drive from the configuration.
+ * Caller must ensure that it isn't active.
+ */
+void
+remove_drive(int driveno)
+{
+    struct drive *drive = &vinum_conf.drive[driveno];
+    struct vinum_hdr *vhdr;				    /* buffer for header */
+    int error;
+
+    if (drive->state > drive_referenced) {		    /* real drive */
+	if (drive->state == drive_up) {
+	    vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN);	/* allocate buffer */
+	    CHECKALLOC(vhdr, "Can't allocate memory");
+	    error = read_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET);
+	    if (error)
+		drive->lasterror = error;
+	    else {
+		vhdr->magic = VINUM_NOMAGIC;		    /* obliterate the magic, but leave the rest */
+		write_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET);
+	    }
+	    Free(vhdr);
+	}
+	free_drive(drive);				    /* close it and free resources */
+	save_config();					    /* and save the updated configuration */
+    }
+}
+
+/*
+ * Transfer drive data.  Usually called from one of these defines;
+ * #define read_drive(a, b, c, d) driveio (a, b, c, d, B_READ)
+ * #define write_drive(a, b, c, d) driveio (a, b, c, d, B_WRITE)
+ *
+ * length and offset are in bytes, but must be multiples of sector
+ * size.  The function *does not check* for this condition, and
+ * truncates ruthlessly.
+ * Return error number.
+ */
+int
+driveio(struct drive *drive, char *buf, size_t length, off_t offset, int flag)
+{
+    int error;
+    struct buf *bp;
+
+    error = 0;						    /* to keep the compiler happy */
+    while (length) {					    /* divide into small enough blocks */
+	int len = min(length, MAXBSIZE);		    /* maximum block device transfer is MAXBSIZE */
+
+	bp = geteblk(len);				    /* get a buffer header */
+	bp->b_flags = 0;
+	bp->b_iocmd = flag;
+	bp->b_dev = drive->dev;				    /* device */
+	bp->b_blkno = offset / drive->sectorsize;	    /* block number */
+	bp->b_saveaddr = bp->b_data;
+	bp->b_data = buf;
+	bp->b_bcount = len;
+	DEV_STRATEGY(bp);				    /* initiate the transfer */
+	error = bufwait(bp);
+	bp->b_data = bp->b_saveaddr;
+	bp->b_flags |= B_INVAL | B_AGE;
+	bp->b_ioflags &= ~BIO_ERROR;
+	brelse(bp);
+	if (error)
+	    break;
+	length -= len;					    /* update pointers */
+	buf += len;
+	offset += len;
+    }
+    return error;
+}
+
+/*
+ * Check a drive for a vinum header.  If found,
+ * update the drive information.  We come here
+ * with a partially populated drive structure
+ * which includes the device name.
+ *
+ * Return information on what we found.
+ *
+ * This function is called from two places: check_drive,
+ * which wants to find out whether the drive is a
+ * Vinum drive, and config_drive, which asserts that
+ * it is a vinum drive.  In the first case, we don't
+ * print error messages (verbose==0), in the second
+ * we do (verbose==1).
+ */
+enum drive_label_info
+read_drive_label(struct drive *drive, int verbose)
+{
+    int error;
+    int result;						    /* result of our search */
+    struct vinum_hdr *vhdr;				    /* and as header */
+
+    error = init_drive(drive, 0);			    /* find the drive */
+    if (error)						    /* find the drive */
+	return DL_CANT_OPEN;				    /* not ours */
+
+    vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN);	    /* allocate buffers */
+    CHECKALLOC(vhdr, "Can't allocate memory");
+
+    drive->state = drive_up;				    /* be optimistic */
+    error = read_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET);
+    if (vhdr->magic == VINUM_MAGIC) {			    /* ours! */
+	if (drive->label.name[0]			    /* we have a name for this drive */
+	&&(strcmp(drive->label.name, vhdr->label.name))) {  /* but it doesn't match the real name */
+	    drive->lasterror = EINVAL;
+	    result = DL_WRONG_DRIVE;			    /* it's the wrong drive */
+	    drive->state = drive_unallocated;		    /* put it back, it's not ours */
+	} else
+	    result = DL_OURS;
+	/*
+	 * We copy the drive anyway so that we have
+	 * the correct name in the drive info.  This
+	 * may not be the name specified
+	 */
+	drive->label = vhdr->label;			    /* put in the label information */
+    } else if (vhdr->magic == VINUM_NOMAGIC)		    /* was ours, but we gave it away */
+	result = DL_DELETED_LABEL;			    /* and return the info */
+    else
+	result = DL_NOT_OURS;				    /* we could have it, but we don't yet */
+    Free(vhdr);						    /* that's all. */
+    return result;
+}
+
+/*
+ * Check a drive for a vinum header.  If found,
+ * read configuration information from the drive and
+ * incorporate the data into the configuration.
+ *
+ * Return drive number.
+ */
+struct drive *
+check_drive(char *devicename)
+{
+    int driveno;
+    int i;
+    struct drive *drive;
+
+    driveno = find_drive_by_name(devicename, 1);	    /* if entry doesn't exist, create it */
+    drive = &vinum_conf.drive[driveno];			    /* and get a pointer */
+
+    if (drive->state >= drive_down)			    /* up or down, we know it */
+	return drive;
+    if (read_drive_label(drive, 0) == DL_OURS) {	    /* one of ours */
+	for (i = 0; i < vinum_conf.drives_allocated; i++) { /* see if the name already exists */
+	    if ((i != driveno)				    /* not this drive */
+	    &&(DRIVE[i].state != drive_unallocated)	    /* and it's allocated */
+	    &&(strcmp(DRIVE[i].label.name,
+			DRIVE[driveno].label.name) == 0)) { /* and it has the same name */
+		struct drive *mydrive = &DRIVE[i];
+
+		if (mydrive->devicename[0] == '/') {	    /* we know a device name for it */
+		    /*
+		     * set an error, but don't take the
+		     * drive down: that would cause unneeded
+		     * error messages.
+		     */
+		    drive->lasterror = EEXIST;
+		    break;
+		} else {				    /* it's just a place holder, */
+		    int sdno;
+
+		    for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { /* look at each subdisk */
+			if ((SD[sdno].driveno == i)	    /* it's pointing to this one, */
+			&&(SD[sdno].state != sd_unallocated)) {	/* and it's a real subdisk */
+			    SD[sdno].driveno = drive->driveno; /* point to the one we found */
+			    update_sd_state(sdno);	    /* and update its state */
+			}
+		    }
+		    bzero(mydrive, sizeof(struct drive));   /* don't deallocate it, just remove it */
+		}
+	    }
+	}
+	return drive;
+    } else {						    /* not ours, */
+	close_drive(drive);
+	free_drive(drive);				    /* get rid of it */
+	return NULL;
+    }
+}
+
+static char *
+sappend(char *txt, char *s)
+{
+    while ((*s++ = *txt++) != 0);
+    return s - 1;
+}
+
+void
+format_config(char *config, int len)
+{
+    int i;
+    int j;
+    char *s = config;
+    char *configend = &config[len];
+
+    bzero(config, len);
+
+    /* First write the volume configuration */
+    for (i = 0; i < vinum_conf.volumes_allocated; i++) {
+	struct volume *vol;
+
+	vol = &vinum_conf.volume[i];
+	if ((vol->state > volume_uninit)
+	    && (vol->name[0] != '\0')) {		    /* paranoia */
+	    snprintf(s,
+		configend - s,
+		"volume %s state %s",
+		vol->name,
+		volume_state(vol->state));
+	    while (*s)
+		s++;					    /* find the end */
+	    s = sappend("\n", s);
+	}
+    }
+
+    /* Then the plex configuration */
+    for (i = 0; i < vinum_conf.plexes_allocated; i++) {
+	struct plex *plex;
+	struct volume *vol;
+
+	plex = &vinum_conf.plex[i];
+	if ((plex->state > plex_referenced)
+	    && (plex->name[0] != '\0')) {		    /* paranoia */
+	    snprintf(s,
+		configend - s,
+		"plex name %s state %s org %s ",
+		plex->name,
+		plex_state(plex->state),
+		plex_org(plex->organization));
+	    while (*s)
+		s++;					    /* find the end */
+	    if (isstriped(plex)) {
+		snprintf(s,
+		    configend - s,
+		    "%ds ",
+		    (int) plex->stripesize);
+		while (*s)
+		    s++;				    /* find the end */
+	    }
+	    if (plex->volno >= 0) {			    /* we have a volume */
+		vol = &VOL[plex->volno];
+		snprintf(s,
+		    configend - s,
+		    "vol %s ",
+		    vol->name);
+		while (*s)
+		    s++;				    /* find the end */
+		if ((vol->preferred_plex >= 0)		    /* has a preferred plex */
+		&&vol->plex[vol->preferred_plex] == i)	    /* and it's us */
+		    snprintf(s, configend - s, "preferred ");
+		while (*s)
+		    s++;				    /* find the end */
+	    }
+	    for (j = 0; j < plex->subdisks; j++) {
+		snprintf(s,
+		    configend - s,
+		    " sd %s",
+		    vinum_conf.sd[plex->sdnos[j]].name);
+	    }
+	    s = sappend("\n", s);
+	}
+    }
+
+    /* And finally the subdisk configuration */
+    for (i = 0; i < vinum_conf.subdisks_allocated; i++) {
+	struct sd *sd;
+	char *drivename;
+
+	sd = &SD[i];
+	if ((sd->state != sd_referenced)
+	    && (sd->state != sd_unallocated)
+	    && (sd->name[0] != '\0')) {			    /* paranoia */
+	    drivename = vinum_conf.drive[sd->driveno].label.name;
+	    /*
+	     * XXX We've seen cases of dead subdisks
+	     * which don't have a drive.  If we let them
+	     * through here, the drive name is null, so
+	     * they get the drive named 'plex'.
+	     *
+	     * This is a breakage limiter, not a fix.
+	     */
+	    if (drivename[0] == '\0')
+		drivename = "*invalid*";
+	    snprintf(s,
+		configend - s,
+		"sd name %s drive %s len %llus driveoffset %llus state %s",
+		sd->name,
+		drivename,
+		(unsigned long long) sd->sectors,
+		(unsigned long long) sd->driveoffset,
+		sd_state(sd->state));
+	    while (*s)
+		s++;					    /* find the end */
+	    if (sd->plexno >= 0)
+		snprintf(s,
+		    configend - s,
+		    " plex %s plexoffset %llds",
+		    vinum_conf.plex[sd->plexno].name,
+		    (long long) sd->plexoffset);
+	    else
+		snprintf(s, configend - s, " detached");
+	    while (*s)
+		s++;					    /* find the end */
+	    if (sd->flags & VF_RETRYERRORS) {
+		snprintf(s, configend - s, " retryerrors");
+		while (*s)
+		    s++;				    /* find the end */
+	    }
+	    snprintf(s, configend - s, " \n");
+	    while (*s)
+		s++;					    /* find the end */
+	}
+    }
+    if (s > &config[len - 2])
+	panic("vinum: configuration data overflow");
+}
+
+/*
+ * issue a save config request to the d�mon.  The actual work
+ * is done in process context by daemon_save_config.
+ */
+void
+save_config(void)
+{
+    queue_daemon_request(daemonrq_saveconfig, (union daemoninfo) 0);
+}
+
+/*
+ * Write the configuration to all vinum slices.  This
+ * is performed by the daemon only.
+ */
+void
+daemon_save_config(void)
+{
+    int error;
+    int written_config;					    /* set when we first write the config to disk */
+    int driveno;
+    struct drive *drive;				    /* point to current drive info */
+    struct vinum_hdr *vhdr;				    /* and as header */
+    char *config;					    /* point to config data */
+
+    /* don't save the configuration while we're still working on it */
+    if (vinum_conf.flags & VF_CONFIGURING)
+	return;
+    written_config = 0;					    /* no config written yet */
+    /* Build a volume header */
+    vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN);	    /* get space for the config data */
+    CHECKALLOC(vhdr, "Can't allocate config data");
+    vhdr->magic = VINUM_MAGIC;				    /* magic number */
+    vhdr->config_length = MAXCONFIG;			    /* length of following config info */
+
+    config = Malloc(MAXCONFIG);				    /* get space for the config data */
+    CHECKALLOC(config, "Can't allocate config data");
+
+    format_config(config, MAXCONFIG);
+    error = 0;						    /* no errors yet */
+    for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) {
+	drive = &vinum_conf.drive[driveno];		    /* point to drive */
+	if (drive->state > drive_referenced) {
+	    LOCKDRIVE(drive);				    /* don't let it change */
+
+	    /*
+	     * First, do some drive consistency checks.  Some
+	     * of these are kludges, others require a process
+	     * context and couldn't be done before.
+	     */
+	    if ((drive->devicename[0] == '\0')
+		|| (drive->label.name[0] == '\0')) {
+		unlockdrive(drive);
+		free_drive(drive);			    /* get rid of it */
+		break;
+	    }
+	    if (((drive->flags & VF_OPEN) == 0)		    /* drive not open */
+	    &&(drive->state > drive_down)) {		    /* and it thinks it's not down */
+		unlockdrive(drive);
+		set_drive_state(driveno, drive_down, setstate_force); /* tell it what's what */
+		continue;
+	    }
+	    if ((drive->state == drive_down)		    /* it's down */
+	    &&(drive->flags & VF_OPEN)) {		    /* but open, */
+		unlockdrive(drive);
+		close_drive(drive);			    /* close it */
+	    } else if (drive->state > drive_down) {
+		microtime(&drive->label.last_update);	    /* time of last update is now */
+		bcopy((char *) &drive->label,		    /* and the label info from the drive structure */
+		    (char *) &vhdr->label,
+		    sizeof(vhdr->label));
+		if ((drive->state != drive_unallocated)
+		    && (drive->state != drive_referenced)) { /* and it's a real drive */
+		    error = write_drive(drive,
+			(char *) vhdr,
+			VINUMHEADERLEN,
+			VINUM_LABEL_OFFSET);
+		    if (error == 0)			    /* first config copy */
+			error = write_drive(drive,
+			    config,
+			    MAXCONFIG,
+			    VINUM_CONFIG_OFFSET);
+		    if (error == 0)
+			error = write_drive(drive,	    /* second copy */
+			    config,
+			    MAXCONFIG,
+			    VINUM_CONFIG_OFFSET + MAXCONFIG);
+		    unlockdrive(drive);
+		    if (error) {
+			log(LOG_ERR,
+			    "vinum: Can't write config to %s, error %d\n",
+			    drive->devicename,
+			    error);
+			set_drive_state(drive->driveno, drive_down, setstate_force);
+		    } else
+			written_config = 1;		    /* we've written it on at least one drive */
+		}
+	    } else					    /* not worth looking at, */
+		unlockdrive(drive);			    /* just unlock it again */
+	}
+    }
+    Free(vhdr);
+    Free(config);
+}
+
+/*
+ * Disk labels are a mess.  The correct way to
+ * access them is with the DIOC[GSW]DINFO ioctls,
+ * but some programs, such as newfs, access the
+ * disk directly, so we have to write things
+ * there.  We do this only on request.  If a user
+ * request tries to read it directly, we fake up
+ * one on the fly.
+ */
+
+/*
+ * get_volume_label returns a label structure to
+ * lp, which is allocated by the caller.
+ */
+void
+get_volume_label(char *name, int plexes, u_int64_t size, struct disklabel *lp)
+{
+    bzero(lp, sizeof(struct disklabel));
+
+    strncpy(lp->d_typename, "vinum", sizeof(lp->d_typename));
+    lp->d_type = DTYPE_VINUM;
+    strncpy(lp->d_packname, name, min(sizeof(lp->d_packname), sizeof(name)));
+    lp->d_rpm = 14400 * plexes;				    /* to keep them guessing */
+    lp->d_interleave = 1;
+    lp->d_flags = 0;
+
+    /*
+     * A Vinum volume has a single track with all
+     * its sectors.
+     */
+    lp->d_secsize = DEV_BSIZE;				    /* bytes per sector */
+    lp->d_nsectors = size;				    /* data sectors per track */
+    lp->d_ntracks = 1;					    /* tracks per cylinder */
+    lp->d_ncylinders = 1;				    /* data cylinders per unit */
+    lp->d_secpercyl = size;				    /* data sectors per cylinder */
+    lp->d_secperunit = size;				    /* data sectors per unit */
+
+    lp->d_bbsize = BBSIZE;
+    lp->d_sbsize = 0;					    /* no longer used?  */
+    lp->d_magic = DISKMAGIC;
+    lp->d_magic2 = DISKMAGIC;
+
+    /*
+     * Set up partitions a, b and c to be identical
+     * and the size of the volume.  a is UFS, b is
+     * swap, c is nothing.
+     */
+    lp->d_partitions[0].p_size = size;
+    lp->d_partitions[0].p_fsize = 1024;
+    lp->d_partitions[0].p_fstype = FS_BSDFFS;		    /* FreeBSD File System :-) */
+    lp->d_partitions[0].p_fsize = 1024;			    /* FS fragment size */
+    lp->d_partitions[0].p_frag = 8;			    /* and fragments per block */
+    lp->d_partitions[SWAP_PART].p_size = size;
+    lp->d_partitions[SWAP_PART].p_fstype = FS_SWAP;	    /* swap partition */
+    lp->d_partitions[LABEL_PART].p_size = size;
+    lp->d_npartitions = LABEL_PART + 1;
+    strncpy(lp->d_packname, name, min(sizeof(lp->d_packname), sizeof(name)));
+    lp->d_checksum = dkcksum(lp);
+}
+
+/*
+ * Seach disks on system for vinum slices and add
+ * them to the configuuration if they're not
+ * there already.  devicename is a blank-separate
+ * list of device names.  If not provided, use
+ * sysctl to get a list of all disks on the
+ * system.
+ *
+ * Return an error indication.
+ */
+int
+vinum_scandisk(char *devicename)
+{
+    struct drive *volatile drive;
+    volatile int driveno;
+    int firstdrive;					    /* first drive in this list */
+    volatile int gooddrives;				    /* number of usable drives found */
+    int firsttime;					    /* set if we have never configured before */
+    int error;
+    char *config_text;					    /* read the config info from disk into here */
+    char *volatile cptr;				    /* pointer into config information */
+    char *eptr;						    /* end pointer into config information */
+    char *config_line;					    /* copy the config line to */
+    volatile int status;
+    int *drivelist;					    /* list of drive indices */
+    char *partname;					    /* for creating partition names */
+    char *cp;						    /* pointer to start of disk name */
+    char *ep;						    /* and to first char after name */
+    char *np;						    /* name pointer in naem we build */
+    size_t alloclen;
+    int malloced;
+    int partnamelen;					    /* length of partition name */
+    int drives;
+
+    malloced = 0;					    /* devicename not malloced */
+    if (devicename == NULL) {				    /* no devices specified, */
+	/* get a list of all disks in the system */
+	/* Get size of disk list */
+	error = kernel_sysctlbyname(&thread0, "kern.disks", NULL,
+	    NULL, NULL, 0, &alloclen);
+	if (error) {
+	    log(LOG_ERR, "vinum: can't get disk list: %d\n", error);
+	    return EINVAL;
+	}
+	devicename = Malloc(alloclen);
+	if (devicename == NULL) {
+	    printf("vinum: can't allocate memory for drive list");
+	    return ENOMEM;
+	} else
+	    malloced = 1;
+	/* Now get the list of disks */
+	kernel_sysctlbyname(&thread0, "kern.disks", devicename,
+	    &alloclen, NULL, 0, NULL);
+    }
+    status = 0;						    /* success indication */
+    vinum_conf.flags |= VF_READING_CONFIG;		    /* reading config from disk */
+    partname = Malloc(MAXPATHLEN);			    /* extract name of disk here */
+    if (partname == NULL) {
+	printf("vinum_scandisk: can't allocate memory for drive name");
+	return ENOMEM;
+    }
+    gooddrives = 0;					    /* number of usable drives found */
+    firstdrive = vinum_conf.drives_used;		    /* the first drive */
+    firsttime = vinum_conf.drives_used == 0;		    /* are we a virgin? */
+
+    /* allocate a drive pointer list */
+    drives = 256;					    /* should be enough for most cases */
+    drivelist = (int *) Malloc(drives * sizeof(int));
+    CHECKALLOC(drivelist, "Can't allocate memory");
+    error = lock_config();				    /* make sure we're alone here */
+    if (error)
+	return error;
+    error = setjmp(command_fail);			    /* come back here on error */
+    if (error)						    /* longjmped out */
+	return error;
+
+    /* Open all drives and find which was modified most recently */
+    for (cp = devicename; *cp; cp = ep) {
+	char part;					    /* UNIX partition */
+	int slice;
+
+	while (*cp == ' ')
+	    cp++;					    /* find start of name */
+	if (*cp == '\0')				    /* done, */
+	    break;
+	ep = cp;
+	while (*ep && (*ep != ' '))			    /* find end of name */
+	    ep++;
+
+	np = partname;					    /* start building up a name here */
+	if (*cp != '/') {				    /* name doesn't start with /, */
+	    strcpy(np, "/dev/");			    /* assume /dev */
+	    np += strlen("/dev/");
+	}
+	memcpy(np, cp, ep - cp);			    /* put in name */
+	np += ep - cp;					    /* and point past */
+
+	partnamelen = MAXPATHLEN + np - partname;	    /* remaining length in partition name */
+	/* first try the partition table */
+	for (slice = 1; slice < 5; slice++)
+	    for (part = 'a'; part < 'i'; part++) {
+		if (part != 'c') {			    /* don't do the c partition */
+		    snprintf(np,
+			partnamelen,
+			"s%d%c",
+			slice,
+			part);
+		    drive = check_drive(partname);	    /* try to open it */
+		    if (drive) {			    /* got something, */
+			if (drive->flags & VF_CONFIGURED)   /* already read this config, */
+			    log(LOG_WARNING,
+				"vinum: already read config from %s\n",	/* say so */
+				drive->label.name);
+			else {
+			    if (gooddrives == drives)	    /* ran out of entries */
+				EXPAND(drivelist, int, drives, drives);	/* double the size */
+			    drivelist[gooddrives] = drive->driveno; /* keep the drive index */
+			    drive->flags &= ~VF_NEWBORN;    /* which is no longer newly born */
+			    gooddrives++;
+			}
+		    }
+		}
+	    }
+	/*
+	 * This is a kludge.  Probably none of this
+	 * should be here.
+	 */
+	if (gooddrives == 0) {				    /* didn't find anything, */
+	    for (part = 'a'; part < 'i'; part++)	    /* try the compatibility partition */
+		if (part != 'c') {			    /* don't do the c partition */
+		    snprintf(np,
+			partnamelen,
+			"%c",
+			part);
+		    drive = check_drive(partname);	    /* try to open it */
+		    if (drive) {			    /* got something, */
+			if (drive->flags & VF_CONFIGURED)   /* already read this config, */
+			    log(LOG_WARNING,
+				"vinum: already read config from %s\n",	/* say so */
+				drive->label.name);
+			else {
+			    if (gooddrives == drives)	    /* ran out of entries */
+				EXPAND(drivelist, int, drives, drives);	/* double the size */
+			    drivelist[gooddrives] = drive->driveno; /* keep the drive index */
+			    drive->flags &= ~VF_NEWBORN;    /* which is no longer newly born */
+			    gooddrives++;
+			}
+		    }
+		}
+	}
+    }
+    Free(partname);
+
+    if (gooddrives == 0) {
+	if (firsttime)
+	    log(LOG_WARNING, "vinum: no drives found\n");
+	else
+	    log(LOG_INFO, "vinum: no additional drives found\n");
+	if (malloced)
+	    Free(devicename);
+	unlock_config();
+	return ENOENT;
+    }
+    /*
+     * We now have at least one drive open.  Sort
+     * them in order of config time and merge the
+     * config info with what we have already.
+     */
+    qsort(drivelist, gooddrives, sizeof(int), drivecmp);
+    config_text = (char *) Malloc(MAXCONFIG * 2);	    /* allocate buffers */
+    CHECKALLOC(config_text, "Can't allocate memory");
+    config_line = (char *) Malloc(MAXCONFIGLINE * 2);	    /* allocate buffers */
+    CHECKALLOC(config_line, "Can't allocate memory");
+    for (driveno = 0; driveno < gooddrives; driveno++) {    /* now include the config */
+	drive = &DRIVE[drivelist[driveno]];		    /* point to the drive */
+
+	if (firsttime && (driveno == 0))		    /* we've never configured before, */
+	    log(LOG_INFO, "vinum: reading configuration from %s\n", drive->devicename);
+	else
+	    log(LOG_INFO, "vinum: updating configuration from %s\n", drive->devicename);
+
+	if (drive->state == drive_up)
+	    /* Read in both copies of the configuration information */
+	    error = read_drive(drive, config_text, MAXCONFIG * 2, VINUM_CONFIG_OFFSET);
+	else {
+	    error = EIO;
+	    printf("vinum_scandisk: %s is %s\n", drive->devicename, drive_state(drive->state));
+	}
+
+	if (error != 0) {
+	    log(LOG_ERR, "vinum: Can't read device %s, error %d\n", drive->devicename, error);
+	    free_drive(drive);				    /* give it back */
+	    status = error;
+	}
+	/*
+	 * At this point, check that the two copies
+	 * are the same, and do something useful if
+	 * not.  In particular, consider which is
+	 * newer, and what this means for the
+	 * integrity of the data on the drive.
+	 */
+	else {
+	    vinum_conf.drives_used++;			    /* another drive in use */
+	    /* Parse the configuration, and add it to the global configuration */
+	    for (cptr = config_text; *cptr != '\0';) {	    /* love this style(9) */
+		volatile int parse_status;		    /* return value from parse_config */
+
+		for (eptr = config_line; (*cptr != '\n') && (*cptr != '\0');) /* until the end of the line */
+		    *eptr++ = *cptr++;
+		*eptr = '\0';				    /* and delimit */
+		if (setjmp(command_fail) == 0) {	    /* come back here on error and continue */
+		    parse_status = parse_config(config_line, &keyword_set, 1); /* parse the config line */
+		    /*
+		     * parse_config recognizes referenced
+		     * drives and builds a drive entry for
+		     * them.  This may expand the drive
+		     * table, thus invalidating the pointer.
+		     */
+		    drive = &DRIVE[drivelist[driveno]];	    /* point to the drive */
+
+		    if (parse_status < 0) {		    /* error in config */
+			/*
+			   * This config should have been parsed
+			   * in user space.  If we run into
+			   * problems here, something serious is
+			   * afoot.  Complain and let the user
+			   * snarf the config to see what's
+			   * wrong.
+			 */
+			log(LOG_ERR,
+			    "vinum: Config error on %s, aborting integration\n",
+			    drive->devicename);
+			free_drive(drive);		    /* give it back */
+			status = EINVAL;
+		    }
+		}
+		while (*cptr == '\n')
+		    cptr++;				    /* skip to next line */
+	    }
+	}
+	drive->flags |= VF_CONFIGURED;			    /* this drive's configuration is complete */
+    }
+
+    Free(config_text);
+    Free(drivelist);
+    vinum_conf.flags &= ~VF_READING_CONFIG;		    /* no longer reading from disk */
+    if (status != 0)
+	printf("vinum: couldn't read configuration");
+    else
+	updateconfig(VF_READING_CONFIG);		    /* update from disk config */
+    if (malloced)
+	Free(devicename);
+    unlock_config();
+    return status;
+}
+
+/*
+ * Compare the modification dates of the drives, for qsort.
+ * Return 1 if a < b, 0 if a == b, 01 if a > b: in other
+ * words, sort backwards.
+ */
+int
+drivecmp(const void *va, const void *vb)
+{
+    const struct drive *a = &DRIVE[*(const int *) va];
+    const struct drive *b = &DRIVE[*(const int *) vb];
+
+    if ((a->label.last_update.tv_sec == b->label.last_update.tv_sec)
+	&& (a->label.last_update.tv_usec == b->label.last_update.tv_usec))
+	return 0;
+    else if ((a->label.last_update.tv_sec > b->label.last_update.tv_sec)
+	    || ((a->label.last_update.tv_sec == b->label.last_update.tv_sec)
+	    && (a->label.last_update.tv_usec > b->label.last_update.tv_usec)))
+	return -1;
+    else
+	return 1;
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumio.h b/sys/dev/vinum/vinumio.h
new file mode 100644
index 0000000..bf5134a
--- /dev/null
+++ b/sys/dev/vinum/vinumio.h
@@ -0,0 +1,154 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumio.h,v 1.23 2003/05/04 05:25:46 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#define L 'F'						    /* ID letter of our ioctls */
+
+#define MAX_IOCTL_REPLY 1024
+
+#ifdef VINUMDEBUG
+struct debuginfo {
+    int changeit;
+    int param;
+};
+
+#endif
+
+enum objecttype {
+    drive_object,
+    sd_object,
+    plex_object,
+    volume_object,
+    invalid_object
+};
+
+/*
+ * The state to set with VINUM_SETSTATE.  Since each object has a
+ * different set of states, we need to translate later.
+ */
+enum objectstate {
+    object_down,
+    object_initializing,
+    object_initialized,
+    object_up
+};
+
+/*
+ * This structure is used for modifying objects
+ * (VINUM_SETSTATE, VINUM_REMOVE, VINUM_RESETSTATS, VINUM_ATTACH,
+ * VINUM_DETACH, VINUM_REPLACE
+ */
+struct vinum_ioctl_msg {
+    int index;
+    enum objecttype type;
+    enum objectstate state;				    /* state to set (VINUM_SETSTATE) */
+    enum parityop op;					    /* for parity ops */
+    int force;						    /* do it even if it doesn't make sense */
+    int recurse;					    /* recurse (VINUM_REMOVE) */
+    int verify;						    /* verify (initsd, rebuildparity) */
+    int otherobject;					    /* superordinate object (attach),
+							    * replacement object (replace) */
+    int rename;						    /* rename object (attach) */
+    int64_t offset;					    /* offset of subdisk (for attach) */
+    int blocksize;					    /* size of block to revive (bytes) */
+};
+
+/* VINUM_CREATE returns a buffer of this kind */
+struct _ioctl_reply {
+    int error;
+    char msg[MAX_IOCTL_REPLY];
+};
+
+struct vinum_rename_msg {
+    int index;
+    int recurse;					    /* rename subordinate objects too */
+    enum objecttype type;
+    char newname[MAXNAME];				    /* new name to give to object */
+};
+
+/* ioctl requests */
+#define BUFSIZE 1024					    /* size of buffer, including continuations */
+#define VINUM_CREATE		_IOC(IOC_IN | IOC_OUT, L, 64, BUFSIZE) /* configure vinum */
+#define VINUM_GETCONFIG		_IOR(L, 65, struct __vinum_conf) /* get global config */
+#define VINUM_DRIVECONFIG	_IOWR(L, 66, struct _drive) /* get drive config */
+#define VINUM_SDCONFIG		_IOWR(L, 67, struct _sd)    /* get subdisk config */
+#define VINUM_PLEXCONFIG	_IOWR(L, 68, struct _plex)  /* get plex config */
+#define VINUM_VOLCONFIG		_IOWR(L, 69, struct _volume) /* get volume config */
+#define VINUM_PLEXSDCONFIG	_IOWR(L, 70, struct _sd)    /* get sd config for plex (plex, sdno) */
+#define VINUM_GETFREELIST	_IOWR(L, 71, struct drive_freelist) /* get freelist element (drive, fe) */
+#define VINUM_SAVECONFIG	_IOW(L, 72, int)	    /* write config to disk */
+#define VINUM_RESETCONFIG	_IOC(0, L, 73, 0)	    /* trash config on disk */
+#define VINUM_INIT		_IOC(0, L, 74, 0)	    /* read config from disk */
+#define VINUM_READCONFIG	_IOC(IOC_IN | IOC_OUT, L, 75, BUFSIZE) /* read config from disk */
+#ifdef VINUMDEBUG
+#define VINUM_DEBUG		_IOWR(L, 127, struct debuginfo)	/* call the debugger from ioctl () */
+#endif
+
+/*
+ * Start an object.  Pass two integers:
+ * msg [0] index in vinum_conf.<object>
+ * msg [1] type of object (see below)
+ *
+ * Return ioctl_reply
+ */
+#define VINUM_SETSTATE 		_IOC(IOC_IN | IOC_OUT, L, 76, MAX_IOCTL_REPLY) /* start an object */
+#define VINUM_RELEASECONFIG	_IOC(0, L, 77, 0)	    /* release locks and write config to disk */
+#define VINUM_STARTCONFIG	_IOW(L, 78, int)	    /* start a configuration operation */
+#define VINUM_MEMINFO 		_IOR(L, 79, struct meminfo) /* get memory usage summary */
+#define VINUM_MALLOCINFO	_IOWR(L, 80, struct mc)	    /* get specific malloc information [i] */
+#define VINUM_INITSD 		_IOW(L, 82, int)	    /* initialize a subdisk */
+#define VINUM_REMOVE 		_IOWR(L, 83, struct _ioctl_reply) /* remove an object */
+#define VINUM_READPOL 		_IOWR(L, 84, struct _ioctl_reply) /* set read policy */
+#define VINUM_SETSTATE_FORCE	_IOC(IOC_IN | IOC_OUT, L, 85, MAX_IOCTL_REPLY) /* diddle object state */
+#define VINUM_RESETSTATS	_IOWR(L, 86, struct _ioctl_reply) /* reset object stats */
+#define VINUM_ATTACH		_IOWR(L, 87, struct _ioctl_reply) /* attach an object */
+#define VINUM_DETACH		_IOWR(L, 88, struct _ioctl_reply) /* remove an object */
+
+#define VINUM_RENAME		_IOWR(L, 89, struct _ioctl_reply) /* rename an object */
+#define VINUM_REPLACE		_IOWR(L, 90, struct _ioctl_reply) /* replace an object */
+
+#ifdef VINUMDEBUG
+#define VINUM_RQINFO		_IOWR(L, 91, struct rqinfo) /* get request info [i] from trace buffer */
+#endif
+
+#define VINUM_DAEMON		_IOC(0, L, 92, 0)	    /* perform the kernel part of Vinum daemon */
+#define VINUM_FINDDAEMON	_IOC(0, L, 93, 0)	    /* check for presence of Vinum daemon */
+#define VINUM_SETDAEMON		_IOW(L, 94, int)	    /* set daemon flags */
+#define VINUM_GETDAEMON		_IOR(L, 95, int)	    /* get daemon flags */
+#define VINUM_PARITYOP   	_IOWR(L, 96, struct _ioctl_reply) /* check/rebuild RAID-4/5 parity */
+#define VINUM_MOVE		_IOWR(L, 98, struct _ioctl_reply) /* move an object */
diff --git a/sys/dev/vinum/vinumioctl.c b/sys/dev/vinum/vinumioctl.c
new file mode 100644
index 0000000..2f7b876
--- /dev/null
+++ b/sys/dev/vinum/vinumioctl.c
@@ -0,0 +1,958 @@
+/*
+ * XXX replace all the checks on object validity with
+ * calls to valid<object>
+ */
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumioctl.c,v 1.23 2003/05/23 01:02:22 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+#ifdef VINUMDEBUG
+#include <sys/reboot.h>
+#endif
+
+void attachobject(struct vinum_ioctl_msg *);
+void detachobject(struct vinum_ioctl_msg *);
+void renameobject(struct vinum_rename_msg *);
+void replaceobject(struct vinum_ioctl_msg *);
+void moveobject(struct vinum_ioctl_msg *);
+void setreadpol(struct vinum_ioctl_msg *);
+
+jmp_buf command_fail;					    /* return on a failed command */
+
+/* ioctl routine */
+int
+vinumioctl(dev_t dev,
+    u_long cmd,
+    caddr_t data,
+    int flag,
+    struct thread *td)
+{
+    unsigned int objno;
+    struct sd *sd;
+    struct plex *plex;
+    struct volume *vol;
+
+    /* First, decide what we're looking at */
+    if ((minor(dev) == VINUM_SUPERDEV_MINOR)
+	|| (minor(dev) == VINUM_DAEMON_MINOR))
+	return vinum_super_ioctl(dev, cmd, data);
+    else						    /* real device */
+	switch (DEVTYPE(dev)) {
+	case VINUM_SD_TYPE:
+	case VINUM_SD2_TYPE:				    /* second half of sd namespace */
+	    objno = Sdno(dev);
+
+	    sd = &SD[objno];
+
+	    switch (cmd) {
+	    case DIOCGSECTORSIZE:
+		*(u_int *) data = sd->sectorsize;
+		return 0;
+
+	    case DIOCGMEDIASIZE:
+		*(u_int64_t *) data = sd->sectors * sd->sectorsize;
+		return 0;
+
+		/*
+		 * We don't have this stuff on hardware,
+		 * so just pretend to do it so that
+		 * utilities don't get upset.
+		 */
+	    case DIOCWDINFO:				    /* write partition info */
+	    case DIOCSDINFO:				    /* set partition info */
+		return 0;				    /* not a titty */
+
+	    default:
+		return ENOTTY;				    /* not my kind of ioctl */
+	    }
+
+	    return 0;					    /* pretend we did it */
+
+	case VINUM_PLEX_TYPE:
+	    objno = Plexno(dev);
+
+	    plex = &PLEX[objno];
+
+	    switch (cmd) {
+	    case DIOCGSECTORSIZE:
+		*(u_int64_t *) data = plex->sectorsize;
+		return 0;
+
+	    case DIOCGMEDIASIZE:
+		*(u_int64_t *) data = plex->length * plex->sectorsize;
+		return 0;
+
+		/*
+		 * We don't have this stuff on hardware,
+		 * so just pretend to do it so that
+		 * utilities don't get upset.
+		 */
+	    case DIOCWDINFO:				    /* write partition info */
+	    case DIOCSDINFO:				    /* set partition info */
+		return 0;				    /* not a titty */
+
+	    default:
+		return ENOTTY;				    /* not my kind of ioctl */
+	    }
+
+	    return 0;					    /* pretend we did it */
+
+	case VINUM_VOLUME_TYPE:
+	    objno = Volno(dev);
+
+	    if ((unsigned) objno >= (unsigned) vinum_conf.volumes_allocated) /* not a valid volume */
+		return ENXIO;
+	    vol = &VOL[objno];
+	    if (vol->state != volume_up)		    /* not up, */
+		return EIO;				    /* I/O error */
+
+	    switch (cmd) {
+	    case DIOCGSECTORSIZE:
+		*(u_int *) data = vol->sectorsize;
+		return 0;
+
+	    case DIOCGMEDIASIZE:
+		*(u_int64_t *) data = vol->size * vol->sectorsize;
+		return 0;
+
+		/*
+		 * We don't have this stuff on hardware,
+		 * so just pretend to do it so that
+		 * utilities don't get upset.
+		 */
+	    case DIOCWDINFO:				    /* write partition info */
+	    case DIOCSDINFO:				    /* set partition info */
+		return 0;				    /* not a titty */
+
+	    default:
+		return ENOTTY;				    /* not my kind of ioctl */
+	    }
+	    break;
+	}
+    return 0;						    /* XXX */
+}
+
+/* Handle ioctls for the super device */
+int
+vinum_super_ioctl(dev_t dev,
+    u_long cmd,
+    caddr_t data)
+{
+    int error = 0;
+    unsigned int index;					    /* for transferring config info */
+    unsigned int sdno;					    /* for transferring config info */
+    int fe;						    /* free list element number */
+    struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) data; /* struct to return */
+
+    ioctl_reply = (struct _ioctl_reply *) data;		    /* save the address to reply to */
+    if (error)						    /* bombed out */
+	return 0;					    /* the reply will contain meaningful info */
+    switch (cmd) {
+#ifdef VINUMDEBUG
+    case VINUM_DEBUG:
+	if (((struct debuginfo *) data)->changeit)	    /* change debug settings */
+	    debug = (((struct debuginfo *) data)->param);
+	else {
+	    if (debug & DEBUG_REMOTEGDB)
+		boothowto |= RB_GDB;			    /* serial debug line */
+	    else
+		boothowto &= ~RB_GDB;			    /* local ddb */
+	    Debugger("vinum debug");
+	}
+	ioctl_reply = (struct _ioctl_reply *) data;	    /* reinstate the address to reply to */
+	ioctl_reply->error = 0;
+	return 0;
+#endif
+
+    case VINUM_CREATE:					    /* create a vinum object */
+	error = lock_config();				    /* get the config for us alone */
+	if (error)					    /* can't do it, */
+	    return error;				    /* give up */
+	error = setjmp(command_fail);			    /* come back here on error */
+	if (error == 0)					    /* first time, */
+	    ioctl_reply->error = parse_user_config((char *) data, /* update the config */
+		&keyword_set);
+	else if (ioctl_reply->error == 0) {		    /* longjmp, but no error status */
+	    ioctl_reply->error = EINVAL;		    /* note that something's up */
+	    ioctl_reply->msg[0] = '\0';			    /* no message? */
+	}
+	unlock_config();
+	return 0;					    /* must be 0 to return the real error info */
+
+    case VINUM_GETCONFIG:				    /* get the configuration information */
+	bcopy(&vinum_conf, data, sizeof(vinum_conf));
+	return 0;
+
+	/* start configuring the subsystem */
+    case VINUM_STARTCONFIG:
+	return start_config(*(int *) data);		    /* just lock it.  Parameter is 'force' */
+
+	/*
+	 * Move the individual parts of the config to user space.
+	 *
+	 * Specify the index of the object in the first word of data,
+	 * and return the object there
+	 */
+    case VINUM_DRIVECONFIG:
+	index = *(int *) data;				    /* get the index */
+	if (index >= (unsigned) vinum_conf.drives_allocated) /* can't do it */
+	    return ENXIO;				    /* bang */
+	bcopy(&DRIVE[index], data, sizeof(struct _drive));  /* copy the config item out */
+	return 0;
+
+    case VINUM_SDCONFIG:
+	index = *(int *) data;				    /* get the index */
+	if (index >= (unsigned) vinum_conf.subdisks_allocated) /* can't do it */
+	    return ENXIO;				    /* bang */
+	bcopy(&SD[index], data, sizeof(struct _sd));	    /* copy the config item out */
+	return 0;
+
+    case VINUM_PLEXCONFIG:
+	index = *(int *) data;				    /* get the index */
+	if (index >= (unsigned) vinum_conf.plexes_allocated) /* can't do it */
+	    return ENXIO;				    /* bang */
+	bcopy(&PLEX[index], data, sizeof(struct _plex));    /* copy the config item out */
+	return 0;
+
+    case VINUM_VOLCONFIG:
+	index = *(int *) data;				    /* get the index */
+	if (index >= (unsigned) vinum_conf.volumes_allocated) /* can't do it */
+	    return ENXIO;				    /* bang */
+	bcopy(&VOL[index], data, sizeof(struct _volume));   /* copy the config item out */
+	return 0;
+
+    case VINUM_PLEXSDCONFIG:
+	index = *(int *) data;				    /* get the plex index */
+	sdno = ((int *) data)[1];			    /* and the sd index */
+	if ((index >= (unsigned) vinum_conf.plexes_allocated) /* plex doesn't exist */
+	||(sdno >= PLEX[index].subdisks))		    /* or it doesn't have this many subdisks */
+	    return ENXIO;				    /* bang */
+	bcopy(&SD[PLEX[index].sdnos[sdno]],		    /* copy the config item out */
+	    data,
+	    sizeof(struct _sd));
+	return 0;
+
+	/*
+	 * We get called in two places: one from the
+	 * userland config routines, which call us
+	 * to complete the config and save it.  This
+	 * call supplies the value 0 as a parameter.
+	 *
+	 * The other place is from the user "saveconfig"
+	 * routine, which can only work if we're *not*
+	 * configuring.  In this case, supply parameter 1.
+	 */
+    case VINUM_SAVECONFIG:
+	if (VFLAGS & VF_CONFIGURING) {			    /* must be us, the others are asleep */
+	    if (*(int *) data == 0)			    /* finish config */
+		finish_config(1);			    /* finish the configuration and update it */
+	    else
+		return EBUSY;				    /* can't do it now */
+	}
+	save_config();					    /* save configuration to disk */
+	return 0;
+
+    case VINUM_RELEASECONFIG:				    /* release the config */
+	if (VFLAGS & VF_CONFIGURING) {			    /* must be us, the others are asleep */
+	    finish_config(0);				    /* finish the configuration, don't change it */
+	    save_config();				    /* save configuration to disk */
+	} else
+	    error = EINVAL;				    /* release what config? */
+	return error;
+
+    case VINUM_READCONFIG:
+	if (((char *) data)[0] == '\0')
+	    ioctl_reply->error = vinum_scandisk(NULL);	    /* built your own list */
+	else
+	    ioctl_reply->error = vinum_scandisk((char *) data);
+	if (ioctl_reply->error == ENOENT) {
+	    if (vinum_conf.drives_used > 0)
+		strcpy(ioctl_reply->msg, "no additional drives found");
+	    else
+		strcpy(ioctl_reply->msg, "no drives found");
+	} else if (ioctl_reply->error)
+	    strcpy(ioctl_reply->msg, "can't read configuration information, see log file");
+	return 0;					    /* must be 0 to return the real error info */
+
+    case VINUM_INIT:
+	ioctl_reply = (struct _ioctl_reply *) data;	    /* reinstate the address to reply to */
+	ioctl_reply->error = 0;
+	return 0;
+
+    case VINUM_RESETCONFIG:
+	if (vinum_inactive(0)) {			    /* if the volumes are not active */
+	    /*
+	     * Note the open count.  We may be called from v, so we'll be open.
+	     * Keep the count so we don't underflow
+	     */
+	    free_vinum(1);				    /* clean up everything */
+	    log(LOG_NOTICE, "vinum: CONFIGURATION OBLITERATED\n");
+	    ioctl_reply = (struct _ioctl_reply *) data;	    /* reinstate the address to reply to */
+	    ioctl_reply->error = 0;
+	    return 0;
+	}
+	return EBUSY;
+
+    case VINUM_SETSTATE:
+	setstate((struct vinum_ioctl_msg *) data);	    /* set an object state */
+	return 0;
+
+	/*
+	 * Set state by force, without changing
+	 * anything else.
+	 */
+    case VINUM_SETSTATE_FORCE:
+	setstate_by_force((struct vinum_ioctl_msg *) data); /* set an object state */
+	return 0;
+
+#ifdef VINUMDEBUG
+    case VINUM_MEMINFO:
+	vinum_meminfo(data);
+	return 0;
+
+    case VINUM_MALLOCINFO:
+	return vinum_mallocinfo(data);
+
+    case VINUM_RQINFO:
+	return vinum_rqinfo(data);
+#endif
+
+    case VINUM_REMOVE:
+	remove((struct vinum_ioctl_msg *) data);	    /* remove an object */
+	return 0;
+
+    case VINUM_GETFREELIST:				    /* get a drive free list element */
+	index = *(int *) data;				    /* get the drive index */
+	fe = ((int *) data)[1];				    /* and the free list element */
+	if ((index >= (unsigned) vinum_conf.drives_allocated) /* plex doesn't exist */
+	||(DRIVE[index].state == drive_unallocated))
+	    return ENODEV;
+	if (fe >= DRIVE[index].freelist_entries)	    /* no such entry */
+	    return ENOENT;
+	bcopy(&DRIVE[index].freelist[fe],
+	    data,
+	    sizeof(struct drive_freelist));
+	return 0;
+
+    case VINUM_RESETSTATS:
+	resetstats((struct vinum_ioctl_msg *) data);	    /* reset object stats */
+	return 0;
+
+	/* attach an object to a superordinate object */
+    case VINUM_ATTACH:
+	attachobject((struct vinum_ioctl_msg *) data);
+	return 0;
+
+	/* detach an object from a superordinate object */
+    case VINUM_DETACH:
+	detachobject((struct vinum_ioctl_msg *) data);
+	return 0;
+
+	/* rename an object */
+    case VINUM_RENAME:
+	renameobject((struct vinum_rename_msg *) data);
+	return 0;
+
+	/* replace an object */
+    case VINUM_REPLACE:
+	replaceobject((struct vinum_ioctl_msg *) data);
+	return 0;
+
+    case VINUM_DAEMON:
+	vinum_daemon();					    /* perform the daemon */
+	return 0;
+
+    case VINUM_FINDDAEMON:				    /* check for presence of daemon */
+	return vinum_finddaemon();
+	return 0;
+
+    case VINUM_SETDAEMON:				    /* set daemon flags */
+	return vinum_setdaemonopts(*(int *) data);
+
+    case VINUM_GETDAEMON:				    /* get daemon flags */
+	*(int *) data = daemon_options;
+	return 0;
+
+    case VINUM_PARITYOP:				    /* check/rebuild RAID-4/5 parity */
+	parityops((struct vinum_ioctl_msg *) data);
+	return 0;
+
+	/* move an object */
+    case VINUM_MOVE:
+	moveobject((struct vinum_ioctl_msg *) data);
+	return 0;
+
+    case VINUM_READPOL:
+	setreadpol((struct vinum_ioctl_msg *) data);
+	return 0;
+
+    default:
+	/* FALLTHROUGH */
+	break;
+    }
+    return 0;						    /* to keep the compiler happy */
+}
+
+/*
+ * The following four functions check the supplied
+ * object index and return a pointer to the object
+ * if it exists.  Otherwise they longjump out via
+ * throw_rude_remark.
+ */
+struct drive *
+validdrive(int driveno, struct _ioctl_reply *reply)
+{
+    if ((driveno < vinum_conf.drives_allocated)
+	&& (DRIVE[driveno].state > drive_referenced))
+	return &DRIVE[driveno];
+    strcpy(reply->msg, "No such drive");
+    reply->error = ENOENT;
+    return NULL;
+}
+
+struct sd *
+validsd(int sdno, struct _ioctl_reply *reply)
+{
+    if ((sdno < vinum_conf.subdisks_allocated)
+	&& (SD[sdno].state > sd_referenced))
+	return &SD[sdno];
+    strcpy(reply->msg, "No such subdisk");
+    reply->error = ENOENT;
+    return NULL;
+}
+
+struct plex *
+validplex(int plexno, struct _ioctl_reply *reply)
+{
+    if ((plexno < vinum_conf.plexes_allocated)
+	&& (PLEX[plexno].state > plex_referenced))
+	return &PLEX[plexno];
+    strcpy(reply->msg, "No such plex");
+    reply->error = ENOENT;
+    return NULL;
+}
+
+struct volume *
+validvol(int volno, struct _ioctl_reply *reply)
+{
+    if ((volno < vinum_conf.volumes_allocated)
+	&& (VOL[volno].state > volume_uninit))
+	return &VOL[volno];
+    strcpy(reply->msg, "No such volume");
+    reply->error = ENOENT;
+    return NULL;
+}
+
+/* reset an object's stats */
+void
+resetstats(struct vinum_ioctl_msg *msg)
+{
+    struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+
+    switch (msg->type) {
+    case drive_object:
+	if (msg->index < vinum_conf.drives_allocated) {
+	    struct drive *drive = &DRIVE[msg->index];
+	    if (drive->state > drive_referenced) {
+		drive->reads = 0;			    /* number of reads on this drive */
+		drive->writes = 0;			    /* number of writes on this drive */
+		drive->bytes_read = 0;			    /* number of bytes read */
+		drive->bytes_written = 0;		    /* number of bytes written */
+		reply->error = 0;
+		return;
+	    }
+	    reply->error = EINVAL;
+	    return;
+	}
+    case sd_object:
+	if (msg->index < vinum_conf.subdisks_allocated) {
+	    struct sd *sd = &SD[msg->index];
+	    if (sd->state > sd_referenced) {
+		sd->reads = 0;				    /* number of reads on this subdisk */
+		sd->writes = 0;				    /* number of writes on this subdisk */
+		sd->bytes_read = 0;			    /* number of bytes read */
+		sd->bytes_written = 0;			    /* number of bytes written */
+		reply->error = 0;
+		return;
+	    }
+	    reply->error = EINVAL;
+	    return;
+	}
+	break;
+
+    case plex_object:
+	if (msg->index < vinum_conf.plexes_allocated) {
+	    struct plex *plex = &PLEX[msg->index];
+	    if (plex->state > plex_referenced) {
+		plex->reads = 0;
+		plex->writes = 0;			    /* number of writes on this plex */
+		plex->bytes_read = 0;			    /* number of bytes read */
+		plex->bytes_written = 0;		    /* number of bytes written */
+		plex->recovered_reads = 0;		    /* number of recovered read operations */
+		plex->degraded_writes = 0;		    /* number of degraded writes */
+		plex->parityless_writes = 0;		    /* number of parityless writes */
+		plex->multiblock = 0;			    /* requests that needed more than one block */
+		plex->multistripe = 0;			    /* requests that needed more than one stripe */
+		reply->error = 0;
+		return;
+	    }
+	    reply->error = EINVAL;
+	    return;
+	}
+	break;
+
+    case volume_object:
+	if (msg->index < vinum_conf.volumes_allocated) {
+	    struct volume *vol = &VOL[msg->index];
+	    if (vol->state > volume_uninit) {
+		vol->bytes_read = 0;			    /* number of bytes read */
+		vol->bytes_written = 0;			    /* number of bytes written */
+		vol->reads = 0;				    /* number of reads on this volume */
+		vol->writes = 0;			    /* number of writes on this volume */
+		vol->recovered_reads = 0;		    /* reads recovered from another plex */
+		reply->error = 0;
+		return;
+	    }
+	    reply->error = EINVAL;
+	    return;
+	}
+    case invalid_object:				    /* can't get this */
+	reply->error = EINVAL;
+	return;
+    }
+}
+
+/* attach an object to a superior object */
+void
+attachobject(struct vinum_ioctl_msg *msg)
+{
+    struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+    int sdno;
+    struct sd *sd;
+    struct plex *plex;
+    struct volume *vol;
+
+    switch (msg->type) {
+    case drive_object:					    /* you can't attach a drive to anything */
+    case volume_object:					    /* nor a volume */
+    case invalid_object:				    /* "this can't happen" */
+	reply->error = EINVAL;
+	reply->msg[0] = '\0';				    /* vinum(8) doesn't do this */
+	return;
+
+    case sd_object:
+	sd = validsd(msg->index, reply);
+	if (sd == NULL)					    /* not a valid subdisk  */
+	    return;
+	plex = validplex(msg->otherobject, reply);
+	if (plex) {
+	    /*
+	     * We should be more intelligent about this.
+	     * We should be able to reattach a dead
+	     * subdisk, but if we want to increase the total
+	     * number of subdisks, we have a lot of reshuffling
+	     * to do. XXX
+	     */
+	    if ((plex->organization != plex_concat)	    /* can't attach to striped and RAID-4/5 */
+	    &&(!msg->force)) {				    /* without using force */
+		reply->error = EINVAL;			    /* no message, the user should check */
+		strcpy(reply->msg, "Can't attach to this plex organization");
+	    } else if (sd->plexno >= 0) {		    /* already belong to a plex */
+		reply->error = EBUSY;			    /* no message, the user should check */
+		sprintf(reply->msg, "%s is already attached to %s",
+		    sd->name,
+		    sd[sd->plexno].name);
+		reply->msg[0] = '\0';
+	    } else {
+		sd->plexoffset = msg->offset;		    /* this is where we want it */
+		set_sd_state(sd->sdno, sd_stale, setstate_force); /* make sure it's stale */
+		give_sd_to_plex(plex->plexno, sd->sdno);    /* and give it to the plex */
+		update_sd_config(sd->sdno, 0);
+		save_config();
+		if (sd->state == sd_reviving)
+		    reply->error = EAGAIN;		    /* need to revive it */
+		else
+		    reply->error = 0;
+	    }
+	}
+	break;
+
+    case plex_object:
+	plex = validplex(msg->index, reply);		    /* get plex */
+	if (plex == NULL)
+	    return;
+	vol = validvol(msg->otherobject, reply);	    /* and volume information */
+	if (vol) {
+	    if (vol->plexes == MAXPLEX) {		    /* we have too many already */
+		reply->error = ENOSPC;			    /* nowhere to put it */
+		strcpy(reply->msg, "Too many plexes");
+	    } else if (plex->volno >= 0) {		    /* the plex has an owner */
+		reply->error = EBUSY;			    /* no message, the user should check */
+		sprintf(reply->msg, "%s is already attached to %s",
+		    plex->name,
+		    VOL[plex->volno].name);
+	    } else {
+		for (sdno = 0; sdno < plex->subdisks; sdno++) {
+		    sd = &SD[plex->sdnos[sdno]];
+
+		    if (sd->state > sd_down)		    /* real subdisk, vaguely accessible */
+			set_sd_state(plex->sdnos[sdno], sd_stale, setstate_force); /* make it stale */
+		}
+		set_plex_state(plex->plexno, plex_up, setstate_none); /* update plex state */
+		give_plex_to_volume(msg->otherobject, msg->index, 0); /* and give it to the volume */
+		update_plex_config(plex->plexno, 0);
+		save_config();
+		reply->error = 0;			    /* all went well */
+	    }
+	}
+    }
+}
+
+/* detach an object from a superior object */
+void
+detachobject(struct vinum_ioctl_msg *msg)
+{
+    struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+    struct sd *sd;
+    struct plex *plex;
+    struct volume *vol;
+    int sdno;
+    int plexno;
+
+    switch (msg->type) {
+    case drive_object:					    /* you can't detach a drive from anything */
+    case volume_object:					    /* nor a volume */
+    case invalid_object:				    /* "this can't happen" */
+	reply->error = EINVAL;
+	reply->msg[0] = '\0';				    /* vinum(8) doesn't do this */
+	return;
+
+    case sd_object:
+	sd = validsd(msg->index, reply);
+	if (sd == NULL)
+	    return;
+	if (sd->plexno < 0) {				    /* doesn't belong to a plex */
+	    reply->error = ENOENT;
+	    strcpy(reply->msg, "Subdisk is not attached");
+	    return;
+	} else {					    /* valid plex number */
+	    plex = &PLEX[sd->plexno];
+	    if ((!msg->force)				    /* don't force things */
+	    &&((plex->state == plex_up)			    /* and the plex is up */
+	    ||((plex->state == plex_flaky) && sd->state == sd_up))) { /* or flaky with this sd up */
+		reply->error = EBUSY;			    /* we need this sd */
+		reply->msg[0] = '\0';
+		return;
+	    }
+	    sd->plexno = -1;				    /* anonymous sd */
+	    if (plex->subdisks == 1) {			    /* this was the only subdisk */
+		Free(plex->sdnos);			    /* free the subdisk array */
+		plex->sdnos = NULL;			    /* and note the fact */
+		plex->subdisks_allocated = 0;		    /* no subdisk space */
+	    } else {
+		for (sdno = 0; sdno < plex->subdisks; sdno++) {
+		    if (plex->sdnos[sdno] == msg->index)    /* found our subdisk */
+			break;
+		}
+		if (sdno < (plex->subdisks - 1))	    /* not the last one, compact */
+		    bcopy(&plex->sdnos[sdno + 1],
+			&plex->sdnos[sdno],
+			(plex->subdisks - 1 - sdno) * sizeof(int));
+	    }
+	    plex->subdisks--;
+	    if (!bcmp(plex->name, sd->name, strlen(plex->name) + 1))
+		/* this subdisk is named after the plex */
+	    {
+		bcopy(sd->name,
+		    &sd->name[3],
+		    min(strlen(sd->name) + 1, MAXSDNAME - 3));
+		bcopy("ex-", sd->name, 3);
+		sd->name[MAXSDNAME - 1] = '\0';
+	    }
+	    update_plex_config(plex->plexno, 0);
+	    if (isstriped(plex))			    /* we've just mutilated our plex, */
+		set_plex_state(plex->plexno,
+		    plex_down,
+		    setstate_force | setstate_configuring);
+	    if (plex->volno >= 0)			    /* plex attached to volume, */
+		update_volume_config(plex->volno);
+	    save_config();
+	    reply->error = 0;
+	}
+	return;
+
+    case plex_object:
+	plex = validplex(msg->index, reply);		    /* get plex */
+	if (plex == NULL)
+	    return;
+	if (plex->volno >= 0) {
+	    int volno = plex->volno;
+
+	    vol = &VOL[volno];
+	    if ((!msg->force)				    /* don't force things */
+	    &&((vol->state == volume_up)		    /* and the volume is up */
+	    &&(vol->plexes == 1))) {			    /* and this is the last plex */
+		/*
+		   * XXX As elsewhere, check whether we will lose
+		   * mapping by removing this plex
+		 */
+		reply->error = EBUSY;			    /* we need this plex */
+		reply->msg[0] = '\0';
+		return;
+	    }
+	    plex->volno = -1;				    /* anonymous plex */
+	    for (plexno = 0; plexno < vol->plexes; plexno++) {
+		if (vol->plex[plexno] == msg->index)	    /* found our plex */
+		    break;
+	    }
+	    if (plexno < (vol->plexes - 1))		    /* not the last one, compact */
+		bcopy(&vol->plex[plexno + 1],
+		    &vol->plex[plexno],
+		    (vol->plexes - 1 - plexno) * sizeof(int));
+	    vol->plexes--;
+	    vol->last_plex_read = 0;			    /* don't go beyond the end */
+	    if (!bcmp(vol->name, plex->name, strlen(vol->name) + 1))
+		/* this plex is named after the volume */
+	    {
+		/* First, check if the subdisks are the same */
+		if (msg->recurse) {
+		    int sdno;
+
+		    for (sdno = 0; sdno < plex->subdisks; sdno++) {
+			struct sd *sd = &SD[plex->sdnos[sdno]];
+
+			if (!bcmp(plex->name, sd->name, strlen(plex->name) + 1))
+							    /* subdisk is named after the plex */
+			{
+			    bcopy(sd->name,
+				&sd->name[3],
+				min(strlen(sd->name) + 1, MAXSDNAME - 3));
+			    bcopy("ex-", sd->name, 3);
+			    sd->name[MAXSDNAME - 1] = '\0';
+			}
+		    }
+		}
+		bcopy(plex->name,
+		    &plex->name[3],
+		    min(strlen(plex->name) + 1, MAXPLEXNAME - 3));
+		bcopy("ex-", plex->name, 3);
+		plex->name[MAXPLEXNAME - 1] = '\0';
+	    }
+	    update_volume_config(volno);
+	    save_config();
+	    reply->error = 0;
+	} else {
+	    reply->error = ENOENT;
+	    strcpy(reply->msg, "Plex is not attached");
+	}
+    }
+}
+
+void
+renameobject(struct vinum_rename_msg *msg)
+{
+    struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+    struct drive *drive;
+    struct sd *sd;
+    struct plex *plex;
+    struct volume *vol;
+
+    switch (msg->type) {
+    case drive_object:					    /* you can't attach a drive to anything */
+	if (find_drive(msg->newname, 0) >= 0) {		    /* we have that name already, */
+	    reply->error = EEXIST;
+	    reply->msg[0] = '\0';
+	    return;
+	}
+	drive = validdrive(msg->index, reply);
+	if (drive) {
+	    bcopy(msg->newname, drive->label.name, MAXDRIVENAME);
+	    save_config();
+	    reply->error = 0;
+	}
+	return;
+
+    case sd_object:					    /* you can't attach a subdisk to anything */
+	if (find_subdisk(msg->newname, 0) >= 0) {	    /* we have that name already, */
+	    reply->error = EEXIST;
+	    reply->msg[0] = '\0';
+	    return;
+	}
+	sd = validsd(msg->index, reply);
+	if (sd) {
+	    bcopy(msg->newname, sd->name, MAXSDNAME);
+	    update_sd_config(sd->sdno, 0);
+	    save_config();
+	    reply->error = 0;
+	}
+	return;
+
+    case plex_object:					    /* you can't attach a plex to anything */
+	if (find_plex(msg->newname, 0) >= 0) {		    /* we have that name already, */
+	    reply->error = EEXIST;
+	    reply->msg[0] = '\0';
+	    return;
+	}
+	plex = validplex(msg->index, reply);
+	if (plex) {
+	    bcopy(msg->newname, plex->name, MAXPLEXNAME);
+	    update_plex_config(plex->plexno, 0);
+	    save_config();
+	    reply->error = 0;
+	}
+	return;
+
+    case volume_object:					    /* you can't attach a volume to anything */
+	if (find_volume(msg->newname, 0) >= 0) {	    /* we have that name already, */
+	    reply->error = EEXIST;
+	    reply->msg[0] = '\0';
+	    return;
+	}
+	vol = validvol(msg->index, reply);
+	if (vol) {
+	    bcopy(msg->newname, vol->name, MAXVOLNAME);
+	    update_volume_config(msg->index);
+	    save_config();
+	    reply->error = 0;
+	}
+	return;
+
+    case invalid_object:
+	reply->error = EINVAL;
+	reply->msg[0] = '\0';
+    }
+}
+
+/*
+ * Replace one object with another.
+ * Currently only for drives.
+ * message->index is the drive number of the old drive
+ * message->otherobject is the drive number of the new drive
+ */
+void
+replaceobject(struct vinum_ioctl_msg *msg)
+{
+    struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+
+    reply->error = ENODEV;				    /* until I know how to do this */
+    strcpy(reply->msg, "replace not implemented yet");
+/*      save_config (); */
+}
+
+void
+moveobject(struct vinum_ioctl_msg *msg)
+{
+    struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+    struct drive *drive;
+    struct sd *sd;
+
+    /* Check that our objects are valid (i.e. they exist) */
+    drive = validdrive(msg->index, (struct _ioctl_reply *) msg);
+    if (drive == NULL)
+	return;
+    sd = validsd(msg->otherobject, (struct _ioctl_reply *) msg);
+    if (sd == NULL)
+	return;
+    if (sd->driveno == msg->index)			    /* sd already belongs to drive */
+	return;
+
+    if (sd->state > sd_stale)
+	set_sd_state(sd->sdno, sd_stale, setstate_force);   /* make the subdisk stale */
+    else
+	sd->state = sd_empty;
+    if (sd->plexno >= 0)				    /* part of a plex, */
+	update_plex_state(sd->plexno);			    /* update its state */
+
+    /* Return the space on the old drive */
+    if ((sd->driveno >= 0)				    /* we have a drive, */
+    &&(sd->sectors > 0))				    /* and some space on it */
+	return_drive_space(sd->driveno,			    /* return the space */
+	    sd->driveoffset,
+	    sd->sectors);
+
+    /* Reassign the old subdisk */
+    sd->driveno = msg->index;
+    sd->driveoffset = -1;				    /* let the drive decide where to put us */
+    give_sd_to_drive(sd->sdno);
+    reply->error = 0;
+}
+
+void
+setreadpol(struct vinum_ioctl_msg *msg)
+{
+    struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+    struct volume *vol;
+    struct plex *plex;
+    int myplexno = -1;
+
+    /* Check that our objects are valid (i.e. they exist) */
+    vol = validvol(msg->index, reply);
+    if (vol == NULL)
+	return;
+
+    /* If a plex was specified, check that is is valid */
+    if (msg->otherobject >= 0) {
+	plex = validplex(msg->otherobject, reply);
+	if (vol == NULL)
+	    return;
+
+	/* Is it attached to this volume? */
+	myplexno = my_plex(msg->index, msg->otherobject);
+	if (myplexno < 0) {
+	    strcpy(reply->msg, "Plex is not attached to volume");
+	    reply->error = ENOENT;
+	    return;
+	}
+    }
+    lock_config();
+    vol->preferred_plex = myplexno;
+    save_config();
+    unlock_config();
+    reply->error = 0;
+}
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumkw.h b/sys/dev/vinum/vinumkw.h
new file mode 100644
index 0000000..d7bc7a5
--- /dev/null
+++ b/sys/dev/vinum/vinumkw.h
@@ -0,0 +1,152 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumkw.h,v 1.20 2003/05/07 03:32:09 grog Exp grog $
+ * $FreeBSD$
+ */
+
+/*
+ * Command keywords that vinum knows.  These include both user-level
+ * and kernel-level stuff
+ */
+
+/*
+ * Our complete vocabulary.  The names of the commands are
+ * the same as the identifier without the kw_ at the beginning
+ * (i.e. kw_create defines the "create" keyword).  Preprocessor
+ * magic in parser.c does the rest.
+ *
+ * To add a new word: put it in the table below and one of the
+ * lists in vinumparser.c (probably keywords).
+ */
+enum keyword {
+    kw_create,
+    kw_modify,
+    kw_list,
+    kw_l = kw_list,
+    kw_ld,						    /* list drive */
+    kw_ls,						    /* list subdisk */
+    kw_lp,						    /* list plex */
+    kw_lv,						    /* list volume */
+    kw_set,
+    kw_rm,
+    kw_mv,						    /* move object */
+    kw_move,						    /* synonym for mv */
+    kw_start,
+    kw_stop,
+    kw_makedev,						    /* make /dev/vinum devices */
+    kw_setdaemon,					    /* set daemon flags */
+    kw_getdaemon,					    /* set daemon flags */
+    kw_help,
+    kw_drive,
+    kw_partition,
+    kw_sd,
+    kw_subdisk = kw_sd,
+    kw_plex,
+    kw_volume,
+    kw_vol = kw_volume,
+    kw_read,
+    kw_readpol,
+    kw_org,
+    kw_name,
+    kw_concat,
+    kw_striped,
+    kw_raid4,
+    kw_raid5,
+    kw_driveoffset,
+    kw_plexoffset,
+    kw_len,
+    kw_length = kw_len,
+    kw_size = kw_len,
+    kw_state,
+    kw_setupstate,
+    kw_d,						    /* flag names */
+    kw_f,
+    kw_r,
+    kw_s,
+    kw_v,
+    kw_w,
+    kw_round,						    /* round robin */
+    /*
+     * The first of these is a volume attibute ("prefer plex"), and the
+     * second is a plex attribute ("preferred" means that the volume
+     * prefers this plex).
+     */
+    kw_prefer,						    /* prefer plex */
+    kw_preferred,					    /* preferred plex */
+    kw_device,
+    kw_init,
+    kw_resetconfig,
+    kw_writethrough,
+    kw_writeback,
+    kw_replace,
+    kw_resetstats,
+    kw_attach,
+    kw_detach,
+    kw_rename,
+    kw_printconfig,
+    kw_saveconfig,
+    kw_hotspare,
+    kw_detached,
+    kw_debug,						    /* go into debugger */
+    kw_stripe,
+    kw_mirror,
+    kw_info,
+    kw_quit,
+    kw_max,
+    kw_setstate,
+    kw_checkparity,
+    kw_rebuildparity,
+    kw_dumpconfig,
+    kw_retryerrors,
+    kw_invalid_keyword = -1
+};
+
+struct _keywords {
+    char *name;
+    enum keyword keyword;
+};
+
+struct keywordset {
+    int size;
+    struct _keywords *k;
+};
+
+extern struct _keywords keywords[];
+extern struct _keywords flag_keywords[];
+
+extern struct keywordset keyword_set;
+extern struct keywordset flag_set;
+
+/* Parser functions */
+
+enum keyword get_keyword(char *, struct keywordset *);
+int tokenize(char *, char *[], int);
diff --git a/sys/dev/vinum/vinumlock.c b/sys/dev/vinum/vinumlock.c
new file mode 100644
index 0000000..33d9578
--- /dev/null
+++ b/sys/dev/vinum/vinumlock.c
@@ -0,0 +1,264 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumlock.c,v 1.19 2003/05/23 01:07:18 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+/* Lock a drive, wait if it's in use */
+#ifdef VINUMDEBUG
+int
+lockdrive(struct drive *drive, char *file, int line)
+#else
+int
+lockdrive(struct drive *drive)
+#endif
+{
+    int error;
+
+    /* XXX get rid of     drive->flags |= VF_LOCKING; */
+    if ((drive->flags & VF_LOCKED)			    /* it's locked */
+    &&(drive->pid == curproc->p_pid)) {			    /* by us! */
+#ifdef VINUMDEBUG
+	log(LOG_WARNING,
+	    "vinum lockdrive: already locking %s from %s:%d, called from %s:%d\n",
+	    drive->label.name,
+	    drive->lockfilename,
+	    drive->lockline,
+	    basename(file),
+	    line);
+#else
+	log(LOG_WARNING,
+	    "vinum lockdrive: already locking %s\n",
+	    drive->label.name);
+#endif
+	return 0;
+    }
+    while ((drive->flags & VF_LOCKED) != 0) {
+	/*
+	 * There are problems sleeping on a unique identifier,
+	 * since the drive structure can move, and the unlock
+	 * function can be called after killing the drive.
+	 * Solve this by waiting on this function; the number
+	 * of conflicts is negligible.
+	 */
+	if ((error = tsleep(&lockdrive,
+		    PRIBIO,
+		    "vindrv",
+		    0)) != 0)
+	    return error;
+    }
+    drive->flags |= VF_LOCKED;
+    drive->pid = curproc->p_pid;			    /* it's a panic error if curproc is null */
+#ifdef VINUMDEBUG
+    bcopy(basename(file), drive->lockfilename, 15);
+    drive->lockfilename[15] = '\0';			    /* truncate if necessary */
+    drive->lockline = line;
+#endif
+    return 0;
+}
+
+/* Unlock a drive and let the next one at it */
+void
+unlockdrive(struct drive *drive)
+{
+    drive->flags &= ~VF_LOCKED;
+    /* we don't reset pid: it's of hysterical interest */
+    wakeup(&lockdrive);
+}
+
+/* Lock a stripe of a plex, wait if it's in use */
+struct rangelock *
+lockrange(daddr_t stripe, struct buf *bp, struct plex *plex)
+{
+    struct rangelock *lock;
+    struct rangelock *pos;				    /* position of first free lock */
+    int foundlocks;					    /* number of locks found */
+
+    /*
+     * We could get by without counting the number
+     * of locks we find, but we have a linear search
+     * through a table which in most cases will be
+     * empty.  It's faster to stop when we've found
+     * all the locks that are there.  This is also
+     * the reason why we put pos at the beginning
+     * instead of the end, though it requires an
+     * extra test.
+     */
+    pos = NULL;
+    foundlocks = 0;
+
+    /*
+     * we can't use 0 as a valid address, so
+     * increment all addresses by 1.
+     */
+    stripe++;
+    mtx_lock(plex->lockmtx);
+
+    /* Wait here if the table is full */
+    while (plex->usedlocks == PLEX_LOCKS)		    /* all in use */
+	msleep(&plex->usedlocks, plex->lockmtx, PRIBIO, "vlock", 0);
+
+#ifdef DIAGNOSTIC
+    if (plex->usedlocks >= PLEX_LOCKS)
+	panic("lockrange: Too many locks in use");
+#endif
+
+    lock = plex->lock;					    /* pointer in lock table */
+    if (plex->usedlocks > 0)				    /* something locked, */
+	/* Search the lock table for our stripe */
+	for (; lock < &plex->lock[PLEX_LOCKS]
+	    && foundlocks < plex->usedlocks;
+	    lock++) {
+	    if (lock->stripe) {				    /* in use */
+		foundlocks++;				    /* found another one in use */
+		if ((lock->stripe == stripe)		    /* it's our stripe */
+		&&(lock->bp != bp)) {			    /* but not our request */
+#ifdef VINUMDEBUG
+		    if (debug & DEBUG_LOCKREQS) {
+			struct rangelockinfo lockinfo;
+
+			lockinfo.stripe = stripe;
+			lockinfo.bp = bp;
+			lockinfo.plexno = plex->plexno;
+			logrq(loginfo_lockwait, (union rqinfou) &lockinfo, bp);
+		    }
+#endif
+		    plex->lockwaits++;			    /* waited one more time */
+		    msleep(lock, plex->lockmtx, PRIBIO, "vrlock", 0);
+		    lock = &plex->lock[-1];		    /* start again */
+		    foundlocks = 0;
+		    pos = NULL;
+		}
+	    } else if (pos == NULL)			    /* still looking for somewhere? */
+		pos = lock;				    /* a place to put this one */
+	}
+    /*
+     * This untidy looking code ensures that we'll
+     * always end up pointing to the first free lock
+     * entry, thus minimizing the number of
+     * iterations necessary.
+     */
+    if (pos == NULL)					    /* didn't find one on the way, */
+	pos = lock;					    /* use the one we're pointing to */
+
+    /*
+     * The address range is free, and we're pointing
+     * to the first unused entry.  Make it ours.
+     */
+    pos->stripe = stripe;
+    pos->bp = bp;
+    plex->usedlocks++;					    /* one more lock */
+    mtx_unlock(plex->lockmtx);
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_LOCKREQS) {
+	struct rangelockinfo lockinfo;
+
+	lockinfo.stripe = stripe;
+	lockinfo.bp = bp;
+	lockinfo.plexno = plex->plexno;
+	logrq(loginfo_lock, (union rqinfou) &lockinfo, bp);
+    }
+#endif
+    return pos;
+}
+
+/* Unlock a volume and let the next one at it */
+void
+unlockrange(int plexno, struct rangelock *lock)
+{
+    struct plex *plex;
+
+    plex = &PLEX[plexno];
+#ifdef DIAGNOSTIC
+    if (lock < &plex->lock[0] || lock >= &plex->lock[PLEX_LOCKS])
+	panic("vinum: rangelock %p on plex %d invalid, not between %p and %p",
+	    lock,
+	    plexno,
+	    &plex->lock[0],
+	    &plex->lock[PLEX_LOCKS]);
+#endif
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_LOCKREQS) {
+	struct rangelockinfo lockinfo;
+
+	lockinfo.stripe = lock->stripe;
+	lockinfo.bp = lock->bp;
+	lockinfo.plexno = plex->plexno;
+	logrq(loginfo_lockwait, (union rqinfou) &lockinfo, lock->bp);
+    }
+#endif
+    lock->stripe = 0;					    /* no longer used */
+    plex->usedlocks--;					    /* one less lock */
+    if (plex->usedlocks == PLEX_LOCKS - 1)		    /* we were full, */
+	wakeup(&plex->usedlocks);			    /* get a waiter if one's there */
+    wakeup((void *) lock);
+}
+
+/* Get a lock for the global config.  Wait if it's not available. */
+int
+lock_config(void)
+{
+    int error;
+
+    while ((vinum_conf.flags & VF_LOCKED) != 0) {
+	vinum_conf.flags |= VF_LOCKING;
+	if ((error = tsleep(&vinum_conf, PRIBIO, "vincfg", 0)) != 0)
+	    return error;
+    }
+    vinum_conf.flags |= VF_LOCKED;
+    return 0;
+}
+
+/* Unlock global config and wake up any waiters. */
+void
+unlock_config(void)
+{
+    vinum_conf.flags &= ~VF_LOCKED;
+    if ((vinum_conf.flags & VF_LOCKING) != 0) {
+	vinum_conf.flags &= ~VF_LOCKING;
+	wakeup(&vinum_conf);
+    }
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinummemory.c b/sys/dev/vinum/vinummemory.c
new file mode 100644
index 0000000..b4e9a43
--- /dev/null
+++ b/sys/dev/vinum/vinummemory.c
@@ -0,0 +1,288 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinummemory.c,v 1.31 2003/05/23 01:08:36 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+
+#ifdef VINUMDEBUG
+#include <dev/vinum/request.h>
+extern struct rqinfo rqinfo[];
+extern struct rqinfo *rqip;
+int rqinfo_size = RQINFO_SIZE;				    /* for debugger */
+
+#undef longjmp						    /* this was defined as LongJmp */
+#define strrchr	rindex
+#ifdef __i386__						    /* check for validity */
+void
+LongJmp(jmp_buf buf, int retval)
+{
+/*
+   * longjmp is not documented, not even jmp_buf.
+   * This is what's in i386/i386/support.s:
+   * ENTRY(longjmp)
+   *    movl    4(%esp),%eax
+   *    movl    (%eax),%ebx                      restore ebx
+   *    movl    4(%eax),%esp                     restore esp
+   *    movl    8(%eax),%ebp                     restore ebp
+   *    movl    12(%eax),%esi                    restore esi
+   *    movl    16(%eax),%edi                    restore edi
+   *    movl    20(%eax),%edx                    get rta
+   *    movl    %edx,(%esp)                      put in return frame
+   *    xorl    %eax,%eax                        return(1);
+   *    incl    %eax
+   *    ret
+   *
+   * from which we deduce the structure of jmp_buf:
+ */
+    struct JmpBuf {
+	int jb_ebx;
+	int jb_esp;
+	int jb_ebp;
+	int jb_esi;
+	int jb_edi;
+	int jb_eip;
+    };
+
+    struct JmpBuf *jb = (struct JmpBuf *) buf;
+
+    if ((jb->jb_esp < 0xc0000000)
+	|| (jb->jb_ebp < 0xc0000000)
+	|| (jb->jb_eip < 0xc0000000))
+	panic("Invalid longjmp");
+    longjmp(buf, retval);
+}
+
+#else /* not i386 */
+#define LongJmp longjmp					    /* just use the kernel function */
+#endif /* i386 */
+#endif /* VINUMDEBUG */
+
+/* find the base name of a path name */
+char *
+basename(char *file)
+{
+    char *f = strrchr(file, '/');			    /* chop off dirname if present */
+
+    if (f == NULL)
+	return file;
+    else
+	return ++f;					    /* skip the / */
+}
+
+#ifdef VINUMDEBUG
+void
+expand_table(void **table, int oldsize, int newsize, char *file, int line)
+#else
+void
+expand_table(void **table, int oldsize, int newsize)
+#endif
+{
+    if (newsize > oldsize) {
+	int *temp;
+	int s;
+
+	s = splhigh();
+#ifdef VINUMDEBUG
+	temp = (int *) MMalloc(newsize, file, line);	    /* allocate a new table */
+#else
+	temp = (int *) Malloc(newsize);			    /* allocate a new table */
+#endif
+	CHECKALLOC(temp, "vinum: Can't expand table\n");
+	bzero((char *) temp, newsize);			    /* clean it all out */
+	if (*table != NULL) {				    /* already something there, */
+	    bcopy((char *) *table, (char *) temp, oldsize); /* copy it to the old table */
+#ifdef VINUMDEBUG
+	    FFree(*table, file, line);
+#else
+	    Free(*table);
+#endif
+	}
+	*table = temp;
+	splx(s);
+    }
+}
+
+#ifdef VINUMDEBUG
+#define MALLOCENTRIES 16384
+int malloccount = 0;
+int highwater = 0;					    /* highest index ever allocated */
+struct mc malloced[MALLOCENTRIES];
+
+#define FREECOUNT 64
+int freecount = FREECOUNT;				    /* for debugger */
+int lastfree = 0;
+struct mc freeinfo[FREECOUNT];
+
+int total_malloced;
+static int mallocseq = 0;
+
+caddr_t
+MMalloc(int size, char *file, int line)
+{
+    int s;
+    caddr_t result;
+    int i;
+
+    if (malloccount >= MALLOCENTRIES) {			    /* too many */
+	log(LOG_ERR, "vinum: can't allocate table space to trace memory allocation");
+	return 0;					    /* can't continue */
+    }
+    /* Wait for malloc if we can */
+    result = malloc(size,
+	M_DEVBUF,
+	curthread->td_intr_nesting_level == 0 ? M_WAITOK : M_NOWAIT);
+    if (result == NULL)
+	log(LOG_ERR, "vinum: can't allocate %d bytes from %s:%d\n", size, file, line);
+    else {
+	s = splhigh();
+	for (i = 0; i < malloccount; i++) {
+	    if (((result + size) > malloced[i].address)
+		&& (result < malloced[i].address + malloced[i].size)) /* overlap */
+		Debugger("Malloc overlap");
+	}
+	if (result) {
+	    char *f = basename(file);
+
+	    i = malloccount++;
+	    total_malloced += size;
+	    microtime(&malloced[i].time);
+	    malloced[i].seq = mallocseq++;
+	    malloced[i].size = size;
+	    malloced[i].line = line;
+	    malloced[i].address = result;
+	    strlcpy(malloced[i].file, f, MCFILENAMELEN);
+	}
+	if (malloccount > highwater)
+	    highwater = malloccount;
+	splx(s);
+    }
+    return result;
+}
+
+void
+FFree(void *mem, char *file, int line)
+{
+    int s;
+    int i;
+
+    s = splhigh();
+    for (i = 0; i < malloccount; i++) {
+	if ((caddr_t) mem == malloced[i].address) {	    /* found it */
+	    bzero(mem, malloced[i].size);		    /* XXX */
+	    free(mem, M_DEVBUF);
+	    malloccount--;
+	    total_malloced -= malloced[i].size;
+	    if (debug & DEBUG_MEMFREE) {		    /* keep track of recent frees */
+		char *f = strrchr(file, '/');		    /* chop off dirname if present */
+
+		if (f == NULL)
+		    f = file;
+		else
+		    f++;				    /* skip the / */
+
+		microtime(&freeinfo[lastfree].time);
+		freeinfo[lastfree].seq = malloced[i].seq;
+		freeinfo[lastfree].size = malloced[i].size;
+		freeinfo[lastfree].line = line;
+		freeinfo[lastfree].address = mem;
+		bcopy(f, freeinfo[lastfree].file, MCFILENAMELEN);
+		if (++lastfree == FREECOUNT)
+		    lastfree = 0;
+	    }
+	    if (i < malloccount)			    /* more coming after */
+		bcopy(&malloced[i + 1], &malloced[i], (malloccount - i) * sizeof(struct mc));
+	    splx(s);
+	    return;
+	}
+    }
+    splx(s);
+    log(LOG_ERR,
+	"Freeing unallocated data at 0x%p from %s, line %d\n",
+	mem,
+	file,
+	line);
+    Debugger("Free");
+}
+
+void
+vinum_meminfo(caddr_t data)
+{
+    struct meminfo *m = (struct meminfo *) data;
+
+    m->mallocs = malloccount;
+    m->total_malloced = total_malloced;
+    m->malloced = malloced;
+    m->highwater = highwater;
+}
+
+int
+vinum_mallocinfo(caddr_t data)
+{
+    struct mc *m = (struct mc *) data;
+    unsigned int ent = m->seq;				    /* index of entry to return */
+
+    if (ent >= malloccount)
+	return ENOENT;
+    m->address = malloced[ent].address;
+    m->size = malloced[ent].size;
+    m->line = malloced[ent].line;
+    m->seq = malloced[ent].seq;
+    strlcpy(m->file, malloced[ent].file, MCFILENAMELEN);
+    return 0;
+}
+
+/*
+ * return the nth request trace buffer entry.  This
+ * is indexed back from the current entry (which
+ * has index 0)
+ */
+int
+vinum_rqinfo(caddr_t data)
+{
+    struct rqinfo *rq = (struct rqinfo *) data;
+    int ent = *(int *) data;				    /* 1st word is index */
+    int lastent = rqip - rqinfo;			    /* entry number of current entry */
+
+    if (ent >= RQINFO_SIZE)				    /* out of the table */
+	return ENOENT;
+    if ((ent = lastent - ent - 1) < 0)
+	ent += RQINFO_SIZE;				    /* roll over backwards */
+    bcopy(&rqinfo[ent], rq, sizeof(struct rqinfo));
+    return 0;
+}
+#endif
diff --git a/sys/dev/vinum/vinumobj.h b/sys/dev/vinum/vinumobj.h
new file mode 100644
index 0000000..81087f3
--- /dev/null
+++ b/sys/dev/vinum/vinumobj.h
@@ -0,0 +1,320 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *	Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumobj.h,v 1.7 2003/05/23 01:08:58 grog Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Definitions of Vinum objects: drive, subdisk, plex and volume.
+ * This file is included both by userland programs and by kernel code.
+ * The userland structures are a subset of the kernel structures, and
+ * all userland fields are at the beginning, so that a simple copy in
+ * the length of the userland structure will be sufficient.  In order
+ * to perform this copy, vinumioctl must know both structures, so it
+ * includes this file again with _KERNEL reset.
+ */
+
+#ifndef _KERNEL
+/*
+ * Flags for all objects.  Most of them only apply
+ * to specific objects, but we currently have
+ * space for all in any 32 bit flags word.
+ */
+enum objflags {
+    VF_LOCKED = 1,					    /* somebody has locked access to this object */
+    VF_LOCKING = 2,					    /* we want access to this object */
+    VF_OPEN = 4,					    /* object has openers */
+    VF_WRITETHROUGH = 8,				    /* volume: write through */
+    VF_INITED = 0x10,					    /* unit has been initialized */
+    VF_WLABEL = 0x20,					    /* label area is writable */
+    VF_LABELLING = 0x40,				    /* unit is currently being labelled */
+    VF_WANTED = 0x80,					    /* someone is waiting to obtain a lock */
+    VF_RAW = 0x100,					    /* raw volume (no file system) */
+    VF_LOADED = 0x200,					    /* module is loaded */
+    VF_CONFIGURING = 0x400,				    /* somebody is changing the config */
+    VF_WILL_CONFIGURE = 0x800,				    /* somebody wants to change the config */
+    VF_CONFIG_INCOMPLETE = 0x1000,			    /* haven't finished changing the config */
+    VF_CONFIG_SETUPSTATE = 0x2000,			    /* set a volume up if all plexes are empty */
+    VF_READING_CONFIG = 0x4000,				    /* we're reading config database from disk */
+    VF_FORCECONFIG = 0x8000,				    /* configure drives even with different names */
+    VF_NEWBORN = 0x10000,				    /* for objects: we've just created it */
+    VF_CONFIGURED = 0x20000,				    /* for drives: we read the config */
+    VF_STOPPING = 0x40000,				    /* for vinum_conf: stop on last close */
+    VF_DAEMONOPEN = 0x80000,				    /* the daemon has us open (only superdev) */
+    VF_CREATED = 0x100000,				    /* for volumes: freshly created, more then new */
+    VF_HOTSPARE = 0x200000,				    /* for drives: use as hot spare */
+    VF_RETRYERRORS = 0x400000,				    /* don't down subdisks on I/O errors */
+    VF_HASDEBUG = 0x800000,				    /* set if we support debug */
+};
+
+#endif
+
+/* Global configuration information for the vinum subsystem */
+#ifdef _KERNEL
+struct _vinum_conf
+#else
+struct __vinum_conf
+#endif
+{
+    int version;					    /* version of structures */
+#ifdef _KERNEL
+    /* Pointers to vinum structures */
+    struct drive *drive;
+    struct sd *sd;
+    struct plex *plex;
+    struct volume *volume;
+#else
+    /* Pointers to vinum structures */
+    struct _drive *drive;
+    struct _sd *sd;
+    struct _plex *plex;
+    struct _volume *volume;
+#endif
+
+    /* the number allocated of each object */
+    int drives_allocated;
+    int subdisks_allocated;
+    int plexes_allocated;
+    int volumes_allocated;
+
+    /* and the number currently in use */
+    /*
+     * Note that drives_used is not valid during drive recognition
+     * (vinum_scandisk and friends).  Many invalid drives are added and
+     * later removed; the count isn't correct until we leave
+     * vinum_scandisk.
+     */
+    int drives_used;
+    int subdisks_used;
+    int plexes_used;
+    int volumes_used;
+
+    int flags;						    /* see above */
+
+#define VINUM_MAXACTIVE  30000				    /* maximum number of active requests */
+    int active;						    /* current number of requests outstanding */
+    int maxactive;					    /* maximum number of requests ever outstanding */
+#ifdef _KERNEL
+#ifdef VINUMDEBUG
+    struct request *lastrq;
+    struct buf *lastbuf;
+#endif
+#endif
+};
+
+/* Use these defines to simplify code */
+#define DRIVE vinum_conf.drive
+#define SD vinum_conf.sd
+#define PLEX vinum_conf.plex
+#define VOL vinum_conf.volume
+#define VFLAGS vinum_conf.flags
+
+/*
+ * A drive corresponds to a disk slice.  We use a different term to show
+ * the difference in usage: it doesn't have to be a slice, and could
+ * theoretically be a complete, unpartitioned disk
+ */
+
+#ifdef _KERNEL
+struct drive
+#else
+struct _drive
+#endif
+{
+    char devicename[MAXDRIVENAME];			    /* name of the slice it's on */
+    struct vinum_label label;				    /* and the label information */
+    enum drivestate state;				    /* current state */
+    int flags;						    /* flags */
+    int subdisks_allocated;				    /* number of entries in sd */
+    int subdisks_used;					    /* and the number used */
+    int blocksize;					    /* size of fs blocks */
+    int pid;						    /* of locker */
+    u_int64_t sectors_available;			    /* number of sectors still available */
+    int secsperblock;
+    int lasterror;					    /* last error on drive */
+    int driveno;					    /* index of drive in vinum_conf */
+    int opencount;					    /* number of up subdisks */
+    u_int64_t reads;					    /* number of reads on this drive */
+    u_int64_t writes;					    /* number of writes on this drive */
+    u_int64_t bytes_read;				    /* number of bytes read */
+    u_int64_t bytes_written;				    /* number of bytes written */
+#define DRIVE_MAXACTIVE  30000				    /* maximum number of active requests */
+    int active;						    /* current number of requests outstanding */
+    int maxactive;					    /* maximum number of requests ever outstanding */
+    int freelist_size;					    /* number of entries alloced in free list */
+    int freelist_entries;				    /* number of entries used in free list */
+    struct drive_freelist *freelist;			    /* sorted list of free space on drive */
+#ifdef _KERNEL
+    u_int sectorsize;
+    off_t mediasize;
+    dev_t dev;						    /* device information */
+#ifdef VINUMDEBUG
+    char lockfilename[16];				    /* name of file from which we were locked */
+    int lockline;					    /* and the line number */
+#endif
+#endif
+};
+
+#ifdef _KERNEL
+struct sd
+#else
+struct _sd
+#endif
+{
+    char name[MAXSDNAME];				    /* name of subdisk */
+    enum sdstate state;					    /* state */
+    int flags;
+    int lasterror;					    /* last error occurred */
+    /* offsets in blocks */
+    int64_t driveoffset;				    /* offset on drive */
+    /*
+     * plexoffset is the offset from the beginning
+     * of the plex to the very first part of the
+     * subdisk, in sectors.  For striped, RAID-4 and
+     * RAID-5 plexes, only the first stripe is
+     * located at this offset
+     */
+    int64_t plexoffset;					    /* offset in plex */
+    u_int64_t sectors;					    /* and length in sectors */
+    int sectorsize;					    /* sector size for DIOCGSECTORSIZE */
+    int plexno;						    /* index of plex, if it belongs */
+    int driveno;					    /* index of the drive on which it is located */
+    int sdno;						    /* our index in vinum_conf */
+    int plexsdno;					    /* and our number in our plex */
+    /* (undefined if no plex) */
+    u_int64_t reads;					    /* number of reads on this subdisk */
+    u_int64_t writes;					    /* number of writes on this subdisk */
+    u_int64_t bytes_read;				    /* number of bytes read */
+    u_int64_t bytes_written;				    /* number of bytes written */
+    /* revive parameters */
+    u_int64_t revived;					    /* block number of current revive request */
+    int revive_blocksize;				    /* revive block size (bytes) */
+    int revive_interval;				    /* and time to wait between transfers */
+    pid_t reviver;					    /* PID of reviving process */
+    /* init parameters */
+    u_int64_t initialized;				    /* block number of current init request */
+    int init_blocksize;					    /* init block size (bytes) */
+    int init_interval;					    /* and time to wait between transfers */
+#ifdef _KERNEL
+    struct request *waitlist;				    /* list of requests waiting on revive op */
+    dev_t dev;						    /* associated device */
+#endif
+};
+
+#ifdef _KERNEL
+struct plex
+#else
+struct _plex
+#endif
+{
+    enum plexorg organization;				    /* Plex organization */
+    enum plexstate state;				    /* and current state */
+    u_int64_t length;					    /* total length of plex (sectors) */
+    int flags;
+    int stripesize;					    /* size of stripe or raid band, in sectors */
+    int sectorsize;					    /* sector size for DIOCGSECTORSIZE */
+    int subdisks;					    /* number of associated subdisks */
+    int subdisks_allocated;				    /* number of subdisks allocated space for */
+    int *sdnos;						    /* list of component subdisks */
+    int plexno;						    /* index of plex in vinum_conf */
+    int volno;						    /* index of volume */
+    int volplexno;					    /* number of plex in volume */
+    /* Statistics */
+    u_int64_t reads;					    /* number of reads on this plex */
+    u_int64_t writes;					    /* number of writes on this plex */
+    u_int64_t bytes_read;				    /* number of bytes read */
+    u_int64_t bytes_written;				    /* number of bytes written */
+    u_int64_t recovered_reads;				    /* number of recovered read operations */
+    u_int64_t degraded_writes;				    /* number of degraded writes */
+    u_int64_t parityless_writes;			    /* number of parityless writes */
+    u_int64_t multiblock;				    /* requests that needed more than one block */
+    u_int64_t multistripe;				    /* requests that needed more than one stripe */
+    int sddowncount;					    /* number of subdisks down */
+    /* Lock information */
+    int usedlocks;					    /* number currently in use */
+    int lockwaits;					    /* and number of waits for locks */
+    off_t checkblock;					    /* block number for parity op */
+    char name[MAXPLEXNAME];				    /* name of plex */
+#ifdef _KERNEL
+    struct rangelock *lock;				    /* ranges of locked addresses */
+    struct mtx *lockmtx;				    /* lock mutex, one of plexmutex [] */
+    dev_t dev;						    /* associated device */
+#endif
+};
+
+#ifdef _KERNEL
+struct volume
+#else
+struct _volume
+#endif
+{
+    char name[MAXVOLNAME];				    /* name of volume */
+    enum volumestate state;				    /* current state */
+    int plexes;						    /* number of plexes */
+    int preferred_plex;					    /* index of plex to read from,
+							    * -1 for round-robin */
+    /*
+     * index of plex used for last read, for
+     * round-robin.
+     */
+    int last_plex_read;
+    int volno;						    /* volume number */
+    int flags;						    /* status and configuration flags */
+    int openflags;					    /* flags supplied to last open(2) */
+    u_int64_t size;					    /* size of volume */
+    int blocksize;					    /* logical block size */
+    int sectorsize;					    /* sector size for DIOCGSECTORSIZE */
+    int active;						    /* number of outstanding requests active */
+    int subops;						    /* and the number of suboperations */
+    /* Statistics */
+    u_int64_t bytes_read;				    /* number of bytes read */
+    u_int64_t bytes_written;				    /* number of bytes written */
+    u_int64_t reads;					    /* number of reads on this volume */
+    u_int64_t writes;					    /* number of writes on this volume */
+    u_int64_t recovered_reads;				    /* reads recovered from another plex */
+    /*
+     * Unlike subdisks in the plex, space for the
+     * plex pointers is static.
+     */
+    int plex[MAXPLEX];					    /* index of plexes */
+#ifdef _KERNEL
+    dev_t dev;						    /* associated device */
+#endif
+};
diff --git a/sys/dev/vinum/vinumparser.c b/sys/dev/vinum/vinumparser.c
new file mode 100644
index 0000000..2820ffd
--- /dev/null
+++ b/sys/dev/vinum/vinumparser.c
@@ -0,0 +1,234 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumparser.c,v 1.25 2003/05/07 03:33:28 grog Exp grog $
+ * $FreeBSD$
+ */
+
+/*
+ * This file contains the parser for the configuration routines.  It's used
+ * both in the kernel and in the user interface program, thus the separate file.
+ */
+
+/*
+ * Go through a text and split up into text tokens.  These are either non-blank
+ * sequences, or any sequence (except \0) enclosed in ' or ".  Embedded ' or
+ * " characters may be escaped by \, which otherwise has no special meaning.
+ *
+ * Delimit by following with a \0, and return pointers to the starts at token [].
+ * Return the number of tokens found as the return value.
+ *
+ * This method has the restriction that a closing " or ' must be followed by
+ * grey space.
+ *
+ * Error conditions are end of line before end of quote, or no space after
+ * a closing quote.  In this case, tokenize() returns -1.
+ */
+
+#include <sys/param.h>
+#include <dev/vinum/vinumkw.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <machine/setjmp.h>
+/* All this mess for a single struct definition */
+#include <sys/uio.h>
+#include <sys/namei.h>
+#include <sys/mount.h>
+
+#include <dev/vinum/vinumvar.h>
+#include <dev/vinum/vinumio.h>
+#include <dev/vinum/vinumext.h>
+#define iswhite(c) ((c == ' ') || (c == '\t'))		    /* check for white space */
+#else /* userland */
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#define iswhite isspace					    /* use the ctype macro */
+#endif
+
+/* enum keyword is defined in vinumvar.h */
+
+#define keypair(x) { #x, kw_##x }			    /* create pair "foo", kw_foo */
+#define flagkeypair(x) { "-"#x, kw_##x }		    /* create pair "-foo", kw_foo */
+#define KEYWORDSET(x) {sizeof (x) / sizeof (struct _keywords), x}
+
+/* Normal keywords.  These are all the words that vinum knows. */
+struct _keywords keywords[] =
+{keypair(drive),
+    keypair(partition),
+    keypair(sd),
+    keypair(subdisk),
+    keypair(plex),
+    keypair(volume),
+    keypair(vol),
+    keypair(setupstate),
+    keypair(readpol),
+    keypair(org),
+    keypair(name),
+    keypair(writethrough),
+    keypair(writeback),
+    keypair(device),
+    keypair(concat),
+    keypair(raid4),
+    keypair(raid5),
+    keypair(striped),
+    keypair(plexoffset),
+    keypair(driveoffset),
+    keypair(length),
+    keypair(len),
+    keypair(size),
+    keypair(state),
+    keypair(round),
+    keypair(prefer),
+    keypair(preferred),
+    keypair(rename),
+    keypair(detached),
+#ifndef _KERNEL						    /* for vinum(8) only */
+    keypair(debug),
+    keypair(stripe),
+    keypair(mirror),
+#endif
+    keypair(attach),
+    keypair(detach),
+    keypair(printconfig),
+    keypair(saveconfig),
+    keypair(replace),
+    keypair(create),
+    keypair(read),
+    keypair(modify),
+    keypair(list),
+    keypair(l),
+    keypair(ld),
+    keypair(ls),
+    keypair(lp),
+    keypair(lv),
+    keypair(info),
+    keypair(set),
+    keypair(rm),
+    keypair(mv),
+    keypair(move),
+    keypair(init),
+    keypair(resetconfig),
+    keypair(start),
+    keypair(stop),
+    keypair(makedev),
+    keypair(help),
+    keypair(quit),
+    keypair(setdaemon),
+    keypair(getdaemon),
+    keypair(max),
+    keypair(replace),
+    keypair(readpol),
+    keypair(resetstats),
+    keypair(setstate),
+    keypair(checkparity),
+    keypair(rebuildparity),
+    keypair(dumpconfig),
+    keypair(retryerrors)
+};
+struct keywordset keyword_set = KEYWORDSET(keywords);
+
+#ifndef _KERNEL
+struct _keywords flag_keywords[] =
+{flagkeypair(f),
+    flagkeypair(d),
+    flagkeypair(v),
+    flagkeypair(s),
+    flagkeypair(r),
+    flagkeypair(w)
+};
+struct keywordset flag_set = KEYWORDSET(flag_keywords);
+
+#endif
+
+/*
+ * Take a blank separated list of tokens and turn it into a list of
+ * individual nul-delimited strings.  Build a list of pointers at
+ * token, which must have enough space for the tokens.  Return the
+ * number of tokens, or -1 on error (typically a missing string
+ * delimiter).
+ */
+int
+tokenize(char *cptr, char *token[], int maxtoken)
+{
+    char delim;						    /* delimiter for searching for the partner */
+    int tokennr;					    /* index of this token */
+
+    for (tokennr = 0; tokennr < maxtoken;) {
+	while (iswhite(*cptr))
+	    cptr++;					    /* skip initial white space */
+	if ((*cptr == '\0') || (*cptr == '\n') || (*cptr == '#')) /* end of line */
+	    return tokennr;				    /* return number of tokens found */
+	delim = *cptr;
+	token[tokennr] = cptr;				    /* point to it */
+	tokennr++;					    /* one more */
+	if (tokennr == maxtoken)			    /* run off the end? */
+	    return tokennr;
+	if ((delim == '\'') || (delim == '"')) {	    /* delimitered */
+	    for (;;) {
+		cptr++;
+		if ((*cptr == delim) && (cptr[-1] != '\\')) { /* found the partner */
+		    cptr++;				    /* move on past */
+		    if (!iswhite(*cptr))		    /* error, no space after closing quote */
+			return -1;
+		    *cptr++ = '\0';			    /* delimit */
+		} else if ((*cptr == '\0') || (*cptr == '\n')) /* end of line */
+		    return -1;
+	    }
+	} else {					    /* not quoted */
+	    while ((*cptr != '\0') && (!iswhite(*cptr)) && (*cptr != '\n'))
+		cptr++;
+	    if (*cptr != '\0')				    /* not end of the line, */
+		*cptr++ = '\0';				    /* delimit and move to the next */
+	}
+    }
+    return maxtoken;					    /* can't get here */
+}
+
+/* Find a keyword and return an index */
+enum keyword
+get_keyword(char *name, struct keywordset *keywordset)
+{
+    int i;
+    struct _keywords *keywords = keywordset->k;		    /* point to the keywords */
+    if (name != NULL) {					    /* parameter exists */
+	for (i = 0; i < keywordset->size; i++)
+	    if (!strcmp(name, keywords[i].name))
+		return (enum keyword) keywords[i].keyword;
+    }
+    return kw_invalid_keyword;
+}
diff --git a/sys/dev/vinum/vinumraid5.c b/sys/dev/vinum/vinumraid5.c
new file mode 100644
index 0000000..73b024f
--- /dev/null
+++ b/sys/dev/vinum/vinumraid5.c
@@ -0,0 +1,698 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Cybernet Corporation and Nan Yang Computer Services Limited.
+ *      All rights reserved.
+ *
+ *  This software was developed as part of the NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Cybernet Corporation
+ *      and Nan Yang Computer Services Limited
+ * 4. Neither the name of the Companies nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumraid5.c,v 1.23 2003/02/08 03:32:45 grog Exp $
+ * $FreeBSD$
+ */
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+#include <sys/resourcevar.h>
+
+/*
+ * Parameters which describe the current transfer.
+ * These are only used for calculation, but they
+ * need to be passed to other functions, so it's
+ * tidier to put them in a struct
+ */
+struct metrics {
+    daddr_t stripebase;					    /* base address of stripe (1st subdisk) */
+    int stripeoffset;					    /* offset in stripe */
+    int stripesectors;					    /* total sectors to transfer in this stripe */
+    daddr_t sdbase;					    /* offset in subdisk of stripe base */
+    int sdcount;					    /* number of disks involved in this transfer */
+    daddr_t diskstart;					    /* remember where this transfer starts */
+    int psdno;						    /* number of parity subdisk */
+    int badsdno;					    /* number of down subdisk, if there is one */
+    int firstsdno;					    /* first data subdisk number */
+    /* These correspond to the fields in rqelement, sort of */
+    int useroffset;
+    /*
+     * Initial offset and length values for the first
+     * data block
+     */
+    int initoffset;					    /* start address of block to transfer */
+    short initlen;					    /* length in sectors of data transfer */
+    /* Define a normal operation */
+    int dataoffset;					    /* start address of block to transfer */
+    int datalen;					    /* length in sectors of data transfer */
+    /* Define a group operation */
+    int groupoffset;					    /* subdisk offset of group operation */
+    int grouplen;					    /* length in sectors of group operation */
+    /* Define a normal write operation */
+    int writeoffset;					    /* subdisk offset of normal write */
+    int writelen;					    /* length in sectors of write operation */
+    enum xferinfo flags;				    /* to check what we're doing */
+    int rqcount;					    /* number of elements in request */
+};
+
+enum requeststatus bre5(struct request *rq,
+    int plexno,
+    daddr_t * diskstart,
+    daddr_t diskend);
+void complete_raid5_write(struct rqelement *);
+enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
+void setrqebounds(struct rqelement *rqe, struct metrics *mp);
+
+/*
+ * define the low-level requests needed to perform
+ * a high-level I/O operation for a specific plex
+ * 'plexno'.
+ *
+ * Return 0 if all subdisks involved in the
+ * request are up, 1 if some subdisks are not up,
+ * and -1 if the request is at least partially
+ * outside the bounds of the subdisks.
+ *
+ * Modify the pointer *diskstart to point to the
+ * end address.  On read, return on the first bad
+ * subdisk, so that the caller
+ * (build_read_request) can try alternatives.
+ *
+ * On entry to this routine, the prq structures
+ * are not assigned.  The assignment is performed
+ * by expandrq().  Strictly speaking, the elements
+ * rqe->sdno of all entries should be set to -1,
+ * since 0 (from bzero) is a valid subdisk number.
+ * We avoid this problem by initializing the ones
+ * we use, and not looking at the others (index >=
+ * prq->requests).
+ */
+enum requeststatus
+bre5(struct request *rq,
+    int plexno,
+    daddr_t * diskaddr,
+    daddr_t diskend)
+{
+    struct metrics m;					    /* most of the information */
+    struct sd *sd;
+    struct plex *plex;
+    struct buf *bp;					    /* user's bp */
+    struct rqgroup *rqg;				    /* the request group that we will create */
+    struct rqelement *rqe;				    /* point to this request information */
+    int rsectors;					    /* sectors remaining in this stripe */
+    int mysdno;						    /* another sd index in loops */
+    int rqno;						    /* request number */
+
+    rqg = NULL;						    /* shut up, damn compiler */
+    m.diskstart = *diskaddr;				    /* start of transfer */
+    bp = rq->bp;					    /* buffer pointer */
+    plex = &PLEX[plexno];				    /* point to the plex */
+
+
+    while (*diskaddr < diskend) {			    /* until we get it all sorted out */
+	if (*diskaddr >= plex->length)			    /* beyond the end of the plex */
+	    return REQUEST_EOF;				    /* can't continue */
+
+	m.badsdno = -1;					    /* no bad subdisk yet */
+
+	/* Part A: Define the request */
+	/*
+	 * First, calculate some sizes:
+	 * The offset of the start address from
+	 * the start of the stripe.
+	 */
+	m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));
+
+	/*
+	 * The plex-relative address of the
+	 * start of the stripe.
+	 */
+	m.stripebase = *diskaddr - m.stripeoffset;
+
+	/* subdisk containing the parity stripe */
+	if (plex->organization == plex_raid5)
+	    m.psdno = plex->subdisks - 1
+		- (*diskaddr / (plex->stripesize * (plex->subdisks - 1)))
+		% plex->subdisks;
+	else						    /* RAID-4 */
+	    m.psdno = plex->subdisks - 1;
+
+	/*
+	 * The number of the subdisk in which
+	 * the start is located.
+	 */
+	m.firstsdno = m.stripeoffset / plex->stripesize;
+	if (m.firstsdno >= m.psdno)			    /* at or past parity sd */
+	    m.firstsdno++;				    /* increment it */
+
+	/*
+	 * The offset from the beginning of
+	 * the stripe on this subdisk.
+	 */
+	m.initoffset = m.stripeoffset % plex->stripesize;
+
+	/* The offset of the stripe start relative to this subdisk */
+	m.sdbase = m.stripebase / (plex->subdisks - 1);
+
+	m.useroffset = *diskaddr - m.diskstart;		    /* The offset of the start in the user buffer */
+
+	/*
+	 * The number of sectors to transfer in the
+	 * current (first) subdisk.
+	 */
+	m.initlen = min(diskend - *diskaddr,		    /* the amount remaining to transfer */
+	    plex->stripesize - m.initoffset);		    /* and the amount left in this block */
+
+	/*
+	 * The number of sectors to transfer in this stripe
+	 * is the minumum of the amount remaining to transfer
+	 * and the amount left in this stripe.
+	 */
+	m.stripesectors = min(diskend - *diskaddr,
+	    plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);
+
+	/* The number of data subdisks involved in this request */
+	m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;
+
+	/* Part B: decide what kind of transfer this will be.
+
+	 * start and end addresses of the transfer in
+	 * the current block.
+	 *
+	 * There are a number of different kinds of
+	 * transfer, each of which relates to a
+	 * specific subdisk:
+	 *
+	 * 1. Normal read.  All participating subdisks
+	 *    are up, and the transfer can be made
+	 *    directly to the user buffer.  The bounds
+	 *    of the transfer are described by
+	 *    m.dataoffset and m.datalen.  We have
+	 *    already calculated m.initoffset and
+	 *    m.initlen, which define the parameters
+	 *    for the first data block.
+	 *
+	 * 2. Recovery read.  One participating
+	 *    subdisk is down.  To recover data, all
+	 *    the other subdisks, including the parity
+	 *    subdisk, must be read.  The data is
+	 *    recovered by exclusive-oring all the
+	 *    other blocks.  The bounds of the
+	 *    transfer are described by m.groupoffset
+	 *    and m.grouplen.
+	 *
+	 * 3. A read request may request reading both
+	 *    available data (normal read) and
+	 *    non-available data (recovery read).
+	 *    This can be a problem if the address
+	 *    ranges of the two reads do not coincide:
+	 *    in this case, the normal read needs to
+	 *    be extended to cover the address range
+	 *    of the recovery read, and must thus be
+	 *    performed out of malloced memory.
+	 *
+	 * 4. Normal write.  All the participating
+	 *    subdisks are up.  The bounds of the
+	 *    transfer are described by m.dataoffset
+	 *    and m.datalen.  Since these values
+	 *    differ for each block, we calculate the
+	 *    bounds for the parity block
+	 *    independently as the maximum of the
+	 *    individual blocks and store these values
+	 *    in m.writeoffset and m.writelen.  This
+	 *    write proceeds in four phases:
+	 *
+	 *    i.  Read the old contents of each block
+	 *        and the parity block.
+	 *    ii.  ``Remove'' the old contents from
+	 *         the parity block with exclusive or.
+	 *    iii. ``Insert'' the new contents of the
+	 *          block in the parity block, again
+	 *          with exclusive or.
+	 *
+	 *    iv.  Write the new contents of the data
+	 *         blocks and the parity block.  The data
+	 *         block transfers can be made directly from
+	 *         the user buffer.
+	 *
+	 * 5. Degraded write where the data block is
+	 *    not available.  The bounds of the
+	 *    transfer are described by m.groupoffset
+	 *    and m.grouplen. This requires the
+	 *    following steps:
+	 *
+	 *    i.  Read in all the other data blocks,
+	 *        excluding the parity block.
+	 *
+	 *    ii.  Recreate the parity block from the
+	 *         other data blocks and the data to be
+	 *         written.
+	 *
+	 *    iii. Write the parity block.
+	 *
+	 * 6. Parityless write, a write where the
+	 *    parity block is not available.  This is
+	 *    in fact the simplest: just write the
+	 *    data blocks.  This can proceed directly
+	 *    from the user buffer.  The bounds of the
+	 *    transfer are described by m.dataoffset
+	 *    and m.datalen.
+	 *
+	 * 7. Combination of degraded data block write
+	 *    and normal write.  In this case the
+	 *    address ranges of the reads may also
+	 *    need to be extended to cover all
+	 *    participating blocks.
+	 *
+	 * All requests in a group transfer transfer
+	 * the same address range relative to their
+	 * subdisk.  The individual transfers may
+	 * vary, but since our group of requests is
+	 * all in a single slice, we can define a
+	 * range in which they all fall.
+	 *
+	 * In the following code section, we determine
+	 * which kind of transfer we will perform.  If
+	 * there is a group transfer, we also decide
+	 * its bounds relative to the subdisks.  At
+	 * the end, we have the following values:
+	 *
+	 *  m.flags indicates the kinds of transfers
+	 *    we will perform.
+	 *  m.initoffset indicates the offset of the
+	 *    beginning of any data operation relative
+	 *    to the beginning of the stripe base.
+	 *  m.initlen specifies the length of any data
+	 *    operation.
+	 *  m.dataoffset contains the same value as
+	 *    m.initoffset.
+	 *  m.datalen contains the same value as
+	 *    m.initlen.  Initially dataoffset and
+	 *    datalen describe the parameters for the
+	 *    first data block; while building the data
+	 *    block requests, they are updated for each
+	 *    block.
+	 *  m.groupoffset indicates the offset of any
+	 *    group operation relative to the beginning
+	 *    of the stripe base.
+	 *  m.grouplen specifies the length of any
+	 *    group operation.
+	 *  m.writeoffset indicates the offset of a
+	 *    normal write relative to the beginning of
+	 *    the stripe base.  This value differs from
+	 *    m.dataoffset in that it applies to the
+	 *    entire operation, and not just the first
+	 *    block.
+	 *  m.writelen specifies the total span of a
+	 *    normal write operation.  writeoffset and
+	 *    writelen are used to define the parity
+	 *    block.
+	 */
+	m.groupoffset = 0;				    /* assume no group... */
+	m.grouplen = 0;					    /* until we know we have one */
+	m.writeoffset = m.initoffset;			    /* start offset of transfer */
+	m.writelen = 0;					    /* nothing to write yet */
+	m.flags = 0;					    /* no flags yet */
+	rsectors = m.stripesectors;			    /* remaining sectors to examine */
+	m.dataoffset = m.initoffset;			    /* start at the beginning of the transfer */
+	m.datalen = m.initlen;
+
+	if (m.sdcount > 1) {
+	    plex->multiblock++;				    /* more than one block for the request */
+	    /*
+	     * If we have two transfers that don't overlap,
+	     * (one at the end of the first block, the other
+	     * at the beginning of the second block),
+	     * it's cheaper to split them.
+	     */
+	    if (rsectors < plex->stripesize) {
+		m.sdcount = 1;				    /* just one subdisk */
+		m.stripesectors = m.initlen;		    /* and just this many sectors */
+		rsectors = m.initlen;			    /* and in the loop counter */
+	    }
+	}
+	if (SD[plex->sdnos[m.psdno]].state < sd_reborn)	    /* is our parity subdisk down? */
+	    m.badsdno = m.psdno;			    /* note that it's down */
+	if (bp->b_iocmd == BIO_READ) {			    /* read operation */
+	    for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
+		if (mysdno == m.psdno)			    /* ignore parity on read */
+		    mysdno++;
+		if (mysdno == plex->subdisks)		    /* wraparound */
+		    mysdno = 0;
+		if (mysdno == m.psdno)			    /* parity, */
+		    mysdno++;				    /* we've given already */
+
+		if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
+		    if (m.badsdno >= 0)			    /* we had one already, */
+			return REQUEST_DOWN;		    /* we can't take a second */
+		    m.badsdno = mysdno;			    /* got the first */
+		    m.groupoffset = m.dataoffset;	    /* define the bounds */
+		    m.grouplen = m.datalen;
+		    m.flags |= XFR_RECOVERY_READ;	    /* we need recovery */
+		    plex->recovered_reads++;		    /* count another one */
+		} else
+		    m.flags |= XFR_NORMAL_READ;		    /* normal read */
+
+		/* Update the pointers for the next block */
+		m.dataoffset = 0;			    /* back to the start of the stripe */
+		rsectors -= m.datalen;			    /* remaining sectors to examine */
+		m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
+	    }
+	} else {					    /* write operation */
+	    for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
+		if (mysdno == m.psdno)			    /* parity stripe, we've dealt with that */
+		    mysdno++;
+		if (mysdno == plex->subdisks)		    /* wraparound */
+		    mysdno = 0;
+		if (mysdno == m.psdno)			    /* parity, */
+		    mysdno++;				    /* we've given already */
+
+		sd = &SD[plex->sdnos[mysdno]];
+		if (sd->state != sd_up) {
+		    enum requeststatus s;
+
+		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
+		    if (s && (m.badsdno >= 0)) {	    /* second bad disk, */
+			int sdno;
+			/*
+			 * If the parity disk is down, there's
+			 * no recovery.  We make all involved
+			 * subdisks stale.  Otherwise, we
+			 * should be able to recover, but it's
+			 * like pulling teeth.  Fix it later.
+			 */
+			for (sdno = 0; sdno < m.sdcount; sdno++) {
+			    struct sd *sd = &SD[plex->sdnos[sdno]];
+			    if (sd->state >= sd_reborn)	    /* sort of up, */
+				set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
+			}
+			return s;			    /* and crap out */
+		    }
+		    m.badsdno = mysdno;			    /* note which one is bad */
+		    m.flags |= XFR_DEGRADED_WRITE;	    /* we need recovery */
+		    plex->degraded_writes++;		    /* count another one */
+		    m.groupoffset = m.dataoffset;	    /* define the bounds */
+		    m.grouplen = m.datalen;
+		} else {
+		    m.flags |= XFR_NORMAL_WRITE;	    /* normal write operation */
+		    if (m.writeoffset > m.dataoffset) {	    /* move write operation lower */
+			m.writelen = max(m.writeoffset + m.writelen,
+			    m.dataoffset + m.datalen)
+			    - m.dataoffset;
+			m.writeoffset = m.dataoffset;
+		    } else
+			m.writelen = max(m.writeoffset + m.writelen,
+			    m.dataoffset + m.datalen)
+			    - m.writeoffset;
+		}
+
+		/* Update the pointers for the next block */
+		m.dataoffset = 0;			    /* back to the start of the stripe */
+		rsectors -= m.datalen;			    /* remaining sectors to examine */
+		m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
+	    }
+	    if (m.badsdno == m.psdno) {			    /* got a bad parity block, */
+		struct sd *psd = &SD[plex->sdnos[m.psdno]];
+
+		if (psd->state == sd_down)
+		    set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
+		else if (psd->state == sd_crashed)
+		    set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
+		m.flags &= ~XFR_NORMAL_WRITE;		    /* this write isn't normal, */
+		m.flags |= XFR_PARITYLESS_WRITE;	    /* it's parityless */
+		plex->parityless_writes++;		    /* count another one */
+	    }
+	}
+
+	/* reset the initial transfer values */
+	m.dataoffset = m.initoffset;			    /* start at the beginning of the transfer */
+	m.datalen = m.initlen;
+
+	/* decide how many requests we need */
+	if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))
+	    /* doing a recovery read or degraded write, */
+	    m.rqcount = plex->subdisks;			    /* all subdisks */
+	else if (m.flags & XFR_NORMAL_WRITE)		    /* normal write, */
+	    m.rqcount = m.sdcount + 1;			    /* all data blocks and the parity block */
+	else						    /* parityless write or normal read */
+	    m.rqcount = m.sdcount;			    /* just the data blocks */
+
+	/* Part C: build the requests */
+	rqg = allocrqg(rq, m.rqcount);			    /* get a request group */
+	if (rqg == NULL) {				    /* malloc failed */
+	    bp->b_error = ENOMEM;
+	    bp->b_ioflags |= BIO_ERROR;
+	    return REQUEST_ENOMEM;
+	}
+	rqg->plexno = plexno;
+	rqg->flags = m.flags;
+	rqno = 0;					    /* index in the request group */
+
+	/* 1: PARITY BLOCK */
+	/*
+	 * Are we performing an operation which requires parity?  In that case,
+	 * work out the parameters and define the parity block.
+	 * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
+	 */
+	if (m.flags & XFR_PARITYOP) {			    /* need parity */
+	    rqe = &rqg->rqe[rqno];			    /* point to element */
+	    sd = &SD[plex->sdnos[m.psdno]];		    /* the subdisk in question */
+	    rqe->rqg = rqg;				    /* point back to group */
+	    rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
+	    &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE);	    /* transfer flags without data op stuf */
+	    setrqebounds(rqe, &m);			    /* set up the bounds of the transfer */
+	    rqe->sdno = sd->sdno;			    /* subdisk number */
+	    rqe->driveno = sd->driveno;
+	    if (build_rq_buffer(rqe, plex))		    /* build the buffer */
+		return REQUEST_ENOMEM;			    /* can't do it */
+	    rqe->b.b_iocmd = BIO_READ;			    /* we must read first */
+	    m.sdcount++;				    /* adjust the subdisk count */
+	    rqno++;					    /* and point to the next request */
+	}
+	/*
+	 * 2: DATA BLOCKS
+	 * Now build up requests for the blocks required
+	 * for individual transfers
+	 */
+	for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
+	    if (mysdno == m.psdno)			    /* parity, */
+		mysdno++;				    /* we've given already */
+	    if (mysdno == plex->subdisks)		    /* got to the end, */
+		mysdno = 0;				    /* wrap around */
+	    if (mysdno == m.psdno)			    /* parity, */
+		mysdno++;				    /* we've given already */
+
+	    rqe = &rqg->rqe[rqno];			    /* point to element */
+	    sd = &SD[plex->sdnos[mysdno]];		    /* the subdisk in question */
+	    rqe->rqg = rqg;				    /* point to group */
+	    if (m.flags & XFR_NEEDS_MALLOC)		    /* we need a malloced buffer first */
+		rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
+	    else
+		rqe->flags = m.flags | XFR_DATA_BLOCK;	    /* transfer flags */
+	    if (mysdno == m.badsdno) {			    /* this is the bad subdisk */
+		rqg->badsdno = rqno;			    /* note which one */
+		rqe->flags |= XFR_BAD_SUBDISK;		    /* note that it's dead */
+		/*
+		 * we can't read or write from/to it,
+		 * but we don't need to malloc
+		 */
+		rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
+	    }
+	    setrqebounds(rqe, &m);			    /* set up the bounds of the transfer */
+	    rqe->useroffset = m.useroffset;		    /* offset in user buffer */
+	    rqe->sdno = sd->sdno;			    /* subdisk number */
+	    rqe->driveno = sd->driveno;
+	    if (build_rq_buffer(rqe, plex))		    /* build the buffer */
+		return REQUEST_ENOMEM;			    /* can't do it */
+	    if ((m.flags & XFR_PARITYOP)		    /* parity operation, */
+	    &&((m.flags & XFR_BAD_SUBDISK) == 0))	    /* and not the bad subdisk, */
+		rqe->b.b_iocmd = BIO_READ;		    /* we must read first */
+
+	    /* Now update pointers for the next block */
+	    *diskaddr += m.datalen;			    /* skip past what we've done */
+	    m.stripesectors -= m.datalen;		    /* deduct from what's left */
+	    m.useroffset += m.datalen;			    /* and move on in the user buffer */
+	    m.datalen = min(m.stripesectors, plex->stripesize);	/* and recalculate */
+	    m.dataoffset = 0;				    /* start at the beginning of next block */
+	}
+
+	/*
+	 * 3: REMAINING BLOCKS FOR RECOVERY
+	 * Finally, if we have a recovery operation, build
+	 * up transfers for the other subdisks.  Follow the
+	 * subdisks around until we get to where we started.
+	 * These requests use only the group parameters.
+	 */
+	if ((rqno < m.rqcount)				    /* haven't done them all already */
+	&&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
+	    for (; rqno < m.rqcount; rqno++, mysdno++) {
+		if (mysdno == m.psdno)			    /* parity, */
+		    mysdno++;				    /* we've given already */
+		if (mysdno == plex->subdisks)		    /* got to the end, */
+		    mysdno = 0;				    /* wrap around */
+		if (mysdno == m.psdno)			    /* parity, */
+		    mysdno++;				    /* we've given already */
+
+		rqe = &rqg->rqe[rqno];			    /* point to element */
+		sd = &SD[plex->sdnos[mysdno]];		    /* the subdisk in question */
+		rqe->rqg = rqg;				    /* point to group */
+
+		rqe->sdoffset = m.sdbase + m.groupoffset;   /* start of transfer */
+		rqe->dataoffset = 0;			    /* for tidiness' sake */
+		rqe->groupoffset = 0;			    /* group starts at the beginining */
+		rqe->datalen = 0;
+		rqe->grouplen = m.grouplen;
+		rqe->buflen = m.grouplen;
+		rqe->flags = (m.flags | XFR_MALLOCED)	    /* transfer flags without data op stuf */
+		&~XFR_DATAOP;
+		rqe->sdno = sd->sdno;			    /* subdisk number */
+		rqe->driveno = sd->driveno;
+		if (build_rq_buffer(rqe, plex))		    /* build the buffer */
+		    return REQUEST_ENOMEM;		    /* can't do it */
+		rqe->b.b_iocmd = BIO_READ;		    /* we must read first */
+	    }
+	}
+	/*
+	 * We need to lock the address range before
+	 * doing anything.  We don't have to be
+	 * performing a recovery operation: somebody
+	 * else could be doing so, and the results could
+	 * influence us.  Note the fact here, we'll perform
+	 * the lock in launch_requests.
+	 */
+	rqg->lockbase = m.stripebase;
+	if (*diskaddr < diskend)			    /* didn't finish the request on this stripe */
+	    plex->multistripe++;			    /* count another one */
+    }
+    return REQUEST_OK;
+}
+
+/*
+ * Helper function for rqe5: adjust the bounds of
+ * the transfers to minimize the buffer
+ * allocation.
+ *
+ * Each request can handle two of three different
+ * data ranges:
+ *
+ * 1.  The range described by the parameters
+ *     dataoffset and datalen, for normal read or
+ *     parityless write.
+ * 2.  The range described by the parameters
+ *     groupoffset and grouplen, for recovery read
+ *     and degraded write.
+ * 3.  For normal write, the range depends on the
+ *     kind of block.  For data blocks, the range
+ *     is defined by dataoffset and datalen.  For
+ *     parity blocks, it is defined by writeoffset
+ *     and writelen.
+ *
+ * In order not to allocate more memory than
+ * necessary, this function adjusts the bounds
+ * parameter for each request to cover just the
+ * minimum necessary for the function it performs.
+ * This will normally vary from one request to the
+ * next.
+ *
+ * Things are slightly different for the parity
+ * block.  In this case, the bounds defined by
+ * mp->writeoffset and mp->writelen also play a
+ * r�le.  Select this case by setting the
+ * parameter forparity != 0.
+ */
+void
+setrqebounds(struct rqelement *rqe, struct metrics *mp)
+{
+    /* parity block of a normal write */
+    if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK))
+	== (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) {	    /* case 3 */
+	if (rqe->flags & XFR_DEGRADED_WRITE) {		    /* also degraded write */
+	    /*
+	     * With a combined normal and degraded write, we
+	     * will zero out the area of the degraded write
+	     * in the second phase, so we don't need to read
+	     * it in.  Unfortunately, we need a way to tell
+	     * build_request_buffer the size of the buffer,
+	     * and currently that's the length of the read.
+	     * As a result, we read everything, even the stuff
+	     * that we're going to nuke.
+	     * FIXME XXX
+	     */
+	    if (mp->groupoffset < mp->writeoffset) {	    /* group operation starts lower */
+		rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
+		rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */
+		rqe->groupoffset = 0;			    /* and the group at the beginning */
+	    } else {					    /* individual data starts first */
+		rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
+		rqe->dataoffset = 0;			    /* individual data starts at the beginning */
+		rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */
+	    }
+	    rqe->datalen = mp->writelen;
+	    rqe->grouplen = mp->grouplen;
+	} else {					    /* just normal write (case 3) */
+	    rqe->sdoffset = mp->sdbase + mp->writeoffset;   /* start of transfer */
+	    rqe->dataoffset = 0;			    /* degradation starts at the beginning */
+	    rqe->groupoffset = 0;			    /* for tidiness' sake */
+	    rqe->datalen = mp->writelen;
+	    rqe->grouplen = 0;
+	}
+    } else if (rqe->flags & XFR_DATAOP) {		    /* data operation (case 1 or 3) */
+	if (rqe->flags & XFR_GROUPOP) {			    /* also a group operation (case 2) */
+	    if (mp->groupoffset < mp->dataoffset) {	    /* group operation starts lower */
+		rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
+		rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */
+		rqe->groupoffset = 0;			    /* and the group at the beginning */
+	    } else {					    /* individual data starts first */
+		rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
+		rqe->dataoffset = 0;			    /* individual data starts at the beginning */
+		rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */
+	    }
+	    rqe->datalen = mp->datalen;
+	    rqe->grouplen = mp->grouplen;
+	} else {					    /* just data operation (case 1) */
+	    rqe->sdoffset = mp->sdbase + mp->dataoffset;    /* start of transfer */
+	    rqe->dataoffset = 0;			    /* degradation starts at the beginning */
+	    rqe->groupoffset = 0;			    /* for tidiness' sake */
+	    rqe->datalen = mp->datalen;
+	    rqe->grouplen = 0;
+	}
+    } else {						    /* just group operations (case 2) */
+	rqe->sdoffset = mp->sdbase + mp->groupoffset;	    /* start of transfer */
+	rqe->dataoffset = 0;				    /* for tidiness' sake */
+	rqe->groupoffset = 0;				    /* group starts at the beginining */
+	rqe->datalen = 0;
+	rqe->grouplen = mp->grouplen;
+    }
+    rqe->buflen = max(rqe->dataoffset + rqe->datalen,	    /* total buffer length */
+	rqe->groupoffset + rqe->grouplen);
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumrequest.c b/sys/dev/vinum/vinumrequest.c
new file mode 100644
index 0000000..f74fc89
--- /dev/null
+++ b/sys/dev/vinum/vinumrequest.c
@@ -0,0 +1,1112 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ *  Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumrequest.c,v 1.36 2003/05/08 04:34:55 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+#include <sys/resourcevar.h>
+
+enum requeststatus bre(struct request *rq,
+    int plexno,
+    daddr_t * diskstart,
+    daddr_t diskend);
+enum requeststatus bre5(struct request *rq,
+    int plexno,
+    daddr_t * diskstart,
+    daddr_t diskend);
+enum requeststatus build_read_request(struct request *rq, int volplexno);
+enum requeststatus build_write_request(struct request *rq);
+enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
+int find_alternate_sd(struct request *rq);
+int check_range_covered(struct request *);
+void complete_rqe(struct buf *bp);
+void complete_raid5_write(struct rqelement *);
+int abortrequest(struct request *rq, int error);
+void sdio_done(struct buf *bp);
+int vinum_bounds_check(struct buf *bp, struct volume *vol);
+caddr_t allocdatabuf(struct rqelement *rqe);
+void freedatabuf(struct rqelement *rqe);
+
+#ifdef VINUMDEBUG
+struct rqinfo rqinfo[RQINFO_SIZE];
+struct rqinfo *rqip = rqinfo;
+
+void
+logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)
+{
+    int s = splhigh();
+
+    microtime(&rqip->timestamp);			    /* when did this happen? */
+    rqip->type = type;
+    rqip->bp = ubp;					    /* user buffer */
+    switch (type) {
+    case loginfo_user_bp:
+    case loginfo_user_bpl:
+    case loginfo_sdio:					    /* subdisk I/O */
+    case loginfo_sdiol:					    /* subdisk I/O launch */
+    case loginfo_sdiodone:				    /* subdisk I/O complete */
+	bcopy(info.bp, &rqip->info.b, sizeof(struct buf));
+	rqip->devmajor = major(info.bp->b_dev);
+	rqip->devminor = minor(info.bp->b_dev);
+	break;
+
+    case loginfo_iodone:
+    case loginfo_rqe:
+    case loginfo_raid5_data:
+    case loginfo_raid5_parity:
+	bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement));
+	rqip->devmajor = major(info.rqe->b.b_dev);
+	rqip->devminor = minor(info.rqe->b.b_dev);
+	break;
+
+    case loginfo_lockwait:
+    case loginfo_lock:
+    case loginfo_unlock:
+	bcopy(info.lockinfo, &rqip->info.lockinfo, sizeof(struct rangelock));
+
+	break;
+
+    case loginfo_unused:
+	break;
+    }
+    rqip++;
+    if (rqip >= &rqinfo[RQINFO_SIZE])			    /* wrap around */
+	rqip = rqinfo;
+    splx(s);
+}
+
+#endif
+
+void
+vinumstrategy(struct bio *biop)
+{
+    struct buf *bp = (struct buf *) biop;
+    int volno;
+    struct volume *vol = NULL;
+
+    switch (DEVTYPE(bp->b_dev)) {
+    case VINUM_SD_TYPE:
+    case VINUM_SD2_TYPE:
+	sdio(bp);
+	return;
+
+    default:
+	bp->b_error = EIO;				    /* I/O error */
+	bp->b_io.bio_flags |= BIO_ERROR;
+	bufdone(bp);
+	return;
+
+    case VINUM_VOLUME_TYPE:				    /* volume I/O */
+	volno = Volno(bp->b_dev);
+	vol = &VOL[volno];
+	if (vol->state != volume_up) {			    /* can't access this volume */
+	    bp->b_error = EIO;				    /* I/O error */
+	    bp->b_io.bio_flags |= BIO_ERROR;
+	    bufdone(bp);
+	    return;
+	}
+	if (vinum_bounds_check(bp, vol) <= 0) {		    /* don't like them bounds */
+	    bufdone(bp);
+	    return;
+	}
+	/* FALLTHROUGH */
+	/*
+	 * Plex I/O is pretty much the same as volume I/O
+	 * for a single plex.  Indicate this by passing a NULL
+	 * pointer (set above) for the volume
+	 */
+    case VINUM_PLEX_TYPE:
+	bp->b_resid = bp->b_bcount;			    /* transfer everything */
+	vinumstart(bp, 0);
+	return;
+    }
+}
+
+/*
+ * Start a transfer.  Return -1 on error, 0 if OK,
+ * 1 if we need to retry.  Parameter reviveok is
+ * set when doing transfers for revives: it allows
+ * transfers to be started immediately when a
+ * revive is in progress.  During revive, normal
+ * transfers are queued if they share address
+ * space with a currently active revive operation.
+ */
+int
+vinumstart(struct buf *bp, int reviveok)
+{
+    int plexno;
+    int maxplex;					    /* maximum number of plexes to handle */
+    struct volume *vol;
+    struct request *rq;					    /* build up our request here */
+    enum requeststatus status;
+
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_LASTREQS)
+	logrq(loginfo_user_bp, (union rqinfou) bp, bp);
+#endif
+
+    if ((bp->b_bcount % DEV_BSIZE) != 0) {		    /* bad length */
+	bp->b_error = EINVAL;				    /* invalid size */
+	bp->b_io.bio_flags |= BIO_ERROR;
+	bufdone(bp);
+	return -1;
+    }
+    rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */
+    if (rq == NULL) {					    /* can't do it */
+	bp->b_error = ENOMEM;				    /* can't get memory */
+	bp->b_io.bio_flags |= BIO_ERROR;
+	bufdone(bp);
+	return -1;
+    }
+    bzero(rq, sizeof(struct request));
+
+    /*
+     * Note the volume ID.  This can be NULL, which
+     * the request building functions use as an
+     * indication for single plex I/O.
+     */
+    rq->bp = bp;					    /* and the user buffer struct */
+
+    if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) {	    /* it's a volume, */
+	rq->volplex.volno = Volno(bp->b_dev);		    /* get the volume number */
+	vol = &VOL[rq->volplex.volno];			    /* and point to it */
+	vol->active++;					    /* one more active request */
+	maxplex = vol->plexes;				    /* consider all its plexes */
+    } else {
+	vol = NULL;					    /* no volume */
+	rq->volplex.plexno = Plexno(bp->b_dev);		    /* point to the plex */
+	rq->isplex = 1;					    /* note that it's a plex */
+	maxplex = 1;					    /* just the one plex */
+    }
+
+    if (bp->b_iocmd == BIO_READ) {
+	/*
+	 * This is a read request.  Decide
+	 * which plex to read from.
+	 *
+	 * There's a potential race condition here,
+	 * since we're not locked, and we could end
+	 * up multiply incrementing the round-robin
+	 * counter.  This doesn't have any serious
+	 * effects, however.
+	 */
+	if (vol != NULL) {
+	    plexno = vol->preferred_plex;		    /* get the plex to use */
+	    if (plexno < 0) {				    /* round robin */
+		plexno = vol->last_plex_read;
+		vol->last_plex_read++;
+		if (vol->last_plex_read >= vol->plexes)	    /* got the the end? */
+		    vol->last_plex_read = 0;		    /* wrap around */
+	    }
+	    status = build_read_request(rq, plexno);	    /* build a request */
+	} else {
+	    daddr_t diskaddr = bp->b_blkno;		    /* start offset of transfer */
+	    status = bre(rq,				    /* build a request list */
+		rq->volplex.plexno,
+		&diskaddr,
+		diskaddr + (bp->b_bcount / DEV_BSIZE));
+	}
+
+	if (status > REQUEST_RECOVERED) {		    /* can't satisfy it */
+	    if (status == REQUEST_DOWN) {		    /* not enough subdisks */
+		bp->b_error = EIO;			    /* I/O error */
+		bp->b_io.bio_flags |= BIO_ERROR;
+	    }
+	    bufdone(bp);
+	    freerq(rq);
+	    return -1;
+	}
+	return launch_requests(rq, reviveok);		    /* now start the requests if we can */
+    } else
+	/*
+	 * This is a write operation.  We write to all plexes.  If this is
+	 * a RAID-4 or RAID-5 plex, we must also update the parity stripe.
+	 */
+    {
+	if (vol != NULL) {
+	    if ((vol->plexes > 0)			    /* multiple plex */
+	    ||(isparity((&PLEX[vol->plex[0]])))) {	    /* or RAID-[45], */
+		rq->save_data = bp->b_data;		    /* save the data buffer address */
+		bp->b_data = Malloc(bp->b_bcount);
+		bcopy(rq->save_data, bp->b_data, bp->b_bcount);	/* make a copy */
+		rq->flags |= XFR_COPYBUF;		    /* and note that we did it */
+	    }
+	    status = build_write_request(rq);
+	} else {					    /* plex I/O */
+	    daddr_t diskstart;
+
+	    diskstart = bp->b_blkno;			    /* start offset of transfer */
+	    status = bre(rq,
+		Plexno(bp->b_dev),
+		&diskstart,
+		bp->b_blkno + (bp->b_bcount / DEV_BSIZE));  /* build requests for the plex */
+	}
+	if (status > REQUEST_RECOVERED) {		    /* can't satisfy it */
+	    if (status == REQUEST_DOWN) {		    /* not enough subdisks */
+		bp->b_error = EIO;			    /* I/O error */
+		bp->b_io.bio_flags |= BIO_ERROR;
+	    }
+	    if (rq->flags & XFR_COPYBUF) {
+		Free(bp->b_data);
+		bp->b_data = rq->save_data;
+	    }
+	    bufdone(bp);
+	    freerq(rq);
+	    return -1;
+	}
+	return launch_requests(rq, reviveok);		    /* now start the requests if we can */
+    }
+}
+
+/*
+ * Call the low-level strategy routines to
+ * perform the requests in a struct request
+ */
+int
+launch_requests(struct request *rq, int reviveok)
+{
+    struct rqgroup *rqg;
+    int rqno;						    /* loop index */
+    struct rqelement *rqe;				    /* current element */
+    struct drive *drive;
+    int rcount;						    /* request count */
+
+    /*
+     * First find out whether we're reviving, and
+     * the request contains a conflict.  If so, we
+     * hang the request off plex->waitlist of the
+     * first plex we find which is reviving.
+     */
+
+    if ((rq->flags & XFR_REVIVECONFLICT)		    /* possible revive conflict */
+    &&(!reviveok)) {					    /* and we don't want to do it now, */
+	struct sd *sd;
+	struct request *waitlist;			    /* point to the waitlist */
+
+	sd = &SD[rq->sdno];
+	if (sd->waitlist != NULL) {			    /* something there already, */
+	    waitlist = sd->waitlist;
+	    while (waitlist->next != NULL)		    /* find the end */
+		waitlist = waitlist->next;
+	    waitlist->next = rq;			    /* hook our request there */
+	} else
+	    sd->waitlist = rq;				    /* hook our request at the front */
+
+#ifdef VINUMDEBUG
+	if (debug & DEBUG_REVIVECONFLICT)
+	    log(LOG_DEBUG,
+		"Revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n",
+		rq->sdno,
+		rq,
+		rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
+		major(rq->bp->b_dev),
+		minor(rq->bp->b_dev),
+		(intmax_t) rq->bp->b_blkno,
+		rq->bp->b_bcount);
+#endif
+	return 0;					    /* and get out of here */
+    }
+    rq->active = 0;					    /* nothing yet */
+#ifdef VINUMDEBUG
+    /* XXX This is probably due to a bug */
+    if (rq->rqg == NULL) {				    /* no request */
+	log(LOG_ERR, "vinum: null rqg\n");
+	abortrequest(rq, EINVAL);
+	return -1;
+    }
+#endif
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_ADDRESSES)
+	log(LOG_DEBUG,
+	    "Request: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n",
+	    rq,
+	    rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
+	    major(rq->bp->b_dev),
+	    minor(rq->bp->b_dev),
+	    (intmax_t) rq->bp->b_blkno,
+	    rq->bp->b_bcount);
+    vinum_conf.lastrq = rq;
+    vinum_conf.lastbuf = rq->bp;
+    if (debug & DEBUG_LASTREQS)
+	logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp);
+#endif
+
+    /*
+     * We used to have an splbio() here anyway, out
+     * of superstition.  With the division of labour
+     * below (first count the requests, then issue
+     * them), it looks as if we don't need this
+     * splbio() protection.  In fact, as dillon
+     * points out, there's a race condition
+     * incrementing and decrementing rq->active and
+     * rqg->active.  This splbio() didn't help
+     * there, because the device strategy routine
+     * can sleep.  Solve this by putting shorter
+     * duration locks on the code.
+     */
+    /*
+     * This loop happens without any participation
+     * of the bottom half, so it requires no
+     * protection.
+     */
+    for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) {	    /* through the whole request chain */
+	rqg->active = rqg->count;			    /* they're all active */
+	for (rqno = 0; rqno < rqg->count; rqno++) {
+	    rqe = &rqg->rqe[rqno];
+	    if (rqe->flags & XFR_BAD_SUBDISK)		    /* this subdisk is bad, */
+		rqg->active--;				    /* one less active request */
+	}
+	if (rqg->active)				    /* we have at least one active request, */
+	    rq->active++;				    /* one more active request group */
+    }
+
+    /*
+     * Now fire off the requests.  In this loop the
+     * bottom half could be completing requests
+     * before we finish.  We avoid splbio()
+     * protection by ensuring we don't tread in the
+     * same places that the bottom half does.
+     */
+    for (rqg = rq->rqg; rqg != NULL;) {			    /* through the whole request chain */
+	if (rqg->lockbase >= 0)				    /* this rqg needs a lock first */
+	    rqg->lock = lockrange(rqg->lockbase, rqg->rq->bp, &PLEX[rqg->plexno]);
+	rcount = rqg->count;
+	for (rqno = 0; rqno < rcount;) {
+	    rqe = &rqg->rqe[rqno];
+
+	    /*
+	     * Point to next rqg before the bottom half
+	     * changes the structures.
+	     */
+	    if (++rqno >= rcount)
+		rqg = rqg->next;
+	    if ((rqe->flags & XFR_BAD_SUBDISK) == 0) {	    /* this subdisk is good, */
+		drive = &DRIVE[rqe->driveno];		    /* look at drive */
+		drive->active++;
+		if (drive->active >= drive->maxactive)
+		    drive->maxactive = drive->active;
+		vinum_conf.active++;
+		if (vinum_conf.active >= vinum_conf.maxactive)
+		    vinum_conf.maxactive = vinum_conf.active;
+
+#ifdef VINUMDEBUG
+		if (debug & DEBUG_ADDRESSES)
+		    log(LOG_DEBUG,
+			"  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%jx, length %ld\n",
+			rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
+			major(rqe->b.b_dev),
+			minor(rqe->b.b_dev),
+			rqe->sdno,
+			(u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
+			(intmax_t) rqe->b.b_blkno,
+			rqe->b.b_bcount);
+		if (debug & DEBUG_LASTREQS) {
+		    microtime(&rqe->launchtime);	    /* time we launched this request */
+		    logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp);
+		}
+#endif
+		/* fire off the request */
+		DEV_STRATEGY(&rqe->b);
+	    }
+	}
+    }
+    return 0;
+}
+
+/*
+ * define the low-level requests needed to perform a
+ * high-level I/O operation for a specific plex 'plexno'.
+ *
+ * Return REQUEST_OK if all subdisks involved in the request are up,
+ * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the
+ * request is at least partially outside the bounds of the subdisks.
+ *
+ * Modify the pointer *diskstart to point to the end address.  On
+ * read, return on the first bad subdisk, so that the caller
+ * (build_read_request) can try alternatives.
+ *
+ * On entry to this routine, the rqg structures are not assigned.  The
+ * assignment is performed by expandrq().  Strictly speaking, the
+ * elements rqe->sdno of all entries should be set to -1, since 0
+ * (from bzero) is a valid subdisk number.  We avoid this problem by
+ * initializing the ones we use, and not looking at the others (index
+ * >= rqg->requests).
+ */
+enum requeststatus
+bre(struct request *rq,
+    int plexno,
+    daddr_t * diskaddr,
+    daddr_t diskend)
+{
+    int sdno;
+    struct sd *sd;
+    struct rqgroup *rqg;
+    struct buf *bp;					    /* user's bp */
+    struct plex *plex;
+    enum requeststatus status;				    /* return value */
+    daddr_t plexoffset;					    /* offset of transfer in plex */
+    daddr_t stripebase;					    /* base address of stripe (1st subdisk) */
+    daddr_t stripeoffset;				    /* offset in stripe */
+    daddr_t blockoffset;				    /* offset in stripe on subdisk */
+    struct rqelement *rqe;				    /* point to this request information */
+    daddr_t diskstart = *diskaddr;			    /* remember where this transfer starts */
+    enum requeststatus s;				    /* temp return value */
+
+    bp = rq->bp;					    /* buffer pointer */
+    status = REQUEST_OK;				    /* return value: OK until proven otherwise */
+    plex = &PLEX[plexno];				    /* point to the plex */
+
+    switch (plex->organization) {
+    case plex_concat:
+	sd = NULL;					    /* (keep compiler quiet) */
+	for (sdno = 0; sdno < plex->subdisks; sdno++) {
+	    sd = &SD[plex->sdnos[sdno]];
+	    if (*diskaddr < sd->plexoffset)		    /* we must have a hole, */
+		status = REQUEST_DEGRADED;		    /* note the fact */
+	    if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */
+		rqg = allocrqg(rq, 1);			    /* space for the request */
+		if (rqg == NULL) {			    /* malloc failed */
+		    bp->b_error = ENOMEM;
+		    bp->b_io.bio_flags |= BIO_ERROR;
+		    return REQUEST_ENOMEM;
+		}
+		rqg->plexno = plexno;
+
+		rqe = &rqg->rqe[0];			    /* point to the element */
+		rqe->rqg = rqg;				    /* group */
+		rqe->sdno = sd->sdno;			    /* put in the subdisk number */
+		plexoffset = *diskaddr;			    /* start offset in plex */
+		rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */
+		rqe->useroffset = plexoffset - diskstart;   /* start offset in user buffer */
+		rqe->dataoffset = 0;
+		rqe->datalen = min(diskend - *diskaddr,	    /* number of sectors to transfer in this sd */
+		    sd->sectors - rqe->sdoffset);
+		rqe->groupoffset = 0;			    /* no groups for concatenated plexes */
+		rqe->grouplen = 0;
+		rqe->buflen = rqe->datalen;		    /* buffer length is data buffer length */
+		rqe->flags = 0;
+		rqe->driveno = sd->driveno;
+		if (sd->state != sd_up) {		    /* *now* we find the sd is down */
+		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
+		    if (s == REQUEST_DOWN) {		    /* down? */
+			rqe->flags = XFR_BAD_SUBDISK;	    /* yup */
+			if (rq->bp->b_iocmd == BIO_READ)    /* read request, */
+			    return REQUEST_DEGRADED;	    /* give up here */
+			/*
+			 * If we're writing, don't give up
+			 * because of a bad subdisk.  Go
+			 * through to the bitter end, but note
+			 * which ones we can't access.
+			 */
+			status = REQUEST_DEGRADED;	    /* can't do it all */
+		    }
+		}
+		*diskaddr += rqe->datalen;		    /* bump the address */
+		if (build_rq_buffer(rqe, plex)) {	    /* build the buffer */
+		    deallocrqg(rqg);
+		    bp->b_error = ENOMEM;
+		    bp->b_io.bio_flags |= BIO_ERROR;
+		    return REQUEST_ENOMEM;		    /* can't do it */
+		}
+	    }
+	    if (*diskaddr == diskend)			    /* we're finished, */
+		break;					    /* get out of here */
+	}
+	/*
+	 * We've got to the end of the plex.  Have we got to the end of
+	 * the transfer?  It would seem that having an offset beyond the
+	 * end of the subdisk is an error, but in fact it can happen if
+	 * the volume has another plex of different size.  There's a valid
+	 * question as to why you would want to do this, but currently
+	 * it's allowed.
+	 *
+	 * In a previous version, I returned REQUEST_DOWN here.  I think
+	 * REQUEST_EOF is more appropriate now.
+	 */
+	if (diskend > sd->sectors + sd->plexoffset)	    /* pointing beyond EOF? */
+	    status = REQUEST_EOF;
+	break;
+
+    case plex_striped:
+	{
+	    while (*diskaddr < diskend) {		    /* until we get it all sorted out */
+		if (*diskaddr >= plex->length)		    /* beyond the end of the plex */
+		    return REQUEST_EOF;			    /* can't continue */
+
+		/* The offset of the start address from the start of the stripe. */
+		stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks);
+
+		/* The plex-relative address of the start of the stripe. */
+		stripebase = *diskaddr - stripeoffset;
+
+		/* The number of the subdisk in which the start is located. */
+		sdno = stripeoffset / plex->stripesize;
+
+		/* The offset from the beginning of the stripe on this subdisk. */
+		blockoffset = stripeoffset % plex->stripesize;
+
+		sd = &SD[plex->sdnos[sdno]];		    /* the subdisk in question */
+		rqg = allocrqg(rq, 1);			    /* space for the request */
+		if (rqg == NULL) {			    /* malloc failed */
+		    bp->b_error = ENOMEM;
+		    bp->b_io.bio_flags |= BIO_ERROR;
+		    return REQUEST_ENOMEM;
+		}
+		rqg->plexno = plexno;
+
+		rqe = &rqg->rqe[0];			    /* point to the element */
+		rqe->rqg = rqg;
+		rqe->sdoffset = stripebase / plex->subdisks + blockoffset; /* start offset in this subdisk */
+		rqe->useroffset = *diskaddr - diskstart;    /* The offset of the start in the user buffer */
+		rqe->dataoffset = 0;
+		rqe->datalen = min(diskend - *diskaddr,	    /* the amount remaining to transfer */
+		    plex->stripesize - blockoffset);	    /* and the amount left in this stripe */
+		rqe->groupoffset = 0;			    /* no groups for striped plexes */
+		rqe->grouplen = 0;
+		rqe->buflen = rqe->datalen;		    /* buffer length is data buffer length */
+		rqe->flags = 0;
+		rqe->sdno = sd->sdno;			    /* put in the subdisk number */
+		rqe->driveno = sd->driveno;
+
+		if (sd->state != sd_up) {		    /* *now* we find the sd is down */
+		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
+		    if (s == REQUEST_DOWN) {		    /* down? */
+			rqe->flags = XFR_BAD_SUBDISK;	    /* yup */
+			if (rq->bp->b_iocmd == BIO_READ)    /* read request, */
+			    return REQUEST_DEGRADED;	    /* give up here */
+			/*
+			 * If we're writing, don't give up
+			 * because of a bad subdisk.  Go through
+			 * to the bitter end, but note which
+			 * ones we can't access.
+			 */
+			status = REQUEST_DEGRADED;	    /* can't do it all */
+		    }
+		}
+		/*
+		 * It would seem that having an offset
+		 * beyond the end of the subdisk is an
+		 * error, but in fact it can happen if the
+		 * volume has another plex of different
+		 * size.  There's a valid question as to why
+		 * you would want to do this, but currently
+		 * it's allowed.
+		 */
+		if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */
+		    rqe->datalen = sd->sectors - rqe->sdoffset;	/* truncate */
+#ifdef VINUMDEBUG
+		    if (debug & DEBUG_EOFINFO) {	    /* tell on the request */
+			log(LOG_DEBUG,
+			    "vinum: EOF on plex %s, sd %s offset %x (user offset 0x%jx)\n",
+			    plex->name,
+			    sd->name,
+			    (u_int) sd->sectors,
+			    (intmax_t) bp->b_blkno);
+			log(LOG_DEBUG,
+			    "vinum: stripebase %#jx, stripeoffset %#jx, blockoffset %#jx\n",
+			    (intmax_t) stripebase,
+			    (intmax_t) stripeoffset,
+			    (intmax_t) blockoffset);
+		    }
+#endif
+		}
+		if (build_rq_buffer(rqe, plex)) {	    /* build the buffer */
+		    deallocrqg(rqg);
+		    bp->b_error = ENOMEM;
+		    bp->b_io.bio_flags |= BIO_ERROR;
+		    return REQUEST_ENOMEM;		    /* can't do it */
+		}
+		*diskaddr += rqe->datalen;		    /* look at the remainder */
+		if ((*diskaddr < diskend)		    /* didn't finish the request on this stripe */
+		&&(*diskaddr < plex->length)) {		    /* and there's more to come */
+		    plex->multiblock++;			    /* count another one */
+		    if (sdno == plex->subdisks - 1)	    /* last subdisk, */
+			plex->multistripe++;		    /* another stripe as well */
+		}
+	    }
+	}
+	break;
+
+	/*
+	 * RAID-4 and RAID-5 are complicated enough to have their own
+	 * function.
+	 */
+    case plex_raid4:
+    case plex_raid5:
+	status = bre5(rq, plexno, diskaddr, diskend);
+	break;
+
+    default:
+	log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization);
+	status = REQUEST_DOWN;				    /* can't access it */
+    }
+
+    return status;
+}
+
+/*
+ * Build up a request structure for reading volumes.
+ * This function is not needed for plex reads, since there's
+ * no recovery if a plex read can't be satisified.
+ */
+enum requeststatus
+build_read_request(struct request *rq,			    /* request */
+    int plexindex)
+{							    /* index in the volume's plex table */
+    struct buf *bp;
+    daddr_t startaddr;					    /* offset of previous part of transfer */
+    daddr_t diskaddr;					    /* offset of current part of transfer */
+    daddr_t diskend;					    /* and end offset of transfer */
+    int plexno;						    /* plex index in vinum_conf */
+    struct rqgroup *rqg;				    /* point to the request we're working on */
+    struct volume *vol;					    /* volume in question */
+    int recovered = 0;					    /* set if we recover a read */
+    enum requeststatus status = REQUEST_OK;
+    int plexmask;					    /* bit mask of plexes, for recovery */
+
+    bp = rq->bp;					    /* buffer pointer */
+    diskaddr = bp->b_blkno;				    /* start offset of transfer */
+    diskend = diskaddr + (bp->b_bcount / DEV_BSIZE);	    /* and end offset of transfer */
+    rqg = &rq->rqg[plexindex];				    /* plex request */
+    vol = &VOL[rq->volplex.volno];			    /* point to volume */
+
+    while (diskaddr < diskend) {			    /* build up request components */
+	startaddr = diskaddr;
+	status = bre(rq, vol->plex[plexindex], &diskaddr, diskend); /* build up a request */
+	switch (status) {
+	case REQUEST_OK:
+	    continue;
+
+	case REQUEST_RECOVERED:
+	    /*
+	     * XXX FIXME if we have more than one plex, and we can
+	     * satisfy the request from another, don't use the
+	     * recovered request, since it's more expensive.
+	     */
+	    recovered = 1;
+	    break;
+
+	case REQUEST_ENOMEM:
+	    return status;
+	    /*
+	     * If we get here, our request is not complete.  Try
+	     * to fill in the missing parts from another plex.
+	     * This can happen multiple times in this function,
+	     * and we reinitialize the plex mask each time, since
+	     * we could have a hole in our plexes.
+	     */
+	case REQUEST_EOF:
+	case REQUEST_DOWN:				    /* can't access the plex */
+	case REQUEST_DEGRADED:				    /* can't access the plex */
+	    plexmask = ((1 << vol->plexes) - 1)		    /* all plexes in the volume */
+	    &~(1 << plexindex);				    /* except for the one we were looking at */
+	    for (plexno = 0; plexno < vol->plexes; plexno++) {
+		if (plexmask == 0)			    /* no plexes left to try */
+		    return REQUEST_DOWN;		    /* failed */
+		diskaddr = startaddr;			    /* start at the beginning again */
+		if (plexmask & (1 << plexno)) {		    /* we haven't tried this plex yet */
+		    bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */
+		    if (diskaddr > startaddr) {		    /* we satisfied another part */
+			recovered = 1;			    /* we recovered from the problem */
+			status = REQUEST_OK;		    /* don't complain about it */
+			break;
+		    }
+		}
+	    }
+	    if (diskaddr == startaddr)			    /* didn't get any further, */
+		return status;
+	}
+	if (recovered)
+	    vol->recovered_reads += recovered;		    /* adjust our recovery count */
+    }
+    return status;
+}
+
+/*
+ * Build up a request structure for writes.
+ * Return 0 if all subdisks involved in the request are up, 1 if some
+ * subdisks are not up, and -1 if the request is at least partially
+ * outside the bounds of the subdisks.
+ */
+enum requeststatus
+build_write_request(struct request *rq)
+{							    /* request */
+    struct buf *bp;
+    daddr_t diskstart;					    /* offset of current part of transfer */
+    daddr_t diskend;					    /* and end offset of transfer */
+    int plexno;						    /* plex index in vinum_conf */
+    struct volume *vol;					    /* volume in question */
+    enum requeststatus status;
+
+    bp = rq->bp;					    /* buffer pointer */
+    vol = &VOL[rq->volplex.volno];			    /* point to volume */
+    diskend = bp->b_blkno + (bp->b_bcount / DEV_BSIZE);	    /* end offset of transfer */
+    status = REQUEST_DOWN;				    /* assume the worst */
+    for (plexno = 0; plexno < vol->plexes; plexno++) {
+	diskstart = bp->b_blkno;			    /* start offset of transfer */
+	/*
+	 * Build requests for the plex.
+	 * We take the best possible result here (min,
+	 * not max): we're happy if we can write at all
+	 */
+	status = min(status, bre(rq,
+		vol->plex[plexno],
+		&diskstart,
+		diskend));
+    }
+    return status;
+}
+
+/* Fill in the struct buf part of a request element. */
+enum requeststatus
+build_rq_buffer(struct rqelement *rqe, struct plex *plex)
+{
+    struct sd *sd;					    /* point to subdisk */
+    struct volume *vol;
+    struct buf *bp;
+    struct buf *ubp;					    /* user (high level) buffer header */
+
+    vol = &VOL[rqe->rqg->rq->volplex.volno];
+    sd = &SD[rqe->sdno];				    /* point to subdisk */
+    bp = &rqe->b;
+    ubp = rqe->rqg->rq->bp;				    /* pointer to user buffer header */
+
+    /* Initialize the buf struct */
+    /* copy these flags from user bp */
+    bp->b_flags = ubp->b_flags & (B_NOCACHE | B_ASYNC);
+    bp->b_io.bio_flags = 0;
+    bp->b_iocmd = ubp->b_iocmd;
+#ifdef VINUMDEBUG
+    if (rqe->flags & XFR_BUFLOCKED)			    /* paranoia */
+	panic("build_rq_buffer: rqe already locked");	    /* XXX remove this when we're sure */
+#endif
+    BUF_LOCKINIT(bp);					    /* get a lock for the buffer */
+    BUF_LOCK(bp, LK_EXCLUSIVE, NULL);			    /* and lock it */
+    BUF_KERNPROC(bp);
+    rqe->flags |= XFR_BUFLOCKED;
+    bp->b_iodone = complete_rqe;
+    /*
+     * You'd think that we wouldn't need to even
+     * build the request buffer for a dead subdisk,
+     * but in some cases we need information like
+     * the user buffer address.  Err on the side of
+     * generosity and supply what we can.  That
+     * obviously doesn't include drive information
+     * when the drive is dead.
+     */
+    if ((rqe->flags & XFR_BAD_SUBDISK) == 0)		    /* subdisk is accessible, */
+	bp->b_dev = DRIVE[rqe->driveno].dev;		    /* drive device */
+    bp->b_blkno = rqe->sdoffset + sd->driveoffset;	    /* start address */
+    bp->b_bcount = rqe->buflen << DEV_BSHIFT;		    /* number of bytes to transfer */
+    bp->b_resid = bp->b_bcount;				    /* and it's still all waiting */
+    bp->b_bufsize = bp->b_bcount;			    /* and buffer size */
+    bp->b_rcred = FSCRED;				    /* we have the file system credentials */
+    bp->b_wcred = FSCRED;				    /* we have the file system credentials */
+
+    if (rqe->flags & XFR_MALLOCED) {			    /* this operation requires a malloced buffer */
+	bp->b_data = Malloc(bp->b_bcount);		    /* get a buffer to put it in */
+	if (bp->b_data == NULL) {			    /* failed */
+	    abortrequest(rqe->rqg->rq, ENOMEM);
+	    return REQUEST_ENOMEM;			    /* no memory */
+	}
+    } else
+	/*
+	 * Point directly to user buffer data.  This means
+	 * that we don't need to do anything when we have
+	 * finished the transfer
+	 */
+	bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE;
+    /*
+     * On a recovery read, we perform an XOR of
+     * all blocks to the user buffer.  To make
+     * this work, we first clean out the buffer
+     */
+    if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK))
+	== (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) {	    /* bad subdisk of a recovery read */
+	int length = rqe->grouplen << DEV_BSHIFT;	    /* and count involved */
+	char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */
+
+	bzero(data, length);				    /* clean it out */
+    }
+    return 0;
+}
+
+/*
+ * Abort a request: free resources and complete the
+ * user request with the specified error
+ */
+int
+abortrequest(struct request *rq, int error)
+{
+    struct buf *bp = rq->bp;				    /* user buffer */
+
+    bp->b_error = error;
+    freerq(rq);						    /* free everything we're doing */
+    bp->b_io.bio_flags |= BIO_ERROR;
+    return error;					    /* and give up */
+}
+
+/*
+ * Check that our transfer will cover the
+ * complete address space of the user request.
+ *
+ * Return 1 if it can, otherwise 0
+ */
+int
+check_range_covered(struct request *rq)
+{
+    return 1;
+}
+
+/* Perform I/O on a subdisk */
+void
+sdio(struct buf *bp)
+{
+    int s;						    /* spl */
+    struct sd *sd;
+    struct sdbuf *sbp;
+    daddr_t endoffset;
+    struct drive *drive;
+
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_LASTREQS)
+	logrq(loginfo_sdio, (union rqinfou) bp, bp);
+#endif
+    sd = &SD[Sdno(bp->b_dev)];				    /* point to the subdisk */
+    drive = &DRIVE[sd->driveno];
+
+    if (drive->state != drive_up) {
+	if (sd->state >= sd_crashed) {
+	    if (bp->b_iocmd == BIO_WRITE)		    /* writing, */
+		set_sd_state(sd->sdno, sd_stale, setstate_force);
+	    else
+		set_sd_state(sd->sdno, sd_crashed, setstate_force);
+	}
+	bp->b_error = EIO;
+	bp->b_io.bio_flags |= BIO_ERROR;
+	bufdone(bp);
+	return;
+    }
+    /*
+     * We allow access to any kind of subdisk as long as we can expect
+     * to get the I/O performed.
+     */
+    if (sd->state < sd_empty) {				    /* nothing to talk to, */
+	bp->b_error = EIO;
+	bp->b_io.bio_flags |= BIO_ERROR;
+	bufdone(bp);
+	return;
+    }
+    /* Get a buffer */
+    sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf));
+    if (sbp == NULL) {
+	bp->b_error = ENOMEM;
+	bp->b_io.bio_flags |= BIO_ERROR;
+	bufdone(bp);
+	return;
+    }
+    bzero(sbp, sizeof(struct sdbuf));			    /* start with nothing */
+    sbp->b.b_flags = bp->b_flags;
+    sbp->b.b_iocmd = bp->b_iocmd;
+    sbp->b.b_bufsize = bp->b_bcount;			    /* buffer size */
+    sbp->b.b_bcount = bp->b_bcount;			    /* number of bytes to transfer */
+    sbp->b.b_resid = bp->b_resid;			    /* and amount waiting */
+    sbp->b.b_dev = DRIVE[sd->driveno].dev;		    /* device */
+    sbp->b.b_data = bp->b_data;				    /* data buffer */
+    sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;
+    sbp->b.b_iodone = sdio_done;			    /* come here on completion */
+    BUF_LOCKINIT(&sbp->b);				    /* get a lock for the buffer */
+    BUF_LOCK(&sbp->b, LK_EXCLUSIVE, NULL);		    /* and lock it */
+    BUF_KERNPROC(&sbp->b);
+    sbp->bp = bp;					    /* note the address of the original header */
+    sbp->sdno = sd->sdno;				    /* note for statistics */
+    sbp->driveno = sd->driveno;
+    endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE;  /* final sector offset */
+    if (endoffset > sd->sectors) {			    /* beyond the end */
+	sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */
+	if (sbp->b.b_bcount <= 0) {			    /* nothing to transfer */
+	    bp->b_resid = bp->b_bcount;			    /* nothing transferred */
+	    bufdone(bp);
+	    BUF_UNLOCK(&sbp->b);
+	    BUF_LOCKFREE(&sbp->b);
+	    Free(sbp);
+	    return;
+	}
+    }
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_ADDRESSES)
+	log(LOG_DEBUG,
+	    "  %s dev %d.%d, sd %d, offset 0x%jx, devoffset 0x%jx, length %ld\n",
+	    sbp->b.b_iocmd == BIO_READ ? "Read" : "Write",
+	    major(sbp->b.b_dev),
+	    minor(sbp->b.b_dev),
+	    sbp->sdno,
+	    (intmax_t) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset),
+	    (intmax_t) sbp->b.b_blkno,
+	    sbp->b.b_bcount);
+#endif
+    s = splbio();
+#ifdef VINUMDEBUG
+    if (debug & DEBUG_LASTREQS)
+	logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b);
+#endif
+    DEV_STRATEGY(&sbp->b);
+    splx(s);
+}
+
+/*
+ * Simplified version of bounds_check_with_label
+ * Determine the size of the transfer, and make sure it is
+ * within the boundaries of the partition. Adjust transfer
+ * if needed, and signal errors or early completion.
+ *
+ * Volumes are simpler than disk slices: they only contain
+ * one component (though we call them a, b and c to make
+ * system utilities happy), and they always take up the
+ * complete space of the "partition".
+ *
+ * I'm still not happy with this: why should the label be
+ * protected?  If it weren't so damned difficult to write
+ * one in the first pleace (because it's protected), it wouldn't
+ * be a problem.
+ */
+int
+vinum_bounds_check(struct buf *bp, struct volume *vol)
+{
+    int maxsize = vol->size;				    /* size of the partition (sectors) */
+    int size = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* size of this request (sectors) */
+
+#ifdef LABELSECTOR
+    /* Would this transfer overwrite the disk label? */
+    if (bp->b_blkno <= LABELSECTOR			    /* starts before or at the label */
+#if LABELSECTOR != 0
+	&& bp->b_blkno + size > LABELSECTOR		    /* and finishes after */
+#endif
+	&& (bp->b_iocmd == BIO_WRITE)			    /* and it's a write */
+	&&(!vol->flags & (VF_WLABEL | VF_LABELLING))) {	    /* and we're not allowed to write the label */
+	bp->b_error = EROFS;				    /* read-only */
+	bp->b_io.bio_flags |= BIO_ERROR;
+	return -1;
+    }
+#endif
+    if (size == 0)					    /* no transfer specified, */
+	return 0;					    /* treat as EOF */
+    /* beyond partition? */
+    if (bp->b_blkno < 0					    /* negative start */
+	|| bp->b_blkno + size > maxsize) {		    /* or goes beyond the end of the partition */
+	/* if exactly at end of disk, return an EOF */
+	if (bp->b_blkno == maxsize) {
+	    bp->b_resid = bp->b_bcount;
+	    return 0;
+	}
+	/* or truncate if part of it fits */
+	size = maxsize - bp->b_blkno;
+	if (size <= 0) {				    /* nothing to transfer */
+	    bp->b_error = EINVAL;
+	    bp->b_io.bio_flags |= BIO_ERROR;
+	    return -1;
+	}
+	bp->b_bcount = size << DEV_BSHIFT;
+    }
+    bp->b_pblkno = bp->b_blkno;
+    return 1;
+}
+
+/*
+ * Allocate a request group and hook
+ * it in in the list for rq
+ */
+struct rqgroup *
+allocrqg(struct request *rq, int elements)
+{
+    struct rqgroup *rqg;				    /* the one we're going to allocate */
+    int size = sizeof(struct rqgroup) + elements * sizeof(struct rqelement);
+
+    rqg = (struct rqgroup *) Malloc(size);
+    if (rqg != NULL) {					    /* malloc OK, */
+	if (rq->rqg)					    /* we already have requests */
+	    rq->lrqg->next = rqg;			    /* hang it off the end */
+	else						    /* first request */
+	    rq->rqg = rqg;				    /* at the start */
+	rq->lrqg = rqg;					    /* this one is the last in the list */
+
+	bzero(rqg, size);				    /* no old junk */
+	rqg->rq = rq;					    /* point back to the parent request */
+	rqg->count = elements;				    /* number of requests in the group */
+	rqg->lockbase = -1;				    /* no lock required yet */
+    }
+    return rqg;
+}
+
+/*
+ * Deallocate a request group out of a chain.  We do
+ * this by linear search: the chain is short, this
+ * almost never happens, and currently it can only
+ * happen to the first member of the chain.
+ */
+void
+deallocrqg(struct rqgroup *rqg)
+{
+    struct rqgroup *rqgc = rqg->rq->rqg;		    /* point to the request chain */
+
+    if (rqg->lock)					    /* got a lock? */
+	unlockrange(rqg->plexno, rqg->lock);		    /* yes, free it */
+    if (rqgc == rqg)					    /* we're first in line */
+	rqg->rq->rqg = rqg->next;			    /* unhook ourselves */
+    else {
+	while ((rqgc->next != NULL)			    /* find the group */
+	&&(rqgc->next != rqg))
+	    rqgc = rqgc->next;
+	if (rqgc->next == NULL)
+	    log(LOG_ERR,
+		"vinum deallocrqg: rqg %p not found in request %p\n",
+		rqg->rq,
+		rqg);
+	else
+	    rqgc->next = rqg->next;			    /* make the chain jump over us */
+    }
+    Free(rqg);
+}
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumrevive.c b/sys/dev/vinum/vinumrevive.c
new file mode 100644
index 0000000..03e16f9
--- /dev/null
+++ b/sys/dev/vinum/vinumrevive.c
@@ -0,0 +1,622 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumrevive.c,v 1.18 2003/04/28 02:54:43 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+/*
+ * Revive a block of a subdisk.  Return an error
+ * indication.  EAGAIN means successful copy, but
+ * that more blocks remain to be copied.  EINVAL
+ * means that the subdisk isn't associated with a
+ * plex (which means a programming error if we get
+ * here at all; FIXME).
+ */
+
+int
+revive_block(int sdno)
+{
+    int s;						    /* priority level */
+    struct sd *sd;
+    struct plex *plex;
+    struct volume *vol;
+    struct buf *bp;
+    int error = EAGAIN;
+    int size;						    /* size of revive block, bytes */
+    daddr_t plexblkno;					    /* lblkno in plex */
+    int psd;						    /* parity subdisk number */
+    u_int64_t stripe;					    /* stripe number */
+    int paritysd = 0;					    /* set if this is the parity stripe */
+    struct rangelock *lock;				    /* for locking */
+    daddr_t stripeoffset;				    /* offset in stripe */
+
+    plexblkno = 0;					    /* to keep the compiler happy */
+    sd = &SD[sdno];
+    lock = NULL;
+    if (sd->plexno < 0)					    /* no plex? */
+	return EINVAL;
+    plex = &PLEX[sd->plexno];				    /* point to plex */
+    if (plex->volno >= 0)
+	vol = &VOL[plex->volno];
+    else
+	vol = NULL;
+
+    if ((sd->revive_blocksize == 0)			    /* no block size */
+    ||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1)))	    /* or invalid block size */
+	sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
+    else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE)
+	sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE;
+    size = min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT;
+    sd->reviver = curproc->p_pid;			    /* note who last had a bash at it */
+
+    /* Now decide where to read from */
+    switch (plex->organization) {
+    case plex_concat:
+	plexblkno = sd->revived + sd->plexoffset;	    /* corresponding address in plex */
+	break;
+
+    case plex_striped:
+	stripeoffset = sd->revived % plex->stripesize;	    /* offset from beginning of stripe */
+	if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize)
+	    size = (plex->stripesize - stripeoffset) << DEV_BSHIFT;
+	plexblkno = sd->plexoffset			    /* base */
+	    + (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */
+	    + stripeoffset;				    /* offset from beginning of stripe */
+	break;
+
+    case plex_raid4:
+    case plex_raid5:
+	stripeoffset = sd->revived % plex->stripesize;	    /* offset from beginning of stripe */
+	plexblkno = sd->plexoffset			    /* base */
+	    + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */
+	    +stripeoffset;				    /* offset from beginning of stripe */
+	stripe = (sd->revived / plex->stripesize);	    /* stripe number */
+
+	/* Make sure we don't go beyond the end of the band. */
+	size = min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT);
+	if (plex->organization == plex_raid4)
+	    psd = plex->subdisks - 1;			    /* parity subdisk for this stripe */
+	else
+	    psd = plex->subdisks - 1 - stripe % plex->subdisks;	/* parity subdisk for this stripe */
+	paritysd = plex->sdnos[psd] == sdno;		    /* note if it's the parity subdisk */
+
+	/*
+	 * Now adjust for the strangenesses
+	 * in RAID-4 and RAID-5 striping.
+	 */
+	if (sd->plexsdno > psd)				    /* beyond the parity stripe, */
+	    plexblkno -= plex->stripesize;		    /* one stripe less */
+	else if (paritysd)
+	    plexblkno -= plex->stripesize * sd->plexsdno;   /* go back to the beginning of the band */
+	break;
+
+    case plex_disorg:					    /* to keep the compiler happy */
+	break;						    /* to keep the pedants happy */
+    }
+
+    if (paritysd) {					    /* we're reviving a parity block, */
+	bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */
+	if (bp == NULL)					    /* no buffer space */
+	    return ENOMEM;				    /* chicken out */
+    } else {						    /* data block */
+	s = splbio();
+	bp = geteblk(size);				    /* Get a buffer */
+	splx(s);
+	if (bp == NULL)
+	    return ENOMEM;
+
+	/*
+	 * Amount to transfer: block size, unless it
+	 * would overlap the end.
+	 */
+	bp->b_bcount = size;
+	bp->b_resid = bp->b_bcount;
+	bp->b_blkno = plexblkno;			    /* start here */
+	if (isstriped(plex))				    /* we need to lock striped plexes */
+	    lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */
+	if (vol != NULL)				    /* it's part of a volume, */
+	    /*
+	       * First, read the data from the volume.  We
+	       * don't care which plex, that's bre's job.
+	     */
+	    bp->b_dev = VINUM_VOL(plex->volno);		    /* create the device number */
+	else						    /* it's an unattached plex */
+	    bp->b_dev = VINUM_PLEX(sd->plexno);		    /* create the device number */
+
+	bp->b_iocmd = BIO_READ;				    /* either way, read it */
+	bp->b_flags = 0;
+	vinumstart(bp, 1);
+	bufwait(bp);
+    }
+
+    if (bp->b_ioflags & BIO_ERROR) {
+	error = bp->b_error;
+	if (lock)					    /* we took a lock, */
+	    unlockrange(sd->plexno, lock);		    /* give it back */
+    } else
+	/* Now write to the subdisk */
+    {
+	bp->b_dev = VINUM_SD(sdno);			    /* create the device number */
+	bp->b_flags &= ~B_DONE;				    /* no longer done */
+	bp->b_ioflags = 0;
+	bp->b_iocmd = BIO_WRITE;
+	bp->b_resid = bp->b_bcount;
+	bp->b_blkno = sd->revived;			    /* write it to here */
+	sdio(bp);					    /* perform the I/O */
+	bufwait(bp);
+	if (bp->b_ioflags & BIO_ERROR)
+	    error = bp->b_error;
+	else {
+	    sd->revived += bp->b_bcount >> DEV_BSHIFT;	    /* moved this much further down */
+	    if (sd->revived >= sd->sectors) {		    /* finished */
+		sd->revived = 0;
+		set_sd_state(sdno, sd_up, setstate_force);  /* bring the sd up */
+		log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
+		save_config();				    /* and save the updated configuration */
+		error = 0;				    /* we're done */
+	    }
+	}
+	if (lock)					    /* we took a lock, */
+	    unlockrange(sd->plexno, lock);		    /* give it back */
+	while (sd->waitlist) {				    /* we have waiting requests */
+#ifdef VINUMDEBUG
+	    struct request *rq = sd->waitlist;
+
+	    if (debug & DEBUG_REVIVECONFLICT)
+		log(LOG_DEBUG,
+		    "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n",
+		    rq->sdno,
+		    rq,
+		    rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
+		    major(rq->bp->b_dev),
+		    minor(rq->bp->b_dev),
+		    (intmax_t)rq->bp->b_blkno,
+		    rq->bp->b_bcount);
+#endif
+	    launch_requests(sd->waitlist, 1);		    /* do them now */
+	    sd->waitlist = sd->waitlist->next;		    /* and move on to the next */
+	}
+    }
+    if (bp->b_qindex == 0) {				    /* not on a queue, */
+	bp->b_flags |= B_INVAL;
+	bp->b_ioflags &= ~BIO_ERROR;
+	brelse(bp);					    /* is this kosher? */
+    }
+    return error;
+}
+
+/*
+ * Check or rebuild the parity blocks of a RAID-4
+ * or RAID-5 plex.
+ *
+ * The variables plex->checkblock and
+ * plex->rebuildblock represent the
+ * subdisk-relative address of the stripe we're
+ * looking at, not the plex-relative address.  We
+ * store it in the plex and not as a local
+ * variable because this function could be
+ * stopped, and we don't want to repeat the part
+ * we've already done.  This is also the reason
+ * why we don't initialize it here except at the
+ * end.  It gets initialized with the plex on
+ * creation.
+ *
+ * Each call to this function processes at most
+ * one stripe.  We can't loop in this function,
+ * because we're unstoppable, so we have to be
+ * called repeatedly from userland.
+ */
+void
+parityops(struct vinum_ioctl_msg *data)
+{
+    int plexno;
+    struct plex *plex;
+    int size;						    /* I/O transfer size, bytes */
+    int stripe;						    /* stripe number in plex */
+    int psd;						    /* parity subdisk number */
+    struct rangelock *lock;				    /* lock on stripe */
+    struct _ioctl_reply *reply;
+    off_t pstripe;					    /* pointer to our stripe counter */
+    struct buf *pbp;
+    off_t errorloc;					    /* offset of parity error */
+    enum parityop op;					    /* operation to perform */
+
+    plexno = data->index;
+    op = data->op;
+    pbp = NULL;
+    reply = (struct _ioctl_reply *) data;
+    reply->error = EAGAIN;				    /* expect to repeat this call */
+    plex = &PLEX[plexno];
+    if (!isparity(plex)) {				    /* not RAID-4 or RAID-5 */
+	reply->error = EINVAL;
+	return;
+    } else if (plex->state < plex_flaky) {
+	reply->error = EIO;
+	strcpy(reply->msg, "Plex is not completely accessible\n");
+	return;
+    }
+    pstripe = data->offset;
+    stripe = pstripe / plex->stripesize;		    /* stripe number */
+    psd = plex->subdisks - 1 - stripe % plex->subdisks;	    /* parity subdisk for this stripe */
+    size = min(DEFAULT_REVIVE_BLOCKSIZE,		    /* one block at a time */
+	plex->stripesize << DEV_BSHIFT);
+
+    pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */
+    if (pbp == NULL) {					    /* no buffer space */
+	reply->error = ENOMEM;
+	return;						    /* chicken out */
+    }
+    /*
+     * Now we have a result in the data buffer of
+     * the parity buffer header, which we have kept.
+     * Decide what to do with it.
+     */
+    reply->msg[0] = '\0';				    /* until shown otherwise */
+    if ((pbp->b_ioflags & BIO_ERROR) == 0) {		    /* no error */
+	if ((op == rebuildparity)
+	    || (op == rebuildandcheckparity)) {
+	    pbp->b_iocmd = BIO_WRITE;
+	    pbp->b_resid = pbp->b_bcount;
+	    sdio(pbp);					    /* write the parity block */
+	    bufwait(pbp);
+	}
+	if (((op == checkparity)
+		|| (op == rebuildandcheckparity))
+	    && (errorloc != -1)) {
+	    if (op == checkparity)
+		reply->error = EIO;
+	    sprintf(reply->msg,
+		"Parity incorrect at offset 0x%jx\n",
+		(intmax_t)errorloc);
+	}
+	if (reply->error == EAGAIN) {			    /* still OK, */
+	    plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT);	/* moved this much further down */
+	    if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */
+		plex->checkblock = 0;
+		reply->error = 0;
+	    }
+	}
+    }
+    if (pbp->b_ioflags & BIO_ERROR)
+	reply->error = pbp->b_error;
+    pbp->b_flags |= B_INVAL;
+    pbp->b_ioflags &= ~BIO_ERROR;
+    brelse(pbp);
+    unlockrange(plexno, lock);
+}
+
+/*
+ * Rebuild a parity stripe.  Return pointer to
+ * parity bp.  On return,
+ *
+ * 1.  The band is locked.  The caller must unlock
+ *     the band and release the buffer header.
+ *
+ * 2.  All buffer headers except php have been
+ *     released.  The caller must release pbp.
+ *
+ * 3.  For checkparity and rebuildandcheckparity,
+ *     the parity is compared with the current
+ *     parity block.  If it's different, the
+ *     offset of the error is returned to
+ *     errorloc.  The caller can set the value of
+ *     the pointer to NULL if this is called for
+ *     rebuilding parity.
+ *
+ * pstripe is the subdisk-relative base address of
+ * the data to be reconstructed, size is the size
+ * of the transfer in bytes.
+ */
+struct buf *
+parityrebuild(struct plex *plex,
+    u_int64_t pstripe,
+    int size,
+    enum parityop op,
+    struct rangelock **lockp,
+    off_t * errorloc)
+{
+    int error;
+    int s;
+    int sdno;
+    u_int64_t stripe;					    /* stripe number */
+    int *parity_buf;					    /* buffer address for current parity block */
+    int *newparity_buf;					    /* and for new parity block */
+    int mysize;						    /* I/O transfer size for this transfer */
+    int isize;						    /* mysize in ints */
+    int i;
+    int psd;						    /* parity subdisk number */
+    int newpsd;						    /* and "subdisk number" of new parity */
+    struct buf **bpp;					    /* pointers to our bps */
+    struct buf *pbp;					    /* buffer header for parity stripe */
+    int *sbuf;
+    int bufcount;					    /* number of buffers we need */
+
+    stripe = pstripe / plex->stripesize;		    /* stripe number */
+    psd = plex->subdisks - 1 - stripe % plex->subdisks;	    /* parity subdisk for this stripe */
+    parity_buf = NULL;					    /* to keep the compiler happy */
+    error = 0;
+
+    /*
+     * It's possible that the default transfer size
+     * we chose is not a factor of the stripe size.
+     * We *must* limit this operation to a single
+     * stripe, at least for RAID-5 rebuild, since
+     * the parity subdisk changes between stripes,
+     * so in this case we need to perform a short
+     * transfer.  Set variable mysize to reflect
+     * this.
+     */
+    mysize = min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT);
+    isize = mysize / (sizeof(int));			    /* number of ints in the buffer */
+    bufcount = plex->subdisks + 1;			    /* sd buffers plus result buffer */
+    newpsd = plex->subdisks;
+    bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */
+
+    /* First, build requests for all subdisks */
+    for (sdno = 0; sdno < bufcount; sdno++) {		    /* for each subdisk */
+	if ((sdno != psd) || (op != rebuildparity)) {
+	    /* Get a buffer header and initialize it. */
+	    s = splbio();
+	    bpp[sdno] = geteblk(mysize);		    /* Get a buffer */
+	    if (bpp[sdno] == NULL) {
+		while (sdno-- > 0) {			    /* release the ones we got */
+		    bpp[sdno]->b_flags |= B_INVAL;
+		    brelse(bpp[sdno]);			    /* give back our resources */
+		}
+		splx(s);
+		printf("vinum: can't allocate buffer space for parity op.\n");
+		return NULL;				    /* no bpps */
+	    }
+	    splx(s);
+	    if (sdno == psd)
+		parity_buf = (int *) bpp[sdno]->b_data;
+	    if (sdno == newpsd)				    /* the new one? */
+		bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[psd]); /* write back to the parity SD */
+	    else
+		bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[sdno]);	/* device number */
+	    bpp[sdno]->b_iocmd = BIO_READ;		    /* either way, read it */
+	    bpp[sdno]->b_flags = 0;
+	    bpp[sdno]->b_bcount = mysize;
+	    bpp[sdno]->b_resid = bpp[sdno]->b_bcount;
+	    bpp[sdno]->b_blkno = pstripe;		    /* transfer from here */
+	}
+    }
+
+    /* Initialize result buffer */
+    pbp = bpp[newpsd];
+    newparity_buf = (int *) bpp[newpsd]->b_data;
+    bzero(newparity_buf, mysize);
+
+    /*
+     * Now lock the stripe with the first non-parity
+     * bp as locking bp.
+     */
+    *lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1),
+	bpp[psd ? 0 : 1],
+	plex);
+
+    /*
+     * Then issue requests for all subdisks in
+     * parallel.  Don't transfer the parity stripe
+     * if we're rebuilding parity, unless we also
+     * want to check it.
+     */
+    for (sdno = 0; sdno < plex->subdisks; sdno++) {	    /* for each real subdisk */
+	if ((sdno != psd) || (op != rebuildparity)) {
+	    sdio(bpp[sdno]);
+	}
+    }
+
+    /*
+     * Next, wait for the requests to complete.
+     * We wait in the order in which they were
+     * issued, which isn't necessarily the order in
+     * which they complete, but we don't have a
+     * convenient way of doing the latter, and the
+     * delay is minimal.
+     */
+    for (sdno = 0; sdno < plex->subdisks; sdno++) {	    /* for each subdisk */
+	if ((sdno != psd) || (op != rebuildparity)) {
+	    bufwait(bpp[sdno]);
+	    if (bpp[sdno]->b_ioflags & BIO_ERROR)	    /* can't read, */
+		error = bpp[sdno]->b_error;
+	    else if (sdno != psd) {			    /* update parity */
+		sbuf = (int *) bpp[sdno]->b_data;
+		for (i = 0; i < isize; i++)
+		    ((int *) newparity_buf)[i] ^= sbuf[i];  /* xor in the buffer */
+	    }
+	}
+	if (sdno != psd) {				    /* release all bps except parity */
+	    bpp[sdno]->b_flags |= B_INVAL;
+	    brelse(bpp[sdno]);				    /* give back our resources */
+	}
+    }
+
+    /*
+     * If we're checking, compare the calculated
+     * and the read parity block.  If they're
+     * different, return the plex-relative offset;
+     * otherwise return -1.
+     */
+    if ((op == checkparity)
+	|| (op == rebuildandcheckparity)) {
+	*errorloc = -1;					    /* no error yet */
+	for (i = 0; i < isize; i++) {
+	    if (parity_buf[i] != newparity_buf[i]) {
+		*errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1)
+		    + i * sizeof(int);
+		break;
+	    }
+	}
+	bpp[psd]->b_flags |= B_INVAL;
+	brelse(bpp[psd]);				    /* give back our resources */
+    }
+    /* release our resources */
+    Free(bpp);
+    if (error) {
+	pbp->b_ioflags |= BIO_ERROR;
+	pbp->b_error = error;
+    }
+    return pbp;
+}
+
+/*
+ * Initialize a subdisk by writing zeroes to the
+ * complete address space.  If verify is set,
+ * check each transfer for correctness.
+ *
+ * Each call to this function writes (and maybe
+ * checks) a single block.
+ */
+int
+initsd(int sdno, int verify)
+{
+    int s;						    /* priority level */
+    struct sd *sd;
+    struct plex *plex;
+    struct volume *vol;
+    struct buf *bp;
+    int error;
+    int size;						    /* size of init block, bytes */
+    daddr_t plexblkno;					    /* lblkno in plex */
+    int verified;					    /* set when we're happy with what we wrote */
+
+    error = 0;
+    plexblkno = 0;					    /* to keep the compiler happy */
+    sd = &SD[sdno];
+    if (sd->plexno < 0)					    /* no plex? */
+	return EINVAL;
+    plex = &PLEX[sd->plexno];				    /* point to plex */
+    if (plex->volno >= 0)
+	vol = &VOL[plex->volno];
+    else
+	vol = NULL;
+
+    if (sd->init_blocksize == 0) {
+	if (plex->stripesize != 0)			    /* we're striped, don't init more than */
+	    sd->init_blocksize = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
+		plex->stripesize << DEV_BSHIFT);
+	else
+	    sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
+    } else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE)
+	sd->init_blocksize = MAX_REVIVE_BLOCKSIZE;
+
+    size = min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT;
+
+    verified = 0;
+    while (!verified) {					    /* until we're happy with it, */
+	s = splbio();
+	bp = geteblk(size);				    /* Get a buffer */
+	splx(s);
+	if (bp == NULL)
+	    return ENOMEM;
+
+	bp->b_bcount = size;
+	bp->b_resid = bp->b_bcount;
+	bp->b_blkno = sd->initialized;			    /* write it to here */
+	bzero(bp->b_data, bp->b_bcount);
+	bp->b_dev = VINUM_SD(sdno);			    /* create the device number */
+	bp->b_iocmd = BIO_WRITE;
+	sdio(bp);					    /* perform the I/O */
+	bufwait(bp);
+	if (bp->b_ioflags & BIO_ERROR)
+	    error = bp->b_error;
+	if (bp->b_qindex == 0) {			    /* not on a queue, */
+	    bp->b_flags |= B_INVAL;
+	    bp->b_ioflags &= ~BIO_ERROR;
+	    brelse(bp);					    /* is this kosher? */
+	}
+	if ((error == 0) && verify) {			    /* check that it got there */
+	    s = splbio();
+	    bp = geteblk(size);				    /* get a buffer */
+	    if (bp == NULL) {
+		splx(s);
+		error = ENOMEM;
+	    } else {
+		bp->b_bcount = size;
+		bp->b_resid = bp->b_bcount;
+		bp->b_blkno = sd->initialized;		    /* read from here */
+		bp->b_dev = VINUM_SD(sdno);		    /* create the device number */
+		bp->b_iocmd = BIO_READ;			    /* read it back */
+		splx(s);
+		sdio(bp);
+		bufwait(bp);
+		/*
+		 * XXX Bug fix code.  This is hopefully no
+		 * longer needed (21 February 2000).
+		 */
+		if (bp->b_ioflags & BIO_ERROR)
+		    error = bp->b_error;
+		else if ((*bp->b_data != 0)		    /* first word spammed */
+		||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */
+		    printf("vinum: init error on %s, offset 0x%llx sectors\n",
+			sd->name,
+			(long long) sd->initialized);
+		    verified = 0;
+		} else
+		    verified = 1;
+		if (bp->b_qindex == 0) {		    /* not on a queue, */
+		    bp->b_flags |= B_INVAL;
+		    bp->b_ioflags &= ~BIO_ERROR;
+		    brelse(bp);				    /* is this kosher? */
+		}
+	    }
+	} else
+	    verified = 1;
+    }
+    if (error == 0) {					    /* did it, */
+	sd->initialized += size >> DEV_BSHIFT;		    /* moved this much further down */
+	if (sd->initialized >= sd->sectors) {		    /* finished */
+	    sd->initialized = 0;
+	    set_sd_state(sdno, sd_initialized, setstate_force);	/* bring the sd up */
+	    log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
+	    save_config();				    /* and save the updated configuration */
+	} else						    /* more to go, */
+	    error = EAGAIN;				    /* ya'll come back, see? */
+    }
+    return error;
+}
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumstate.c b/sys/dev/vinum/vinumstate.c
new file mode 100644
index 0000000..59c9860
--- /dev/null
+++ b/sys/dev/vinum/vinumstate.c
@@ -0,0 +1,1093 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumstate.c,v 2.21 2003/04/28 02:54:43 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+/* Update drive state */
+/* Return 1 if the state changes, otherwise 0 */
+int
+set_drive_state(int driveno, enum drivestate newstate, enum setstateflags flags)
+{
+    struct drive *drive = &DRIVE[driveno];
+    int oldstate = drive->state;
+    int sdno;
+
+    if (drive->state == drive_unallocated)		    /* no drive to do anything with, */
+	return 0;
+
+    if (newstate == oldstate)				    /* don't change it if it's not different */
+	return 1;					    /* all OK */
+    if ((newstate == drive_down)			    /* the drive's going down */
+    &&(!(flags & setstate_force))
+	&& (drive->opencount != 0))			    /* we can't do it */
+	return 0;					    /* don't do it */
+    drive->state = newstate;				    /* set the state */
+    if (drive->label.name[0] != '\0')			    /* we have a name, */
+	log(LOG_INFO,
+	    "vinum: drive %s is %s\n",
+	    drive->label.name,
+	    drive_state(drive->state));
+    if (drive->state != oldstate) {			    /* state has changed */
+	for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { /* find this drive's subdisks */
+	    if ((SD[sdno].state >= sd_referenced)
+		&& (SD[sdno].driveno == driveno))	    /* belongs to this drive */
+		update_sd_state(sdno);			    /* update the state */
+	}
+    }
+    if (newstate == drive_up) {				    /* want to bring it up */
+	if ((drive->flags & VF_OPEN) == 0)		    /* should be open, but we're not */
+	    init_drive(drive, 1);			    /* which changes the state again */
+    } else						    /* taking it down or worse */
+	queue_daemon_request(daemonrq_closedrive,	    /* get the daemon to close it */
+	    (union daemoninfo) drive);
+    if ((flags & setstate_configuring) == 0)		    /* configuring? */
+	save_config();					    /* no: save the updated configuration now */
+    return 1;
+}
+
+/*
+ * Try to set the subdisk state.  Return 1 if
+ * state changed to what we wanted, -1 if it
+ * changed to something else, and 0 if no change.
+ *
+ * This routine is called both from the user (up,
+ * down states only) and internally.
+ *
+ * The setstate_force bit in the flags enables the
+ * state change even if it could be dangerous to
+ * data consistency.  It shouldn't allow nonsense.
+ */
+int
+set_sd_state(int sdno, enum sdstate newstate, enum setstateflags flags)
+{
+    struct sd *sd = &SD[sdno];
+    struct plex *plex;
+    struct volume *vol;
+    int oldstate = sd->state;
+    int status = 1;					    /* status to return */
+
+    if (newstate == oldstate)				    /* already there, */
+	return 1;
+    else if (sd->state == sd_unallocated)		    /* no subdisk to do anything with, */
+	return 0;					    /* can't do it */
+
+    if (sd->driveoffset < 0) {				    /* not allocated space */
+	sd->state = sd_down;
+	if (newstate != sd_down) {
+	    if (sd->plexno >= 0)
+		sdstatemap(&PLEX[sd->plexno]);		    /* count up subdisks */
+	    return -1;
+	}
+    } else {						    /* space allocated */
+	switch (newstate) {
+	case sd_down:					    /* take it down? */
+	    /*
+	     * If we're attached to a plex, and we're
+	     * not reborn, we won't go down without
+	     * use of force.
+	     */
+	    if ((!flags & setstate_force)
+		&& (sd->plexno >= 0)
+		&& (sd->state != sd_reborn))
+		return 0;				    /* don't do it */
+	    break;
+
+	case sd_initialized:
+	    if ((sd->state == sd_initializing)		    /* we were initializing */
+	    ||(flags & setstate_force))			    /* or we forced it */
+		break;
+	    return 0;					    /* can't do it otherwise */
+
+	case sd_up:
+	    if (DRIVE[sd->driveno].state != drive_up)	    /* can't bring the sd up if the drive isn't, */
+		return 0;				    /* not even by force */
+	    if (flags & setstate_force)			    /* forcing it, */
+		break;					    /* just do it, and damn the consequences */
+	    switch (sd->state) {
+		/*
+		 * Perform the necessary tests.  To allow
+		 * the state transition, just break out of
+		 * the switch.
+		 */
+	    case sd_crashed:
+	    case sd_reborn:
+	    case sd_down:				    /* been down, no data lost */
+		/*
+		 * If we're associated with a plex, and
+		 * the plex isn't up, or we're the only
+		 * subdisk in the plex, we can do it.
+		 */
+		if ((sd->plexno >= 0)
+		    && (((PLEX[sd->plexno].state < plex_firstup)
+			    || (PLEX[sd->plexno].subdisks > 1))))
+		    break;				    /* do it */
+		if (oldstate != sd_reborn) {
+		    sd->state = sd_reborn;		    /* here it is again */
+		    log(LOG_INFO,
+			"vinum: %s is %s, not %s\n",
+			sd->name,
+			sd_state(sd->state),
+			sd_state(newstate));
+		}
+		status = -1;
+		break;
+
+	    case sd_init:				    /* brand new */
+		if (flags & setstate_configuring)	    /* we're doing this while configuring */
+		    break;
+		/* otherwise it's like being empty */
+		/* FALLTHROUGH */
+
+	    case sd_empty:
+	    case sd_initialized:
+		/*
+		 * If we're not part of a plex, or the
+		 * plex is not part of a volume with other
+		 * plexes which are up, we can come up
+		 * without being inconsistent.
+		 *
+		 * If we're part of a parity plex, we'll
+		 * come up if the caller uses force.  This
+		 * is the way we bring them up after
+		 * initialization.
+		 */
+		if ((sd->plexno < 0)
+		    || ((vpstate(&PLEX[sd->plexno]) & volplex_otherup) == 0)
+		    || (isparity((&PLEX[sd->plexno]))
+			&& (flags & setstate_force)))
+		    break;
+
+		/* Otherwise it's just out of date */
+		/* FALLTHROUGH */
+
+	    case sd_stale:				    /* out of date info, need reviving */
+	    case sd_obsolete:
+		/*
+
+		 * 1.  If the subdisk is not part of a
+		 *     plex, bring it up, don't revive.
+		 *
+		 * 2.  If the subdisk is part of a
+		 *     one-plex volume or an unattached
+		 *     plex, and it's not RAID-4 or
+		 *     RAID-5, we *can't revive*.  The
+		 *     subdisk doesn't change its state.
+		 *
+		 * 3.  If the subdisk is part of a
+		 *     one-plex volume or an unattached
+		 *     plex, and it's RAID-4 or RAID-5,
+		 *     but more than one subdisk is down,
+		 *     we *still can't revive*.  The
+		 *     subdisk doesn't change its state.
+		 *
+		 * 4.  If the subdisk is part of a
+		 *     multi-plex volume, we'll change to
+		 *     reviving and let the revive
+		 *     routines find out whether it will
+		 *     work or not.  If they don't, the
+		 *     revive stops with an error message,
+		 *     but the state doesn't change
+		 *     (FWIW).
+		 */
+		if (sd->plexno < 0)			    /* no plex associated, */
+		    break;				    /* bring it up */
+		plex = &PLEX[sd->plexno];
+		if (plex->volno >= 0)			    /* have a volume */
+		    vol = &VOL[plex->volno];
+		else
+		    vol = NULL;
+		/*
+		 * We can't do it if:
+		 *
+		 * 1: we don't have a volume
+		 * 2: we're the only plex in the volume
+		 * 3: we're a RAID-4 or RAID-5 plex, and
+		 *    more than one subdisk is down.
+		 */
+		if (((vol == NULL)
+			|| (vol->plexes == 1))
+		    && ((!isparity(plex))
+			|| (plex->sddowncount > 1))) {
+		    if (sd->state == sd_initializing)	    /* it's finished initializing  */
+			sd->state = sd_initialized;
+		    else
+			return 0;			    /* can't do it */
+		} else {
+		    sd->state = sd_reviving;		    /* put in reviving state */
+		    sd->revived = 0;			    /* nothing done yet */
+		    status = EAGAIN;			    /* need to repeat */
+		}
+		break;
+
+	    case sd_reviving:
+		if (flags & setstate_force)		    /* insist, */
+		    break;
+		return EAGAIN;				    /* no, try again */
+
+	    default:					    /* can't do it */
+		/*
+		 * There's no way to bring subdisks up directly from
+		 * other states.  First they need to be initialized
+		 * or revived.
+		 */
+		return 0;
+	    }
+	    break;
+
+	default:					    /* other ones, only internal with force */
+	    if ((flags & setstate_force) == 0)		    /* no force?  What's this? */
+		return 0;				    /* don't do it */
+	}
+    }
+    if (status == 1) {					    /* we can do it, */
+	sd->state = newstate;
+	if (flags & setstate_force)
+	    log(LOG_INFO, "vinum: %s is %s by force\n", sd->name, sd_state(sd->state));
+	else
+	    log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
+    } else						    /* we don't get here with status 0 */
+	log(LOG_INFO,
+	    "vinum: %s is %s, not %s\n",
+	    sd->name,
+	    sd_state(sd->state),
+	    sd_state(newstate));
+    if (sd->plexno >= 0)				    /* we belong to a plex */
+	update_plex_state(sd->plexno);			    /* update plex state */
+    if ((flags & setstate_configuring) == 0)		    /* save config now */
+	save_config();
+    return status;
+}
+
+/*
+ * Set the state of a plex dependent on its subdisks.
+ * This time round, we'll let plex state just reflect
+ * aggregate subdisk state, so this becomes an order of
+ * magnitude less complicated.  In particular, ignore
+ * the requested state.
+ */
+int
+set_plex_state(int plexno, enum plexstate state, enum setstateflags flags)
+{
+    struct plex *plex;					    /* point to our plex */
+    enum plexstate oldstate;
+    enum volplexstate vps;				    /* how do we compare with the other plexes? */
+
+    plex = &PLEX[plexno];				    /* point to our plex */
+    oldstate = plex->state;
+
+    /* If the plex isn't allocated, we can't do it. */
+    if (plex->state == plex_unallocated)
+	return 0;
+
+    /*
+     * If it's already in the the state we want,
+     * and it's not up, just return.  If it's up,
+     * we still need to do some housekeeping.
+     */
+    if ((state == oldstate)
+	&& (state != plex_up))
+	return 1;
+    vps = vpstate(plex);				    /* how do we compare with the other plexes? */
+    switch (state) {
+	/*
+	 * We can't bring the plex up, even by force,
+	 * unless it's ready.  update_plex_state
+	 * checks that.
+	 */
+    case plex_up:					    /* bring the plex up */
+	update_plex_state(plex->plexno);		    /* it'll come up if it can */
+	break;
+
+    case plex_down:					    /* want to take it down */
+	/*
+	 * If we're the only one, or the only one
+	 * which is up, we need force to do it.
+	 */
+	if (((vps == volplex_onlyus)
+		|| (vps == volplex_onlyusup))
+	    && (!(flags & setstate_force)))
+	    return 0;					    /* can't do it */
+	plex->state = state;				    /* do it */
+	invalidate_subdisks(plex, sd_down);		    /* and down all up subdisks */
+	break;
+
+	/*
+	 * This is only requested internally.
+	 * Trust ourselves
+	 */
+    case plex_faulty:
+	plex->state = state;				    /* do it */
+	invalidate_subdisks(plex, sd_crashed);		    /* and crash all up subdisks */
+	break;
+
+    case plex_initializing:
+	/* XXX consider what safeguards we need here */
+	if ((flags & setstate_force) == 0)
+	    return 0;
+	plex->state = state;				    /* do it */
+	break;
+
+	/* What's this? */
+    default:
+	return 0;
+    }
+    if (plex->state != oldstate)			    /* we've changed, */
+	log(LOG_INFO,					    /* tell them about it */
+	    "vinum: %s is %s\n",
+	    plex->name,
+	    plex_state(plex->state));
+    /*
+     * Now see what we have left, and whether
+     * we're taking the volume down
+     */
+    if (plex->volno >= 0)				    /* we have a volume */
+	update_volume_state(plex->volno);		    /* update its state */
+    if ((flags & setstate_configuring) == 0)		    /* save config now */
+	save_config();					    /* yes: save the updated configuration */
+    return 1;
+}
+
+/* Update the state of a plex dependent on its plexes. */
+int
+set_volume_state(int volno, enum volumestate state, enum setstateflags flags)
+{
+    struct volume *vol = &VOL[volno];			    /* point to our volume */
+
+    if (vol->state == volume_unallocated)		    /* no volume to do anything with, */
+	return 0;
+    if (vol->state == state)				    /* we're there already */
+	return 1;
+
+    if (state == volume_up)				    /* want to come up */
+	update_volume_state(volno);
+    else if (state == volume_down) {			    /* want to go down */
+	if (((vol->flags & VF_OPEN) == 0)		    /* not open */
+	||((flags & setstate_force) != 0)) {		    /* or we're forcing */
+	    vol->state = volume_down;
+	    log(LOG_INFO,
+		"vinum: volume %s is %s\n",
+		vol->name,
+		volume_state(vol->state));
+	    if ((flags & setstate_configuring) == 0)	    /* save config now */
+		save_config();				    /* yes: save the updated configuration */
+	    return 1;
+	}
+    }
+    return 0;						    /* no change */
+}
+
+/* Set the state of a subdisk based on its environment */
+void
+update_sd_state(int sdno)
+{
+    struct sd *sd;
+    struct drive *drive;
+    enum sdstate oldstate;
+
+    sd = &SD[sdno];
+    oldstate = sd->state;
+    drive = &DRIVE[sd->driveno];
+
+    if (drive->state == drive_up) {
+	switch (sd->state) {
+	case sd_down:
+	case sd_crashed:
+	    sd->state = sd_reborn;			    /* back up again with no loss */
+	    break;
+
+	default:
+	    break;
+	}
+    } else {						    /* down or worse */
+	switch (sd->state) {
+	case sd_up:
+	case sd_reborn:
+	case sd_reviving:
+	case sd_empty:
+	    sd->state = sd_crashed;			    /* lost our drive */
+	    break;
+
+	default:
+	    break;
+	}
+    }
+    if (sd->state != oldstate)				    /* state has changed, */
+	log(LOG_INFO,					    /* say so */
+	    "vinum: %s is %s\n",
+	    sd->name,
+	    sd_state(sd->state));
+    if (sd->plexno >= 0)				    /* we're part of a plex, */
+	update_plex_state(sd->plexno);			    /* update its state */
+}
+
+/*
+ * Force a plex and all its subdisks
+ * into an 'up' state.  This is a helper
+ * for update_plex_state.
+ */
+void
+forceup(int plexno)
+{
+    struct plex *plex;
+    int sdno;
+
+    plex = &PLEX[plexno];				    /* point to the plex */
+    plex->state = plex_up;				    /* and bring it up */
+
+    /* change the subdisks to up state */
+    for (sdno = 0; sdno < plex->subdisks; sdno++) {
+	SD[plex->sdnos[sdno]].state = sd_up;
+	log(LOG_INFO,					    /* tell them about it */
+	    "vinum: %s is up\n",
+	    SD[plex->sdnos[sdno]].name);
+    }
+}
+
+/* Set the state of a plex based on its environment */
+void
+update_plex_state(int plexno)
+{
+    struct plex *plex;					    /* point to our plex */
+    enum plexstate oldstate;
+    enum sdstates statemap;				    /* get a map of the subdisk states */
+    enum volplexstate vps;				    /* how do we compare with the other plexes? */
+
+    plex = &PLEX[plexno];				    /* point to our plex */
+    oldstate = plex->state;
+    statemap = sdstatemap(plex);			    /* get a map of the subdisk states */
+    vps = vpstate(plex);				    /* how do we compare with the other plexes? */
+
+    if (statemap & sd_initstate)			    /* something initializing? */
+	plex->state = plex_initializing;		    /* yup, that makes the plex the same */
+    else if (statemap == sd_upstate)
+	/*
+	 * All the subdisks are up.  This also means that
+	 * they are consistent, so we can just bring
+	 * the plex up
+	 */
+	plex->state = plex_up;
+    else if (isparity(plex)				    /* RAID-4 or RAID-5 plex */
+    &&(plex->sddowncount == 1))				    /* and exactly one subdisk down */
+	plex->state = plex_degraded;			    /* limping a bit */
+    else if (((statemap & ~sd_downstate) == sd_emptystate)  /* all subdisks empty */
+    ||((statemap & ~sd_downstate)
+	    == (statemap & ~sd_downstate & (sd_initializedstate | sd_upstate)))) {
+	if ((vps & volplex_otherup) == 0) {		    /* no other plex is up */
+	    struct volume *vol = &VOL[plex->volno];	    /* possible volume to which it points */
+
+	    /*
+	     * If we're a striped or concat plex
+	     * associated with a volume, none of whose
+	     * plexes are up, and we're new and untested,
+	     * and the volume has the setupstate bit set,
+	     * we can pretend to be in a consistent state.
+	     *
+	     * We need to do this in one swell foop: on
+	     * the next call we will no longer be just
+	     * empty.
+	     *
+	     * This code assumes that all the other plexes
+	     * are also capable of coming up (i.e. all the
+	     * sds are up), but that's OK: we'll come back
+	     * to this function for the remaining plexes
+	     * in the volume.
+	     */
+	    if ((plex->state == plex_init)
+		&& (plex->volno >= 0)
+		&& (vol->flags & VF_CONFIG_SETUPSTATE)) {
+		for (plexno = 0; plexno < vol->plexes; plexno++)
+		    forceup(VOL[plex->volno].plex[plexno]);
+	    } else if ((statemap == sd_initializedstate)    /* if it's initialized (not empty) */
+	    ||(plex->organization == plex_concat)	    /* and we're not RAID-4 or RAID-5 */
+	    ||(plex->organization == plex_striped))
+		forceup(plexno);			    /* we'll do it */
+	    /*
+	     * This leaves a case where things don't get
+	     * done: the plex is RAID-4 or RAID-5, and
+	     * the subdisks are all empty.  They need to
+	     * be initialized first.
+	     */
+	} else {
+	    if (statemap == sd_upstate)			    /* all subdisks up */
+		plex->state = plex_up;			    /* we can come up too */
+	    else
+		plex->state = plex_faulty;
+	}
+    } else if ((statemap & (sd_upstate | sd_rebornstate)) == statemap) /* all up or reborn */
+	plex->state = plex_flaky;
+    else if (statemap & (sd_upstate | sd_rebornstate))	    /* some up or reborn */
+	plex->state = plex_corrupt;			    /* corrupt */
+    else if (statemap & (sd_initstate | sd_emptystate))	    /* some subdisks empty or initializing */
+	plex->state = plex_initializing;
+    else						    /* nothing at all up */
+	plex->state = plex_faulty;
+
+    if (plex->state != oldstate)			    /* state has changed, */
+	log(LOG_INFO,					    /* tell them about it */
+	    "vinum: %s is %s\n",
+	    plex->name,
+	    plex_state(plex->state));
+    if (plex->volno >= 0)				    /* we're part of a volume, */
+	update_volume_state(plex->volno);		    /* update its state */
+}
+
+/* Set volume state based on its components */
+void
+update_volume_state(int volno)
+{
+    struct volume *vol;					    /* our volume */
+    int plexno;
+    enum volumestate oldstate;
+
+    vol = &VOL[volno];					    /* point to our volume */
+    oldstate = vol->state;
+
+    for (plexno = 0; plexno < vol->plexes; plexno++) {
+	struct plex *plex = &PLEX[vol->plex[plexno]];	    /* point to the plex */
+	if (plex->state >= plex_corrupt) {		    /* something accessible, */
+	    vol->state = volume_up;
+	    break;
+	}
+    }
+    if (plexno == vol->plexes)				    /* didn't find an up plex */
+	vol->state = volume_down;
+
+    if (vol->state != oldstate) {			    /* state changed */
+	log(LOG_INFO, "vinum: %s is %s\n", vol->name, volume_state(vol->state));
+	save_config();					    /* save the updated configuration */
+    }
+}
+
+/*
+ * Called from request routines when they find
+ * a subdisk which is not kosher.  Decide whether
+ * it warrants changing the state.  Return
+ * REQUEST_DOWN if we can't use the subdisk,
+ * REQUEST_OK if we can.
+ */
+/*
+ * A prior version of this function checked the plex
+ * state as well.  At the moment, consider plex states
+ * information for the user only.  We'll ignore them
+ * and use the subdisk state only.  The last version of
+ * this file with the old logic was 2.7. XXX
+ */
+enum requeststatus
+checksdstate(struct sd *sd, struct request *rq, daddr_t diskaddr, daddr_t diskend)
+{
+    struct plex *plex = &PLEX[sd->plexno];
+    int writeop = (rq->bp->b_iocmd == BIO_WRITE);	    /* note if we're writing */
+
+    switch (sd->state) {
+	/* We shouldn't get called if the subdisk is up */
+    case sd_up:
+	return REQUEST_OK;
+
+    case sd_reviving:
+	/*
+	 * Access to a reviving subdisk depends on the
+	 * organization of the plex:
+	 *
+	 * - If it's concatenated, access the subdisk
+	 *   up to its current revive point.  If we
+	 *   want to write to the subdisk overlapping
+	 *   the current revive block, set the
+	 *   conflict flag in the request, asking the
+	 *   caller to put the request on the wait
+	 *   list, which will be attended to by
+	 *   revive_block when it's done.
+	 * - if it's striped, we can't do it (we could
+	 *   do some hairy calculations, but it's
+	 *   unlikely to work).
+	 * - if it's RAID-4 or RAID-5, we can do it as
+	 *   long as only one subdisk is down
+	 */
+	if (plex->organization == plex_striped)		    /* plex is striped, */
+	    return REQUEST_DOWN;
+	else if (isparity(plex)) {			    /* RAID-4 or RAID-5 plex */
+	    if (plex->sddowncount > 1)			    /* with more than one sd down, */
+		return REQUEST_DOWN;
+	    else
+		/*
+		 * XXX We shouldn't do this if we can find a
+		 * better way.  Check the other plexes
+		 * first, and return a DOWN if another
+		 * plex will do it better
+		 */
+		return REQUEST_OK;			    /* OK, we'll find a way */
+	}
+	if (diskaddr > (sd->revived
+		+ sd->plexoffset
+		+ (sd->revive_blocksize >> DEV_BSHIFT)))    /* we're beyond the end */
+	    return REQUEST_DOWN;
+	else if (diskend > (sd->revived + sd->plexoffset)) { /* we finish beyond the end */
+	    if (writeop) {
+		rq->flags |= XFR_REVIVECONFLICT;	    /* note a potential conflict */
+		rq->sdno = sd->sdno;			    /* and which sd last caused it */
+	    } else
+		return REQUEST_DOWN;
+	}
+	return REQUEST_OK;
+
+    case sd_reborn:
+	if (writeop)
+	    return REQUEST_OK;				    /* always write to a reborn disk */
+	else						    /* don't allow a read */
+	    /*
+	       * Handle the mapping.  We don't want to reject
+	       * a read request to a reborn subdisk if that's
+	       * all we have. XXX
+	     */
+	    return REQUEST_DOWN;
+
+    case sd_down:
+	if (writeop)					    /* writing to a consistent down disk */
+	    set_sd_state(sd->sdno, sd_obsolete, setstate_force); /* it's not consistent now */
+	return REQUEST_DOWN;
+
+    case sd_crashed:
+	if (writeop)					    /* writing to a consistent down disk */
+	    set_sd_state(sd->sdno, sd_stale, setstate_force); /* it's not consistent now */
+	return REQUEST_DOWN;
+
+    default:
+	return REQUEST_DOWN;
+    }
+}
+
+/* return a state map for the subdisks of a plex */
+enum sdstates
+sdstatemap(struct plex *plex)
+{
+    int sdno;
+    enum sdstates statemap = 0;				    /* note the states we find */
+
+    plex->sddowncount = 0;				    /* no subdisks down yet */
+    for (sdno = 0; sdno < plex->subdisks; sdno++) {
+	struct sd *sd = &SD[plex->sdnos[sdno]];		    /* point to the subdisk */
+
+	switch (sd->state) {
+	case sd_empty:
+	    statemap |= sd_emptystate;
+	    (plex->sddowncount)++;			    /* another unusable subdisk */
+	    break;
+
+	case sd_init:
+	    statemap |= sd_initstate;
+	    (plex->sddowncount)++;			    /* another unusable subdisk */
+	    break;
+
+	case sd_down:
+	    statemap |= sd_downstate;
+	    (plex->sddowncount)++;			    /* another unusable subdisk */
+	    break;
+
+	case sd_crashed:
+	    statemap |= sd_crashedstate;
+	    (plex->sddowncount)++;			    /* another unusable subdisk */
+	    break;
+
+	case sd_obsolete:
+	    statemap |= sd_obsoletestate;
+	    (plex->sddowncount)++;			    /* another unusable subdisk */
+	    break;
+
+	case sd_stale:
+	    statemap |= sd_stalestate;
+	    (plex->sddowncount)++;			    /* another unusable subdisk */
+	    break;
+
+	case sd_reborn:
+	    statemap |= sd_rebornstate;
+	    break;
+
+	case sd_up:
+	    statemap |= sd_upstate;
+	    break;
+
+	case sd_initializing:
+	    statemap |= sd_initstate;
+	    (plex->sddowncount)++;			    /* another unusable subdisk */
+	    break;
+
+	case sd_initialized:
+	    statemap |= sd_initializedstate;
+	    (plex->sddowncount)++;			    /* another unusable subdisk */
+	    break;
+
+	case sd_unallocated:
+	case sd_uninit:
+	case sd_reviving:
+	case sd_referenced:
+	    statemap |= sd_otherstate;
+	    (plex->sddowncount)++;			    /* another unusable subdisk */
+	}
+    }
+    return statemap;
+}
+
+/* determine the state of the volume relative to this plex */
+enum volplexstate
+vpstate(struct plex *plex)
+{
+    struct volume *vol;
+    enum volplexstate state = volplex_onlyusdown;	    /* state to return */
+    int plexno;
+
+    if (plex->volno < 0) {				    /* not associated with a volume */
+	if (plex->state > plex_degraded)
+	    return volplex_onlyus;			    /* just us */
+	else
+	    return volplex_onlyusdown;			    /* assume the worst */
+    }
+    vol = &VOL[plex->volno];				    /* point to our volume */
+    for (plexno = 0; plexno < vol->plexes; plexno++) {
+	if (&PLEX[vol->plex[plexno]] == plex) {		    /* us */
+	    if (PLEX[vol->plex[plexno]].state >= plex_degraded)	/* are we up? */
+		state |= volplex_onlyus;		    /* yes */
+	} else {
+	    if (PLEX[vol->plex[plexno]].state >= plex_degraded)	/* not us */
+		state |= volplex_otherup;		    /* and when they were up, they were up */
+	    else
+		state |= volplex_alldown;		    /* and when they were down, they were down */
+	}
+    }
+    return state;					    /* and when they were only halfway up */
+}							    /* they were neither up nor down */
+
+/* Check if all bits b are set in a */
+int allset(int a, int b);
+
+int
+allset(int a, int b)
+{
+    return (a & b) == b;
+}
+
+/* Invalidate the subdisks belonging to a plex */
+void
+invalidate_subdisks(struct plex *plex, enum sdstate state)
+{
+    int sdno;
+
+    for (sdno = 0; sdno < plex->subdisks; sdno++) {	    /* for each subdisk */
+	struct sd *sd = &SD[plex->sdnos[sdno]];
+
+	switch (sd->state) {
+	case sd_unallocated:
+	case sd_uninit:
+	case sd_init:
+	case sd_initializing:
+	case sd_initialized:
+	case sd_empty:
+	case sd_obsolete:
+	case sd_stale:
+	case sd_crashed:
+	case sd_down:
+	case sd_referenced:
+	    break;
+
+	case sd_reviving:
+	case sd_reborn:
+	case sd_up:
+	    set_sd_state(plex->sdnos[sdno], state, setstate_force);
+	}
+    }
+}
+
+/*
+ * Start an object, in other words do what we can to get it up.
+ * This is called from vinumioctl (VINUMSTART).
+ * Return error indications via ioctl_reply
+ */
+void
+start_object(struct vinum_ioctl_msg *data)
+{
+    int status;
+    int objindex = data->index;				    /* data gets overwritten */
+    struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) data; /* format for returning replies */
+    enum setstateflags flags;
+
+    if (data->force != 0)				    /* are we going to use force? */
+	flags = setstate_force;				    /* yes */
+    else
+	flags = setstate_none;				    /* no */
+
+    switch (data->type) {
+    case drive_object:
+	status = set_drive_state(objindex, drive_up, flags);
+	if (DRIVE[objindex].state != drive_up)		    /* set status on whether we really did it */
+	    ioctl_reply->error = EBUSY;
+	else
+	    ioctl_reply->error = 0;
+	break;
+
+    case sd_object:
+	if (DRIVE[SD[objindex].driveno].state != drive_up) {
+	    ioctl_reply->error = EIO;
+	    strcpy(ioctl_reply->msg, "Drive is down");
+	    return;
+	}
+	if (data->blocksize)
+	    SD[objindex].revive_blocksize = data->blocksize;
+	if ((SD[objindex].state == sd_reviving)		    /* reviving, */
+	||(SD[objindex].state == sd_stale)) {		    /* or stale, will revive */
+	    SD[objindex].state = sd_reviving;		    /* make sure we're reviving */
+	    ioctl_reply->error = revive_block(objindex);    /* revive another block */
+	    ioctl_reply->msg[0] = '\0';			    /* no comment */
+	    return;
+	} else if (SD[objindex].state == sd_initializing) { /* initializing, */
+	    if (data->blocksize)
+		SD[objindex].init_blocksize = data->blocksize;
+	    ioctl_reply->error = initsd(objindex, data->verify); /* initialize another block */
+	    ioctl_reply->msg[0] = '\0';			    /* no comment */
+	    return;
+	}
+	status = set_sd_state(objindex, sd_up, flags);	    /* set state */
+	if (status != EAGAIN) {				    /* not first revive or initialize, */
+	    if (SD[objindex].state != sd_up)		    /* set status on whether we really did it */
+		ioctl_reply->error = EBUSY;
+	    else
+		ioctl_reply->error = 0;
+	} else
+	    ioctl_reply->error = status;
+	break;
+
+    case plex_object:
+	status = set_plex_state(objindex, plex_up, flags);
+	if (PLEX[objindex].state != plex_up)		    /* set status on whether we really did it */
+	    ioctl_reply->error = EBUSY;
+	else
+	    ioctl_reply->error = 0;
+	break;
+
+    case volume_object:
+	status = set_volume_state(objindex, volume_up, flags);
+	if (VOL[objindex].state != volume_up)		    /* set status on whether we really did it */
+	    ioctl_reply->error = EBUSY;
+	else
+	    ioctl_reply->error = 0;
+	break;
+
+    default:
+	ioctl_reply->error = EINVAL;
+	strcpy(ioctl_reply->msg, "Invalid object type");
+	return;
+    }
+    /*
+     * There's no point in saying anything here:
+     * the userland program does it better
+     */
+    ioctl_reply->msg[0] = '\0';
+}
+
+/*
+ * Stop an object, in other words do what we can to get it down
+ * This is called from vinumioctl (VINUMSTOP).
+ * Return error indications via ioctl_reply.
+ */
+void
+stop_object(struct vinum_ioctl_msg *data)
+{
+    int status = 1;
+    int objindex = data->index;				    /* save the number from change */
+    struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) data; /* format for returning replies */
+
+    switch (data->type) {
+    case drive_object:
+	status = set_drive_state(objindex, drive_down, data->force);
+	break;
+
+    case sd_object:
+	status = set_sd_state(objindex, sd_down, data->force);
+	break;
+
+    case plex_object:
+	status = set_plex_state(objindex, plex_down, data->force);
+	break;
+
+    case volume_object:
+	status = set_volume_state(objindex, volume_down, data->force);
+	break;
+
+    default:
+	ioctl_reply->error = EINVAL;
+	strcpy(ioctl_reply->msg, "Invalid object type");
+	return;
+    }
+    ioctl_reply->msg[0] = '\0';
+    if (status == 0)					    /* couldn't do it */
+	ioctl_reply->error = EBUSY;
+    else
+	ioctl_reply->error = 0;
+}
+
+/*
+ * VINUM_SETSTATE ioctl: set an object state.
+ * msg is the message passed by the user.
+ */
+void
+setstate(struct vinum_ioctl_msg *msg)
+{
+    int sdno;
+    struct sd *sd;
+    struct plex *plex;
+    struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) msg; /* format for returning replies */
+
+    switch (msg->state) {
+    case object_down:
+	stop_object(msg);
+	break;
+
+    case object_initializing:
+	switch (msg->type) {
+	case sd_object:
+	    sd = &SD[msg->index];
+	    if ((msg->index >= vinum_conf.subdisks_allocated)
+		|| (sd->state <= sd_referenced)) {
+		sprintf(ioctl_reply->msg, "Invalid subdisk %d", msg->index);
+		ioctl_reply->error = EFAULT;
+		return;
+	    }
+	    set_sd_state(msg->index, sd_initializing, msg->force);
+	    if (sd->state != sd_initializing) {
+		strcpy(ioctl_reply->msg, "Can't set state");
+		ioctl_reply->error = EBUSY;
+	    } else
+		ioctl_reply->error = 0;
+	    break;
+
+	case plex_object:
+	    plex = &PLEX[msg->index];
+	    if ((msg->index >= vinum_conf.plexes_allocated)
+		|| (plex->state <= plex_unallocated)) {
+		sprintf(ioctl_reply->msg, "Invalid plex %d", msg->index);
+		ioctl_reply->error = EFAULT;
+		return;
+	    }
+	    set_plex_state(msg->index, plex_initializing, msg->force);
+	    if (plex->state != plex_initializing) {
+		strcpy(ioctl_reply->msg, "Can't set state");
+		ioctl_reply->error = EBUSY;
+	    } else {
+		ioctl_reply->error = 0;
+		for (sdno = 0; sdno < plex->subdisks; sdno++) {
+		    sd = &SD[plex->sdnos[sdno]];
+		    set_sd_state(plex->sdnos[sdno], sd_initializing, msg->force);
+		    if (sd->state != sd_initializing) {
+			strcpy(ioctl_reply->msg, "Can't set state");
+			ioctl_reply->error = EBUSY;
+			break;
+		    }
+		}
+	    }
+	    break;
+
+	default:
+	    strcpy(ioctl_reply->msg, "Invalid object");
+	    ioctl_reply->error = EINVAL;
+	}
+	break;
+
+    case object_initialized:
+	if (msg->type == sd_object) {
+	    sd = &SD[msg->index];
+	    if ((msg->index >= vinum_conf.subdisks_allocated)
+		|| (sd->state <= sd_referenced)) {
+		sprintf(ioctl_reply->msg, "Invalid subdisk %d", msg->index);
+		ioctl_reply->error = EFAULT;
+		return;
+	    }
+	    set_sd_state(msg->index, sd_initialized, msg->force);
+	    if (sd->state != sd_initializing) {
+		strcpy(ioctl_reply->msg, "Can't set state");
+		ioctl_reply->error = EBUSY;
+	    } else
+		ioctl_reply->error = 0;
+	} else {
+	    strcpy(ioctl_reply->msg, "Invalid object");
+	    ioctl_reply->error = EINVAL;
+	}
+	break;
+
+    case object_up:
+	start_object(msg);
+    }
+}
+
+/*
+ * Brute force set state function.  Don't look at
+ * any dependencies, just do it.  This is mainly
+ * intended for testing and recovery.
+ */
+void
+setstate_by_force(struct vinum_ioctl_msg *msg)
+{
+    struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) msg; /* format for returning replies */
+
+    switch (msg->type) {
+    case drive_object:
+	DRIVE[msg->index].state = msg->state;
+	break;
+
+    case sd_object:
+	SD[msg->index].state = msg->state;
+	break;
+
+    case plex_object:
+	PLEX[msg->index].state = msg->state;
+	break;
+
+    case volume_object:
+	VOL[msg->index].state = msg->state;
+	break;
+
+    default:
+	break;
+    }
+    ioctl_reply->error = 0;
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumstate.h b/sys/dev/vinum/vinumstate.h
new file mode 100644
index 0000000..572f317
--- /dev/null
+++ b/sys/dev/vinum/vinumstate.h
@@ -0,0 +1,257 @@
+/*-
+ * Copyright (c) 1997, 1998
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This file gets read by makestatetext to create text files
+ * with the names of the states, so don't change the file
+ * format
+ */
+
+enum volumestate {
+    volume_unallocated,
+    /* present but unused.  Must be 0 */
+
+    volume_uninit,
+    /* mentioned elsewhere but not known to the configuration */
+
+    volume_down,
+
+    /* The volume is up and functional, but not all plexes may be available */
+    volume_up,
+    volume_laststate = volume_up			    /* last value, for table dimensions */
+};
+
+enum plexstate {
+    /* An empty entry, not a plex at all.   */
+    plex_unallocated,
+
+    /* The plex has been referenced by a volume */
+    plex_referenced,
+    /*
+     * The plex has been allocated, but there configuration
+     * is not complete
+     */
+    plex_init,
+
+    /*
+     * A plex which has gone completely down because of
+     * I/O errors.
+     */
+    plex_faulty,
+
+    /*
+     * A plex which has been taken down by the
+     * administrator.
+     */
+    plex_down,
+
+    /* A plex which is being initialized */
+    plex_initializing,
+
+    /*
+     * *** The remaining states represent plexes which are
+     * at least partially up.  Keep these separate so that
+     * they can be checked more easily.
+     */
+
+    /*
+     * A plex entry which is at least partially up.  Not
+     * all subdisks are available, and an inconsistency
+     * has occurred.  If no other plex is uncorrupted,
+     * the volume is no longer consistent.
+     */
+    plex_corrupt,
+
+    plex_firstup = plex_corrupt,			    /* first "up" state */
+
+    /*
+     * A RAID-5 plex entry which is accessible, but one
+     * subdisk is down, requiring recovery for many
+     * I/O requests.
+     */
+    plex_degraded,
+
+    /*
+     * A plex which is really up, but which has a reborn
+     * subdisk which we don't completely trust, and
+     * which we don't want to read if we can avoid it
+     */
+    plex_flaky,
+
+    /*
+     * A plex entry which is completely up.  All subdisks
+     * are up.
+     */
+    plex_up,
+
+    plex_laststate = plex_up				    /* last value, for table dimensions */
+};
+
+/* subdisk states */
+enum sdstate {
+    /* An empty entry, not a subdisk at all. */
+    sd_unallocated,
+
+    /*
+     * A subdisk entry which has not been created
+     * completely.  Some fields may be empty.
+     */
+    sd_uninit,
+
+    /* The subdisk has been referenced by a plex */
+    sd_referenced,
+
+    /*
+     * A subdisk entry which has been created completely.
+     * All fields are correct, but the disk hasn't
+     * been updated.
+     */
+    sd_init,
+
+    /*
+     * A subdisk entry which has been created completely.
+     * All fields are correct, and the disk has been
+     * updated, but there is no data on the disk.
+     */
+    sd_empty,
+
+    /*
+     * A subdisk entry which has been created completely and
+     * which is currently being initialized
+     */
+    sd_initializing,
+
+    /*
+     * A subdisk entry which has been initialized,
+     * but which can't come up because it would
+     * cause inconsistencies.
+     */
+    sd_initialized,
+
+    /* *** The following states represent invalid data */
+    /*
+     * A subdisk entry which has been created completely.
+     * All fields are correct, the config on disk has been
+     * updated, and the data was valid, but since then the
+     * drive has been taken down, and as a result updates
+     * have been missed.
+     */
+    sd_obsolete,
+
+    /*
+     * A subdisk entry which has been created completely.
+     * All fields are correct, the disk has been updated,
+     * and the data was valid, but since then the drive
+     * has been crashed and updates have been lost.
+     */
+    sd_stale,
+
+    /* *** The following states represent valid, inaccessible data */
+
+    /*
+     * A subdisk entry which has been created completely.
+     * All fields are correct, the disk has been updated,
+     * and the data was valid, but since then the drive
+     * has gone down.   No attempt has been made to write
+     * to the subdisk since the crash, so the data is valid.
+     */
+    sd_crashed,
+
+    /*
+     * A subdisk entry which was up, which contained
+     * valid data, and which was taken down by the
+     * administrator.  The data is valid.
+     */
+    sd_down,
+
+    /*
+     * *** This is invalid data (the subdisk previously had
+     * a numerically lower state), but it is currently in the
+     * process of being revived.  We can write but not read.
+     */
+    sd_reviving,
+
+    /*
+     * *** The following states represent accessible subdisks
+     * with valid data
+     */
+
+    /*
+     * A subdisk entry which has been created completely.
+     * All fields are correct, the disk has been updated,
+     * and the data was valid, but since then the drive
+     * has gone down and up again.  No updates were lost,
+     * but it is possible that the subdisk has been
+     * damaged.  We won't read from this subdisk if we
+     * have a choice.  If this is the only subdisk which
+     * covers this address space in the plex, we set its
+     * state to sd_up under these circumstances, so this
+     * status implies that there is another subdisk to
+     * fulfil the request.
+     */
+    sd_reborn,
+
+    /*
+     * A subdisk entry which has been created completely.
+     * All fields are correct, the disk has been updated,
+     * and the data is valid.
+     */
+    sd_up,
+
+    sd_laststate = sd_up				    /* last value, for table dimensions */
+};
+
+enum drivestate {
+    drive_unallocated,
+    /* present but unused.  Must be 0 */
+
+    drive_referenced,
+    /* just mentioned in some other config entry */
+
+    drive_down,
+    /* not accessible */
+
+    drive_up,
+    /* up and running */
+
+    drive_laststate = drive_up				    /* last value, for table dimensions */
+};
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumutil.c b/sys/dev/vinum/vinumutil.c
new file mode 100644
index 0000000..5d3fe82
--- /dev/null
+++ b/sys/dev/vinum/vinumutil.c
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumutil.c,v 1.17 2003/04/28 02:54:43 grog Exp $
+ * $FreeBSD$
+ */
+
+/* This file contains utility routines used both in kernel and user context */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/statetexts.h>
+#ifndef _KERNEL
+#include <stdio.h>
+#include <string.h>
+extern jmp_buf command_fail;				    /* return on a failed command */
+#endif
+
+static char numeric_state[32];				    /* temporary buffer for ASCII conversions */
+#define STATECOUNT(x) (sizeof (x##statetext) / sizeof (char *))
+/* Return drive state as a string */
+char *
+drive_state(enum drivestate state)
+{
+    if (((unsigned) state) >= STATECOUNT(drive)) {
+	sprintf(numeric_state, "Invalid state %d", (int) state);
+	return numeric_state;
+    } else
+	return drivestatetext[state];
+}
+
+/* Return volume state as a string */
+char *
+volume_state(enum volumestate state)
+{
+    if (((unsigned) state) >= STATECOUNT(vol)) {
+	sprintf(numeric_state, "Invalid state %d", (int) state);
+	return numeric_state;
+    } else
+	return volstatetext[state];
+}
+
+/* Return plex state as a string */
+char *
+plex_state(enum plexstate state)
+{
+    if (((unsigned) state) >= STATECOUNT(plex)) {
+	sprintf(numeric_state, "Invalid state %d", (int) state);
+	return numeric_state;
+    } else
+	return plexstatetext[state];
+}
+
+/* Return plex organization as a string */
+char *
+plex_org(enum plexorg org)
+{
+    switch (org) {
+    case plex_disorg:					    /* disorganized */
+	return "disorg";
+	break;
+
+    case plex_concat:					    /* concatenated plex */
+	return "concat";
+	break;
+
+    case plex_striped:					    /* striped plex */
+	return "striped";
+	break;
+
+    case plex_raid4:					    /* RAID-4 plex */
+	return "raid4";
+
+    case plex_raid5:					    /* RAID-5 plex */
+	return "raid5";
+	break;
+
+    default:
+	sprintf(numeric_state, "Invalid org %d", (int) org);
+	return numeric_state;
+    }
+}
+
+/* Return sd state as a string */
+char *
+sd_state(enum sdstate state)
+{
+    if (((unsigned) state) >= STATECOUNT(sd)) {
+	sprintf(numeric_state, "Invalid state %d", (int) state);
+	return numeric_state;
+    } else
+	return sdstatetext[state];
+}
+
+/* Now convert in the other direction */
+/*
+ * These are currently used only internally,
+ * so we don't do too much error checking
+ */
+enum drivestate
+DriveState(char *text)
+{
+    int i;
+    for (i = 0; i < STATECOUNT(drive); i++)
+	if (strcmp(text, drivestatetext[i]) == 0)	    /* found it */
+	    return (enum drivestate) i;
+    return -1;
+}
+
+enum sdstate
+SdState(char *text)
+{
+    int i;
+    for (i = 0; i < STATECOUNT(sd); i++)
+	if (strcmp(text, sdstatetext[i]) == 0)		    /* found it */
+	    return (enum sdstate) i;
+    return -1;
+}
+
+enum plexstate
+PlexState(char *text)
+{
+    int i;
+    for (i = 0; i < STATECOUNT(plex); i++)
+	if (strcmp(text, plexstatetext[i]) == 0)	    /* found it */
+	    return (enum plexstate) i;
+    return -1;
+}
+
+enum volumestate
+VolState(char *text)
+{
+    int i;
+    for (i = 0; i < STATECOUNT(vol); i++)
+	if (strcmp(text, volstatetext[i]) == 0)		    /* found it */
+	    return (enum volumestate) i;
+    return -1;
+}
+
+/*
+ * Take a number with an optional scale factor and convert
+ * it to a number of bytes.
+ *
+ * The scale factors are:
+ *
+ * s    sectors (of 512 bytes)
+ * b    blocks (of 512 bytes).  This unit is deprecated,
+ *      because it's confusing, but maintained to avoid
+ *      confusing Veritas users.
+ * k    kilobytes (1024 bytes)
+ * m    megabytes (of 1024 * 1024 bytes)
+ * g    gigabytes (of 1024 * 1024 * 1024 bytes)
+ */
+u_int64_t
+sizespec(char *spec)
+{
+    u_int64_t size;
+    char *s;
+    int sign = 1;					    /* -1 if negative */
+
+    size = 0;
+    if (spec != NULL) {					    /* we have a parameter */
+	s = spec;
+	if (*s == '-') {				    /* negative, */
+	    sign = -1;
+	    s++;					    /* skip */
+	}
+	if ((*s >= '0') && (*s <= '9')) {		    /* it's numeric */
+	    while ((*s >= '0') && (*s <= '9'))		    /* it's numeric */
+		size = size * 10 + *s++ - '0';		    /* convert it */
+	    switch (*s) {
+	    case '\0':
+		return size * sign;
+
+	    case 'B':
+	    case 'b':
+	    case 'S':
+	    case 's':
+		return size * sign * 512;
+
+	    case 'K':
+	    case 'k':
+		return size * sign * 1024;
+
+	    case 'M':
+	    case 'm':
+		return size * sign * 1024 * 1024;
+
+	    case 'G':
+	    case 'g':
+		return size * sign * 1024 * 1024 * 1024;
+	    }
+	}
+#ifdef _KERNEL
+	throw_rude_remark(EINVAL, "Invalid length specification: %s", spec);
+#else
+	fprintf(stderr, "Invalid length specification: %s", spec);
+	longjmp(command_fail, 1);
+#endif
+    }
+#ifdef _KERNEL
+    throw_rude_remark(EINVAL, "Missing length specification");
+#else
+    fprintf(stderr, "Missing length specification");
+    longjmp(command_fail, 1);
+#endif
+    /* NOTREACHED */
+    return -1;
+}
+
+/*
+ * Extract the volume number from a device number.  Check that it's
+ * the correct type, and that it isn't one of the superdevs.
+ */
+int
+Volno(dev_t dev)
+{
+    int volno = minor(dev);
+
+    if (OBJTYPE(dev) != VINUM_VOLUME_TYPE)
+	return -1;
+    else
+	volno = ((volno & 0x3fff0000) >> 8) | (volno & 0xff);
+    if ((volno == VINUM_SUPERDEV_VOL)
+	|| (volno == VINUM_DAEMON_VOL))
+	return -1;
+    else
+	return volno;
+}
+
+/*
+ * Extract a plex number from a device number.
+ * Don't check the major number, but check the
+ * type.  Return -1 for invalid types.
+ */
+int
+Plexno(dev_t dev)
+{
+    int plexno = minor(dev);
+
+    if (OBJTYPE(dev) != VINUM_PLEX_TYPE)
+	return -1;
+    else
+	return ((plexno & 0x3fff0000) >> 8) | (plexno & 0xff);
+}
+
+/*
+ * Extract a subdisk number from a device number.
+ * Don't check the major number, but check the
+ * type.  Return -1 for invalid types.
+ */
+int
+Sdno(dev_t dev)
+{
+    int sdno = minor(dev);
+
+    /*
+     * Care: VINUM_SD_TYPE is 2 or 3, which is why we use < instead of
+     * !=.  It's not clear that this makes any sense abstracting it to
+     * this level.
+     */
+    if (OBJTYPE(dev) < VINUM_SD_TYPE)
+	return -1;
+    else
+/*
+ * Note that the number we return includes the low-order bit of the
+ * type field.  This gives us twice as many potential subdisks as
+ * plexes or volumes.
+ */
+	return ((sdno & 0x7fff0000) >> 8) | (sdno & 0xff);
+}
diff --git a/sys/dev/vinum/vinumutil.h b/sys/dev/vinum/vinumutil.h
new file mode 100644
index 0000000..2efa42c
--- /dev/null
+++ b/sys/dev/vinum/vinumutil.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *      Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumutil.h,v 1.1 2001/05/22 04:07:22 grog Exp grog $
+ * $FreeBSD$
+ */
+
+/*
+ * Functions defined in vinumutil.c, which is used both in userland
+ * and in the kernel.
+ */
+char *drive_state(enum drivestate);
+char *volume_state(enum volumestate);
+char *plex_state(enum plexstate);
+char *plex_org(enum plexorg);
+char *sd_state(enum sdstate);
+enum drivestate DriveState(char *text);
+enum sdstate SdState(char *text);
+enum plexstate PlexState(char *text);
+enum volumestate VolState(char *text);
diff --git a/sys/dev/vinum/vinumvar.h b/sys/dev/vinum/vinumvar.h
new file mode 100644
index 0000000..8c6a07b
--- /dev/null
+++ b/sys/dev/vinum/vinumvar.h
@@ -0,0 +1,400 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ *	Nan Yang Computer Services Limited.  All rights reserved.
+ *
+ *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ *  Written by Greg Lehey
+ *
+ *  This software is distributed under the so-called ``Berkeley
+ *  License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Nan Yang Computer
+ *	Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumvar.h,v 1.33 2003/05/23 01:09:23 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <sys/time.h>
+#include <dev/vinum/vinumstate.h>
+#include <sys/mutex.h>
+
+/* Directory for device nodes. */
+#define VINUM_DIR   "/dev/vinum"
+
+/*
+ * Some configuration maxima.  They're an enum because
+ * we can't define global constants.  Sorry about that.
+ *
+ * These aren't as bad as they look: most of them are soft limits.
+ */
+
+#define VINUMROOT
+enum constants {
+    /*
+     * Current version of the data structures.  This
+     * is used to ensure synchronization between
+     * kernel module and userland vinum(8).
+     */
+    VINUMVERSION = 1,
+    VINUM_HEADER = 512,					    /* size of header on disk */
+    MAXCONFIGLINE = 1024,				    /* maximum size of a single config line */
+    MINVINUMSLICE = 1048576,				    /* minimum size of a slice */
+
+    VINUM_CDEV_MAJOR = 91,				    /* major number for character device */
+
+    ROUND_ROBIN_READPOL = -1,				    /* round robin read policy */
+
+    /*
+     * Type field in high-order two bits of minor
+     * number.  Subdisks are in fact both type 2 and
+     * type 3, giving twice the number of subdisks.
+     * This causes some ugliness in the code.
+     */
+    VINUM_VOLUME_TYPE = 0,
+    VINUM_PLEX_TYPE = 1,
+    VINUM_SD_TYPE = 2,
+    VINUM_SD2_TYPE = 3,
+
+
+    /*
+     * Define a minor device number.
+     * This is not used directly; instead, it's
+     * called by the other macros.
+     */
+#define VINUMMINOR(o,t)  ((o & 0xff) | ((o & 0x3fff00) << 8) | (t << VINUM_TYPE_SHIFT))
+
+    VINUM_TYPE_SHIFT = 30,
+    VINUM_MAXVOL = 0x3ffffd,				    /* highest numbered volume */
+
+    /*
+     * The super device and the daemon device are
+     * magic: they're the two highest-numbered
+     * volumes.
+     */
+    VINUM_SUPERDEV_VOL = 0x3ffffe,
+    VINUM_DAEMON_VOL = 0x3fffff,
+    VINUM_MAXPLEX = 0x3fffff,
+    VINUM_MAXSD = 0x7fffff,
+
+#define VINUM_SUPERDEV_MINOR VINUMMINOR (VINUM_SUPERDEV_VOL, VINUM_VOLUME_TYPE)
+#define VINUM_DAEMON_MINOR   VINUMMINOR (VINUM_DAEMON_VOL, VINUM_VOLUME_TYPE)
+
+    /*
+     * Mask for the number part of each object.
+     * Plexes and volumes are the same, subdisks use
+     * the low-order bit of the type field and thus
+     * have twice the number.
+     */
+
+    MAJORDEV_SHIFT = 8,
+
+    MAXPLEX = 8,					    /* maximum number of plexes in a volume */
+    MAXSD = 256,					    /* maximum number of subdisks in a plex */
+    MAXDRIVENAME = 32,					    /* maximum length of a device name */
+    MAXSDNAME = 64,					    /* maximum length of a subdisk name */
+    MAXPLEXNAME = 64,					    /* maximum length of a plex name */
+    MAXVOLNAME = 64,					    /* maximum length of a volume name */
+    MAXNAME = 64,					    /* maximum length of any name */
+
+
+#define OBJTYPE(x)	((minor(x) >> VINUM_TYPE_SHIFT) & 3)
+
+    /* Create device minor numbers */
+#define VINUMDEV(o, t)		makedev (VINUM_CDEV_MAJOR, VINUMMINOR (o, t))
+
+#define VINUM_VOL(v)		makedev (VINUM_CDEV_MAJOR, \
+					 VINUMMINOR (v, VINUM_VOLUME_TYPE))
+#define VINUM_PLEX(p)		makedev (VINUM_CDEV_MAJOR, \
+					 VINUMMINOR (p, VINUM_PLEX_TYPE))
+#define VINUM_SD(s)		makedev (VINUM_CDEV_MAJOR, \
+					 VINUMMINOR (s, VINUM_SD_TYPE))
+
+    /* extract device type */
+#define DEVTYPE(x) ((minor (x) >> VINUM_TYPE_SHIFT) & 3)
+
+#define VINUM_SUPERDEV_NAME VINUM_DIR"/control"		    /* normal super device */
+#define VINUM_DAEMON_DEV_NAME VINUM_DIR"/controld"	    /* super device for daemon only */
+
+    /*
+     * the number of object entries to cater for initially, and also the
+     * value by which they are incremented.  It doesn't take long
+     * to extend them, so theoretically we could start with 1 of each, but
+     * it's untidy to allocate such small areas.  These values are
+     * probably too small.
+     */
+
+    INITIAL_DRIVES = 4,
+    INITIAL_VOLUMES = 4,
+    INITIAL_PLEXES = 8,
+    INITIAL_SUBDISKS = 16,
+    INITIAL_SUBDISKS_IN_PLEX = 4,			    /* number of subdisks to allocate to a plex */
+    INITIAL_SUBDISKS_IN_DRIVE = 4,			    /* number of subdisks to allocate to a drive */
+    INITIAL_DRIVE_FREELIST = 16,			    /* number of entries in drive freelist */
+    PLEX_REGION_TABLE_SIZE = 8,				    /* number of entries in plex region tables */
+    PLEX_LOCKS = 256,					    /* number of locks to allocate to a plex */
+    PLEXMUTEXES = 32,
+    MAX_REVIVE_BLOCKSIZE = MAXPHYS,			    /* maximum revive block size */
+    DEFAULT_REVIVE_BLOCKSIZE = 65536,			    /* default revive block size */
+    VINUMHOSTNAMELEN = 32,				    /* host name field in label */
+};
+
+/*
+ * Slice header
+ *
+ * Vinum drives start with this structure:
+ *
+ *\                                            Sector
+ * |--------------------------------------|
+ * |   PDP-11 memorial boot block         |      0
+ * |--------------------------------------|
+ * |   Disk label, maybe                  |      1
+ * |--------------------------------------|
+ * |   Slice definition  (vinum_hdr)      |      8
+ * |--------------------------------------|
+ * |                                      |
+ * |   Configuration info, first copy     |      9
+ * |                                      |
+ * |--------------------------------------|
+ * |                                      |
+ * |   Configuration info, second copy    |      9 + size of config
+ * |                                      |
+ * |--------------------------------------|
+ */
+
+/* Sizes and offsets of our information */
+enum {
+    VINUM_LABEL_OFFSET = 4096,				    /* offset of vinum label */
+    VINUMHEADERLEN = 512,				    /* size of vinum label */
+    VINUM_CONFIG_OFFSET = 4608,				    /* offset of first config copy */
+    MAXCONFIG = 65536,					    /* and size of config copy */
+    DATASTART = (MAXCONFIG * 2 + VINUM_CONFIG_OFFSET) / DEV_BSIZE /* this is where the data starts */
+};
+
+/*
+ * hostname is 256 bytes long, but we don't need to shlep
+ * multiple copies in vinum.  We use the host name just
+ * to identify this system, and 32 bytes should be ample
+ * for that purpose
+ */
+
+struct vinum_label {
+    char sysname[VINUMHOSTNAMELEN];			    /* system name at time of creation */
+    char name[MAXDRIVENAME];				    /* our name of the drive */
+    struct timeval date_of_birth;			    /* the time it was created */
+    struct timeval last_update;				    /* and the time of last update */
+    /*
+     * total size in bytes of the drive.  This value
+     * includes the headers.
+     */
+    off_t drive_size;
+};
+
+struct vinum_hdr {
+    uint64_t magic;					    /* we're long on magic numbers */
+#define VINUM_MAGIC    22322600044678729LL		    /* should be this */
+#define VINUM_NOMAGIC  22322600044678990LL		    /* becomes this after obliteration */
+    /*
+     * Size in bytes of each copy of the
+     * configuration info.  This must be a multiple
+     * of the sector size.
+     */
+    int config_length;
+    struct vinum_label label;				    /* unique label */
+};
+
+/* Information returned from read_drive_label */
+enum drive_label_info {
+    DL_CANT_OPEN,					    /* invalid partition */
+    DL_NOT_OURS,					    /* valid partition, but no vinum label */
+    DL_DELETED_LABEL,					    /* valid partition, deleted label found */
+    DL_WRONG_DRIVE,					    /* drive name doesn't match */
+    DL_OURS						    /* valid partition and label found */
+};
+
+/* kinds of plex organization */
+enum plexorg {
+    plex_disorg,					    /* disorganized */
+    plex_concat,					    /* concatenated plex */
+    plex_striped,					    /* striped plex */
+    plex_raid4,						    /* RAID4 plex */
+    plex_raid5						    /* RAID5 plex */
+};
+
+/* Recognize plex organizations */
+#define isstriped(p) (p->organization >= plex_striped)	    /* RAID 1, 4 or 5 */
+#define isparity(p) (p->organization >= plex_raid4)	    /* RAID 4 or 5 */
+
+/* Address range definitions, for locking volumes */
+struct rangelock {
+    daddr_t stripe;					    /* address + 1 of the range being locked  */
+    struct buf *bp;					    /* user's buffer pointer */
+};
+
+struct drive_freelist {					    /* sorted list of free space on drive */
+    u_int64_t offset;					    /* offset of entry */
+    u_int64_t sectors;					    /* and length in sectors */
+};
+
+/*
+ * Include the structure definitions shared
+ * between userland and kernel.
+ */
+
+#ifdef _KERNEL
+#include <dev/vinum/vinumobj.h>
+#undef _KERNEL
+#include <dev/vinum/vinumobj.h>
+#define _KERNEL
+#else
+#include <dev/vinum/vinumobj.h>
+#endif
+
+/*
+ * Table expansion.  Expand table, which contains oldcount
+ * entries of type element, by increment entries, and change
+ * oldcount accordingly
+ */
+#ifdef VINUMDEBUG
+#define EXPAND(table, element, oldcount, increment)         \
+{							    \
+  expand_table ((void **) &table,			    \
+		oldcount * sizeof (element),		    \
+		(oldcount + increment) * sizeof (element),  \
+		__FILE__,				    \
+		__LINE__ );				    \
+  oldcount += increment;				    \
+  }
+#else
+#define EXPAND(table, element, oldcount, increment)         \
+{							    \
+  expand_table ((void **) &table,			    \
+		oldcount * sizeof (element),		    \
+		(oldcount + increment) * sizeof (element)); \
+  oldcount += increment;				    \
+  }
+#endif
+
+/* Information on vinum's memory usage */
+struct meminfo {
+    int mallocs;					    /* number of malloced blocks */
+    int total_malloced;					    /* total amount malloced */
+    int highwater;					    /* maximum number of mallocs */
+    struct mc *malloced;				    /* pointer to kernel table */
+};
+
+#define MCFILENAMELEN	16
+struct mc {
+    struct timeval time;
+    int seq;
+    int size;
+    short line;
+    caddr_t address;
+    char file[MCFILENAMELEN];
+};
+
+/*
+ * These enums are used by the state transition
+ * routines.  They're in bit map format:
+ *
+ * Bit 0: Other plexes in the volume are down
+ * Bit 1: Other plexes in the volume are up
+ * Bit 2: The current plex is up
+ * Maybe they should be local to
+ * state.c
+ */
+enum volplexstate {
+    volplex_onlyusdown = 0,				    /* 0: we're the only plex, and we're down */
+    volplex_alldown,					    /* 1: another plex is down, and so are we */
+    volplex_otherup,					    /* 2: another plex is up */
+    volplex_otherupdown,				    /* 3: other plexes are up and down */
+    volplex_onlyus,					    /* 4: we're up and alone */
+    volplex_onlyusup,					    /* 5: only we are up, others are down */
+    volplex_allup,					    /* 6: all plexes are up */
+    volplex_someup					    /* 7: some plexes are up, including us */
+};
+
+/* state map for plex */
+enum sdstates {
+    sd_emptystate = 1,
+    sd_downstate = 2,					    /* SD is down */
+    sd_crashedstate = 4,				    /* SD is crashed */
+    sd_obsoletestate = 8,				    /* SD is obsolete */
+    sd_stalestate = 16,					    /* SD is stale */
+    sd_rebornstate = 32,				    /* SD is reborn */
+    sd_upstate = 64,					    /* SD is up */
+    sd_initstate = 128,					    /* SD is initializing */
+    sd_initializedstate = 256,				    /* SD is initialized */
+    sd_otherstate = 512,				    /* SD is in some other state */
+};
+
+/*
+ * This is really just a parameter to pass to
+ * set_<foo>_state, but since it needs to be known
+ * in the external definitions, we need to define
+ * it here
+ */
+enum setstateflags {
+    setstate_none = 0,					    /* no flags */
+    setstate_force = 1,					    /* force the state change */
+    setstate_configuring = 2,				    /* we're currently configuring, don't save */
+};
+
+/* Operations for parityops to perform. */
+enum parityop {
+    checkparity,
+    rebuildparity,
+    rebuildandcheckparity,				    /* rebuildparity with the -v option */
+};
+
+#ifdef VINUMDEBUG
+/* Debugging stuff */
+enum debugflags {
+    DEBUG_ADDRESSES = 1,				    /* show buffer information during requests */
+    DEBUG_NUMOUTPUT = 2,				    /* show the value of vp->v_numoutput */
+    DEBUG_RESID = 4,					    /* go into debugger in complete_rqe */
+    DEBUG_LASTREQS = 8,					    /* keep a circular buffer of last requests */
+    DEBUG_REVIVECONFLICT = 16,				    /* print info about revive conflicts */
+    DEBUG_EOFINFO = 32,					    /* print info about EOF detection */
+    DEBUG_MEMFREE = 64,					    /* keep info about Frees */
+    DEBUG_BIGDRIVE = 128,				    /* pretend our drives are 100 times the size */
+    DEBUG_REMOTEGDB = 256,				    /* go into remote gdb */
+    DEBUG_WARNINGS = 512,				    /* log various relatively harmless warnings  */
+    DEBUG_LOCKREQS = 1024,				    /* log locking requests  */
+};
+
+#ifdef _KERNEL
+#ifdef __i386__
+#define longjmp LongJmp					    /* test our longjmps */
+#endif
+#endif
+#endif
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */