summaryrefslogtreecommitdiffstats
path: root/sys/dev/vinum
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/vinum')
-rw-r--r--sys/dev/vinum/COPYRIGHT37
-rwxr-xr-xsys/dev/vinum/makestatetext78
-rw-r--r--sys/dev/vinum/request.h273
-rw-r--r--sys/dev/vinum/statetexts.h91
-rw-r--r--sys/dev/vinum/vinum.c531
-rw-r--r--sys/dev/vinum/vinumconfig.c2148
-rw-r--r--sys/dev/vinum/vinumdaemon.c281
-rw-r--r--sys/dev/vinum/vinumext.h263
-rw-r--r--sys/dev/vinum/vinumhdr.h80
-rw-r--r--sys/dev/vinum/vinuminterrupt.c467
-rw-r--r--sys/dev/vinum/vinumio.c959
-rw-r--r--sys/dev/vinum/vinumio.h154
-rw-r--r--sys/dev/vinum/vinumioctl.c958
-rw-r--r--sys/dev/vinum/vinumkw.h152
-rw-r--r--sys/dev/vinum/vinumlock.c264
-rw-r--r--sys/dev/vinum/vinummemory.c288
-rw-r--r--sys/dev/vinum/vinumobj.h320
-rw-r--r--sys/dev/vinum/vinumparser.c234
-rw-r--r--sys/dev/vinum/vinumraid5.c698
-rw-r--r--sys/dev/vinum/vinumrequest.c1112
-rw-r--r--sys/dev/vinum/vinumrevive.c622
-rw-r--r--sys/dev/vinum/vinumstate.c1093
-rw-r--r--sys/dev/vinum/vinumstate.h257
-rw-r--r--sys/dev/vinum/vinumutil.c304
-rw-r--r--sys/dev/vinum/vinumutil.h54
-rw-r--r--sys/dev/vinum/vinumvar.h400
26 files changed, 12118 insertions, 0 deletions
diff --git a/sys/dev/vinum/COPYRIGHT b/sys/dev/vinum/COPYRIGHT
new file mode 100644
index 0000000..f0295e6
--- /dev/null
+++ b/sys/dev/vinum/COPYRIGHT
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $FreeBSD$
+ */
diff --git a/sys/dev/vinum/makestatetext b/sys/dev/vinum/makestatetext
new file mode 100755
index 0000000..c5a7da2
--- /dev/null
+++ b/sys/dev/vinum/makestatetext
@@ -0,0 +1,78 @@
+#!/bin/sh
+# Make statetexts.h from vinumstate.h
+# $FreeBSD$
+# $Id: makestatetext,v 1.7 1999/12/29 07:24:54 grog Exp grog $
+infile=vinumstate.h
+ofile=statetexts.h
+echo >$ofile "/* Created by $0 on" `date`. "Do not edit */"
+echo >>$ofile
+cat >> $ofile <<FOO
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called \`\`Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided \`\`as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ */
+
+FOO
+
+echo >>$ofile "/* Drive state texts */"
+echo >>$ofile "char *drivestatetext [] =
+ { "
+egrep -e 'drive_[A-z0-9]*,' <$infile | grep -v = | sed 's: *drive_\([^,]*\).*: \"\1\",:' >>$ofile
+cat <<FOO >> $ofile
+ };
+
+/* Subdisk state texts */
+char *sdstatetext [] =
+ {
+FOO
+egrep -e 'sd_[A-z0-9]*,' $infile | grep -v = | sed 's: *sd_\([^,]*\).*: \"\1\",:' >>$ofile
+cat <<FOO >> $ofile
+ };
+
+/* Plex state texts */
+char *plexstatetext [] =
+ {
+FOO
+egrep -e 'plex_[A-z0-9]*,' $infile | grep -v = | sed 's: *plex_\([^,]*\).*: \"\1\",:' >>$ofile
+cat <<FOO >> $ofile
+ };
+
+/* Volume state texts */
+char *volstatetext [] =
+ {
+FOO
+egrep -e 'volume_[A-z0-9]*,' $infile | grep -v = | sed 's: *volume_\([^,]*\).*: \"\1\",:' >>$ofile
+cat <<FOO >> $ofile
+ };
+FOO
diff --git a/sys/dev/vinum/request.h b/sys/dev/vinum/request.h
new file mode 100644
index 0000000..600130f
--- /dev/null
+++ b/sys/dev/vinum/request.h
@@ -0,0 +1,273 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: request.h,v 1.22 2003/04/24 04:37:08 grog Exp $
+ * $FreeBSD$
+ */
+
+/* Information needed to set up a transfer */
+
+enum xferinfo {
+ XFR_NORMAL_READ = 1,
+ XFR_NORMAL_WRITE = 2, /* write request in normal mode */
+ XFR_RECOVERY_READ = 4,
+ XFR_DEGRADED_WRITE = 8,
+ XFR_PARITYLESS_WRITE = 0x10,
+ XFR_NO_PARITY_STRIPE = 0x20, /* parity stripe is not available */
+ XFR_DATA_BLOCK = 0x40, /* data block in request */
+ XFR_PARITY_BLOCK = 0x80, /* parity block in request */
+ XFR_BAD_SUBDISK = 0x100, /* this subdisk is dead */
+ XFR_MALLOCED = 0x200, /* this buffer is malloced */
+#ifdef VINUMDEBUG
+ XFR_PHASE2 = 0x800, /* documentation only: 2nd phase write */
+#endif
+ XFR_REVIVECONFLICT = 0x1000, /* possible conflict with a revive operation */
+ XFR_BUFLOCKED = 0x2000, /* BUF_LOCK performed on this buffer */
+ XFR_COPYBUF = 0x4000, /* data buffer was copied */
+ /* operations that need a parity block */
+ XFR_PARITYOP = (XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE),
+ /* operations that use the group parameters */
+ XFR_GROUPOP = (XFR_DEGRADED_WRITE | XFR_RECOVERY_READ),
+ /* operations that that use the data parameters */
+ XFR_DATAOP = (XFR_NORMAL_READ | XFR_NORMAL_WRITE | XFR_PARITYLESS_WRITE),
+ /* operations requiring read before write */
+ XFR_RBW = (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE),
+ /* operations that need a malloced buffer */
+ XFR_NEEDS_MALLOC = (XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE)
+};
+
+/*
+ * Describe one low-level request, part of a
+ * high-level request. This is an extended
+ * struct buf buffer, and the first element
+ * *must* be a struct buf. We pass this
+ * structure to the I/O routines instead of a
+ * struct buf in order to be able to locate the
+ * high-level request when it completes.
+ *
+ * All offsets and lengths are in sectors.
+ */
+
+struct rqelement {
+ struct buf b; /* buf structure */
+ struct rqgroup *rqg; /* pointer to our group */
+ /* Information about the transfer */
+ daddr_t sdoffset; /* offset in subdisk */
+ int useroffset; /* offset in user buffer of normal data */
+ /*
+ * dataoffset and datalen refer to "individual" data
+ * transfers which involve only this drive (normal read,
+ * parityless write) and also degraded write.
+ *
+ * groupoffset and grouplen refer to the other "group"
+ * operations (normal write, recovery read) which involve
+ * more than one drive. Both the offsets are relative to
+ * the start of the local buffer.
+ */
+ int dataoffset; /* offset in buffer of the normal data */
+ int groupoffset; /* offset in buffer of group data */
+ short datalen; /* length of normal data (sectors) */
+ short grouplen; /* length of group data (sectors) */
+ short buflen; /* total buffer length to allocate */
+ short flags; /* really enum xferinfo (see above) */
+ /* Ways to find other components */
+ short sdno; /* subdisk number */
+ short driveno; /* drive number */
+ struct timeval launchtime; /* time of launch, for info function */
+};
+
+/*
+ * A group of requests built to satisfy an I/O
+ * transfer on a single plex.
+ */
+struct rqgroup {
+ struct rqgroup *next; /* pointer to next group */
+ struct request *rq; /* pointer to the request */
+ short count; /* number of requests in this group */
+ short active; /* and number active */
+ short plexno; /* index of plex */
+ int badsdno; /* index of bad subdisk or -1 */
+ enum xferinfo flags; /* description of transfer */
+ struct rangelock *lock; /* lock for this transfer */
+ daddr_t lockbase; /* and lock address */
+ struct rqelement rqe[0]; /* and the elements of this request */
+};
+
+/*
+ * Describe one high-level request and the
+ * work we have to do to satisfy it.
+ */
+struct request {
+ struct buf *bp; /* pointer to the high-level request */
+ caddr_t save_data; /* for copied write buffers */
+ enum xferinfo flags;
+ union {
+ int volno; /* volume index */
+ int plexno; /* or plex index */
+ } volplex;
+ int error; /* current error indication */
+ int sdno; /* reviving subdisk (XFR_REVIVECONFLICT) */
+ short isplex; /* set if this is a plex request */
+ short active; /* number of subrequests still active */
+ struct rqgroup *rqg; /* pointer to the first group of requests */
+ struct rqgroup *lrqg; /* and to the last group of requests */
+ struct request *next; /* link of waiting requests */
+};
+
+/*
+ * Extended buffer header for subdisk I/O. Includes
+ * a pointer to the user I/O request.
+ */
+struct sdbuf {
+ struct buf b; /* our buffer */
+ struct buf *bp; /* and pointer to parent */
+ short driveno; /* drive index */
+ short sdno; /* and subdisk index */
+};
+
+/*
+ * Values returned by rqe and friends. Be careful
+ * with these: they are in order of increasing
+ * seriousness. Some routines check for
+ * > REQUEST_RECOVERED to indicate a failed request. XXX
+ */
+enum requeststatus {
+ REQUEST_OK, /* request built OK */
+ REQUEST_RECOVERED, /* request OK, but involves RAID5 recovery */
+ REQUEST_DEGRADED, /* parts of request failed */
+ REQUEST_EOF, /* parts of request failed: outside plex */
+ REQUEST_DOWN, /* all of request failed: subdisk(s) down */
+ REQUEST_ENOMEM /* all of request failed: ran out of memory */
+};
+
+#ifdef VINUMDEBUG
+/* Trace entry for request info (DEBUG_LASTREQS) */
+enum rqinfo_type {
+ loginfo_unused, /* never been used */
+ loginfo_user_bp, /* this is the bp when strategy is called */
+ loginfo_user_bpl, /* and this is the bp at launch time */
+ loginfo_rqe, /* user RQE */
+ loginfo_iodone, /* iodone */
+ loginfo_raid5_data, /* write RAID-5 data block */
+ loginfo_raid5_parity, /* write RAID-5 parity block */
+ loginfo_sdio, /* subdisk I/O */
+ loginfo_sdiol, /* subdisk I/O launch */
+ loginfo_sdiodone, /* subdisk iodone */
+ loginfo_lockwait, /* wait for range lock */
+ loginfo_lock, /* lock range */
+ loginfo_unlock, /* unlock range */
+};
+
+/*
+ * This is the rangelock structure with an added
+ * buffer pointer and plex number. We don't need
+ * the plex number for the locking protocol, but
+ * it does help a lot when logging.
+ */
+struct rangelockinfo {
+ daddr_t stripe; /* address + 1 of the range being locked */
+ struct buf *bp; /* user's buffer pointer */
+ int plexno;
+};
+
+union rqinfou { /* info to pass to logrq */
+ struct buf *bp;
+ struct rqelement *rqe; /* address of request, for correlation */
+ struct rangelockinfo *lockinfo;
+};
+
+struct rqinfo {
+ enum rqinfo_type type; /* kind of event */
+ struct timeval timestamp; /* time it happened */
+ struct buf *bp; /* point to user buffer */
+ int devmajor; /* major and minor device info */
+ int devminor;
+ union {
+ struct buf b; /* yup, the *whole* buffer header */
+ struct rqelement rqe; /* and the whole rqe */
+ struct rangelock lockinfo;
+ } info;
+};
+
+#define RQINFO_SIZE 128 /* number of info slots in buffer */
+
+void logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp);
+#endif
+
+/* Structures for the daemon */
+
+/* types of request to the daemon */
+enum daemonrq {
+ daemonrq_none, /* dummy to catch bugs */
+ daemonrq_ioerror, /* error occurred on I/O */
+ daemonrq_saveconfig, /* save configuration */
+ daemonrq_return, /* return to userland */
+ daemonrq_ping, /* show sign of life */
+ daemonrq_init, /* initialize a plex */
+ daemonrq_revive, /* revive a subdisk */
+ daemonrq_closedrive, /* close a drive */
+};
+
+/* info field for daemon requests */
+union daemoninfo { /* and the request information */
+ struct request *rq; /* for daemonrq_ioerror */
+ struct sd *sd; /* for daemonrq_revive */
+ struct plex *plex; /* for daemonrq_init */
+ struct drive *drive; /* for daemonrq_closedrive */
+ int nothing; /* for passing NULL */
+};
+
+struct daemonq {
+ struct daemonq *next; /* pointer to next element in queue */
+ enum daemonrq type; /* type of request */
+ int privateinuse; /* private element, being used */
+ union daemoninfo info; /* and the request information */
+};
+
+void queue_daemon_request(enum daemonrq type, union daemoninfo info);
+
+extern int daemon_options;
+
+enum daemon_option {
+ daemon_verbose = 1, /* talk about what we're doing */
+ daemon_stopped = 2,
+ daemon_noupdate = 4, /* don't update the disk config, for recovery */
+};
+
+void freerq(struct request *rq);
+void unlockrange(int plexno, struct rangelock *);
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/statetexts.h b/sys/dev/vinum/statetexts.h
new file mode 100644
index 0000000..88cfc17
--- /dev/null
+++ b/sys/dev/vinum/statetexts.h
@@ -0,0 +1,91 @@
+/* Created by ./makestatetext on Wed Jan 5 10:05:30 CST 2000. Do not edit */
+
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $FreeBSD$
+ */
+
+/* Drive state texts */
+char *drivestatetext[] =
+{
+ "unallocated",
+ "referenced",
+ "down",
+ "up",
+};
+
+/* Subdisk state texts */
+char *sdstatetext[] =
+{
+ "unallocated",
+ "uninit",
+ "referenced",
+ "init",
+ "empty",
+ "initializing",
+ "initialized",
+ "obsolete",
+ "stale",
+ "crashed",
+ "down",
+ "reviving",
+ "reborn",
+ "up",
+};
+
+/* Plex state texts */
+char *plexstatetext[] =
+{
+ "unallocated",
+ "referenced",
+ "init",
+ "faulty",
+ "down",
+ "initializing",
+ "corrupt",
+ "degraded",
+ "flaky",
+ "up",
+};
+
+/* Volume state texts */
+char *volstatetext[] =
+{
+ "unallocated",
+ "uninit",
+ "down",
+ "up",
+};
diff --git a/sys/dev/vinum/vinum.c b/sys/dev/vinum/vinum.c
new file mode 100644
index 0000000..36dfa98
--- /dev/null
+++ b/sys/dev/vinum/vinum.c
@@ -0,0 +1,531 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinum.c,v 1.44 2003/05/23 00:50:55 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#define STATIC static /* nothing while we're testing */
+
+#include <dev/vinum/vinumhdr.h>
+#include <sys/sysproto.h> /* for sync(2) */
+#ifdef VINUMDEBUG
+#include <sys/reboot.h>
+int debug = 0; /* debug flags */
+extern int total_malloced;
+extern int malloccount;
+extern struct mc malloced[];
+#endif
+#include <dev/vinum/request.h>
+
+struct cdevsw vinum_cdevsw =
+{
+ .d_open = vinumopen,
+ .d_close = vinumclose,
+ .d_read = physread,
+ .d_write = physwrite,
+ .d_ioctl = vinumioctl,
+ .d_strategy = vinumstrategy,
+ .d_name = "vinum",
+ .d_maj = VINUM_CDEV_MAJOR,
+ .d_flags = D_DISK
+};
+
+/* Called by main() during pseudo-device attachment. */
+void vinumattach(void *);
+STATIC int vinum_modevent(module_t mod, modeventtype_t type, void *unused);
+STATIC void vinum_clone(void *arg, char *name, int namelen, dev_t * dev);
+
+struct _vinum_conf vinum_conf; /* configuration information */
+
+dev_t vinum_daemon_dev;
+dev_t vinum_super_dev;
+
+static eventhandler_tag dev_clone_tag;
+
+/*
+ * Mutexes for plex synchronization. Ideally each plex
+ * should have its own mutex, but the fact that the plex
+ * struct can move makes that very complicated. Instead,
+ * have plexes use share these mutexes based on modulo plex
+ * number.
+ */
+struct mtx plexmutex[PLEXMUTEXES];
+
+/*
+ * Called by main() during pseudo-device attachment. All we need
+ * to do is allocate enough space for devices to be configured later, and
+ * add devsw entries.
+ */
+void
+vinumattach(void *dummy)
+{
+ char *envp;
+ int i;
+#define MUTEXNAMELEN 16
+ char mutexname[MUTEXNAMELEN];
+#if PLEXMUTEXES > 10000
+#error Increase size of MUTEXNAMELEN
+#endif
+/* modload should prevent multiple loads, so this is worth a panic */
+ if ((vinum_conf.flags & VF_LOADED) != 0)
+ panic("vinum: already loaded");
+
+ log(LOG_INFO, "vinum: loaded\n");
+#ifdef VINUMDEBUG
+ vinum_conf.flags |= VF_LOADED | VF_HASDEBUG; /* we're loaded now, and we support debug */
+#else
+ vinum_conf.flags |= VF_LOADED; /* we're loaded now */
+#endif
+
+ daemonq = NULL; /* initialize daemon's work queue */
+ dqend = NULL;
+
+ vinum_daemon_dev = make_dev(&vinum_cdevsw,
+ VINUM_DAEMON_MINOR,
+ UID_ROOT,
+ GID_WHEEL,
+ S_IRUSR | S_IWUSR,
+ "vinum/controld");
+ vinum_super_dev = make_dev(&vinum_cdevsw,
+ VINUM_SUPERDEV_MINOR,
+ UID_ROOT,
+ GID_WHEEL,
+ S_IRUSR | S_IWUSR,
+ "vinum/control");
+
+ vinum_conf.version = VINUMVERSION; /* note what version we are */
+
+ /* allocate space: drives... */
+ DRIVE = (struct drive *) Malloc(sizeof(struct drive) * INITIAL_DRIVES);
+ CHECKALLOC(DRIVE, "vinum: no memory\n");
+ bzero(DRIVE, sizeof(struct drive) * INITIAL_DRIVES);
+ vinum_conf.drives_allocated = INITIAL_DRIVES; /* number of drive slots allocated */
+ vinum_conf.drives_used = 0; /* and number in use */
+
+ /* volumes, ... */
+ VOL = (struct volume *) Malloc(sizeof(struct volume) * INITIAL_VOLUMES);
+ CHECKALLOC(VOL, "vinum: no memory\n");
+ bzero(VOL, sizeof(struct volume) * INITIAL_VOLUMES);
+ vinum_conf.volumes_allocated = INITIAL_VOLUMES; /* number of volume slots allocated */
+ vinum_conf.volumes_used = 0; /* and number in use */
+
+ /* plexes, ... */
+ PLEX = (struct plex *) Malloc(sizeof(struct plex) * INITIAL_PLEXES);
+ CHECKALLOC(PLEX, "vinum: no memory\n");
+ bzero(PLEX, sizeof(struct plex) * INITIAL_PLEXES);
+ vinum_conf.plexes_allocated = INITIAL_PLEXES; /* number of plex slots allocated */
+ vinum_conf.plexes_used = 0; /* and number in use */
+
+ for (i = 0; i < PLEXMUTEXES; i++) {
+ snprintf(mutexname, MUTEXNAMELEN, "vinumplex%d", i);
+ mtx_init(&plexmutex[i], mutexname, "plex", MTX_DEF);
+ }
+
+ /* and subdisks */
+ SD = (struct sd *) Malloc(sizeof(struct sd) * INITIAL_SUBDISKS);
+ CHECKALLOC(SD, "vinum: no memory\n");
+ bzero(SD, sizeof(struct sd) * INITIAL_SUBDISKS);
+ vinum_conf.subdisks_allocated = INITIAL_SUBDISKS; /* number of sd slots allocated */
+ vinum_conf.subdisks_used = 0; /* and number in use */
+ dev_clone_tag = EVENTHANDLER_REGISTER(dev_clone, vinum_clone, 0, 1000);
+
+ /*
+ * See if the loader has passed us any of the autostart
+ * options.
+ */
+ envp = NULL;
+ if ((envp = getenv("vinum.autostart")) != NULL) { /* start all drives now */
+ vinum_scandisk(NULL);
+ freeenv(envp);
+ } else if ((envp = getenv("vinum.drives")) != NULL) {
+ vinum_scandisk(envp);
+ freeenv(envp);
+ }
+}
+
+/*
+ * Check if we have anything open. If confopen is != 0,
+ * that goes for the super device as well, otherwise
+ * only for volumes.
+ *
+ * Return 0 if not inactive, 1 if inactive.
+ */
+int
+vinum_inactive(int confopen)
+{
+ int i;
+ int can_do = 1; /* assume we can do it */
+
+ if (confopen && (vinum_conf.flags & VF_OPEN)) /* open by vinum(8)? */
+ return 0; /* can't do it while we're open */
+ lock_config();
+ for (i = 0; i < vinum_conf.volumes_allocated; i++) {
+ if ((VOL[i].state > volume_down)
+ && (VOL[i].flags & VF_OPEN)) { /* volume is open */
+ can_do = 0;
+ break;
+ }
+ }
+ unlock_config();
+ return can_do;
+}
+
+/*
+ * Free all structures.
+ * If cleardrive is 0, save the configuration; otherwise
+ * remove the configuration from the drive.
+ *
+ * Before coming here, ensure that no volumes are open.
+ */
+void
+free_vinum(int cleardrive)
+{
+ int i;
+ int drives_allocated = vinum_conf.drives_allocated;
+
+ while ((vinum_conf.flags & (VF_STOPPING | VF_DAEMONOPEN))
+ == (VF_STOPPING | VF_DAEMONOPEN)) { /* at least one daemon open, we're stopping */
+ queue_daemon_request(daemonrq_return, (union daemoninfo) 0); /* stop the daemon */
+ tsleep(&vinumclose, PUSER, "vstop", 1); /* and wait for it */
+ }
+ if (DRIVE != NULL) {
+ if (cleardrive) { /* remove the vinum config */
+ for (i = 0; i < drives_allocated; i++)
+ remove_drive(i); /* remove the drive */
+ } else { /* keep the config */
+ for (i = 0; i < drives_allocated; i++)
+ free_drive(&DRIVE[i]); /* close files and things */
+ }
+ Free(DRIVE);
+ }
+ if (SD != NULL) {
+ for (i = 0; i < vinum_conf.subdisks_allocated; i++) {
+ struct sd *sd = &SD[i];
+
+ if (sd->state != sd_unallocated)
+ free_sd(i);
+ }
+ Free(SD);
+ }
+ if (PLEX != NULL) {
+ for (i = 0; i < vinum_conf.plexes_allocated; i++) {
+ struct plex *plex = &PLEX[i];
+
+ if (plex->state != plex_unallocated) /* we have real data there */
+ free_plex(i);
+ }
+ Free(PLEX);
+ }
+ if (VOL != NULL) {
+ for (i = 0; i < vinum_conf.volumes_allocated; i++) {
+ struct volume *volume = &VOL[i];
+
+ if (volume->state != volume_unallocated)
+ free_volume(i);
+ }
+ Free(VOL);
+ }
+ bzero(&vinum_conf, sizeof(vinum_conf));
+ vinum_conf.version = VINUMVERSION; /* reinstate version number */
+}
+
+STATIC int
+vinum_modevent(module_t mod, modeventtype_t type, void *unused)
+{
+ struct sync_args dummyarg =
+ {0};
+ int i;
+
+ switch (type) {
+ case MOD_LOAD:
+ vinumattach(NULL);
+ return 0; /* OK */
+ case MOD_UNLOAD:
+ if (!vinum_inactive(1)) /* is anything open? */
+ return EBUSY; /* yes, we can't do it */
+ vinum_conf.flags |= VF_STOPPING; /* note that we want to stop */
+ sync(curthread, &dummyarg); /* write out buffers */
+ free_vinum(0); /* clean up */
+#ifdef VINUMDEBUG
+ if (total_malloced) {
+ int i;
+#ifdef INVARIANTS
+ int *poke;
+#endif
+
+ for (i = 0; i < malloccount; i++) {
+ if (debug & DEBUG_WARNINGS) /* want to hear about them */
+ log(LOG_WARNING,
+ "vinum: exiting with %d bytes malloced from %s:%d\n",
+ malloced[i].size,
+ malloced[i].file,
+ malloced[i].line);
+#ifdef INVARIANTS
+ poke = &((int *) malloced[i].address)
+ [malloced[i].size / (2 * sizeof(int))]; /* middle of the area */
+ if (*poke == 0xdeadc0de) /* already freed */
+ log(LOG_ERR,
+ "vinum: exiting with malloc table inconsistency at %p from %s:%d\n",
+ malloced[i].address,
+ malloced[i].file,
+ malloced[i].line);
+#endif
+ Free(malloced[i].address);
+ }
+ }
+#endif
+ destroy_dev(vinum_daemon_dev); /* daemon device */
+ destroy_dev(vinum_super_dev);
+ for (i = 0; i < PLEXMUTEXES; i++)
+ mtx_destroy(&plexmutex[i]);
+ log(LOG_INFO, "vinum: unloaded\n"); /* tell the world */
+ EVENTHANDLER_DEREGISTER(dev_clone, dev_clone_tag);
+ return 0;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static moduledata_t vinum_mod =
+{
+ "vinum",
+ (modeventhand_t) vinum_modevent,
+ 0
+};
+DECLARE_MODULE(vinum, vinum_mod, SI_SUB_RAID, SI_ORDER_MIDDLE);
+
+/* ARGSUSED */
+/* Open a vinum object */
+int
+vinumopen(dev_t dev,
+ int flags,
+ int fmt,
+ struct thread *td)
+{
+ int error;
+ unsigned int index;
+ struct volume *vol;
+ struct plex *plex;
+ struct sd *sd;
+ int devminor; /* minor number */
+
+ devminor = minor(dev);
+ error = 0;
+ /* First, decide what we're looking at */
+ switch (DEVTYPE(dev)) {
+ case VINUM_VOLUME_TYPE:
+ /*
+ * The super device and daemon device are the last two
+ * volume numbers, so check for them first.
+ */
+ if ((devminor == VINUM_DAEMON_MINOR) /* daemon device */
+ ||(devminor == VINUM_SUPERDEV_MINOR)) { /* or normal super device */
+ error = suser(td); /* are we root? */
+
+ if (error == 0) { /* yes, can do */
+ if (devminor == VINUM_DAEMON_MINOR) /* daemon device */
+ vinum_conf.flags |= VF_DAEMONOPEN; /* we're open */
+ else /* superdev */
+ vinum_conf.flags |= VF_OPEN; /* we're open */
+ }
+ return error;
+ }
+ /* Must be a real volume. Check. */
+ index = Volno(dev);
+ if (index >= vinum_conf.volumes_allocated)
+ return ENXIO; /* no such device */
+ vol = &VOL[index];
+
+ switch (vol->state) {
+ case volume_unallocated:
+ case volume_uninit:
+ return ENXIO;
+
+ case volume_up:
+ vol->flags |= VF_OPEN; /* note we're open */
+ return 0;
+
+ case volume_down:
+ return EIO;
+
+ default:
+ return EINVAL;
+ }
+
+ case VINUM_PLEX_TYPE:
+ index = Plexno(dev); /* get plex index in vinum_conf */
+ if (index >= vinum_conf.plexes_allocated)
+ return ENXIO; /* no such device */
+ plex = &PLEX[index];
+
+ switch (plex->state) {
+ case plex_unallocated:
+ return ENXIO;
+
+ case plex_referenced:
+ return EINVAL;
+
+ default:
+ plex->flags |= VF_OPEN; /* note we're open */
+ return 0;
+ }
+
+ case VINUM_SD_TYPE:
+ case VINUM_SD2_TYPE:
+ index = Sdno(dev); /* get the subdisk number */
+ if (index >= vinum_conf.subdisks_allocated) /* not a valid SD entry */
+ return ENXIO; /* no such device */
+ sd = &SD[index];
+
+ /*
+ * Opening a subdisk is always a special operation, so
+ * we ignore the state as long as it represents a real
+ * subdisk.
+ */
+ switch (sd->state) {
+ case sd_unallocated:
+ return ENXIO;
+
+ case sd_uninit:
+ case sd_referenced:
+ return EINVAL;
+
+ default:
+ sd->flags |= VF_OPEN; /* note we're open */
+ return 0;
+ }
+ }
+ return 0; /* to keep the compiler happy */
+}
+
+/* ARGSUSED */
+int
+vinumclose(dev_t dev,
+ int flags,
+ int fmt,
+ struct thread *td)
+{
+ unsigned int index;
+ struct volume *vol;
+ int devminor;
+
+ devminor = minor(dev);
+ /* First, decide what we're looking at */
+ switch (DEVTYPE(dev)) {
+ case VINUM_VOLUME_TYPE:
+ /*
+ * The super device and daemon device are the last two
+ * volume numbers, so check for them first.
+ */
+ if ((devminor == VINUM_DAEMON_MINOR) /* daemon device */
+ ||(devminor == VINUM_SUPERDEV_MINOR)) { /* or normal super device */
+ /*
+ * don't worry about whether we're root:
+ * nobody else would get this far.
+ */
+ if (devminor == VINUM_SUPERDEV_MINOR) /* normal superdev */
+ vinum_conf.flags &= ~VF_OPEN; /* no longer open */
+ else { /* the daemon device */
+ vinum_conf.flags &= ~VF_DAEMONOPEN; /* no longer open */
+ if (vinum_conf.flags & VF_STOPPING) /* we're trying to stop, */
+ wakeup(&vinumclose); /* we can continue now */
+ }
+ return 0;
+ }
+ /* Real volume */
+ index = Volno(dev);
+ if (index >= vinum_conf.volumes_allocated)
+ return ENXIO; /* no such device */
+ vol = &VOL[index];
+
+ switch (vol->state) {
+ case volume_unallocated:
+ case volume_uninit:
+ return ENXIO;
+
+ case volume_up:
+ vol->flags &= ~VF_OPEN; /* reset our flags */
+ return 0;
+
+ case volume_down:
+ return EIO;
+
+ default:
+ return EINVAL;
+ }
+
+ case VINUM_PLEX_TYPE:
+ if (Volno(dev) >= vinum_conf.volumes_allocated)
+ return ENXIO;
+ /* FALLTHROUGH */
+
+ case VINUM_SD_TYPE:
+ if ((Volno(dev) >= vinum_conf.volumes_allocated) || /* no such volume */
+ (Plexno(dev) >= vinum_conf.plexes_allocated)) /* or no such plex */
+ return ENXIO; /* no such device */
+ /* FALLTHROUGH */
+
+ default:
+ return ENODEV; /* don't know what to do with these */
+ }
+}
+
+void
+vinum_clone(void *arg, char *name, int namelen, dev_t * dev)
+{
+ struct volume *vol;
+ int i;
+
+ if (*dev != NODEV)
+ return;
+ if (strncmp(name, "vinum/", sizeof("vinum/") - 1) != 0)
+ return;
+
+ name += sizeof("vinum/") - 1;
+ if ((i = find_volume(name, 0)) == -1)
+ return;
+
+ vol = &VOL[i];
+ *dev = vol->dev;
+}
+
+
+/* Local Variables: */
+/* fill-column: 60 */
+/* End: */
diff --git a/sys/dev/vinum/vinumconfig.c b/sys/dev/vinum/vinumconfig.c
new file mode 100644
index 0000000..2c00921
--- /dev/null
+++ b/sys/dev/vinum/vinumconfig.c
@@ -0,0 +1,2148 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumconfig.c,v 1.41 2003/05/23 00:57:34 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#define STATIC static
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+#define MAXTOKEN 64 /* maximum number of tokens in a line */
+
+/*
+ * We can afford the luxury of global variables here,
+ * since start_config ensures that these functions
+ * are single-threaded.
+ */
+
+/* These are indices in vinum_conf of the last-mentioned of each kind of object */
+static int current_drive; /* note the last drive we mention, for
+ * some defaults */
+static int current_plex; /* and the same for the last plex */
+static int current_volume; /* and the last volme */
+static struct _ioctl_reply *ioctl_reply; /* struct to return via ioctl */
+
+
+/* These values are used by most of these routines, so set them as globals */
+static char *token[MAXTOKEN]; /* pointers to individual tokens */
+static int tokens; /* number of tokens */
+
+#define TOCONS 0x01
+#define TOTTY 0x02
+#define TOLOG 0x04
+
+struct putchar_arg {
+ int flags;
+ struct tty *tty;
+};
+
+#define MSG_MAX 1024 /* maximum length of a formatted message */
+/*
+ * Format an error message and return to the user
+ * in the reply. CARE: This routine is designed
+ * to be called only from the configuration
+ * routines, so it assumes it's the owner of the
+ * configuration lock, and unlocks it on exit.
+ */
+void
+throw_rude_remark(int error, char *msg,...)
+{
+ int retval;
+ va_list ap;
+ char *text;
+ static int finishing; /* don't recurse */
+ int was_finishing;
+
+ if ((vinum_conf.flags & VF_LOCKED) == 0) /* bug catcher */
+ panic("throw_rude_remark: called without config lock");
+ va_start(ap, msg);
+ if ((ioctl_reply != NULL) /* we're called from the user */
+ &&(!(vinum_conf.flags & VF_READING_CONFIG))) { /* and not reading from disk: return msg */
+ /*
+ * We can't just format to ioctl_reply, since it
+ * may contain our input parameters
+ */
+ text = Malloc(MSG_MAX);
+ if (text == NULL) {
+ log(LOG_ERR, "vinum: can't allocate error message buffer\n");
+ printf("vinum: ");
+ vprintf(msg, ap); /* print to the console */
+ printf("\n");
+ } else {
+ retval = kvprintf(msg, NULL, (void *) text, 10, ap);
+ text[retval] = '\0'; /* delimit */
+ strlcpy(ioctl_reply->msg, text, sizeof(ioctl_reply->msg));
+ ioctl_reply->error = error; /* first byte is the error number */
+ Free(text);
+ }
+ } else {
+ printf("vinum: ");
+ vprintf(msg, ap); /* print to the console */
+ printf("\n");
+ }
+ va_end(ap);
+
+ if (vinum_conf.flags & VF_READING_CONFIG) { /* go through to the bitter end, */
+ if ((vinum_conf.flags & VF_READING_CONFIG) /* we're reading from disk, */
+ &&((daemon_options & daemon_noupdate) == 0)) {
+ log(LOG_NOTICE, "Disabling configuration updates\n");
+ daemon_options |= daemon_noupdate;
+ }
+ return;
+ }
+ /*
+ * We have a problem here: we want to unlock the
+ * configuration, which implies tidying up, but
+ * if we find an error while tidying up, we
+ * could recurse for ever. Use this kludge to
+ * only try once.
+ */
+ was_finishing = finishing;
+ finishing = 1;
+ finish_config(was_finishing); /* unlock anything we may be holding */
+ finishing = was_finishing;
+ longjmp(command_fail, error);
+}
+
+/*
+ * Check a volume to see if the plex is already assigned to it.
+ * Return index in volume->plex, or -1 if not assigned
+ */
+int
+my_plex(int volno, int plexno)
+{
+ int i;
+ struct volume *vol;
+
+ vol = &VOL[volno]; /* point to volno */
+ for (i = 0; i < vol->plexes; i++)
+ if (vol->plex[i] == plexno)
+ return i;
+ return -1; /* not found */
+}
+
+/*
+ * Check a plex to see if the subdisk is already assigned to it.
+ * Return index in plex->sd, or -1 if not assigned
+ */
+int
+my_sd(int plexno, int sdno)
+{
+ int i;
+ struct plex *plex;
+
+ plex = &PLEX[plexno];
+ for (i = 0; i < plex->subdisks; i++)
+ if (plex->sdnos[i] == sdno)
+ return i;
+ return -1; /* not found */
+}
+
+/* Add plex to the volume if possible */
+int
+give_plex_to_volume(int volno, int plexno, int preferme)
+{
+ struct volume *vol;
+ int i;
+ int volplexno;
+
+ /*
+ * It's not an error for the plex to already
+ * belong to the volume, but we need to check a
+ * number of things to make sure it's done right.
+ * Some day.
+ */
+ volplexno = my_plex(volno, plexno);
+ vol = &VOL[volno]; /* point to volume */
+ if (volplexno < 0) {
+ if (vol->plexes == MAXPLEX) /* all plexes allocated */
+ throw_rude_remark(ENOSPC,
+ "Too many plexes for volume %s",
+ vol->name);
+ else if ((vol->plexes > 0) /* we have other plexes */
+ &&((vol->flags & VF_CONFIG_SETUPSTATE) == 0)) /* and we're not setting up state */
+ invalidate_subdisks(&PLEX[plexno], sd_stale); /* make our subdisks invalid */
+ vol->plex[vol->plexes] = plexno; /* this one */
+ vol->plexes++; /* add another plex */
+ PLEX[plexno].volno = volno; /* note the number of our volume */
+
+ /* Find out how big our volume is */
+ for (i = 0; i < vol->plexes; i++)
+ vol->size = max(vol->size, PLEX[vol->plex[i]].length);
+ volplexno = vol->plexes - 1; /* number of plex in volume */
+ }
+ if (preferme) {
+ if (vol->preferred_plex >= 0) /* already had a facourite, */
+ printf("vinum: changing preferred plex for %s from %s to %s\n",
+ vol->name,
+ PLEX[vol->plex[vol->preferred_plex]].name,
+ PLEX[plexno].name);
+ vol->preferred_plex = volplexno;
+ }
+ return volplexno;
+}
+
+/*
+ * Add subdisk to a plex if possible
+ */
+int
+give_sd_to_plex(int plexno, int sdno)
+{
+ int i;
+ struct plex *plex;
+ struct sd *sd;
+
+ /*
+ * It's not an error for the sd to already
+ * belong to the plex, but we need to check a
+ * number of things to make sure it's done right.
+ * Some day.
+ */
+ i = my_sd(plexno, sdno);
+ if (i >= 0) /* does it already belong to us? */
+ return i; /* that's it */
+
+ plex = &PLEX[plexno]; /* point to the plex */
+ sd = &SD[sdno]; /* and the subdisk */
+
+ /* Do we have an offset? Otherwise put it after the last one */
+ if (sd->plexoffset < 0) { /* no offset specified */
+ if (plex->subdisks > 0) {
+ struct sd *lastsd = &SD[plex->sdnos[plex->subdisks - 1]]; /* last subdisk */
+
+ if (plex->organization == plex_concat) /* concat, */
+ sd->plexoffset = lastsd->sectors + lastsd->plexoffset; /* starts here */
+ else /* striped, RAID-4 or RAID-5 */
+ sd->plexoffset = plex->stripesize * plex->subdisks; /* starts here */
+ } else /* first subdisk */
+ sd->plexoffset = 0; /* start at the beginning */
+ }
+ if (plex->subdisks == MAXSD) /* we already have our maximum */
+ throw_rude_remark(ENOSPC, /* crap out */
+ "Can't add %s to %s: plex full",
+ sd->name,
+ plex->name);
+
+ plex->subdisks++; /* another entry */
+ if (plex->subdisks >= plex->subdisks_allocated) /* need more space */
+ EXPAND(plex->sdnos, int, plex->subdisks_allocated, INITIAL_SUBDISKS_IN_PLEX);
+
+ /* Adjust size of plex and volume. */
+ if (isparity(plex)) /* RAID-4 or RAID-5 */
+ plex->length = (plex->subdisks - 1) * sd->sectors; /* size is one disk short */
+ else
+ plex->length += sd->sectors; /* plex gets this much bigger */
+ if (plex->volno >= 0) /* we have a volume */
+ VOL[plex->volno].size = max(VOL[plex->volno].size, plex->length); /* adjust its size */
+
+ /*
+ * We need to check that the subdisks don't overlap,
+ * but we can't do that until a point where we *must*
+ * know the size of all the subdisks. That's not
+ * here. But we need to sort them by offset
+ */
+ for (i = 0; i < plex->subdisks - 1; i++) {
+ if (sd->plexoffset < SD[plex->sdnos[i]].plexoffset) { /* it fits before this one */
+ /* First move any remaining subdisks by one */
+ int j;
+
+ for (j = plex->subdisks - 1; j > i; j--) /* move up one at a time */
+ plex->sdnos[j] = plex->sdnos[j - 1];
+ plex->sdnos[i] = sdno;
+ sd->plexsdno = i; /* note where we are in the subdisk */
+ return i;
+ }
+ }
+
+ /*
+ * The plex doesn't have any subdisk with a
+ * larger offset. Insert it here.
+ */
+ plex->sdnos[i] = sdno;
+ sd->plexsdno = i; /* note where we are in the subdisk */
+ sd->plexno = plex->plexno; /* and who we belong to */
+ return i;
+}
+
+/*
+ * Add a subdisk to drive if possible. The
+ * pointer to the drive must already be stored in
+ * the sd structure, but the drive doesn't know
+ * about the subdisk yet.
+ */
+void
+give_sd_to_drive(int sdno)
+{
+ struct sd *sd; /* pointer to subdisk */
+ struct drive *drive; /* and drive */
+ int fe; /* index in free list */
+ int sfe; /* and index of subdisk when assigning max */
+
+ sd = &SD[sdno]; /* point to sd */
+ drive = &DRIVE[sd->driveno]; /* and drive */
+
+ if (drive->state != drive_up) {
+ update_sd_state(sdno); /* that crashes the subdisk */
+ return;
+ }
+ sd->sectorsize = drive->sectorsize; /* get sector size from drive */
+ if (drive->flags & VF_HOTSPARE) /* the drive is a hot spare, */
+ throw_rude_remark(ENOSPC,
+ "Can't place %s on hot spare drive %s",
+ sd->name,
+ drive->label.name);
+ if ((drive->sectors_available == 0) /* no space left */
+ ||(sd->sectors > drive->sectors_available)) { /* or too big, */
+ sd->driveoffset = -1; /* don't be confusing */
+ free_sd(sd->sdno);
+ throw_rude_remark(ENOSPC, "No space for %s on %s", sd->name, drive->label.name);
+ return; /* in case we come back here */
+ }
+ drive->subdisks_used++; /* one more subdisk */
+
+ if (sd->sectors == 0) { /* take the largest chunk */
+ sfe = 0; /* to keep the compiler happy */
+ for (fe = 0; fe < drive->freelist_entries; fe++) {
+ if (drive->freelist[fe].sectors >= sd->sectors) { /* more space here */
+ sd->sectors = drive->freelist[fe].sectors; /* take it */
+ sd->driveoffset = drive->freelist[fe].offset;
+ sfe = fe; /* and note the index for later */
+ }
+ }
+ if (sd->sectors == 0) { /* no luck, */
+ sd->driveoffset = -1; /* don't be confusing */
+ free_sd(sd->sdno);
+ throw_rude_remark(ENOSPC, /* give up */
+ "No space for %s on %s",
+ sd->name,
+ drive->label.name);
+ }
+ if (sfe < (drive->freelist_entries - 1)) /* not the last one, */
+ bcopy(&drive->freelist[sfe + 1],
+ &drive->freelist[sfe],
+ (drive->freelist_entries - sfe) * sizeof(struct drive_freelist));
+ drive->freelist_entries--; /* one less entry */
+ drive->sectors_available -= sd->sectors; /* and note how much less space we have */
+ } else if (sd->driveoffset < 0) { /* no offset specified, find one */
+ for (fe = 0; fe < drive->freelist_entries; fe++) {
+ if (drive->freelist[fe].sectors >= sd->sectors) { /* it'll fit here */
+ sd->driveoffset = drive->freelist[fe].offset;
+ if (sd->sectors == drive->freelist[fe].sectors) { /* used up the entire entry */
+ if (fe < (drive->freelist_entries - 1)) /* not the last one, */
+ bcopy(&drive->freelist[fe + 1],
+ &drive->freelist[fe],
+ (drive->freelist_entries - fe) * sizeof(struct drive_freelist));
+ drive->freelist_entries--; /* one less entry */
+ } else {
+ drive->freelist[fe].sectors -= sd->sectors; /* this much less space */
+ drive->freelist[fe].offset += sd->sectors; /* this much further on */
+ }
+ drive->sectors_available -= sd->sectors; /* and note how much less space we have */
+ break;
+ }
+ }
+ if (sd->driveoffset < 0)
+ /*
+ * Didn't find anything. Although the drive has
+ * enough space, it's too fragmented
+ */
+ {
+ free_sd(sd->sdno);
+ throw_rude_remark(ENOSPC, "No space for %s on %s", sd->name, drive->label.name);
+ }
+ } else { /* specific offset */
+ /*
+ * For a specific offset to work, the space must be
+ * entirely in a single freelist entry. Look for it.
+ */
+ u_int64_t sdend = sd->driveoffset + sd->sectors; /* end of our subdisk */
+ for (fe = 0; fe < drive->freelist_entries; fe++) {
+ u_int64_t dend = drive->freelist[fe].offset + drive->freelist[fe].sectors; /* end of entry */
+ if (dend >= sdend) { /* fits before here */
+ if (drive->freelist[fe].offset > sd->driveoffset) { /* starts after the beginning of sd area */
+ sd->driveoffset = -1; /* don't be confusing */
+ set_sd_state(sd->sdno, sd_down, setstate_force);
+ throw_rude_remark(ENOSPC,
+ "No space for %s on drive %s at offset %lld",
+ sd->name,
+ drive->label.name,
+ sd->driveoffset);
+ return;
+ }
+ /*
+ * We've found the space, and we can allocate it.
+ * We don't need to say that to the subdisk, which
+ * already knows about it. We need to tell it to
+ * the free list, though. We have four possibilities:
+ *
+ * 1. The subdisk exactly eats up the entry. That's the
+ * same as above.
+ * 2. The subdisk starts at the beginning and leaves space
+ * at the end.
+ * 3. The subdisk starts after the beginning and leaves
+ * space at the end as well: we end up with another
+ * fragment.
+ * 4. The subdisk leaves space at the beginning and finishes
+ * at the end.
+ */
+ drive->sectors_available -= sd->sectors; /* note how much less space we have */
+ if (sd->driveoffset == drive->freelist[fe].offset) { /* 1 or 2 */
+ if (sd->sectors == drive->freelist[fe].sectors) { /* 1: used up the entire entry */
+ if (fe < (drive->freelist_entries - 1)) /* not the last one, */
+ bcopy(&drive->freelist[fe + 1],
+ &drive->freelist[fe],
+ (drive->freelist_entries - fe) * sizeof(struct drive_freelist));
+ drive->freelist_entries--; /* one less entry */
+ } else { /* 2: space at the end */
+ drive->freelist[fe].sectors -= sd->sectors; /* this much less space */
+ drive->freelist[fe].offset += sd->sectors; /* this much further on */
+ }
+ } else { /* 3 or 4 */
+ drive->freelist[fe].sectors = sd->driveoffset - drive->freelist[fe].offset;
+ if (dend > sdend) { /* 3: space at the end as well */
+ if (fe < (drive->freelist_entries - 1)) /* not the last one */
+ bcopy(&drive->freelist[fe], /* move the rest down */
+ &drive->freelist[fe + 1],
+ (drive->freelist_entries - fe) * sizeof(struct drive_freelist));
+ drive->freelist_entries++; /* one less entry */
+ drive->freelist[fe + 1].offset = sdend; /* second entry starts after sd */
+ drive->freelist[fe + 1].sectors = dend - sdend; /* and is this long */
+ }
+ }
+ break;
+ }
+ }
+ }
+ drive->opencount++; /* one more subdisk attached */
+}
+
+/* Get an empty drive entry from the drive table */
+int
+get_empty_drive(void)
+{
+ int driveno;
+ struct drive *drive;
+
+ /* first see if we have one which has been deallocated */
+ for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) {
+ if (DRIVE[driveno].state == drive_unallocated) /* bingo */
+ break;
+ }
+
+ if (driveno >= vinum_conf.drives_allocated) /* we've used all our allocation */
+ EXPAND(DRIVE, struct drive, vinum_conf.drives_allocated, INITIAL_DRIVES);
+
+ /* got a drive entry. Make it pretty */
+ drive = &DRIVE[driveno];
+ bzero(drive, sizeof(struct drive));
+ drive->driveno = driveno; /* put number in structure */
+ drive->flags |= VF_NEWBORN; /* newly born drive */
+ strcpy(drive->devicename, "unknown"); /* and make the name ``unknown'' */
+ return driveno; /* return the index */
+}
+
+/*
+ * Find the named drive in vinum_conf.drive,
+ * return the index in vinum_conf.drive.
+ * Don't mark the drive as allocated (XXX SMP)
+ * If create != 0, create an entry if it doesn't exist
+ */
+/* XXX check if we have it open from attach */
+int
+find_drive(const char *name, int create)
+{
+ int driveno;
+ struct drive *drive;
+
+ if (name != NULL) {
+ for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) {
+ drive = &DRIVE[driveno]; /* point to drive */
+ if ((drive->label.name[0] != '\0') /* it has a name */
+ &&(strcmp(drive->label.name, name) == 0) /* and it's this one */
+ &&(drive->state > drive_unallocated)) /* and it's a real one: found */
+ return driveno;
+ }
+ }
+ /* the drive isn't in the list. Add it if he wants */
+ if (create == 0) /* don't want to create */
+ return -1; /* give up */
+
+ driveno = get_empty_drive();
+ drive = &DRIVE[driveno];
+ if (name != NULL)
+ strlcpy(drive->label.name, /* put in its name */
+ name,
+ sizeof(drive->label.name));
+ drive->state = drive_referenced; /* in use, nothing worthwhile there */
+ return driveno; /* return the index */
+}
+
+/*
+ * Find a drive given its device name.
+ * devname must be valid.
+ * Otherwise the same as find_drive above.
+ */
+int
+find_drive_by_name(const char *devname, int create)
+{
+ int driveno;
+ struct drive *drive;
+
+ for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) {
+ drive = &DRIVE[driveno]; /* point to drive */
+ if ((strcmp(drive->devicename, devname) == 0) /* it's this device */
+ &&(drive->state > drive_unallocated)) /* and it's a real one: found */
+ return driveno;
+ }
+
+ /* the drive isn't in the list. Add it if he wants */
+ if (create == 0) /* don't want to create */
+ return -1; /* give up */
+
+ driveno = get_empty_drive();
+ drive = &DRIVE[driveno];
+ bcopy(devname, /* put in its name */
+ drive->devicename,
+ min(sizeof(drive->devicename),
+ strlen(devname)));
+ drive->state = drive_referenced; /* in use, nothing worthwhile there */
+ return driveno; /* return the index */
+}
+
+/* Find an empty subdisk in the subdisk table */
+int
+get_empty_sd(void)
+{
+ int sdno;
+ struct sd *sd;
+
+ /* first see if we have one which has been deallocated */
+ for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) {
+ if (SD[sdno].state == sd_unallocated) /* bingo */
+ break;
+ }
+ if (sdno >= vinum_conf.subdisks_allocated)
+ /*
+ * We've run out of space. sdno is pointing
+ * where we want it, but at the moment we
+ * don't have the space. Get it.
+ *
+ * XXX We should check for overflow here. We
+ * shouldn't allocate more than VINUM_MAXSD
+ * subdisks (currently at least a quarter of a
+ * million).
+ */
+ EXPAND(SD, struct sd, vinum_conf.subdisks_allocated, INITIAL_SUBDISKS);
+
+ /* initialize some things */
+ sd = &SD[sdno]; /* point to it */
+ bzero(sd, sizeof(struct sd)); /* initialize */
+ sd->flags |= VF_NEWBORN; /* newly born subdisk */
+ sd->plexno = -1; /* no plex */
+ sd->sectors = -1; /* no space */
+ sd->driveno = -1; /* no drive */
+ sd->plexoffset = -1; /* and no offsets */
+ sd->driveoffset = -1;
+ return sdno; /* return the index */
+}
+
+/* return a drive to the free pool */
+void
+free_drive(struct drive *drive)
+{
+ if ((drive->state > drive_referenced) /* real drive */
+ ||(drive->flags & VF_OPEN)) { /* how can it be open without a state? */
+ LOCKDRIVE(drive);
+ if (drive->flags & VF_OPEN) { /* it's open, */
+ close_locked_drive(drive); /* close it */
+ drive->state = drive_down; /* and note the fact */
+ }
+ if (drive->freelist)
+ Free(drive->freelist);
+ bzero(drive, sizeof(struct drive)); /* this also sets drive_unallocated */
+ unlockdrive(drive);
+ }
+}
+
+/*
+ * Find the named subdisk in vinum_conf.sd.
+ *
+ * If create != 0, create an entry if it doesn't exist
+ *
+ * Return index in vinum_conf.sd
+ */
+int
+find_subdisk(const char *name, int create)
+{
+ int sdno;
+ struct sd *sd;
+
+ for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) {
+ if (strcmp(SD[sdno].name, name) == 0) /* found it */
+ return sdno;
+ }
+
+ /* the subdisk isn't in the list. Add it if he wants */
+ if (create == 0) /* don't want to create */
+ return -1; /* give up */
+
+ /* Allocate one and insert the name */
+ sdno = get_empty_sd();
+ sd = &SD[sdno];
+ bcopy(name, sd->name, min(sizeof(sd->name), strlen(name))); /* put in its name */
+ return sdno; /* return the pointer */
+}
+
+/* Return space to a drive */
+void
+return_drive_space(int driveno, int64_t offset, int length)
+{
+ struct drive *drive;
+ int fe; /* free list entry */
+ u_int64_t sdend; /* end of our subdisk */
+ u_int64_t dend; /* end of our freelist entry */
+
+ drive = &DRIVE[driveno];
+ if (drive->state == drive_up) {
+ sdend = offset + length; /* end of our subdisk */
+
+ /* Look for where to return the sd address space */
+ for (fe = 0;
+ (fe < drive->freelist_entries) && (drive->freelist[fe].offset < offset);
+ fe++);
+ /*
+ * Now we are pointing to the last entry, the first
+ * with a higher offset than the subdisk, or both.
+ */
+ if ((fe > 1) /* not the first entry */
+ &&((fe == drive->freelist_entries) /* gone past the end */
+ ||(drive->freelist[fe].offset > offset))) /* or past the block were looking for */
+ fe--; /* point to the block before */
+ dend = drive->freelist[fe].offset + drive->freelist[fe].sectors; /* end of the entry */
+
+ /*
+ * At this point, we are pointing to the correct
+ * place in the free list. A number of possibilities
+ * exist:
+ *
+ * 1. The block to be freed starts at the end of the
+ * block to which we are pointing. This has two
+ * subcases:
+ *
+ * a. The block to be freed ends at the beginning
+ * of the following block. Merge the three
+ * areas into a single block.
+ *
+ * b. The block is shorter than the space between
+ * the current block and the next one. Enlarge
+ * the current block.
+ *
+ * 2. The block to be freed starts after the end
+ * of the block. Again, we have two cases:
+ *
+ * a. It ends before the start of the following block.
+ * Create a new free block.
+ *
+ * b. It ends at the start of the following block.
+ * Enlarge the following block downwards.
+ *
+ * When there is only one free space block, and the
+ * space to be returned is before it, the pointer is
+ * to a non-existent zeroth block. XXX check this
+ */
+ if (offset == dend) { /* Case 1: it starts at the end of this block */
+ if ((fe < drive->freelist_entries - 1) /* we're not the last block in the free list */
+ /* and the subdisk ends at the start of the next block */
+ &&(sdend == drive->freelist[fe + 1].offset)) {
+ drive->freelist[fe].sectors /* 1a: merge all three blocks */
+ = drive->freelist[fe + 1].sectors;
+ if (fe < drive->freelist_entries - 2) /* still more blocks after next */
+ bcopy(&drive->freelist[fe + 2], /* move down one */
+ &drive->freelist[fe + 1],
+ (drive->freelist_entries - 2 - fe)
+ * sizeof(struct drive_freelist));
+ drive->freelist_entries--; /* one less entry in the free list */
+ } else /* 1b: just enlarge this block */
+ drive->freelist[fe].sectors += length;
+ } else { /* Case 2 */
+ if (offset > dend) /* it starts after this block */
+ fe++; /* so look at the next block */
+ if ((fe < drive->freelist_entries) /* we're not the last block in the free list */
+ /* and the subdisk ends at the start of this block: case 4 */
+ &&(sdend == drive->freelist[fe].offset)) {
+ drive->freelist[fe].offset = offset; /* it starts where the sd was */
+ drive->freelist[fe].sectors += length; /* and it's this much bigger */
+ } else { /* case 3: non-contiguous */
+ if (fe < drive->freelist_entries) /* not after the last block, */
+ bcopy(&drive->freelist[fe], /* move the rest up one entry */
+ &drive->freelist[fe + 1],
+ (drive->freelist_entries - fe)
+ * sizeof(struct drive_freelist));
+ drive->freelist_entries++; /* one less entry */
+ drive->freelist[fe].offset = offset; /* this entry represents the sd */
+ drive->freelist[fe].sectors = length;
+ }
+ }
+ drive->sectors_available += length; /* the sectors are now available */
+ }
+}
+
+/*
+ * Free an allocated sd entry.
+ * This performs memory management only. remove()
+ * is responsible for checking relationships.
+ */
+void
+free_sd(int sdno)
+{
+ struct sd *sd;
+
+ sd = &SD[sdno];
+ if ((sd->driveno >= 0) /* we have a drive, */
+ &&(sd->sectors > 0)) /* and some space on it */
+ return_drive_space(sd->driveno, /* return the space */
+ sd->driveoffset,
+ sd->sectors);
+ if (sd->plexno >= 0)
+ PLEX[sd->plexno].subdisks--; /* one less subdisk */
+ destroy_dev(sd->dev);
+ bzero(sd, sizeof(struct sd)); /* and clear it out */
+ sd->state = sd_unallocated;
+ vinum_conf.subdisks_used--; /* one less sd */
+}
+
+/* Find an empty plex in the plex table */
+int
+get_empty_plex(void)
+{
+ int plexno;
+ struct plex *plex; /* if we allocate one */
+
+ /* first see if we have one which has been deallocated */
+ for (plexno = 0; plexno < vinum_conf.plexes_allocated; plexno++) {
+ if (PLEX[plexno].state == plex_unallocated) /* bingo */
+ break; /* and get out of here */
+ }
+
+ if (plexno >= vinum_conf.plexes_allocated)
+ EXPAND(PLEX, struct plex, vinum_conf.plexes_allocated, INITIAL_PLEXES);
+
+ /* Found a plex. Give it an sd structure */
+ plex = &PLEX[plexno]; /* this one is ours */
+ bzero(plex, sizeof(struct plex)); /* polish it up */
+ plex->sdnos = (int *) Malloc(sizeof(int) * INITIAL_SUBDISKS_IN_PLEX); /* allocate sd table */
+ CHECKALLOC(plex->sdnos, "vinum: Can't allocate plex subdisk table");
+ bzero(plex->sdnos, (sizeof(int) * INITIAL_SUBDISKS_IN_PLEX)); /* do we need this? */
+ plex->flags |= VF_NEWBORN; /* newly born plex */
+ plex->subdisks = 0; /* no subdisks in use */
+ plex->subdisks_allocated = INITIAL_SUBDISKS_IN_PLEX; /* and we have space for this many */
+ plex->organization = plex_disorg; /* and it's not organized */
+ plex->volno = -1; /* no volume yet */
+ return plexno; /* return the index */
+}
+
+/*
+ * Find the named plex in vinum_conf.plex
+ *
+ * If create != 0, create an entry if it doesn't exist
+ * return index in vinum_conf.plex
+ */
+int
+find_plex(const char *name, int create)
+{
+ int plexno;
+ struct plex *plex;
+
+ for (plexno = 0; plexno < vinum_conf.plexes_allocated; plexno++) {
+ if (strcmp(PLEX[plexno].name, name) == 0) /* found it */
+ return plexno;
+ }
+
+ /* the plex isn't in the list. Add it if he wants */
+ if (create == 0) /* don't want to create */
+ return -1; /* give up */
+
+ /* Allocate one and insert the name */
+ plexno = get_empty_plex();
+ plex = &PLEX[plexno]; /* point to it */
+ bcopy(name, plex->name, min(sizeof(plex->name), strlen(name))); /* put in its name */
+ return plexno; /* return the pointer */
+}
+
+/*
+ * Free an allocated plex entry
+ * and its associated memory areas
+ */
+void
+free_plex(int plexno)
+{
+ struct plex *plex;
+
+ plex = &PLEX[plexno];
+ if (plex->sdnos)
+ Free(plex->sdnos);
+ if (plex->lock)
+ Free(plex->lock);
+ destroy_dev(plex->dev);
+ bzero(plex, sizeof(struct plex)); /* and clear it out */
+ plex->state = plex_unallocated;
+}
+
+/* Find an empty volume in the volume table */
+int
+get_empty_volume(void)
+{
+ int volno;
+ struct volume *vol;
+ int i;
+
+ /* first see if we have one which has been deallocated */
+ for (volno = 0; volno < vinum_conf.volumes_allocated; volno++) {
+ if (VOL[volno].state == volume_unallocated) /* bingo */
+ break;
+ }
+
+ if (volno >= vinum_conf.volumes_allocated)
+ EXPAND(VOL, struct volume, vinum_conf.volumes_allocated, INITIAL_VOLUMES);
+
+ /* Now initialize fields */
+ vol = &VOL[volno];
+ bzero(vol, sizeof(struct volume));
+ vol->flags |= VF_NEWBORN | VF_CREATED; /* newly born volume */
+ vol->preferred_plex = ROUND_ROBIN_READPOL; /* round robin */
+ for (i = 0; i < MAXPLEX; i++) /* mark the plexes missing */
+ vol->plex[i] = -1;
+ return volno; /* return the index */
+}
+
+/*
+ * Find the named volume in vinum_conf.volume.
+ *
+ * If create != 0, create an entry if it doesn't exist
+ * return the index in vinum_conf
+ */
+int
+find_volume(const char *name, int create)
+{
+ int volno;
+ struct volume *vol;
+
+ for (volno = 0; volno < vinum_conf.volumes_allocated; volno++) {
+ if (strcmp(VOL[volno].name, name) == 0) /* found it */
+ return volno;
+ }
+
+ /* the volume isn't in the list. Add it if he wants */
+ if (create == 0) /* don't want to create */
+ return -1; /* give up */
+
+ /* Allocate one and insert the name */
+ volno = get_empty_volume();
+ vol = &VOL[volno];
+ bcopy(name, vol->name, min(sizeof(vol->name), strlen(name))); /* put in its name */
+ vol->blocksize = DEV_BSIZE; /* block size of this volume */
+ return volno; /* return the pointer */
+}
+
+/*
+ * Free an allocated volume entry
+ * and its associated memory areas
+ */
+void
+free_volume(int volno)
+{
+ struct volume *vol;
+
+ vol = &VOL[volno];
+ destroy_dev(vol->dev);
+ bzero(vol, sizeof(struct volume)); /* and clear it out */
+ vol->state = volume_unallocated;
+}
+
+/*
+ * Handle a drive definition. We store the information in the global variable
+ * drive, so we don't need to allocate.
+ *
+ * If we find an error, print a message and return
+ */
+void
+config_drive(int update)
+{
+ enum drive_label_info partition_status; /* info about the partition */
+ int parameter;
+ int driveno; /* index of drive in vinum_conf */
+ struct drive *drive; /* and pointer to it */
+ int otherdriveno; /* index of possible second drive */
+ int sdno;
+
+ if (tokens < 2) /* not enough tokens */
+ throw_rude_remark(EINVAL, "Drive has no name\n");
+ driveno = find_drive(token[1], 1); /* allocate a drive to initialize */
+ drive = &DRIVE[driveno]; /* and get a pointer */
+ if (update && ((drive->flags & VF_NEWBORN) == 0)) /* this drive exists already */
+ return; /* don't do anything */
+ drive->flags &= ~VF_NEWBORN; /* no longer newly born */
+
+ if (drive->state != drive_referenced) { /* we already know this drive */
+ /*
+ * XXX Check which definition is more up-to-date. Give
+ * preference for the definition on its own drive.
+ */
+ return; /* XXX */
+ }
+ for (parameter = 2; parameter < tokens; parameter++) { /* look at the other tokens */
+ switch (get_keyword(token[parameter], &keyword_set)) {
+ case kw_device:
+ parameter++;
+ otherdriveno = find_drive_by_name(token[parameter], 0); /* see if it exists already */
+ if (otherdriveno >= 0) { /* yup, */
+ drive->state = drive_unallocated; /* deallocate the drive */
+ throw_rude_remark(EEXIST, /* and complain */
+ "Drive %s would have same device as drive %s",
+ token[1],
+ DRIVE[otherdriveno].label.name);
+ }
+ if (drive->devicename[0] == '/') { /* we know this drive... */
+ if (strcmp(drive->devicename, token[parameter])) /* different name */
+ close_drive(drive); /* close it if it's open */
+ else /* no change */
+ break;
+ }
+ /* open the device and get the configuration */
+ bcopy(token[parameter], /* insert device information */
+ drive->devicename,
+ min(sizeof(drive->devicename),
+ strlen(token[parameter])));
+ partition_status = read_drive_label(drive, 1);
+ switch (partition_status) {
+ case DL_CANT_OPEN: /* not our kind */
+ close_drive(drive);
+ if (drive->lasterror == EFTYPE) /* wrong kind of partition */
+ throw_rude_remark(drive->lasterror,
+ "Drive %s has invalid partition type",
+ drive->label.name);
+ else /* I/O error of some kind */
+ throw_rude_remark(drive->lasterror,
+ "Can't initialize drive %s",
+ drive->label.name);
+ break;
+
+ case DL_WRONG_DRIVE: /* valid drive, not the name we expected */
+ if (vinum_conf.flags & VF_FORCECONFIG) { /* but we'll accept that */
+ bcopy(token[1], drive->label.name, sizeof(drive->label.name));
+ break;
+ }
+ close_drive(drive);
+ /*
+ * There's a potential race condition here:
+ * the rude remark refers to a field in an
+ * unallocated drive, which potentially could
+ * be reused. This works because we're the only
+ * thread accessing the config at the moment.
+ */
+ drive->state = drive_unallocated; /* throw it away completely */
+ throw_rude_remark(drive->lasterror,
+ "Incorrect drive name %s specified for drive %s",
+ token[1],
+ drive->label.name);
+ break;
+
+ case DL_DELETED_LABEL: /* it was a drive, but we deleted it */
+ case DL_NOT_OURS: /* nothing to do with the rest */
+ case DL_OURS:
+ break;
+ }
+ /*
+ * read_drive_label overwrites the device name.
+ * If we get here, we can have the drive,
+ * so put it back again
+ */
+ bcopy(token[parameter],
+ drive->devicename,
+ min(sizeof(drive->devicename),
+ strlen(token[parameter])));
+ break;
+
+ case kw_state:
+ parameter++; /* skip the keyword */
+ if (vinum_conf.flags & VF_READING_CONFIG)
+ drive->state = DriveState(token[parameter]); /* set the state */
+ break;
+
+ case kw_hotspare: /* this drive is a hot spare */
+ drive->flags |= VF_HOTSPARE;
+ break;
+
+ default:
+ close_drive(drive);
+ throw_rude_remark(EINVAL,
+ "Drive %s, invalid keyword: %s",
+ token[1],
+ token[parameter]);
+ }
+ }
+
+ if (drive->devicename[0] != '/') {
+ drive->state = drive_unallocated; /* deallocate the drive */
+ throw_rude_remark(EINVAL, "No device name for %s", drive->label.name);
+ }
+ vinum_conf.drives_used++; /* passed all hurdles: one more in use */
+ /*
+ * If we're replacing a drive, it could be that
+ * we already have subdisks referencing this
+ * drive. Note where they should be and change
+ * their state to obsolete.
+ */
+ for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) {
+ if ((SD[sdno].state > sd_referenced)
+ && (SD[sdno].driveno == driveno)) {
+ give_sd_to_drive(sdno);
+ if (SD[sdno].state > sd_stale)
+ SD[sdno].state = sd_stale;
+ }
+ }
+}
+
+/*
+ * Handle a subdisk definition. We store the
+ * information in the global variable sd, so we
+ * don't need to allocate.
+ *
+ * On error throw a message back to the caller.
+ */
+void
+config_subdisk(int update)
+{
+ int parameter;
+ int sdno; /* index of sd in vinum_conf */
+ struct sd *sd; /* and pointer to it */
+ u_int64_t size;
+ int detached = 0; /* set to 1 if this is a detached subdisk */
+ int sdindex = -1; /* index in plexes subdisk table */
+ enum sdstate state = sd_unallocated; /* state to set, if specified */
+ int autosize = 0; /* set if we autosize in give_sd_to_drive */
+ int namedsdno; /* index of another with this name */
+ char partition = 0; /* partition of external subdisk */
+
+ sdno = get_empty_sd(); /* allocate an SD to initialize */
+ sd = &SD[sdno]; /* and get a pointer */
+
+ for (parameter = 1; parameter < tokens; parameter++) { /* look at the other tokens */
+ switch (get_keyword(token[parameter], &keyword_set)) {
+ /*
+ * If we have a 'name' parameter, it must
+ * come first, because we're too lazy to tidy
+ * up dangling refs if it comes later.
+ */
+ case kw_name:
+ namedsdno = find_subdisk(token[++parameter], 0); /* find an existing sd with this name */
+ if (namedsdno >= 0) { /* got one */
+ if (SD[namedsdno].state == sd_referenced) { /* we've been told about this one */
+ if (parameter > 2)
+ throw_rude_remark(EINVAL,
+ "sd %s: name parameter must come first\n", /* no go */
+ token[parameter]);
+ else {
+ int i;
+ struct plex *plex; /* for tidying up dangling references */
+
+ *sd = SD[namedsdno]; /* copy from the referenced one */
+ SD[namedsdno].state = sd_unallocated; /* and deallocate the referenced one */
+ plex = &PLEX[sd->plexno]; /* now take a look at our plex */
+ for (i = 0; i < plex->subdisks; i++) { /* look for the pointer */
+ if (plex->sdnos[i] == namedsdno) /* pointing to the old subdisk */
+ plex->sdnos[i] = sdno; /* bend it to point here */
+ }
+ }
+ }
+ if (update) /* are we updating? */
+ return; /* that's OK, nothing more to do */
+ else
+ throw_rude_remark(EINVAL, "Duplicate subdisk %s", token[parameter]);
+ } else
+ bcopy(token[parameter],
+ sd->name,
+ min(sizeof(sd->name), strlen(token[parameter])));
+ break;
+
+ case kw_detached:
+ detached = 1;
+ break;
+
+ case kw_plexoffset:
+ size = sizespec(token[++parameter]);
+ if ((size == -1) /* unallocated */
+ &&(vinum_conf.flags & VF_READING_CONFIG)) /* reading from disk */
+ break; /* invalid sd; just ignore it */
+ if ((size % DEV_BSIZE) != 0)
+ throw_rude_remark(EINVAL,
+ "sd %s, bad plex offset alignment: %lld",
+ sd->name,
+ (long long) size);
+ else
+ sd->plexoffset = size / DEV_BSIZE;
+ break;
+
+ case kw_driveoffset:
+ size = sizespec(token[++parameter]);
+ if ((size == -1) /* unallocated */
+ &&(vinum_conf.flags & VF_READING_CONFIG)) /* reading from disk */
+ break; /* invalid sd; just ignore it */
+ if ((size % DEV_BSIZE) != 0)
+ throw_rude_remark(EINVAL,
+ "sd %s, bad drive offset alignment: %lld",
+ sd->name,
+ (long long) size);
+ else
+ sd->driveoffset = size / DEV_BSIZE;
+ break;
+
+ case kw_len:
+ if (get_keyword(token[++parameter], &keyword_set) == kw_max) /* select maximum size from drive */
+ size = 0; /* this is how we say it :-) */
+ else
+ size = sizespec(token[parameter]);
+ if ((size % DEV_BSIZE) != 0)
+ throw_rude_remark(EINVAL, "sd %s, length %d not multiple of sector size", sd->name, size);
+ else
+ sd->sectors = size / DEV_BSIZE;
+ /*
+ * We have a problem with autosizing: we need to
+ * give the drive to the plex before we give it
+ * to the drive, in order to be clean if we give
+ * up in the middle, but at this time the size hasn't
+ * been set. Note that we have to fix up after
+ * giving the subdisk to the drive.
+ */
+ if (size == 0)
+ autosize = 1; /* note that we're autosizing */
+ break;
+
+ case kw_drive:
+ sd->driveno = find_drive(token[++parameter], 1); /* insert drive information */
+ break;
+
+ case kw_plex:
+ sd->plexno = find_plex(token[++parameter], 1); /* insert plex information */
+ break;
+
+ /*
+ * Set the state. We can't do this directly,
+ * because give_sd_to_plex may change it
+ */
+ case kw_state:
+ parameter++; /* skip the keyword */
+ if (vinum_conf.flags & VF_READING_CONFIG)
+ state = SdState(token[parameter]); /* set the state */
+ break;
+
+ case kw_partition:
+ parameter++; /* skip the keyword */
+ if ((strlen(token[parameter]) != 1)
+ || (token[parameter][0] < 'a')
+ || (token[parameter][0] > 'h'))
+ throw_rude_remark(EINVAL,
+ "%s: invalid partition %c",
+ sd->name,
+ token[parameter][0]);
+ else
+ partition = token[parameter][0];
+ break;
+
+ case kw_retryerrors:
+ sd->flags |= VF_RETRYERRORS;
+ break;
+
+ default:
+ throw_rude_remark(EINVAL, "%s: invalid keyword: %s", sd->name, token[parameter]);
+ }
+ }
+
+ /* Check we have a drive name */
+ if (sd->driveno < 0) { /* didn't specify a drive */
+ sd->driveno = current_drive; /* set to the current drive */
+ if (sd->driveno < 0) /* no current drive? */
+ throw_rude_remark(EINVAL, "Subdisk %s is not associated with a drive", sd->name);
+ }
+ if (DRIVE[sd->driveno].state != drive_up)
+ sd->state = sd_crashed;
+
+ /*
+ * This is tacky. If something goes wrong
+ * with the checks, we may end up losing drive
+ * space. FIXME.
+ */
+ if (autosize != 0) /* need to find a size, */
+ give_sd_to_drive(sdno); /* do it before the plex */
+
+ /* Check for a plex name */
+ if ((sd->plexno < 0) /* didn't specify a plex */
+ &&(!detached)) /* and didn't say not to, */
+ sd->plexno = current_plex; /* set to the current plex */
+
+ if (sd->plexno >= 0)
+ sdindex = give_sd_to_plex(sd->plexno, sdno); /* now tell the plex that it has this sd */
+
+ sd->sdno = sdno; /* point to our entry in the table */
+
+ /* Does the subdisk have a name? If not, give it one */
+ if (sd->name[0] == '\0') { /* no name */
+ char sdsuffix[8]; /* form sd name suffix here */
+
+ /* Do we have a plex name? */
+ if (sdindex >= 0) /* we have a plex */
+ strlcpy(sd->name, /* take it from there */
+ PLEX[sd->plexno].name,
+ sizeof(sd->name));
+ else /* no way */
+ throw_rude_remark(EINVAL, "Unnamed sd is not associated with a plex");
+ sprintf(sdsuffix, ".s%d", sdindex); /* form the suffix */
+ strlcat(sd->name, sdsuffix, sizeof(sd->name)); /* and add it to the name */
+ }
+ /* do we have complete info for this subdisk? */
+ if (sd->sectors < 0)
+ throw_rude_remark(EINVAL, "sd %s has no length spec", sd->name);
+
+ if (sd->dev == NULL)
+ /*
+ * sdno can (at least theoretically) overflow
+ * into the low order bit of the type field.
+ * This gives rise to a subdisk with type
+ * VINUM_SD2_TYPE. This is a feature, not a
+ * bug.
+ */
+ sd->dev = make_dev(&vinum_cdevsw,
+ VINUMMINOR(sdno, VINUM_SD_TYPE),
+ UID_ROOT,
+ GID_OPERATOR,
+ S_IRUSR | S_IWUSR | S_IRGRP,
+ "vinum/sd/%s",
+ sd->name);
+ if (state != sd_unallocated) /* we had a specific state to set */
+ sd->state = state; /* do it now */
+ else if (sd->state == sd_unallocated) /* no, nothing set yet, */
+ sd->state = sd_empty; /* must be empty */
+ if (autosize == 0) /* no autoconfig, do the drive now */
+ give_sd_to_drive(sdno);
+ vinum_conf.subdisks_used++; /* one more in use */
+}
+
+/*
+ * Handle a plex definition.
+ */
+void
+config_plex(int update)
+{
+ int parameter;
+ int plexno; /* index of plex in vinum_conf */
+ struct plex *plex; /* and pointer to it */
+ int pindex = MAXPLEX; /* index in volume's plex list */
+ int detached = 0; /* don't give it to a volume */
+ int namedplexno;
+ enum plexstate state = plex_init; /* state to set at end */
+ int preferme; /* set if we want to be preferred access */
+
+ current_plex = -1; /* forget the previous plex */
+ preferme = 0; /* nothing special yet */
+ plexno = get_empty_plex(); /* allocate a plex */
+ plex = &PLEX[plexno]; /* and point to it */
+ plex->plexno = plexno; /* and back to the config */
+
+ for (parameter = 1; parameter < tokens; parameter++) { /* look at the other tokens */
+ switch (get_keyword(token[parameter], &keyword_set)) {
+ /*
+ * If we have a 'name' parameter, it must
+ * come first, because we're too lazy to tidy
+ * up dangling refs if it comes later.
+ */
+ case kw_name:
+ namedplexno = find_plex(token[++parameter], 0); /* find an existing plex with this name */
+ if (namedplexno >= 0) { /* plex exists already, */
+ if (PLEX[namedplexno].state == plex_referenced) { /* we've been told about this one */
+ if (parameter > 2) /* we've done other things first, */
+ throw_rude_remark(EINVAL,
+ "plex %s: name parameter must come first\n", /* no go */
+ token[parameter]);
+ else {
+ int i;
+ struct volume *vol; /* for tidying up dangling references */
+
+ *plex = PLEX[namedplexno]; /* get the info */
+ PLEX[namedplexno].state = plex_unallocated; /* and deallocate the other one */
+ vol = &VOL[plex->volno]; /* point to the volume */
+ for (i = 0; i < MAXPLEX; i++) { /* for each plex */
+ if (vol->plex[i] == namedplexno)
+ vol->plex[i] = plexno; /* bend the pointer */
+ }
+ }
+ break; /* use this one */
+ }
+ if (update) /* are we updating? */
+ return; /* yes: that's OK, just return */
+ else
+ throw_rude_remark(EINVAL, "Duplicate plex %s", token[parameter]);
+ } else
+ bcopy(token[parameter], /* put in the name */
+ plex->name,
+ min(MAXPLEXNAME, strlen(token[parameter])));
+ break;
+
+ case kw_detached:
+ detached = 1;
+ break;
+
+ case kw_org: /* plex organization */
+ switch (get_keyword(token[++parameter], &keyword_set)) {
+ case kw_concat:
+ plex->organization = plex_concat;
+ break;
+
+ case kw_striped:
+ {
+ int stripesize = sizespec(token[++parameter]);
+
+ plex->organization = plex_striped;
+ if (stripesize % DEV_BSIZE != 0) /* not a multiple of block size, */
+ throw_rude_remark(EINVAL, "plex %s: stripe size %d not a multiple of sector size",
+ plex->name,
+ stripesize);
+ else
+ plex->stripesize = stripesize / DEV_BSIZE;
+ break;
+ }
+
+ case kw_raid4:
+ {
+ int stripesize = sizespec(token[++parameter]);
+
+ plex->organization = plex_raid4;
+ if (stripesize % DEV_BSIZE != 0) /* not a multiple of block size, */
+ throw_rude_remark(EINVAL, "plex %s: stripe size %d not a multiple of sector size",
+ plex->name,
+ stripesize);
+ else
+ plex->stripesize = stripesize / DEV_BSIZE;
+ break;
+ }
+
+ case kw_raid5:
+ {
+ int stripesize = sizespec(token[++parameter]);
+
+ plex->organization = plex_raid5;
+ if (stripesize % DEV_BSIZE != 0) /* not a multiple of block size, */
+ throw_rude_remark(EINVAL, "plex %s: stripe size %d not a multiple of sector size",
+ plex->name,
+ stripesize);
+ else
+ plex->stripesize = stripesize / DEV_BSIZE;
+ break;
+ }
+
+ default:
+ throw_rude_remark(EINVAL, "Invalid plex organization");
+ }
+ if (isstriped(plex)
+ && (plex->stripesize == 0)) /* didn't specify a valid stripe size */
+ throw_rude_remark(EINVAL, "Need a stripe size parameter");
+ break;
+
+ /*
+ * We're the preferred plex of our volume.
+ * Unfortunately, we don't know who our
+ * volume is yet. Note that we want to be
+ * preferred, and actually do it after we
+ * get a volume.
+ */
+ case kw_preferred:
+ preferme = 1;
+ break;
+
+ case kw_volume:
+ plex->volno = find_volume(token[++parameter], 1); /* insert a pointer to the volume */
+ break;
+
+ case kw_sd: /* add a subdisk */
+ {
+ int sdno;
+
+ sdno = find_subdisk(token[++parameter], 1); /* find a subdisk */
+ SD[sdno].plexoffset = sizespec(token[++parameter]); /* get the offset */
+ give_sd_to_plex(plexno, sdno); /* and insert it there */
+ break;
+ }
+
+ case kw_state:
+ parameter++; /* skip the keyword */
+ if (vinum_conf.flags & VF_READING_CONFIG)
+ state = PlexState(token[parameter]); /* set the state */
+ break;
+
+ default:
+ throw_rude_remark(EINVAL, "plex %s, invalid keyword: %s",
+ plex->name,
+ token[parameter]);
+ }
+ }
+
+ if (plex->organization == plex_disorg)
+ throw_rude_remark(EINVAL, "No plex organization specified");
+
+ if ((plex->volno < 0) /* we don't have a volume */
+ &&(!detached)) /* and we wouldn't object */
+ plex->volno = current_volume;
+
+ if (plex->volno >= 0)
+ pindex = give_plex_to_volume(plex->volno, /* Now tell the volume that it has this plex */
+ plexno,
+ preferme);
+
+ /* Does the plex have a name? If not, give it one */
+ if (plex->name[0] == '\0') { /* no name */
+ char plexsuffix[8]; /* form plex name suffix here */
+ /* Do we have a volume name? */
+ if (plex->volno >= 0) /* we have a volume */
+ strlcpy(plex->name, /* take it from there */
+ VOL[plex->volno].name,
+ sizeof(plex->name));
+ else /* no way */
+ throw_rude_remark(EINVAL, "Unnamed plex is not associated with a volume");
+ sprintf(plexsuffix, ".p%d", pindex); /* form the suffix */
+ strlcat(plex->name, plexsuffix, sizeof(plex->name)); /* and add it to the name */
+ }
+ if (isstriped(plex)) {
+ plex->lock = (struct rangelock *)
+ Malloc(PLEX_LOCKS * sizeof(struct rangelock));
+ CHECKALLOC(plex->lock, "vinum: Can't allocate lock table\n");
+ bzero((char *) plex->lock, PLEX_LOCKS * sizeof(struct rangelock));
+ plex->lockmtx = &plexmutex[plexno % PLEXMUTEXES]; /* use this mutex for locking */
+ }
+ /* Note the last plex we configured */
+ current_plex = plexno;
+ plex->state = state; /* set whatever state we chose */
+ vinum_conf.plexes_used++; /* one more in use */
+ if (plex->dev == NULL)
+ plex->dev = make_dev(&vinum_cdevsw,
+ VINUMMINOR(plexno, VINUM_PLEX_TYPE),
+ UID_ROOT,
+ GID_OPERATOR,
+ S_IRUSR | S_IWUSR | S_IRGRP,
+ "vinum/plex/%s",
+ plex->name);
+}
+
+/*
+ * Handle a volume definition.
+ * If we find an error, print a message, deallocate the nascent volume, and return
+ */
+void
+config_volume(int update)
+{
+ int parameter;
+ int volno;
+ struct volume *vol; /* collect volume info here */
+ int i;
+
+ if (tokens < 2) /* not enough tokens */
+ throw_rude_remark(EINVAL, "Volume has no name");
+ current_volume = -1; /* forget the previous volume */
+ volno = find_volume(token[1], 1); /* allocate a volume to initialize */
+ vol = &VOL[volno]; /* and get a pointer */
+ if (update && ((vol->flags & VF_CREATED) == 0)) /* this volume exists already */
+ return; /* don't do anything */
+ vol->flags &= ~VF_CREATED; /* it exists now */
+
+ for (parameter = 2; parameter < tokens; parameter++) { /* look at all tokens */
+ switch (get_keyword(token[parameter], &keyword_set)) {
+ case kw_plex:
+ {
+ int plexno; /* index of this plex */
+ int myplexno; /* and index if it's already ours */
+
+ plexno = find_plex(token[++parameter], 1); /* find a plex */
+ if (plexno < 0) /* couldn't */
+ break; /* we've already had an error message */
+ myplexno = my_plex(volno, plexno); /* does it already belong to us? */
+ if (myplexno > 0) /* yes, shouldn't get it again */
+ throw_rude_remark(EINVAL,
+ "Plex %s already belongs to volume %s",
+ token[parameter],
+ vol->name);
+ else if (++vol->plexes > 8) /* another entry */
+ throw_rude_remark(EINVAL,
+ "Too many plexes for volume %s",
+ vol->name);
+ vol->plex[vol->plexes - 1] = plexno;
+ PLEX[plexno].state = plex_referenced; /* we know something about it */
+ PLEX[plexno].volno = volno; /* and this volume references it */
+ }
+ break;
+
+ case kw_readpol:
+ switch (get_keyword(token[++parameter], &keyword_set)) { /* decide what to do */
+ case kw_round:
+ vol->preferred_plex = ROUND_ROBIN_READPOL; /* default */
+ break;
+
+ case kw_prefer:
+ {
+ int myplexno; /* index of this plex */
+
+ myplexno = find_plex(token[++parameter], 1); /* find a plex */
+ if (myplexno < 0) { /* couldn't */
+ printf("vinum: couldn't find preferred plex %s for %s\n",
+ token[parameter],
+ vol->name);
+ break; /* we've already had an error message */
+ }
+ myplexno = my_plex(volno, myplexno); /* does it already belong to us? */
+ if (myplexno > 0) /* yes */
+ vol->preferred_plex = myplexno; /* just note the index */
+ else if (++vol->plexes > 8) /* another entry */
+ throw_rude_remark(EINVAL, "Too many plexes");
+ else { /* space for the new plex */
+ vol->plex[vol->plexes - 1] = myplexno; /* add it to our list */
+ vol->preferred_plex = vol->plexes - 1; /* and note the index */
+ }
+ }
+ break;
+
+ default:
+ throw_rude_remark(EINVAL, "Invalid read policy");
+ }
+
+ case kw_setupstate:
+ vol->flags |= VF_CONFIG_SETUPSTATE; /* set the volume up later on */
+ break;
+
+ case kw_state:
+ parameter++; /* skip the keyword */
+ if (vinum_conf.flags & VF_READING_CONFIG)
+ vol->state = VolState(token[parameter]); /* set the state */
+ break;
+
+ /*
+ * XXX experimental ideas. These are not
+ * documented, and will not be until I
+ * decide they're worth keeping.
+ */
+ case kw_writethrough: /* set writethrough mode */
+ vol->flags |= VF_WRITETHROUGH;
+ break;
+
+ case kw_writeback: /* set writeback mode */
+ vol->flags &= ~VF_WRITETHROUGH;
+ break;
+
+ default:
+ throw_rude_remark(EINVAL, "volume %s, invalid keyword: %s",
+ vol->name,
+ token[parameter]);
+ }
+ }
+ current_volume = volno; /* note last referred volume */
+ vol->volno = volno; /* also note in volume */
+
+ /*
+ * Before we can actually use the volume, we need
+ * a volume label. We could start to fake one here,
+ * but it will be a lot easier when we have some
+ * to copy from the drives, so defer it until we
+ * set up the configuration. XXX
+ */
+ if (vol->state == volume_unallocated)
+ vol->state = volume_down; /* now ready to bring up at the end */
+
+ /* Find out how big our volume is */
+ for (i = 0; i < vol->plexes; i++)
+ vol->size = max(vol->size, PLEX[vol->plex[i]].length);
+ vinum_conf.volumes_used++; /* one more in use */
+ if (vol->dev == NULL)
+ vol->dev = make_dev(&vinum_cdevsw,
+ VINUMMINOR(volno, VINUM_VOLUME_TYPE),
+ UID_ROOT,
+ GID_OPERATOR,
+ S_IRUSR | S_IWUSR | S_IRGRP,
+ "vinum/%s",
+ vol->name);
+}
+
+/*
+ * Parse a config entry. CARE! This destroys the original contents of the
+ * config entry, which we don't really need after this. More specifically, it
+ * places \0 characters at the end of each token.
+ *
+ * Return 0 if all is well, otherwise EINVAL for invalid keyword,
+ * or ENOENT if 'read' command doesn't find any drives.
+ */
+int
+parse_config(char *cptr, struct keywordset *keyset, int update)
+{
+ int status;
+
+ status = 0; /* until proven otherwise */
+ tokens = tokenize(cptr, token, MAXTOKEN); /* chop up into tokens */
+
+ if (tokens <= 0) /* screwed up or empty line */
+ return tokens; /* give up */
+ else if (tokens == MAXTOKEN) /* too many */
+ throw_rude_remark(E2BIG,
+ "Configuration error for %s: too many parameters",
+ token[1]);
+
+ if (token[0][0] == '#') /* comment line */
+ return 0;
+
+ switch (get_keyword(token[0], keyset)) { /* decide what to do */
+ case kw_drive:
+ config_drive(update);
+ break;
+
+ case kw_subdisk:
+ config_subdisk(update);
+ break;
+
+ case kw_plex:
+ config_plex(update);
+ break;
+
+ case kw_volume:
+ config_volume(update);
+ break;
+
+ /* Anything else is invalid in this context */
+ default:
+ throw_rude_remark(EINVAL, /* should we die? */
+ "Invalid configuration information: %s",
+ token[0]);
+ }
+ return status;
+}
+
+/*
+ * parse a line handed in from userland via ioctl.
+ * This differs only by the error reporting mechanism:
+ * we return the error indication in the reply to the
+ * ioctl, so we need to set a global static pointer in
+ * this file. This technique works because we have
+ * ensured that configuration is performed in a single-
+ * threaded manner
+ */
+int
+parse_user_config(char *cptr, struct keywordset *keyset)
+{
+ int status;
+
+ ioctl_reply = (struct _ioctl_reply *) cptr;
+ status = parse_config(cptr, keyset, 0);
+ ioctl_reply = NULL; /* don't do this again */
+ return status;
+}
+
+/* Remove an object */
+void
+remove(struct vinum_ioctl_msg *msg)
+{
+ struct vinum_ioctl_msg message = *msg; /* make a copy to hand on */
+
+ ioctl_reply = (struct _ioctl_reply *) msg; /* reinstate the address to reply to */
+ ioctl_reply->error = 0; /* no error, */
+ ioctl_reply->msg[0] = '\0'; /* no message */
+
+ switch (message.type) {
+ case drive_object:
+ remove_drive_entry(message.index, message.force);
+ updateconfig(0);
+ return;
+
+ case sd_object:
+ remove_sd_entry(message.index, message.force, message.recurse);
+ updateconfig(0);
+ return;
+
+ case plex_object:
+ remove_plex_entry(message.index, message.force, message.recurse);
+ updateconfig(0);
+ return;
+
+ case volume_object:
+ remove_volume_entry(message.index, message.force, message.recurse);
+ updateconfig(0);
+ return;
+
+ default:
+ ioctl_reply->error = EINVAL;
+ strcpy(ioctl_reply->msg, "Invalid object type");
+ }
+}
+
+/* Remove a drive. */
+void
+remove_drive_entry(int driveno, int force)
+{
+ struct drive *drive = &DRIVE[driveno];
+ int sdno;
+
+ if ((driveno > vinum_conf.drives_allocated) /* not a valid drive */
+ ||(drive->state == drive_unallocated)) { /* or nothing there */
+ ioctl_reply->error = EINVAL;
+ strcpy(ioctl_reply->msg, "No such drive");
+ } else if (drive->opencount > 0) { /* we have subdisks */
+ if (force) { /* do it at any cost */
+ for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) {
+ if ((SD[sdno].state != sd_unallocated) /* subdisk is allocated */
+ &&(SD[sdno].driveno == driveno)) /* and it belongs to this drive */
+ remove_sd_entry(sdno, force, 0);
+ }
+ remove_drive(driveno); /* now remove it */
+ vinum_conf.drives_used--; /* one less drive */
+ } else
+ ioctl_reply->error = EBUSY; /* can't do that */
+ } else {
+ remove_drive(driveno); /* just remove it */
+ vinum_conf.drives_used--; /* one less drive */
+ }
+}
+
+/* remove a subdisk */
+void
+remove_sd_entry(int sdno, int force, int recurse)
+{
+ struct sd *sd = &SD[sdno];
+
+ if ((sdno > vinum_conf.subdisks_allocated) /* not a valid sd */
+ ||(sd->state == sd_unallocated)) { /* or nothing there */
+ ioctl_reply->error = EINVAL;
+ strcpy(ioctl_reply->msg, "No such subdisk");
+ } else if (sd->flags & VF_OPEN) /* we're open */
+ ioctl_reply->error = EBUSY; /* no getting around that */
+ else if (sd->plexno >= 0) { /* we have a plex */
+ if (force) { /* do it at any cost */
+ struct plex *plex = &PLEX[sd->plexno]; /* point to our plex */
+ int mysdno;
+
+ for (mysdno = 0; /* look for ourselves */
+ mysdno < plex->subdisks && &SD[plex->sdnos[mysdno]] != sd;
+ mysdno++);
+ if (mysdno == plex->subdisks) /* didn't find it */
+ log(LOG_ERR,
+ "Error removing subdisk %s: not found in plex %s\n",
+ SD[mysdno].name,
+ plex->name);
+ else { /* remove the subdisk from plex */
+ if (mysdno < (plex->subdisks - 1)) /* not the last subdisk */
+ bcopy(&plex->sdnos[mysdno + 1],
+ &plex->sdnos[mysdno],
+ (plex->subdisks - 1 - mysdno) * sizeof(int));
+ plex->subdisks--;
+ sd->plexno = -1; /* disown the subdisk */
+ }
+
+ /*
+ * Removing a subdisk from a striped or
+ * RAID-4 or RAID-5 plex really tears the
+ * hell out of the structure, and it needs
+ * to be reinitialized.
+ */
+ if (plex->organization != plex_concat) /* not concatenated, */
+ set_plex_state(plex->plexno, plex_faulty, setstate_force); /* need to reinitialize */
+ log(LOG_INFO, "vinum: removing %s\n", sd->name);
+ free_sd(sdno);
+ } else
+ ioctl_reply->error = EBUSY; /* can't do that */
+ } else {
+ log(LOG_INFO, "vinum: removing %s\n", sd->name);
+ free_sd(sdno);
+ }
+}
+
+/* remove a plex */
+void
+remove_plex_entry(int plexno, int force, int recurse)
+{
+ struct plex *plex = &PLEX[plexno];
+ int sdno;
+
+ if ((plexno > vinum_conf.plexes_allocated) /* not a valid plex */
+ ||(plex->state == plex_unallocated)) { /* or nothing there */
+ ioctl_reply->error = EINVAL;
+ strcpy(ioctl_reply->msg, "No such plex");
+ } else if (plex->flags & VF_OPEN) { /* we're open */
+ ioctl_reply->error = EBUSY; /* no getting around that */
+ return;
+ }
+ if (plex->subdisks) {
+ if (force) { /* do it anyway */
+ if (recurse) { /* remove all below */
+ int sds = plex->subdisks;
+ for (sdno = 0; sdno < sds; sdno++)
+ free_sd(plex->sdnos[sdno]); /* free all subdisks */
+ } else { /* just tear them out */
+ int sds = plex->subdisks;
+ for (sdno = 0; sdno < sds; sdno++)
+ SD[plex->sdnos[sdno]].plexno = -1; /* no plex any more */
+ }
+ } else { /* can't do it without force */
+ ioctl_reply->error = EBUSY; /* can't do that */
+ return;
+ }
+ }
+ if (plex->volno >= 0) { /* we are part of a volume */
+ if (force) { /* do it at any cost */
+ struct volume *vol = &VOL[plex->volno];
+ int myplexno;
+
+ for (myplexno = 0; myplexno < vol->plexes; myplexno++)
+ if (vol->plex[myplexno] == plexno) /* found it */
+ break;
+ if (myplexno == vol->plexes) /* didn't find it. Huh? */
+ log(LOG_ERR,
+ "Error removing plex %s: not found in volume %s\n",
+ plex->name,
+ vol->name);
+ if (myplexno < (vol->plexes - 1)) /* not the last plex in the list */
+ bcopy(&vol->plex[myplexno + 1],
+ &vol->plex[myplexno],
+ vol->plexes - 1 - myplexno);
+ vol->plexes--;
+ } else {
+ ioctl_reply->error = EBUSY; /* can't do that */
+ return;
+ }
+ }
+ log(LOG_INFO, "vinum: removing %s\n", plex->name);
+ free_plex(plexno);
+ vinum_conf.plexes_used--; /* one less plex */
+}
+
+/* remove a volume */
+void
+remove_volume_entry(int volno, int force, int recurse)
+{
+ struct volume *vol = &VOL[volno];
+ int plexno;
+
+ if ((volno > vinum_conf.volumes_allocated) /* not a valid volume */
+ ||(vol->state == volume_unallocated)) { /* or nothing there */
+ ioctl_reply->error = EINVAL;
+ strcpy(ioctl_reply->msg, "No such volume");
+ } else if (vol->flags & VF_OPEN) /* we're open */
+ ioctl_reply->error = EBUSY; /* no getting around that */
+ else if (vol->plexes) {
+ if (recurse && force) { /* remove all below */
+ int plexes = vol->plexes;
+
+/* for (plexno = plexes - 1; plexno >= 0; plexno--) */
+ for (plexno = 0; plexno < plexes; plexno++)
+ remove_plex_entry(vol->plex[plexno], force, recurse);
+ log(LOG_INFO, "vinum: removing %s\n", vol->name);
+ free_volume(volno);
+ vinum_conf.volumes_used--; /* one less volume */
+ } else
+ ioctl_reply->error = EBUSY; /* can't do that */
+ } else {
+ log(LOG_INFO, "vinum: removing %s\n", vol->name);
+ free_volume(volno);
+ vinum_conf.volumes_used--; /* one less volume */
+ }
+}
+
+/* Currently called only from ioctl */
+void
+update_sd_config(int sdno, int diskconfig)
+{
+ if (!diskconfig)
+ set_sd_state(sdno, sd_up, setstate_configuring);
+ SD[sdno].flags &= ~VF_NEWBORN;
+}
+
+void
+update_plex_config(int plexno, int diskconfig)
+{
+ u_int64_t size;
+ int sdno;
+ struct plex *plex = &PLEX[plexno];
+ enum plexstate state = plex_up; /* state we want the plex in */
+ int remainder; /* size of fractional stripe at end */
+ int added_plex; /* set if we add a plex to a volume */
+ int required_sds; /* number of subdisks we need */
+ struct sd *sd;
+ struct volume *vol;
+ int data_sds = 0; /* number of sds carrying data */
+
+ if (plex->state < plex_init) /* not a real plex, */
+ return;
+ added_plex = 0;
+ if (plex->volno >= 0) { /* we have a volume */
+ vol = &VOL[plex->volno];
+
+ /*
+ * If we're newly born,
+ * and the volume isn't,
+ * and it has other plexes,
+ * and we didn't read this mess from disk,
+ * we were added later.
+ */
+ if ((plex->flags & VF_NEWBORN)
+ && ((vol->flags & VF_NEWBORN) == 0)
+ && (vol->plexes > 0)
+ && (diskconfig == 0)) {
+ added_plex = 1;
+ state = plex_down; /* so take ourselves down */
+ }
+ }
+ /*
+ * Check that our subdisks make sense. For
+ * striped plexes, we need at least two
+ * subdisks, and for RAID-4 and RAID-5 plexes we
+ * need at least three subdisks. In each case
+ * they must all be the same size.
+ */
+ if (plex->organization == plex_striped) {
+ data_sds = plex->subdisks;
+ required_sds = 2;
+ } else if (isparity(plex)) { /* RAID 4 or 5 */
+ data_sds = plex->subdisks - 1;
+ required_sds = 3;
+ } else
+ required_sds = 0;
+ if (required_sds > 0) { /* striped, RAID-4 or RAID-5 */
+ if (plex->subdisks < required_sds) {
+ log(LOG_ERR,
+ "vinum: plex %s does not have at least %d subdisks\n",
+ plex->name,
+ required_sds);
+ state = plex_faulty;
+ }
+ /*
+ * Now see if the plex size is a multiple of
+ * the stripe size. If not, trim off the end
+ * of each subdisk and return it to the drive.
+ */
+ if (plex->length > 0) {
+ if (data_sds > 0) {
+ if (plex->stripesize > 0) {
+ remainder = (int) (plex->length /* are we exact? */
+ % ((u_int64_t) plex->stripesize * data_sds));
+ if (remainder) { /* no */
+ log(LOG_INFO, "vinum: removing %d blocks of partial stripe at the end of %s\n",
+ remainder,
+ plex->name);
+ plex->length -= remainder; /* shorten the plex */
+ remainder /= data_sds; /* spread the remainder amongst the sds */
+ for (sdno = 0; sdno < plex->subdisks; sdno++) {
+ sd = &SD[plex->sdnos[sdno]]; /* point to the subdisk */
+ return_drive_space(sd->driveno, /* return the space */
+ sd->driveoffset + sd->sectors - remainder,
+ remainder);
+ sd->sectors -= remainder; /* and shorten it */
+ }
+ }
+ } else /* no data sds, */
+ plex->length = 0; /* reset length */
+ }
+ }
+ }
+ size = 0;
+ for (sdno = 0; sdno < plex->subdisks; sdno++) {
+ sd = &SD[plex->sdnos[sdno]];
+ if (isstriped(plex)
+ && (sdno > 0)
+ && (sd->sectors != SD[plex->sdnos[sdno - 1]].sectors)) {
+ log(LOG_ERR, "vinum: %s must have equal sized subdisks\n", plex->name);
+ state = plex_down;
+ }
+ size += sd->sectors;
+ if (added_plex) /* we were added later */
+ sd->state = sd_stale; /* stale until proven otherwise */
+ if (plex->sectorsize != 0) {
+ if (sd->sectorsize != plex->sectorsize) /* incompatible sector sizes? */
+ printf("vinum: incompatible sector sizes. "
+ "%s has %d bytes, %s has %d bytes. Ignored.\n",
+ sd->name,
+ sd->sectorsize,
+ plex->name,
+ plex->sectorsize);
+ } else /* not set yet, */
+ plex->sectorsize = sd->sectorsize;
+ }
+
+ if (plex->subdisks) { /* plex has subdisks, calculate size */
+ /*
+ * XXX We shouldn't need to calculate the size any
+ * more. Check this some time
+ */
+ if (isparity(plex))
+ size = size / plex->subdisks * (plex->subdisks - 1); /* less space for RAID-4 and RAID-5 */
+ if (plex->length != size)
+ log(LOG_INFO,
+ "Correcting length of %s: was %lld, is %lld\n",
+ plex->name,
+ (long long) plex->length,
+ (long long) size);
+ plex->length = size;
+ } else { /* no subdisks, */
+ plex->length = 0; /* no size */
+ state = plex_down; /* take it down */
+ }
+ update_plex_state(plexno); /* set the state */
+ plex->flags &= ~VF_NEWBORN;
+}
+
+void
+update_volume_config(int volno)
+{
+ struct volume *vol = &VOL[volno];
+ struct plex *plex;
+ int plexno;
+
+ if (vol->state != volume_unallocated)
+ /*
+ * Recalculate the size of the volume,
+ * which might change if the original
+ * plexes were not a multiple of the
+ * stripe size.
+ */
+ {
+ vol->size = 0;
+ for (plexno = 0; plexno < vol->plexes; plexno++) {
+ plex = &PLEX[vol->plex[plexno]];
+ vol->size = max(plex->length, vol->size); /* maximum size */
+ plex->volplexno = plexno; /* note it in the plex */
+ if (vol->sectorsize != 0) {
+ if (plex->sectorsize != vol->sectorsize) /* incompatible sector sizes? */
+ printf("vinum: incompatible sector sizes. "
+ "%s has %d, %s has %d. Ignored.\n",
+ plex->name,
+ plex->sectorsize,
+ vol->name,
+ vol->sectorsize);
+ } else /* not set yet, */
+ vol->sectorsize = plex->sectorsize;
+ }
+ }
+ vol->flags &= ~VF_NEWBORN; /* no longer newly born */
+}
+
+/*
+ * Update the global configuration. This is
+ * called after configuration changes.
+ *
+ * diskconfig is != 0 if we're reading in a config
+ * from disk. In this case, we don't try to bring
+ * the devices up, though we will bring them down
+ * if there's some error which got missed when
+ * writing to disk.
+ */
+void
+updateconfig(int diskconfig)
+{
+ int plexno;
+ int volno;
+
+ for (plexno = 0; plexno < vinum_conf.plexes_allocated; plexno++)
+ update_plex_config(plexno, diskconfig);
+
+ for (volno = 0; volno < vinum_conf.volumes_allocated; volno++) {
+ if (VOL[volno].state > volume_uninit) {
+ VOL[volno].flags &= ~VF_CONFIG_SETUPSTATE; /* no more setupstate */
+ update_volume_state(volno);
+ update_volume_config(volno);
+ }
+ }
+ save_config();
+}
+
+/*
+ * Start manual changes to the configuration and lock out
+ * others who may wish to do so.
+ * XXX why do we need this and lock_config too?
+ */
+int
+start_config(int force)
+{
+ int error;
+
+ current_drive = -1; /* note the last drive we mention, for
+ * some defaults */
+ current_plex = -1; /* and the same for the last plex */
+ current_volume = -1; /* and the last volume */
+ while ((vinum_conf.flags & VF_CONFIGURING) != 0) {
+ vinum_conf.flags |= VF_WILL_CONFIGURE;
+ if ((error = tsleep(&vinum_conf, PRIBIO | PCATCH, "vincfg", 0)) != 0)
+ return error;
+ }
+ /*
+ * We need two flags here: VF_CONFIGURING
+ * tells other processes to hold off (this
+ * function), and VF_CONFIG_INCOMPLETE
+ * tells the state change routines not to
+ * propagate incrememntal state changes
+ */
+ vinum_conf.flags |= VF_CONFIGURING | VF_CONFIG_INCOMPLETE;
+ if (force)
+ vinum_conf.flags |= VF_FORCECONFIG; /* overwrite differently named drives */
+ current_drive = -1; /* reset the defaults */
+ current_plex = -1; /* and the same for the last plex */
+ current_volume = -1; /* and the last volme */
+ return 0;
+}
+
+/*
+ * Update the config if update is 1, and unlock
+ * it. We won't update the configuration if we
+ * are called in a recursive loop via throw_rude_remark.
+ */
+void
+finish_config(int update)
+{
+ /* we've finished our config */
+ vinum_conf.flags &= ~(VF_CONFIG_INCOMPLETE | VF_READING_CONFIG | VF_FORCECONFIG);
+ if (update)
+ updateconfig(0); /* so update things */
+ else
+ updateconfig(1); /* do some updates only */
+ vinum_conf.flags &= ~VF_CONFIGURING; /* and now other people can take a turn */
+ if ((vinum_conf.flags & VF_WILL_CONFIGURE) != 0) {
+ vinum_conf.flags &= ~VF_WILL_CONFIGURE;
+ wakeup_one(&vinum_conf);
+ }
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumdaemon.c b/sys/dev/vinum/vinumdaemon.c
new file mode 100644
index 0000000..3ae09c0
--- /dev/null
+++ b/sys/dev/vinum/vinumdaemon.c
@@ -0,0 +1,281 @@
+/* daemon.c: kernel part of Vinum daemon */
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumdaemon.c,v 1.8 2000/01/03 05:22:03 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+#ifdef VINUMDEBUG
+#include <sys/reboot.h>
+#endif
+
+/* declarations */
+void recover_io(struct request *rq);
+
+int daemon_options = 0; /* options */
+int daemonpid; /* PID of daemon */
+struct daemonq *daemonq; /* daemon's work queue */
+struct daemonq *dqend; /* and the end of the queue */
+
+/*
+ * We normally call Malloc to get a queue element. In interrupt
+ * context, we can't guarantee that we'll get one, since we're not
+ * allowed to wait. If malloc fails, use one of these elements.
+ */
+
+#define INTQSIZE 4
+struct daemonq intq[INTQSIZE]; /* queue elements for interrupt context */
+struct daemonq *intqp; /* and pointer in it */
+
+void
+vinum_daemon(void)
+{
+ int s;
+ struct daemonq *request;
+
+ PROC_LOCK(curproc);
+ curproc->p_flag |= P_SYSTEM; /* we're a system process */
+ mtx_lock_spin(&sched_lock);
+ curproc->p_sflag |= PS_INMEM;
+ mtx_unlock_spin(&sched_lock);
+ PROC_UNLOCK(curproc);
+ daemon_save_config(); /* start by saving the configuration */
+ daemonpid = curproc->p_pid; /* mark our territory */
+ while (1) {
+ tsleep(&vinum_daemon, PRIBIO, "vinum", 0); /* wait for something to happen */
+
+ /*
+ * It's conceivable that, as the result of an
+ * I/O error, we'll be out of action long
+ * enough that another daemon gets started.
+ * That's OK, just give up gracefully.
+ */
+ if (curproc->p_pid != daemonpid) { /* we've been ousted in our sleep */
+ if (daemon_options & daemon_verbose)
+ log(LOG_INFO, "vinum: abdicating\n");
+ return;
+ }
+ while (daemonq != NULL) { /* we have work to do, */
+ s = splhigh(); /* don't get interrupted here */
+ request = daemonq; /* get the request */
+ daemonq = daemonq->next; /* and detach it */
+ if (daemonq == NULL) /* got to the end, */
+ dqend = NULL; /* no end any more */
+ splx(s);
+
+ switch (request->type) {
+ /*
+ * We had an I/O error on a request. Go through the
+ * request and try to salvage it
+ */
+ case daemonrq_ioerror:
+ if (daemon_options & daemon_verbose) {
+ struct request *rq = request->info.rq;
+
+ log(LOG_WARNING,
+ "vinum: recovering I/O request: %p\n%s dev %d.%d, offset 0x%llx, length %ld\n",
+ rq,
+ rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
+ major(rq->bp->b_dev),
+ minor(rq->bp->b_dev),
+ (long long)rq->bp->b_blkno,
+ rq->bp->b_bcount);
+ }
+ recover_io(request->info.rq); /* the failed request */
+ break;
+
+ /*
+ * Write the config to disk. We could end up with
+ * quite a few of these in a row. Only honour the
+ * last one
+ */
+ case daemonrq_saveconfig:
+ if ((daemonq == NULL) /* no more requests */
+ ||(daemonq->type != daemonrq_saveconfig)) { /* or the next isn't the same */
+ if (((daemon_options & daemon_noupdate) == 0) /* we're allowed to do it */
+ &&((vinum_conf.flags & VF_READING_CONFIG) == 0)) { /* and we're not building the config now */
+ /*
+ * We obviously don't want to save a
+ * partial configuration. Less obviously,
+ * we don't need to do anything if we're
+ * asked to write the config when we're
+ * building it up, because we save it at
+ * the end.
+ */
+ if (daemon_options & daemon_verbose)
+ log(LOG_INFO, "vinum: saving config\n");
+ daemon_save_config(); /* save it */
+ }
+ }
+ break;
+
+ case daemonrq_return: /* been told to stop */
+ if (daemon_options & daemon_verbose)
+ log(LOG_INFO, "vinum: stopping\n");
+ daemon_options |= daemon_stopped; /* note that we've stopped */
+ Free(request);
+ while (daemonq != NULL) { /* backed up requests, */
+ request = daemonq; /* get the request */
+ daemonq = daemonq->next; /* and detach it */
+ Free(request); /* then free it */
+ }
+ wakeup(&vinumclose); /* and wake any waiting vinum(8)s */
+ return;
+
+ case daemonrq_ping: /* tell the caller we're here */
+ if (daemon_options & daemon_verbose)
+ log(LOG_INFO, "vinum: ping reply\n");
+ wakeup(&vinum_finddaemon); /* wake up the caller */
+ break;
+
+ case daemonrq_closedrive: /* close a drive */
+ close_drive(request->info.drive); /* do it */
+ break;
+
+ case daemonrq_init: /* initialize a plex */
+ /* XXX */
+ case daemonrq_revive: /* revive a subdisk */
+ /* XXX */
+ /* FALLTHROUGH */
+ default:
+ log(LOG_WARNING, "Invalid request\n");
+ break;
+ }
+ if (request->privateinuse) /* one of ours, */
+ request->privateinuse = 0; /* no longer in use */
+ else
+ Free(request); /* return it */
+ }
+ }
+}
+
+/*
+ * Recover a failed I/O operation.
+ *
+ * The correct way to do this is to examine the request and determine
+ * how to recover each individual failure. In the case of a write,
+ * this could be as simple as doing nothing: the defective drives may
+ * already be down, and there may be nothing else to do. In case of
+ * a read, it will be necessary to retry if there are alternative
+ * copies of the data.
+ *
+ * The easy way (here) is just to reissue the request. This will take
+ * a little longer, but nothing like as long as the failure will have
+ * taken.
+ *
+ */
+void
+recover_io(struct request *rq)
+{
+ /*
+ * This should read:
+ *
+ * vinumstrategy(rq->bp);
+ *
+ * Negotiate with phk to get it fixed.
+ */
+ DEV_STRATEGY(rq->bp); /* reissue the command */
+}
+
+/* Functions called to interface with the daemon */
+
+/* queue a request for the daemon */
+void
+queue_daemon_request(enum daemonrq type, union daemoninfo info)
+{
+ int s;
+
+ struct daemonq *qelt = (struct daemonq *) Malloc(sizeof(struct daemonq));
+
+ if (qelt == NULL) { /* malloc failed, we're prepared for that */
+ /*
+ * Take one of our spares. Give up if it's still in use; the only
+ * message we're likely to get here is a 'drive failed' message,
+ * and that'll come by again if we miss it.
+ */
+ if (intqp->privateinuse) /* still in use? */
+ return; /* yes, give up */
+ qelt = intqp++;
+ if (intqp == &intq[INTQSIZE]) /* got to the end, */
+ intqp = intq; /* wrap around */
+ qelt->privateinuse = 1; /* it's ours, and it's in use */
+ } else
+ qelt->privateinuse = 0;
+
+ qelt->next = NULL; /* end of the chain */
+ qelt->type = type;
+ qelt->info = info;
+ s = splhigh();
+ if (daemonq) { /* something queued already */
+ dqend->next = qelt;
+ dqend = qelt;
+ } else { /* queue is empty, */
+ daemonq = qelt; /* this is the whole queue */
+ dqend = qelt;
+ }
+ splx(s);
+ wakeup(&vinum_daemon); /* and give the dæmon a kick */
+}
+
+/*
+ * see if the daemon is running. Return 0 (no error)
+ * if it is, ESRCH otherwise
+ */
+int
+vinum_finddaemon()
+{
+ int result;
+
+ if (daemonpid != 0) { /* we think we have a daemon, */
+ queue_daemon_request(daemonrq_ping, (union daemoninfo) 0); /* queue a ping */
+ result = tsleep(&vinum_finddaemon, PUSER, "reap", 2 * hz);
+ if (result == 0) /* yup, the daemon's up and running */
+ return 0;
+ }
+ /* no daemon, or we couldn't talk to it: start it */
+ vinum_daemon(); /* start the daemon */
+ return 0;
+}
+
+int
+vinum_setdaemonopts(int options)
+{
+ daemon_options = options;
+ return 0;
+}
diff --git a/sys/dev/vinum/vinumext.h b/sys/dev/vinum/vinumext.h
new file mode 100644
index 0000000..807bb5c6
--- /dev/null
+++ b/sys/dev/vinum/vinumext.h
@@ -0,0 +1,263 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumext.h,v 1.33 2003/05/23 00:57:48 grog Exp $
+ * $FreeBSD$
+ */
+
+/* vinumext.h: external definitions */
+
+/* *sigh* We still need this at the moment. */
+#ifdef _KERNEL
+extern struct _vinum_conf vinum_conf; /* configuration information */
+extern struct mtx plexmutex[]; /* mutexes for plexes to use */
+#else
+extern struct __vinum_conf vinum_conf; /* configuration information */
+#endif
+
+#ifdef VINUMDEBUG
+extern int debug; /* debug flags */
+#endif
+
+/* Physical read and write drive */
+#define read_drive(a, b, c, d) driveio (a, b, c, d, BIO_READ)
+#define write_drive(a, b, c, d) driveio (a, b, c, d, BIO_WRITE)
+
+#define CHECKALLOC(ptr, msg) \
+ if (ptr == NULL) \
+ { \
+ printf (msg); \
+ longjmp (command_fail, -1); \
+ }
+#ifndef _KERNEL
+struct vnode;
+struct thread;
+#endif
+
+#ifdef _KERNEL
+int vinum_inactive(int);
+void free_vinum(int);
+int give_sd_to_plex(int plexno, int sdno);
+void give_sd_to_drive(int sdno);
+int give_plex_to_volume(int, int, int);
+struct drive *check_drive(char *);
+enum drive_label_info read_drive_label(struct drive *, int);
+int parse_config(char *, struct keywordset *, int);
+int parse_user_config(char *cptr, struct keywordset *keyset);
+u_int64_t sizespec(char *spec);
+int volume_index(struct volume *volume);
+int plex_index(struct plex *plex);
+int sd_index(struct sd *sd);
+int drive_index(struct drive *drive);
+int my_plex(int volno, int plexno);
+int my_sd(int plexno, int sdno);
+int get_empty_drive(void);
+int find_drive(const char *name, int create);
+int find_drive_by_name(const char *devname, int create);
+int get_empty_sd(void);
+int find_subdisk(const char *name, int create);
+void return_drive_space(int driveno, int64_t offset, int length);
+void free_sd(int sdno);
+void free_volume(int volno);
+int get_empty_plex(void);
+int find_plex(const char *name, int create);
+void free_plex(int plexno);
+int get_empty_volume(void);
+int find_volume(const char *name, int create);
+void config_subdisk(int);
+void config_plex(int);
+void config_volume(int);
+void config_drive(int);
+void updateconfig(int);
+void update_sd_config(int sdno, int kernelstate);
+void update_plex_config(int plexno, int kernelstate);
+void update_volume_config(int volno);
+void update_config(void);
+void drive_io_done(struct buf *);
+void save_config(void);
+void daemon_save_config(void);
+void write_config(char *, int);
+int start_config(int);
+void finish_config(int);
+void remove(struct vinum_ioctl_msg *msg);
+void remove_drive_entry(int driveno, int force);
+void remove_sd_entry(int sdno, int force, int recurse);
+void remove_plex_entry(int plexno, int force, int recurse);
+void remove_volume_entry(int volno, int force, int recurse);
+
+void checkdiskconfig(char *);
+int open_drive(struct drive *, struct thread *, int);
+void close_drive(struct drive *drive);
+void close_locked_drive(struct drive *drive);
+int driveio(struct drive *, char *, size_t, off_t, int);
+int set_drive_parms(struct drive *drive);
+int init_drive(struct drive *, int);
+/* void throw_rude_remark (int, struct _ioctl_reply *, char *, ...); XXX */
+void throw_rude_remark(int, char *,...);
+
+void format_config(char *config, int len);
+void checkkernel(char *op);
+void free_drive(struct drive *drive);
+void down_drive(struct drive *drive);
+void remove_drive(int driveno);
+
+int vinum_scandisk(char *drivename);
+
+/* I/O */
+d_open_t vinumopen;
+d_close_t vinumclose;
+d_strategy_t vinumstrategy;
+d_ioctl_t vinumioctl;
+
+int vinum_super_ioctl(dev_t, u_long, caddr_t);
+int vinumstart(struct buf *bp, int reviveok);
+int launch_requests(struct request *rq, int reviveok);
+void sdio(struct buf *bp);
+
+/* XXX Do we need this? */
+int vinumpart(dev_t);
+
+extern jmp_buf command_fail; /* return here if config fails */
+
+#ifdef VINUMDEBUG
+/* Memory allocation and request tracing */
+void vinum_meminfo(caddr_t data);
+int vinum_mallocinfo(caddr_t data);
+int vinum_rqinfo(caddr_t data);
+void LongJmp(jmp_buf, int);
+char *basename(char *);
+#endif
+
+#ifdef VINUMDEBUG
+void expand_table(void **, int, int, char *, int);
+#else
+void expand_table(void **, int, int);
+#endif
+
+struct disklabel;
+struct request;
+struct rqgroup *allocrqg(struct request *rq, int elements);
+void deallocrqg(struct rqgroup *rqg);
+
+/* Device number decoding */
+int Volno(dev_t x);
+int Plexno(dev_t x);
+int Sdno(dev_t x);
+
+/* State transitions */
+int set_drive_state(int driveno, enum drivestate state, enum setstateflags flags);
+int set_sd_state(int sdno, enum sdstate state, enum setstateflags flags);
+enum requeststatus checksdstate(struct sd *sd, struct request *rq, daddr_t diskaddr, daddr_t diskend);
+int set_plex_state(int plexno, enum plexstate state, enum setstateflags flags);
+int set_volume_state(int volumeno, enum volumestate state, enum setstateflags flags);
+void update_sd_state(int sdno);
+void forceup(int plexno);
+void update_plex_state(int plexno);
+void update_volume_state(int volno);
+void invalidate_subdisks(struct plex *, enum sdstate);
+void get_volume_label(char *name, int plexes, u_int64_t size, struct disklabel *lp);
+int write_volume_label(int);
+void start_object(struct vinum_ioctl_msg *);
+void stop_object(struct vinum_ioctl_msg *);
+void setstate(struct vinum_ioctl_msg *msg);
+void setstate_by_force(struct vinum_ioctl_msg *msg);
+void vinum_label(int);
+int vinum_writedisklabel(struct volume *, struct disklabel *);
+int initsd(int, int);
+struct buf *parityrebuild(struct plex *, u_int64_t, int, enum parityop, struct rangelock **, off_t *);
+enum requeststatus sddownstate(struct request *rq);
+
+int restart_plex(int plexno);
+int revive_read(struct sd *sd);
+int revive_block(int sdno);
+void parityops(struct vinum_ioctl_msg *);
+
+/* Auxiliary functions */
+enum sdstates sdstatemap(struct plex *plex);
+enum volplexstate vpstate(struct plex *plex);
+#endif
+
+struct drive *validdrive(int driveno, struct _ioctl_reply *);
+struct sd *validsd(int sdno, struct _ioctl_reply *);
+struct plex *validplex(int plexno, struct _ioctl_reply *);
+struct volume *validvol(int volno, struct _ioctl_reply *);
+void resetstats(struct vinum_ioctl_msg *msg);
+
+/* Locking */
+#ifdef VINUMDEBUG
+int lockdrive(struct drive *drive, char *, int);
+#else
+int lockdrive(struct drive *drive);
+#endif
+void unlockdrive(struct drive *drive);
+int lockvol(struct volume *vol);
+void unlockvol(struct volume *vol);
+int lockplex(struct plex *plex);
+void unlockplex(struct plex *plex);
+struct rangelock *lockrange(daddr_t stripe, struct buf *bp, struct plex *plex);
+int lock_config(void);
+void unlock_config(void);
+
+/* Dæmon */
+
+void vinum_daemon(void);
+int vinum_finddaemon(void);
+int vinum_setdaemonopts(int);
+extern struct daemonq *daemonq; /* daemon's work queue */
+extern struct daemonq *dqend; /* and the end of the queue */
+extern struct cdevsw vinum_cdevsw;
+
+#undef Free /* defined in some funny net stuff */
+#ifdef _KERNEL
+#ifdef VINUMDEBUG
+#define Malloc(x) MMalloc ((x), __FILE__, __LINE__) /* show where we came from */
+#define Free(x) FFree ((x), __FILE__, __LINE__) /* show where we came from */
+caddr_t MMalloc(int size, char *, int);
+void FFree(void *mem, char *, int);
+#define LOCKDRIVE(d) lockdrive (d, __FILE__, __LINE__)
+#else
+#define Malloc(x) malloc((x), M_DEVBUF, \
+ curthread->td_proc->p_intr_nesting_level == 0? M_WAITOK: M_NOWAIT)
+#define Free(x) free((x), M_DEVBUF)
+#define LOCKDRIVE(d) lockdrive (d)
+#endif
+#else
+#define Malloc(x) malloc ((x)) /* just the size */
+#define Free(x) free ((x)) /* just the address */
+#endif
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumhdr.h b/sys/dev/vinum/vinumhdr.h
new file mode 100644
index 0000000..e8161e8
--- /dev/null
+++ b/sys/dev/vinum/vinumhdr.h
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ */
+
+/* Header files used by all modules */
+/*
+ * $Id: vinumhdr.h,v 1.19 2001/05/22 04:07:22 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#ifdef _KERNEL
+#include "opt_vinum.h"
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/conf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/sysctl.h>
+#endif
+#include <sys/errno.h>
+#include <sys/time.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/uio.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+#include <sys/syslog.h>
+#include <sys/fcntl.h>
+#include <sys/queue.h>
+#ifdef _KERNEL
+#include <machine/setjmp.h>
+#include <machine/stdarg.h>
+#else
+#include <setjmp.h>
+#include <stdarg.h>
+#endif
+#include <vm/vm.h>
+#include <dev/vinum/vinumvar.h>
+#include <dev/vinum/vinumio.h>
+#include <dev/vinum/vinumkw.h>
+#include <dev/vinum/vinumext.h>
+#include <dev/vinum/vinumutil.h>
+#include <machine/cpu.h>
diff --git a/sys/dev/vinum/vinuminterrupt.c b/sys/dev/vinum/vinuminterrupt.c
new file mode 100644
index 0000000..8d72579
--- /dev/null
+++ b/sys/dev/vinum/vinuminterrupt.c
@@ -0,0 +1,467 @@
+/* vinuminterrupt.c: bottom half of the driver */
+
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinuminterrupt.c,v 1.14 2001/05/23 23:03:37 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+#include <sys/resourcevar.h>
+
+void complete_raid5_write(struct rqelement *);
+void complete_rqe(struct buf *bp);
+void sdio_done(struct buf *bp);
+
+/*
+ * Take a completed buffer, transfer the data back if
+ * it's a read, and complete the high-level request
+ * if this is the last subrequest.
+ *
+ * The bp parameter is in fact a struct rqelement, which
+ * includes a couple of extras at the end.
+ */
+void
+complete_rqe(struct buf *bp)
+{
+ struct rqelement *rqe;
+ struct request *rq;
+ struct rqgroup *rqg;
+ struct buf *ubp; /* user buffer */
+ struct drive *drive;
+ struct sd *sd;
+ char *gravity; /* for error messages */
+
+ rqe = (struct rqelement *) bp; /* point to the element element that completed */
+ rqg = rqe->rqg; /* and the request group */
+ rq = rqg->rq; /* and the complete request */
+ ubp = rq->bp; /* user buffer */
+
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_LASTREQS)
+ logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
+#endif
+ drive = &DRIVE[rqe->driveno];
+ drive->active--; /* one less outstanding I/O on this drive */
+ vinum_conf.active--; /* one less outstanding I/O globally */
+ if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */
+ ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */
+ wakeup(&launch_requests); /* let another one at it */
+ if ((bp->b_io.bio_flags & BIO_ERROR) != 0) { /* transfer in error */
+ gravity = "";
+ sd = &SD[rqe->sdno];
+
+ if (bp->b_error != 0) /* did it return a number? */
+ rq->error = bp->b_error; /* yes, put it in. */
+ else if (rq->error == 0) /* no: do we have one already? */
+ rq->error = EIO; /* no: catchall "I/O error" */
+ sd->lasterror = rq->error;
+ if (bp->b_iocmd == BIO_READ) { /* read operation */
+ if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
+ gravity = " fatal";
+ set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
+ }
+ log(LOG_ERR,
+ "%s:%s read error, block %lld for %ld bytes\n",
+ gravity,
+ sd->name,
+ (long long)bp->b_blkno,
+ bp->b_bcount);
+ } else { /* write operation */
+ if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
+ gravity = "fatal ";
+ set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
+ }
+ log(LOG_ERR,
+ "%s:%s write error, block %lld for %ld bytes\n",
+ gravity,
+ sd->name,
+ (long long)bp->b_blkno,
+ bp->b_bcount);
+ }
+ log(LOG_ERR,
+ "%s: user buffer block %lld for %ld bytes\n",
+ sd->name,
+ (long long)ubp->b_blkno,
+ ubp->b_bcount);
+ if (rq->error == ENXIO) { /* the drive's down too */
+ log(LOG_ERR,
+ "%s: fatal drive I/O error, block %lld for %ld bytes\n",
+ DRIVE[rqe->driveno].label.name,
+ (long long)bp->b_blkno,
+ bp->b_bcount);
+ DRIVE[rqe->driveno].lasterror = rq->error;
+ set_drive_state(rqe->driveno, /* take the drive down */
+ drive_down,
+ setstate_force);
+ }
+ }
+ /* Now update the statistics */
+ if (bp->b_iocmd == BIO_READ) { /* read operation */
+ DRIVE[rqe->driveno].reads++;
+ DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
+ SD[rqe->sdno].reads++;
+ SD[rqe->sdno].bytes_read += bp->b_bcount;
+ PLEX[rqe->rqg->plexno].reads++;
+ PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
+ if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */
+ VOL[PLEX[rqe->rqg->plexno].volno].reads++;
+ VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
+ }
+ } else { /* write operation */
+ DRIVE[rqe->driveno].writes++;
+ DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
+ SD[rqe->sdno].writes++;
+ SD[rqe->sdno].bytes_written += bp->b_bcount;
+ PLEX[rqe->rqg->plexno].writes++;
+ PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
+ if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */
+ VOL[PLEX[rqe->rqg->plexno].volno].writes++;
+ VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
+ }
+ }
+ if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */
+ int *sdata; /* source */
+ int *data; /* and group data */
+ int length; /* and count involved */
+ int count; /* loop counter */
+ struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */
+
+ /* XOR destination is the user data */
+ sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
+ data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
+ length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
+
+ for (count = 0; count < length; count++)
+ data[count] ^= sdata[count];
+
+ /*
+ * In a normal read, we will normally read directly
+ * into the user buffer. This doesn't work if
+ * we're also doing a recovery, so we have to
+ * copy it
+ */
+ if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */
+ char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
+ char *dst;
+
+ dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
+ length = rqe->datalen << DEV_BSHIFT; /* and count involved */
+ bcopy(src, dst, length); /* move it */
+ }
+ } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */
+ &&(rqg->active == 1)) /* and this is the last active request */
+ complete_raid5_write(rqe);
+ /*
+ * This is the earliest place where we can be
+ * sure that the request has really finished,
+ * since complete_raid5_write can issue new
+ * requests.
+ */
+ rqg->active--; /* this request now finished */
+ if (rqg->active == 0) { /* request group finished, */
+ rq->active--; /* one less */
+ if (rqg->lock) { /* got a lock? */
+ unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
+ rqg->lock = 0;
+ }
+ }
+ if (rq->active == 0) { /* request finished, */
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_RESID) {
+ if (ubp->b_resid != 0) /* still something to transfer? */
+ Debugger("resid");
+ }
+#endif
+
+ if (rq->error) { /* did we have an error? */
+ if (rq->isplex) { /* plex operation, */
+ ubp->b_io.bio_flags |= BIO_ERROR; /* yes, propagate to user */
+ ubp->b_error = rq->error;
+ } else /* try to recover */
+ queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
+ } else {
+ ubp->b_resid = 0; /* completed our transfer */
+ if (rq->isplex == 0) /* volume request, */
+ VOL[rq->volplex.volno].active--; /* another request finished */
+ if (rq->flags & XFR_COPYBUF) {
+ Free(ubp->b_data);
+ ubp->b_data = rq->save_data;
+ }
+ bufdone(ubp); /* top level buffer completed */
+ freerq(rq); /* return the request storage */
+ }
+ }
+}
+
+/* Free a request block and anything hanging off it */
+void
+freerq(struct request *rq)
+{
+ struct rqgroup *rqg;
+ struct rqgroup *nrqg; /* next in chain */
+ int rqno;
+
+ for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) { /* through the whole request chain */
+ if (rqg->lock) /* got a lock? */
+ unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
+ for (rqno = 0; rqno < rqg->count; rqno++) {
+ if ((rqg->rqe[rqno].flags & XFR_MALLOCED) /* data buffer was malloced, */
+ &&rqg->rqe[rqno].b.b_data) /* and the allocation succeeded */
+ Free(rqg->rqe[rqno].b.b_data); /* free it */
+ if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) { /* locked this buffer, */
+ BUF_UNLOCK(&rqg->rqe[rqno].b); /* unlock it again */
+ BUF_LOCKFREE(&rqg->rqe[rqno].b);
+ }
+ }
+ nrqg = rqg->next; /* note the next one */
+ Free(rqg); /* and free this one */
+ }
+ Free(rq); /* free the request itself */
+}
+
+/* I/O on subdisk completed */
+void
+sdio_done(struct buf *bp)
+{
+ struct sdbuf *sbp;
+
+ sbp = (struct sdbuf *) bp;
+ if (sbp->b.b_io.bio_flags & BIO_ERROR) { /* had an error */
+ sbp->bp->b_io.bio_flags |= BIO_ERROR; /* propagate upwards */
+ sbp->bp->b_error = sbp->b.b_error;
+ }
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_LASTREQS)
+ logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
+#endif
+ sbp->bp->b_resid = sbp->b.b_resid; /* copy the resid field */
+ /* Now update the statistics */
+ if (bp->b_iocmd == BIO_READ) { /* read operation */
+ DRIVE[sbp->driveno].reads++;
+ DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
+ SD[sbp->sdno].reads++;
+ SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
+ } else { /* write operation */
+ DRIVE[sbp->driveno].writes++;
+ DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
+ SD[sbp->sdno].writes++;
+ SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
+ }
+ bufdone(sbp->bp); /* complete the caller's I/O */
+ BUF_UNLOCK(&sbp->b);
+ BUF_LOCKFREE(&sbp->b);
+ Free(sbp);
+}
+
+/* Start the second phase of a RAID-4 or RAID-5 group write operation. */
+void
+complete_raid5_write(struct rqelement *rqe)
+{
+ int *sdata; /* source */
+ int *pdata; /* and parity block data */
+ int length; /* and count involved */
+ int count; /* loop counter */
+ int rqno; /* request index */
+ int rqoffset; /* offset of request data from parity data */
+ struct buf *ubp; /* user buffer header */
+ struct request *rq; /* pointer to our request */
+ struct rqgroup *rqg; /* and to the request group */
+ struct rqelement *prqe; /* point to the parity block */
+ struct drive *drive; /* drive to access */
+
+ rqg = rqe->rqg; /* and to our request group */
+ rq = rqg->rq; /* point to our request */
+ ubp = rq->bp; /* user's buffer header */
+ prqe = &rqg->rqe[0]; /* point to the parity block */
+
+ /*
+ * If we get to this function, we have normal or
+ * degraded writes, or a combination of both. We do
+ * the same thing in each case: we perform an
+ * exclusive or to the parity block. The only
+ * difference is the origin of the data and the
+ * address range.
+ */
+ if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */
+ pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
+ bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */
+
+ /* Now get what data we need from each block */
+ for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
+ rqe = &rqg->rqe[rqno]; /* this request */
+ sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
+ length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */
+
+ /*
+ * Add the data block to the parity block. Before
+ * we started the request, we zeroed the parity
+ * block, so the result of adding all the other
+ * blocks and the block we want to write will be
+ * the correct parity block.
+ */
+ for (count = 0; count < length; count++)
+ pdata[count] ^= sdata[count];
+ if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */
+ &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */
+ Free(rqe->b.b_data); /* free it now */
+ rqe->flags &= ~XFR_MALLOCED;
+ }
+ }
+ }
+ if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */
+ /* Get what data we need from each block */
+ for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
+ rqe = &rqg->rqe[rqno]; /* this request */
+ if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
+ == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */
+ sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
+ rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
+ pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
+ length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
+
+ /*
+ * "remove" the old data block
+ * from the parity block
+ */
+ if ((pdata < ((int *) prqe->b.b_data))
+ || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
+ || (sdata < ((int *) rqe->b.b_data))
+ || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
+ panic("complete_raid5_write: bounds overflow");
+ for (count = 0; count < length; count++)
+ pdata[count] ^= sdata[count];
+
+ /* "add" the new data block */
+ sdata = (int *) (&ubp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
+ if ((sdata < ((int *) ubp->b_data))
+ || (&sdata[length] > ((int *) (ubp->b_data + ubp->b_bcount))))
+ panic("complete_raid5_write: bounds overflow");
+ for (count = 0; count < length; count++)
+ pdata[count] ^= sdata[count];
+
+ /* Free the malloced buffer */
+ if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */
+ Free(rqe->b.b_data); /* free it */
+ rqe->flags &= ~XFR_MALLOCED;
+ } else
+ panic("complete_raid5_write: malloc conflict");
+
+ if ((rqe->b.b_iocmd == BIO_READ) /* this was a read */
+ &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */
+ rqe->b.b_flags &= ~B_DONE; /* start a new request */
+ rqe->b.b_iocmd = BIO_WRITE; /* we're writing now */
+ rqe->b.b_iodone = complete_rqe; /* call us here when done */
+ rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
+ rqe->b.b_data = &ubp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
+ rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
+ rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */
+ rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
+ rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */
+ rqg->active++; /* another active request */
+ drive = &DRIVE[rqe->driveno]; /* drive to access */
+
+ /* We can't sleep here, so we just increment the counters. */
+ drive->active++;
+ if (drive->active >= drive->maxactive)
+ drive->maxactive = drive->active;
+ vinum_conf.active++;
+ if (vinum_conf.active >= vinum_conf.maxactive)
+ vinum_conf.maxactive = vinum_conf.active;
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_ADDRESSES)
+ log(LOG_DEBUG,
+ " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
+ rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
+ major(rqe->b.b_dev),
+ minor(rqe->b.b_dev),
+ rqe->sdno,
+ (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
+ (long long)rqe->b.b_blkno,
+ rqe->b.b_bcount);
+ if (debug & DEBUG_LASTREQS)
+ logrq(loginfo_raid5_data, (union rqinfou) rqe, ubp);
+#endif
+ DEV_STRATEGY(&rqe->b);
+ }
+ }
+ }
+ }
+ /* Finally, write the parity block */
+ rqe = &rqg->rqe[0];
+ rqe->b.b_flags &= ~B_DONE; /* we're not done */
+ rqe->b.b_iocmd = BIO_WRITE; /* we're writing now */
+ rqe->b.b_iodone = complete_rqe; /* call us here when done */
+ rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
+ rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */
+ rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */
+ rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
+ rqg->active++; /* another active request */
+ drive = &DRIVE[rqe->driveno]; /* drive to access */
+
+ /* We can't sleep here, so we just increment the counters. */
+ drive->active++;
+ if (drive->active >= drive->maxactive)
+ drive->maxactive = drive->active;
+ vinum_conf.active++;
+ if (vinum_conf.active >= vinum_conf.maxactive)
+ vinum_conf.maxactive = vinum_conf.active;
+
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_ADDRESSES)
+ log(LOG_DEBUG,
+ " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
+ rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
+ major(rqe->b.b_dev),
+ minor(rqe->b.b_dev),
+ rqe->sdno,
+ (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
+ (long long)rqe->b.b_blkno,
+ rqe->b.b_bcount);
+ if (debug & DEBUG_LASTREQS)
+ logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubp);
+#endif
+ DEV_STRATEGY(&rqe->b);
+}
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumio.c b/sys/dev/vinum/vinumio.c
new file mode 100644
index 0000000..8544f95
--- /dev/null
+++ b/sys/dev/vinum/vinumio.c
@@ -0,0 +1,959 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumio.c,v 1.39 2003/05/23 00:59:53 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+static char *sappend(char *txt, char *s);
+static int drivecmp(const void *va, const void *vb);
+
+/*
+ * Open the device associated with the drive, and
+ * set drive's vp. Return an error number.
+ */
+int
+open_drive(struct drive *drive, struct thread *td, int verbose)
+{
+ struct cdevsw *dsw; /* pointer to cdevsw entry */
+
+ if (drive->flags & VF_OPEN) /* open already, */
+ return EBUSY; /* don't do it again */
+
+ drive->dev = getdiskbyname(drive->devicename);
+ if (drive->dev == NODEV) /* didn't find anything */
+ return ENOENT;
+
+ drive->dev->si_iosize_max = DFLTPHYS;
+ dsw = devsw(drive->dev);
+ if (dsw == NULL) /* sanity, should not happen */
+ drive->lasterror = ENOENT;
+ else if ((dsw->d_flags & D_DISK) == 0)
+ drive->lasterror = ENOTBLK;
+ else
+ drive->lasterror = (dsw->d_open) (drive->dev, FWRITE | FREAD, 0, NULL);
+
+ if (drive->lasterror != 0) { /* failed */
+ drive->state = drive_down; /* just force it down */
+ if (verbose)
+ log(LOG_WARNING,
+ "vinum open_drive %s: failed with error %d\n",
+ drive->devicename, drive->lasterror);
+ } else
+ drive->flags |= VF_OPEN; /* we're open now */
+
+ return drive->lasterror;
+}
+
+/*
+ * Set some variables in the drive struct in more
+ * convenient form. Return error indication.
+ */
+int
+set_drive_parms(struct drive *drive)
+{
+ drive->blocksize = BLKDEV_IOSIZE; /* do we need this? */
+ drive->secsperblock = drive->blocksize /* number of sectors per block */
+ / drive->sectorsize;
+
+ /* Now update the label part */
+ bcopy(hostname, drive->label.sysname, VINUMHOSTNAMELEN); /* put in host name */
+ microtime(&drive->label.date_of_birth); /* and current time */
+ drive->label.drive_size = drive->mediasize; /* size of the drive in bytes */
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_BIGDRIVE) /* pretend we're 100 times as big */
+ drive->label.drive_size *= 100;
+#endif
+
+ /* number of sectors available for subdisks */
+ drive->sectors_available = drive->label.drive_size / DEV_BSIZE - DATASTART;
+
+ /*
+ * Bug in 3.0 as of January 1998: you can open
+ * non-existent slices. They have a length of 0.
+ */
+ if (drive->label.drive_size < MINVINUMSLICE) { /* too small to worry about */
+ set_drive_state(drive->driveno, drive_down, setstate_force);
+ drive->lasterror = ENOSPC;
+ return ENOSPC;
+ }
+ drive->freelist_size = INITIAL_DRIVE_FREELIST; /* initial number of entries */
+ drive->freelist = (struct drive_freelist *)
+ Malloc(INITIAL_DRIVE_FREELIST * sizeof(struct drive_freelist));
+ if (drive->freelist == NULL) /* can't malloc, dammit */
+ return ENOSPC;
+ drive->freelist_entries = 1; /* just (almost) the complete drive */
+ drive->freelist[0].offset = DATASTART; /* starts here */
+ drive->freelist[0].sectors = (drive->label.drive_size >> DEV_BSHIFT) - DATASTART; /* and it's this long */
+ if (drive->label.name[0] != '\0') /* got a name */
+ set_drive_state(drive->driveno, drive_up, setstate_force); /* our drive is accessible */
+ else /* we know about it, but that's all */
+ drive->state = drive_referenced;
+ return 0;
+}
+
+/*
+ * Initialize a drive: open the device and add
+ * device information.
+ */
+int
+init_drive(struct drive *drive, int verbose)
+{
+
+ drive->lasterror = open_drive(drive, curthread, verbose); /* open the drive */
+ if (drive->lasterror)
+ return drive->lasterror;
+
+ drive->lasterror = (*devsw(drive->dev)->d_ioctl) (drive->dev,
+ DIOCGSECTORSIZE,
+ (caddr_t) & drive->sectorsize,
+ FREAD,
+ curthread);
+ if (drive->lasterror == 0)
+ drive->lasterror = (*devsw(drive->dev)->d_ioctl) (drive->dev,
+ DIOCGMEDIASIZE,
+ (caddr_t) & drive->mediasize,
+ FREAD,
+ curthread);
+ if (drive->lasterror) {
+ if (verbose)
+ log(LOG_ERR,
+ "vinum: Can't get drive dimensions for %s: error %d\n",
+ drive->devicename,
+ drive->lasterror);
+ close_drive(drive);
+ return drive->lasterror;
+ }
+ return set_drive_parms(drive); /* set various odds and ends */
+}
+
+/* Close a drive if it's open. */
+void
+close_drive(struct drive *drive)
+{
+ LOCKDRIVE(drive); /* keep the daemon out */
+ if (drive->flags & VF_OPEN)
+ close_locked_drive(drive); /* and close it */
+ if (drive->state > drive_down) /* if it's up */
+ drive->state = drive_down; /* make sure it's down */
+ unlockdrive(drive);
+}
+
+/*
+ * Real drive close code, called with drive already locked.
+ * We have also checked that the drive is open. No errors.
+ */
+void
+close_locked_drive(struct drive *drive)
+{
+ int error;
+
+ /*
+ * If we can't access the drive, we can't flush
+ * the queues, which spec_close() will try to
+ * do. Get rid of them here first.
+ */
+ error = (*devsw(drive->dev)->d_close) (drive->dev, FWRITE | FREAD, 0, NULL);
+ drive->flags &= ~VF_OPEN; /* no longer open */
+ if (drive->lasterror == 0)
+ drive->lasterror = error;
+}
+
+/*
+ * Remove drive from the configuration.
+ * Caller must ensure that it isn't active.
+ */
+void
+remove_drive(int driveno)
+{
+ struct drive *drive = &vinum_conf.drive[driveno];
+ struct vinum_hdr *vhdr; /* buffer for header */
+ int error;
+
+ if (drive->state > drive_referenced) { /* real drive */
+ if (drive->state == drive_up) {
+ vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN); /* allocate buffer */
+ CHECKALLOC(vhdr, "Can't allocate memory");
+ error = read_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET);
+ if (error)
+ drive->lasterror = error;
+ else {
+ vhdr->magic = VINUM_NOMAGIC; /* obliterate the magic, but leave the rest */
+ write_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET);
+ }
+ Free(vhdr);
+ }
+ free_drive(drive); /* close it and free resources */
+ save_config(); /* and save the updated configuration */
+ }
+}
+
+/*
+ * Transfer drive data. Usually called from one of these defines;
+ * #define read_drive(a, b, c, d) driveio (a, b, c, d, B_READ)
+ * #define write_drive(a, b, c, d) driveio (a, b, c, d, B_WRITE)
+ *
+ * length and offset are in bytes, but must be multiples of sector
+ * size. The function *does not check* for this condition, and
+ * truncates ruthlessly.
+ * Return error number.
+ */
+int
+driveio(struct drive *drive, char *buf, size_t length, off_t offset, int flag)
+{
+ int error;
+ struct buf *bp;
+
+ error = 0; /* to keep the compiler happy */
+ while (length) { /* divide into small enough blocks */
+ int len = min(length, MAXBSIZE); /* maximum block device transfer is MAXBSIZE */
+
+ bp = geteblk(len); /* get a buffer header */
+ bp->b_flags = 0;
+ bp->b_iocmd = flag;
+ bp->b_dev = drive->dev; /* device */
+ bp->b_blkno = offset / drive->sectorsize; /* block number */
+ bp->b_saveaddr = bp->b_data;
+ bp->b_data = buf;
+ bp->b_bcount = len;
+ DEV_STRATEGY(bp); /* initiate the transfer */
+ error = bufwait(bp);
+ bp->b_data = bp->b_saveaddr;
+ bp->b_flags |= B_INVAL | B_AGE;
+ bp->b_ioflags &= ~BIO_ERROR;
+ brelse(bp);
+ if (error)
+ break;
+ length -= len; /* update pointers */
+ buf += len;
+ offset += len;
+ }
+ return error;
+}
+
+/*
+ * Check a drive for a vinum header. If found,
+ * update the drive information. We come here
+ * with a partially populated drive structure
+ * which includes the device name.
+ *
+ * Return information on what we found.
+ *
+ * This function is called from two places: check_drive,
+ * which wants to find out whether the drive is a
+ * Vinum drive, and config_drive, which asserts that
+ * it is a vinum drive. In the first case, we don't
+ * print error messages (verbose==0), in the second
+ * we do (verbose==1).
+ */
+enum drive_label_info
+read_drive_label(struct drive *drive, int verbose)
+{
+ int error;
+ int result; /* result of our search */
+ struct vinum_hdr *vhdr; /* and as header */
+
+ error = init_drive(drive, 0); /* find the drive */
+ if (error) /* find the drive */
+ return DL_CANT_OPEN; /* not ours */
+
+ vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN); /* allocate buffers */
+ CHECKALLOC(vhdr, "Can't allocate memory");
+
+ drive->state = drive_up; /* be optimistic */
+ error = read_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET);
+ if (vhdr->magic == VINUM_MAGIC) { /* ours! */
+ if (drive->label.name[0] /* we have a name for this drive */
+ &&(strcmp(drive->label.name, vhdr->label.name))) { /* but it doesn't match the real name */
+ drive->lasterror = EINVAL;
+ result = DL_WRONG_DRIVE; /* it's the wrong drive */
+ drive->state = drive_unallocated; /* put it back, it's not ours */
+ } else
+ result = DL_OURS;
+ /*
+ * We copy the drive anyway so that we have
+ * the correct name in the drive info. This
+ * may not be the name specified
+ */
+ drive->label = vhdr->label; /* put in the label information */
+ } else if (vhdr->magic == VINUM_NOMAGIC) /* was ours, but we gave it away */
+ result = DL_DELETED_LABEL; /* and return the info */
+ else
+ result = DL_NOT_OURS; /* we could have it, but we don't yet */
+ Free(vhdr); /* that's all. */
+ return result;
+}
+
+/*
+ * Check a drive for a vinum header. If found,
+ * read configuration information from the drive and
+ * incorporate the data into the configuration.
+ *
+ * Return drive number.
+ */
+struct drive *
+check_drive(char *devicename)
+{
+ int driveno;
+ int i;
+ struct drive *drive;
+
+ driveno = find_drive_by_name(devicename, 1); /* if entry doesn't exist, create it */
+ drive = &vinum_conf.drive[driveno]; /* and get a pointer */
+
+ if (drive->state >= drive_down) /* up or down, we know it */
+ return drive;
+ if (read_drive_label(drive, 0) == DL_OURS) { /* one of ours */
+ for (i = 0; i < vinum_conf.drives_allocated; i++) { /* see if the name already exists */
+ if ((i != driveno) /* not this drive */
+ &&(DRIVE[i].state != drive_unallocated) /* and it's allocated */
+ &&(strcmp(DRIVE[i].label.name,
+ DRIVE[driveno].label.name) == 0)) { /* and it has the same name */
+ struct drive *mydrive = &DRIVE[i];
+
+ if (mydrive->devicename[0] == '/') { /* we know a device name for it */
+ /*
+ * set an error, but don't take the
+ * drive down: that would cause unneeded
+ * error messages.
+ */
+ drive->lasterror = EEXIST;
+ break;
+ } else { /* it's just a place holder, */
+ int sdno;
+
+ for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { /* look at each subdisk */
+ if ((SD[sdno].driveno == i) /* it's pointing to this one, */
+ &&(SD[sdno].state != sd_unallocated)) { /* and it's a real subdisk */
+ SD[sdno].driveno = drive->driveno; /* point to the one we found */
+ update_sd_state(sdno); /* and update its state */
+ }
+ }
+ bzero(mydrive, sizeof(struct drive)); /* don't deallocate it, just remove it */
+ }
+ }
+ }
+ return drive;
+ } else { /* not ours, */
+ close_drive(drive);
+ free_drive(drive); /* get rid of it */
+ return NULL;
+ }
+}
+
+static char *
+sappend(char *txt, char *s)
+{
+ while ((*s++ = *txt++) != 0);
+ return s - 1;
+}
+
+void
+format_config(char *config, int len)
+{
+ int i;
+ int j;
+ char *s = config;
+ char *configend = &config[len];
+
+ bzero(config, len);
+
+ /* First write the volume configuration */
+ for (i = 0; i < vinum_conf.volumes_allocated; i++) {
+ struct volume *vol;
+
+ vol = &vinum_conf.volume[i];
+ if ((vol->state > volume_uninit)
+ && (vol->name[0] != '\0')) { /* paranoia */
+ snprintf(s,
+ configend - s,
+ "volume %s state %s",
+ vol->name,
+ volume_state(vol->state));
+ while (*s)
+ s++; /* find the end */
+ s = sappend("\n", s);
+ }
+ }
+
+ /* Then the plex configuration */
+ for (i = 0; i < vinum_conf.plexes_allocated; i++) {
+ struct plex *plex;
+ struct volume *vol;
+
+ plex = &vinum_conf.plex[i];
+ if ((plex->state > plex_referenced)
+ && (plex->name[0] != '\0')) { /* paranoia */
+ snprintf(s,
+ configend - s,
+ "plex name %s state %s org %s ",
+ plex->name,
+ plex_state(plex->state),
+ plex_org(plex->organization));
+ while (*s)
+ s++; /* find the end */
+ if (isstriped(plex)) {
+ snprintf(s,
+ configend - s,
+ "%ds ",
+ (int) plex->stripesize);
+ while (*s)
+ s++; /* find the end */
+ }
+ if (plex->volno >= 0) { /* we have a volume */
+ vol = &VOL[plex->volno];
+ snprintf(s,
+ configend - s,
+ "vol %s ",
+ vol->name);
+ while (*s)
+ s++; /* find the end */
+ if ((vol->preferred_plex >= 0) /* has a preferred plex */
+ &&vol->plex[vol->preferred_plex] == i) /* and it's us */
+ snprintf(s, configend - s, "preferred ");
+ while (*s)
+ s++; /* find the end */
+ }
+ for (j = 0; j < plex->subdisks; j++) {
+ snprintf(s,
+ configend - s,
+ " sd %s",
+ vinum_conf.sd[plex->sdnos[j]].name);
+ }
+ s = sappend("\n", s);
+ }
+ }
+
+ /* And finally the subdisk configuration */
+ for (i = 0; i < vinum_conf.subdisks_allocated; i++) {
+ struct sd *sd;
+ char *drivename;
+
+ sd = &SD[i];
+ if ((sd->state != sd_referenced)
+ && (sd->state != sd_unallocated)
+ && (sd->name[0] != '\0')) { /* paranoia */
+ drivename = vinum_conf.drive[sd->driveno].label.name;
+ /*
+ * XXX We've seen cases of dead subdisks
+ * which don't have a drive. If we let them
+ * through here, the drive name is null, so
+ * they get the drive named 'plex'.
+ *
+ * This is a breakage limiter, not a fix.
+ */
+ if (drivename[0] == '\0')
+ drivename = "*invalid*";
+ snprintf(s,
+ configend - s,
+ "sd name %s drive %s len %llus driveoffset %llus state %s",
+ sd->name,
+ drivename,
+ (unsigned long long) sd->sectors,
+ (unsigned long long) sd->driveoffset,
+ sd_state(sd->state));
+ while (*s)
+ s++; /* find the end */
+ if (sd->plexno >= 0)
+ snprintf(s,
+ configend - s,
+ " plex %s plexoffset %llds",
+ vinum_conf.plex[sd->plexno].name,
+ (long long) sd->plexoffset);
+ else
+ snprintf(s, configend - s, " detached");
+ while (*s)
+ s++; /* find the end */
+ if (sd->flags & VF_RETRYERRORS) {
+ snprintf(s, configend - s, " retryerrors");
+ while (*s)
+ s++; /* find the end */
+ }
+ snprintf(s, configend - s, " \n");
+ while (*s)
+ s++; /* find the end */
+ }
+ }
+ if (s > &config[len - 2])
+ panic("vinum: configuration data overflow");
+}
+
+/*
+ * issue a save config request to the dæmon. The actual work
+ * is done in process context by daemon_save_config.
+ */
+void
+save_config(void)
+{
+ queue_daemon_request(daemonrq_saveconfig, (union daemoninfo) 0);
+}
+
+/*
+ * Write the configuration to all vinum slices. This
+ * is performed by the daemon only.
+ */
+void
+daemon_save_config(void)
+{
+ int error;
+ int written_config; /* set when we first write the config to disk */
+ int driveno;
+ struct drive *drive; /* point to current drive info */
+ struct vinum_hdr *vhdr; /* and as header */
+ char *config; /* point to config data */
+
+ /* don't save the configuration while we're still working on it */
+ if (vinum_conf.flags & VF_CONFIGURING)
+ return;
+ written_config = 0; /* no config written yet */
+ /* Build a volume header */
+ vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN); /* get space for the config data */
+ CHECKALLOC(vhdr, "Can't allocate config data");
+ vhdr->magic = VINUM_MAGIC; /* magic number */
+ vhdr->config_length = MAXCONFIG; /* length of following config info */
+
+ config = Malloc(MAXCONFIG); /* get space for the config data */
+ CHECKALLOC(config, "Can't allocate config data");
+
+ format_config(config, MAXCONFIG);
+ error = 0; /* no errors yet */
+ for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) {
+ drive = &vinum_conf.drive[driveno]; /* point to drive */
+ if (drive->state > drive_referenced) {
+ LOCKDRIVE(drive); /* don't let it change */
+
+ /*
+ * First, do some drive consistency checks. Some
+ * of these are kludges, others require a process
+ * context and couldn't be done before.
+ */
+ if ((drive->devicename[0] == '\0')
+ || (drive->label.name[0] == '\0')) {
+ unlockdrive(drive);
+ free_drive(drive); /* get rid of it */
+ break;
+ }
+ if (((drive->flags & VF_OPEN) == 0) /* drive not open */
+ &&(drive->state > drive_down)) { /* and it thinks it's not down */
+ unlockdrive(drive);
+ set_drive_state(driveno, drive_down, setstate_force); /* tell it what's what */
+ continue;
+ }
+ if ((drive->state == drive_down) /* it's down */
+ &&(drive->flags & VF_OPEN)) { /* but open, */
+ unlockdrive(drive);
+ close_drive(drive); /* close it */
+ } else if (drive->state > drive_down) {
+ microtime(&drive->label.last_update); /* time of last update is now */
+ bcopy((char *) &drive->label, /* and the label info from the drive structure */
+ (char *) &vhdr->label,
+ sizeof(vhdr->label));
+ if ((drive->state != drive_unallocated)
+ && (drive->state != drive_referenced)) { /* and it's a real drive */
+ error = write_drive(drive,
+ (char *) vhdr,
+ VINUMHEADERLEN,
+ VINUM_LABEL_OFFSET);
+ if (error == 0) /* first config copy */
+ error = write_drive(drive,
+ config,
+ MAXCONFIG,
+ VINUM_CONFIG_OFFSET);
+ if (error == 0)
+ error = write_drive(drive, /* second copy */
+ config,
+ MAXCONFIG,
+ VINUM_CONFIG_OFFSET + MAXCONFIG);
+ unlockdrive(drive);
+ if (error) {
+ log(LOG_ERR,
+ "vinum: Can't write config to %s, error %d\n",
+ drive->devicename,
+ error);
+ set_drive_state(drive->driveno, drive_down, setstate_force);
+ } else
+ written_config = 1; /* we've written it on at least one drive */
+ }
+ } else /* not worth looking at, */
+ unlockdrive(drive); /* just unlock it again */
+ }
+ }
+ Free(vhdr);
+ Free(config);
+}
+
+/*
+ * Disk labels are a mess. The correct way to
+ * access them is with the DIOC[GSW]DINFO ioctls,
+ * but some programs, such as newfs, access the
+ * disk directly, so we have to write things
+ * there. We do this only on request. If a user
+ * request tries to read it directly, we fake up
+ * one on the fly.
+ */
+
+/*
+ * get_volume_label returns a label structure to
+ * lp, which is allocated by the caller.
+ */
+void
+get_volume_label(char *name, int plexes, u_int64_t size, struct disklabel *lp)
+{
+ bzero(lp, sizeof(struct disklabel));
+
+ strncpy(lp->d_typename, "vinum", sizeof(lp->d_typename));
+ lp->d_type = DTYPE_VINUM;
+ strncpy(lp->d_packname, name, min(sizeof(lp->d_packname), sizeof(name)));
+ lp->d_rpm = 14400 * plexes; /* to keep them guessing */
+ lp->d_interleave = 1;
+ lp->d_flags = 0;
+
+ /*
+ * A Vinum volume has a single track with all
+ * its sectors.
+ */
+ lp->d_secsize = DEV_BSIZE; /* bytes per sector */
+ lp->d_nsectors = size; /* data sectors per track */
+ lp->d_ntracks = 1; /* tracks per cylinder */
+ lp->d_ncylinders = 1; /* data cylinders per unit */
+ lp->d_secpercyl = size; /* data sectors per cylinder */
+ lp->d_secperunit = size; /* data sectors per unit */
+
+ lp->d_bbsize = BBSIZE;
+ lp->d_sbsize = 0; /* no longer used? */
+ lp->d_magic = DISKMAGIC;
+ lp->d_magic2 = DISKMAGIC;
+
+ /*
+ * Set up partitions a, b and c to be identical
+ * and the size of the volume. a is UFS, b is
+ * swap, c is nothing.
+ */
+ lp->d_partitions[0].p_size = size;
+ lp->d_partitions[0].p_fsize = 1024;
+ lp->d_partitions[0].p_fstype = FS_BSDFFS; /* FreeBSD File System :-) */
+ lp->d_partitions[0].p_fsize = 1024; /* FS fragment size */
+ lp->d_partitions[0].p_frag = 8; /* and fragments per block */
+ lp->d_partitions[SWAP_PART].p_size = size;
+ lp->d_partitions[SWAP_PART].p_fstype = FS_SWAP; /* swap partition */
+ lp->d_partitions[LABEL_PART].p_size = size;
+ lp->d_npartitions = LABEL_PART + 1;
+ strncpy(lp->d_packname, name, min(sizeof(lp->d_packname), sizeof(name)));
+ lp->d_checksum = dkcksum(lp);
+}
+
+/*
+ * Seach disks on system for vinum slices and add
+ * them to the configuuration if they're not
+ * there already. devicename is a blank-separate
+ * list of device names. If not provided, use
+ * sysctl to get a list of all disks on the
+ * system.
+ *
+ * Return an error indication.
+ */
+int
+vinum_scandisk(char *devicename)
+{
+ struct drive *volatile drive;
+ volatile int driveno;
+ int firstdrive; /* first drive in this list */
+ volatile int gooddrives; /* number of usable drives found */
+ int firsttime; /* set if we have never configured before */
+ int error;
+ char *config_text; /* read the config info from disk into here */
+ char *volatile cptr; /* pointer into config information */
+ char *eptr; /* end pointer into config information */
+ char *config_line; /* copy the config line to */
+ volatile int status;
+ int *drivelist; /* list of drive indices */
+ char *partname; /* for creating partition names */
+ char *cp; /* pointer to start of disk name */
+ char *ep; /* and to first char after name */
+ char *np; /* name pointer in naem we build */
+ size_t alloclen;
+ int malloced;
+ int partnamelen; /* length of partition name */
+ int drives;
+
+ malloced = 0; /* devicename not malloced */
+ if (devicename == NULL) { /* no devices specified, */
+ /* get a list of all disks in the system */
+ /* Get size of disk list */
+ error = kernel_sysctlbyname(&thread0, "kern.disks", NULL,
+ NULL, NULL, 0, &alloclen);
+ if (error) {
+ log(LOG_ERR, "vinum: can't get disk list: %d\n", error);
+ return EINVAL;
+ }
+ devicename = Malloc(alloclen);
+ if (devicename == NULL) {
+ printf("vinum: can't allocate memory for drive list");
+ return ENOMEM;
+ } else
+ malloced = 1;
+ /* Now get the list of disks */
+ kernel_sysctlbyname(&thread0, "kern.disks", devicename,
+ &alloclen, NULL, 0, NULL);
+ }
+ status = 0; /* success indication */
+ vinum_conf.flags |= VF_READING_CONFIG; /* reading config from disk */
+ partname = Malloc(MAXPATHLEN); /* extract name of disk here */
+ if (partname == NULL) {
+ printf("vinum_scandisk: can't allocate memory for drive name");
+ return ENOMEM;
+ }
+ gooddrives = 0; /* number of usable drives found */
+ firstdrive = vinum_conf.drives_used; /* the first drive */
+ firsttime = vinum_conf.drives_used == 0; /* are we a virgin? */
+
+ /* allocate a drive pointer list */
+ drives = 256; /* should be enough for most cases */
+ drivelist = (int *) Malloc(drives * sizeof(int));
+ CHECKALLOC(drivelist, "Can't allocate memory");
+ error = lock_config(); /* make sure we're alone here */
+ if (error)
+ return error;
+ error = setjmp(command_fail); /* come back here on error */
+ if (error) /* longjmped out */
+ return error;
+
+ /* Open all drives and find which was modified most recently */
+ for (cp = devicename; *cp; cp = ep) {
+ char part; /* UNIX partition */
+ int slice;
+
+ while (*cp == ' ')
+ cp++; /* find start of name */
+ if (*cp == '\0') /* done, */
+ break;
+ ep = cp;
+ while (*ep && (*ep != ' ')) /* find end of name */
+ ep++;
+
+ np = partname; /* start building up a name here */
+ if (*cp != '/') { /* name doesn't start with /, */
+ strcpy(np, "/dev/"); /* assume /dev */
+ np += strlen("/dev/");
+ }
+ memcpy(np, cp, ep - cp); /* put in name */
+ np += ep - cp; /* and point past */
+
+ partnamelen = MAXPATHLEN + np - partname; /* remaining length in partition name */
+ /* first try the partition table */
+ for (slice = 1; slice < 5; slice++)
+ for (part = 'a'; part < 'i'; part++) {
+ if (part != 'c') { /* don't do the c partition */
+ snprintf(np,
+ partnamelen,
+ "s%d%c",
+ slice,
+ part);
+ drive = check_drive(partname); /* try to open it */
+ if (drive) { /* got something, */
+ if (drive->flags & VF_CONFIGURED) /* already read this config, */
+ log(LOG_WARNING,
+ "vinum: already read config from %s\n", /* say so */
+ drive->label.name);
+ else {
+ if (gooddrives == drives) /* ran out of entries */
+ EXPAND(drivelist, int, drives, drives); /* double the size */
+ drivelist[gooddrives] = drive->driveno; /* keep the drive index */
+ drive->flags &= ~VF_NEWBORN; /* which is no longer newly born */
+ gooddrives++;
+ }
+ }
+ }
+ }
+ /*
+ * This is a kludge. Probably none of this
+ * should be here.
+ */
+ if (gooddrives == 0) { /* didn't find anything, */
+ for (part = 'a'; part < 'i'; part++) /* try the compatibility partition */
+ if (part != 'c') { /* don't do the c partition */
+ snprintf(np,
+ partnamelen,
+ "%c",
+ part);
+ drive = check_drive(partname); /* try to open it */
+ if (drive) { /* got something, */
+ if (drive->flags & VF_CONFIGURED) /* already read this config, */
+ log(LOG_WARNING,
+ "vinum: already read config from %s\n", /* say so */
+ drive->label.name);
+ else {
+ if (gooddrives == drives) /* ran out of entries */
+ EXPAND(drivelist, int, drives, drives); /* double the size */
+ drivelist[gooddrives] = drive->driveno; /* keep the drive index */
+ drive->flags &= ~VF_NEWBORN; /* which is no longer newly born */
+ gooddrives++;
+ }
+ }
+ }
+ }
+ }
+ Free(partname);
+
+ if (gooddrives == 0) {
+ if (firsttime)
+ log(LOG_WARNING, "vinum: no drives found\n");
+ else
+ log(LOG_INFO, "vinum: no additional drives found\n");
+ if (malloced)
+ Free(devicename);
+ unlock_config();
+ return ENOENT;
+ }
+ /*
+ * We now have at least one drive open. Sort
+ * them in order of config time and merge the
+ * config info with what we have already.
+ */
+ qsort(drivelist, gooddrives, sizeof(int), drivecmp);
+ config_text = (char *) Malloc(MAXCONFIG * 2); /* allocate buffers */
+ CHECKALLOC(config_text, "Can't allocate memory");
+ config_line = (char *) Malloc(MAXCONFIGLINE * 2); /* allocate buffers */
+ CHECKALLOC(config_line, "Can't allocate memory");
+ for (driveno = 0; driveno < gooddrives; driveno++) { /* now include the config */
+ drive = &DRIVE[drivelist[driveno]]; /* point to the drive */
+
+ if (firsttime && (driveno == 0)) /* we've never configured before, */
+ log(LOG_INFO, "vinum: reading configuration from %s\n", drive->devicename);
+ else
+ log(LOG_INFO, "vinum: updating configuration from %s\n", drive->devicename);
+
+ if (drive->state == drive_up)
+ /* Read in both copies of the configuration information */
+ error = read_drive(drive, config_text, MAXCONFIG * 2, VINUM_CONFIG_OFFSET);
+ else {
+ error = EIO;
+ printf("vinum_scandisk: %s is %s\n", drive->devicename, drive_state(drive->state));
+ }
+
+ if (error != 0) {
+ log(LOG_ERR, "vinum: Can't read device %s, error %d\n", drive->devicename, error);
+ free_drive(drive); /* give it back */
+ status = error;
+ }
+ /*
+ * At this point, check that the two copies
+ * are the same, and do something useful if
+ * not. In particular, consider which is
+ * newer, and what this means for the
+ * integrity of the data on the drive.
+ */
+ else {
+ vinum_conf.drives_used++; /* another drive in use */
+ /* Parse the configuration, and add it to the global configuration */
+ for (cptr = config_text; *cptr != '\0';) { /* love this style(9) */
+ volatile int parse_status; /* return value from parse_config */
+
+ for (eptr = config_line; (*cptr != '\n') && (*cptr != '\0');) /* until the end of the line */
+ *eptr++ = *cptr++;
+ *eptr = '\0'; /* and delimit */
+ if (setjmp(command_fail) == 0) { /* come back here on error and continue */
+ parse_status = parse_config(config_line, &keyword_set, 1); /* parse the config line */
+ /*
+ * parse_config recognizes referenced
+ * drives and builds a drive entry for
+ * them. This may expand the drive
+ * table, thus invalidating the pointer.
+ */
+ drive = &DRIVE[drivelist[driveno]]; /* point to the drive */
+
+ if (parse_status < 0) { /* error in config */
+ /*
+ * This config should have been parsed
+ * in user space. If we run into
+ * problems here, something serious is
+ * afoot. Complain and let the user
+ * snarf the config to see what's
+ * wrong.
+ */
+ log(LOG_ERR,
+ "vinum: Config error on %s, aborting integration\n",
+ drive->devicename);
+ free_drive(drive); /* give it back */
+ status = EINVAL;
+ }
+ }
+ while (*cptr == '\n')
+ cptr++; /* skip to next line */
+ }
+ }
+ drive->flags |= VF_CONFIGURED; /* this drive's configuration is complete */
+ }
+
+ Free(config_text);
+ Free(drivelist);
+ vinum_conf.flags &= ~VF_READING_CONFIG; /* no longer reading from disk */
+ if (status != 0)
+ printf("vinum: couldn't read configuration");
+ else
+ updateconfig(VF_READING_CONFIG); /* update from disk config */
+ if (malloced)
+ Free(devicename);
+ unlock_config();
+ return status;
+}
+
+/*
+ * Compare the modification dates of the drives, for qsort.
+ * Return 1 if a < b, 0 if a == b, 01 if a > b: in other
+ * words, sort backwards.
+ */
+int
+drivecmp(const void *va, const void *vb)
+{
+ const struct drive *a = &DRIVE[*(const int *) va];
+ const struct drive *b = &DRIVE[*(const int *) vb];
+
+ if ((a->label.last_update.tv_sec == b->label.last_update.tv_sec)
+ && (a->label.last_update.tv_usec == b->label.last_update.tv_usec))
+ return 0;
+ else if ((a->label.last_update.tv_sec > b->label.last_update.tv_sec)
+ || ((a->label.last_update.tv_sec == b->label.last_update.tv_sec)
+ && (a->label.last_update.tv_usec > b->label.last_update.tv_usec)))
+ return -1;
+ else
+ return 1;
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumio.h b/sys/dev/vinum/vinumio.h
new file mode 100644
index 0000000..bf5134a
--- /dev/null
+++ b/sys/dev/vinum/vinumio.h
@@ -0,0 +1,154 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumio.h,v 1.23 2003/05/04 05:25:46 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#define L 'F' /* ID letter of our ioctls */
+
+#define MAX_IOCTL_REPLY 1024
+
+#ifdef VINUMDEBUG
+struct debuginfo {
+ int changeit;
+ int param;
+};
+
+#endif
+
+enum objecttype {
+ drive_object,
+ sd_object,
+ plex_object,
+ volume_object,
+ invalid_object
+};
+
+/*
+ * The state to set with VINUM_SETSTATE. Since each object has a
+ * different set of states, we need to translate later.
+ */
+enum objectstate {
+ object_down,
+ object_initializing,
+ object_initialized,
+ object_up
+};
+
+/*
+ * This structure is used for modifying objects
+ * (VINUM_SETSTATE, VINUM_REMOVE, VINUM_RESETSTATS, VINUM_ATTACH,
+ * VINUM_DETACH, VINUM_REPLACE
+ */
+struct vinum_ioctl_msg {
+ int index;
+ enum objecttype type;
+ enum objectstate state; /* state to set (VINUM_SETSTATE) */
+ enum parityop op; /* for parity ops */
+ int force; /* do it even if it doesn't make sense */
+ int recurse; /* recurse (VINUM_REMOVE) */
+ int verify; /* verify (initsd, rebuildparity) */
+ int otherobject; /* superordinate object (attach),
+ * replacement object (replace) */
+ int rename; /* rename object (attach) */
+ int64_t offset; /* offset of subdisk (for attach) */
+ int blocksize; /* size of block to revive (bytes) */
+};
+
+/* VINUM_CREATE returns a buffer of this kind */
+struct _ioctl_reply {
+ int error;
+ char msg[MAX_IOCTL_REPLY];
+};
+
+struct vinum_rename_msg {
+ int index;
+ int recurse; /* rename subordinate objects too */
+ enum objecttype type;
+ char newname[MAXNAME]; /* new name to give to object */
+};
+
+/* ioctl requests */
+#define BUFSIZE 1024 /* size of buffer, including continuations */
+#define VINUM_CREATE _IOC(IOC_IN | IOC_OUT, L, 64, BUFSIZE) /* configure vinum */
+#define VINUM_GETCONFIG _IOR(L, 65, struct __vinum_conf) /* get global config */
+#define VINUM_DRIVECONFIG _IOWR(L, 66, struct _drive) /* get drive config */
+#define VINUM_SDCONFIG _IOWR(L, 67, struct _sd) /* get subdisk config */
+#define VINUM_PLEXCONFIG _IOWR(L, 68, struct _plex) /* get plex config */
+#define VINUM_VOLCONFIG _IOWR(L, 69, struct _volume) /* get volume config */
+#define VINUM_PLEXSDCONFIG _IOWR(L, 70, struct _sd) /* get sd config for plex (plex, sdno) */
+#define VINUM_GETFREELIST _IOWR(L, 71, struct drive_freelist) /* get freelist element (drive, fe) */
+#define VINUM_SAVECONFIG _IOW(L, 72, int) /* write config to disk */
+#define VINUM_RESETCONFIG _IOC(0, L, 73, 0) /* trash config on disk */
+#define VINUM_INIT _IOC(0, L, 74, 0) /* read config from disk */
+#define VINUM_READCONFIG _IOC(IOC_IN | IOC_OUT, L, 75, BUFSIZE) /* read config from disk */
+#ifdef VINUMDEBUG
+#define VINUM_DEBUG _IOWR(L, 127, struct debuginfo) /* call the debugger from ioctl () */
+#endif
+
+/*
+ * Start an object. Pass two integers:
+ * msg [0] index in vinum_conf.<object>
+ * msg [1] type of object (see below)
+ *
+ * Return ioctl_reply
+ */
+#define VINUM_SETSTATE _IOC(IOC_IN | IOC_OUT, L, 76, MAX_IOCTL_REPLY) /* start an object */
+#define VINUM_RELEASECONFIG _IOC(0, L, 77, 0) /* release locks and write config to disk */
+#define VINUM_STARTCONFIG _IOW(L, 78, int) /* start a configuration operation */
+#define VINUM_MEMINFO _IOR(L, 79, struct meminfo) /* get memory usage summary */
+#define VINUM_MALLOCINFO _IOWR(L, 80, struct mc) /* get specific malloc information [i] */
+#define VINUM_INITSD _IOW(L, 82, int) /* initialize a subdisk */
+#define VINUM_REMOVE _IOWR(L, 83, struct _ioctl_reply) /* remove an object */
+#define VINUM_READPOL _IOWR(L, 84, struct _ioctl_reply) /* set read policy */
+#define VINUM_SETSTATE_FORCE _IOC(IOC_IN | IOC_OUT, L, 85, MAX_IOCTL_REPLY) /* diddle object state */
+#define VINUM_RESETSTATS _IOWR(L, 86, struct _ioctl_reply) /* reset object stats */
+#define VINUM_ATTACH _IOWR(L, 87, struct _ioctl_reply) /* attach an object */
+#define VINUM_DETACH _IOWR(L, 88, struct _ioctl_reply) /* remove an object */
+
+#define VINUM_RENAME _IOWR(L, 89, struct _ioctl_reply) /* rename an object */
+#define VINUM_REPLACE _IOWR(L, 90, struct _ioctl_reply) /* replace an object */
+
+#ifdef VINUMDEBUG
+#define VINUM_RQINFO _IOWR(L, 91, struct rqinfo) /* get request info [i] from trace buffer */
+#endif
+
+#define VINUM_DAEMON _IOC(0, L, 92, 0) /* perform the kernel part of Vinum daemon */
+#define VINUM_FINDDAEMON _IOC(0, L, 93, 0) /* check for presence of Vinum daemon */
+#define VINUM_SETDAEMON _IOW(L, 94, int) /* set daemon flags */
+#define VINUM_GETDAEMON _IOR(L, 95, int) /* get daemon flags */
+#define VINUM_PARITYOP _IOWR(L, 96, struct _ioctl_reply) /* check/rebuild RAID-4/5 parity */
+#define VINUM_MOVE _IOWR(L, 98, struct _ioctl_reply) /* move an object */
diff --git a/sys/dev/vinum/vinumioctl.c b/sys/dev/vinum/vinumioctl.c
new file mode 100644
index 0000000..2f7b876
--- /dev/null
+++ b/sys/dev/vinum/vinumioctl.c
@@ -0,0 +1,958 @@
+/*
+ * XXX replace all the checks on object validity with
+ * calls to valid<object>
+ */
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumioctl.c,v 1.23 2003/05/23 01:02:22 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+#ifdef VINUMDEBUG
+#include <sys/reboot.h>
+#endif
+
+void attachobject(struct vinum_ioctl_msg *);
+void detachobject(struct vinum_ioctl_msg *);
+void renameobject(struct vinum_rename_msg *);
+void replaceobject(struct vinum_ioctl_msg *);
+void moveobject(struct vinum_ioctl_msg *);
+void setreadpol(struct vinum_ioctl_msg *);
+
+jmp_buf command_fail; /* return on a failed command */
+
+/* ioctl routine */
+int
+vinumioctl(dev_t dev,
+ u_long cmd,
+ caddr_t data,
+ int flag,
+ struct thread *td)
+{
+ unsigned int objno;
+ struct sd *sd;
+ struct plex *plex;
+ struct volume *vol;
+
+ /* First, decide what we're looking at */
+ if ((minor(dev) == VINUM_SUPERDEV_MINOR)
+ || (minor(dev) == VINUM_DAEMON_MINOR))
+ return vinum_super_ioctl(dev, cmd, data);
+ else /* real device */
+ switch (DEVTYPE(dev)) {
+ case VINUM_SD_TYPE:
+ case VINUM_SD2_TYPE: /* second half of sd namespace */
+ objno = Sdno(dev);
+
+ sd = &SD[objno];
+
+ switch (cmd) {
+ case DIOCGSECTORSIZE:
+ *(u_int *) data = sd->sectorsize;
+ return 0;
+
+ case DIOCGMEDIASIZE:
+ *(u_int64_t *) data = sd->sectors * sd->sectorsize;
+ return 0;
+
+ /*
+ * We don't have this stuff on hardware,
+ * so just pretend to do it so that
+ * utilities don't get upset.
+ */
+ case DIOCWDINFO: /* write partition info */
+ case DIOCSDINFO: /* set partition info */
+ return 0; /* not a titty */
+
+ default:
+ return ENOTTY; /* not my kind of ioctl */
+ }
+
+ return 0; /* pretend we did it */
+
+ case VINUM_PLEX_TYPE:
+ objno = Plexno(dev);
+
+ plex = &PLEX[objno];
+
+ switch (cmd) {
+ case DIOCGSECTORSIZE:
+ *(u_int64_t *) data = plex->sectorsize;
+ return 0;
+
+ case DIOCGMEDIASIZE:
+ *(u_int64_t *) data = plex->length * plex->sectorsize;
+ return 0;
+
+ /*
+ * We don't have this stuff on hardware,
+ * so just pretend to do it so that
+ * utilities don't get upset.
+ */
+ case DIOCWDINFO: /* write partition info */
+ case DIOCSDINFO: /* set partition info */
+ return 0; /* not a titty */
+
+ default:
+ return ENOTTY; /* not my kind of ioctl */
+ }
+
+ return 0; /* pretend we did it */
+
+ case VINUM_VOLUME_TYPE:
+ objno = Volno(dev);
+
+ if ((unsigned) objno >= (unsigned) vinum_conf.volumes_allocated) /* not a valid volume */
+ return ENXIO;
+ vol = &VOL[objno];
+ if (vol->state != volume_up) /* not up, */
+ return EIO; /* I/O error */
+
+ switch (cmd) {
+ case DIOCGSECTORSIZE:
+ *(u_int *) data = vol->sectorsize;
+ return 0;
+
+ case DIOCGMEDIASIZE:
+ *(u_int64_t *) data = vol->size * vol->sectorsize;
+ return 0;
+
+ /*
+ * We don't have this stuff on hardware,
+ * so just pretend to do it so that
+ * utilities don't get upset.
+ */
+ case DIOCWDINFO: /* write partition info */
+ case DIOCSDINFO: /* set partition info */
+ return 0; /* not a titty */
+
+ default:
+ return ENOTTY; /* not my kind of ioctl */
+ }
+ break;
+ }
+ return 0; /* XXX */
+}
+
+/* Handle ioctls for the super device */
+int
+vinum_super_ioctl(dev_t dev,
+ u_long cmd,
+ caddr_t data)
+{
+ int error = 0;
+ unsigned int index; /* for transferring config info */
+ unsigned int sdno; /* for transferring config info */
+ int fe; /* free list element number */
+ struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) data; /* struct to return */
+
+ ioctl_reply = (struct _ioctl_reply *) data; /* save the address to reply to */
+ if (error) /* bombed out */
+ return 0; /* the reply will contain meaningful info */
+ switch (cmd) {
+#ifdef VINUMDEBUG
+ case VINUM_DEBUG:
+ if (((struct debuginfo *) data)->changeit) /* change debug settings */
+ debug = (((struct debuginfo *) data)->param);
+ else {
+ if (debug & DEBUG_REMOTEGDB)
+ boothowto |= RB_GDB; /* serial debug line */
+ else
+ boothowto &= ~RB_GDB; /* local ddb */
+ Debugger("vinum debug");
+ }
+ ioctl_reply = (struct _ioctl_reply *) data; /* reinstate the address to reply to */
+ ioctl_reply->error = 0;
+ return 0;
+#endif
+
+ case VINUM_CREATE: /* create a vinum object */
+ error = lock_config(); /* get the config for us alone */
+ if (error) /* can't do it, */
+ return error; /* give up */
+ error = setjmp(command_fail); /* come back here on error */
+ if (error == 0) /* first time, */
+ ioctl_reply->error = parse_user_config((char *) data, /* update the config */
+ &keyword_set);
+ else if (ioctl_reply->error == 0) { /* longjmp, but no error status */
+ ioctl_reply->error = EINVAL; /* note that something's up */
+ ioctl_reply->msg[0] = '\0'; /* no message? */
+ }
+ unlock_config();
+ return 0; /* must be 0 to return the real error info */
+
+ case VINUM_GETCONFIG: /* get the configuration information */
+ bcopy(&vinum_conf, data, sizeof(vinum_conf));
+ return 0;
+
+ /* start configuring the subsystem */
+ case VINUM_STARTCONFIG:
+ return start_config(*(int *) data); /* just lock it. Parameter is 'force' */
+
+ /*
+ * Move the individual parts of the config to user space.
+ *
+ * Specify the index of the object in the first word of data,
+ * and return the object there
+ */
+ case VINUM_DRIVECONFIG:
+ index = *(int *) data; /* get the index */
+ if (index >= (unsigned) vinum_conf.drives_allocated) /* can't do it */
+ return ENXIO; /* bang */
+ bcopy(&DRIVE[index], data, sizeof(struct _drive)); /* copy the config item out */
+ return 0;
+
+ case VINUM_SDCONFIG:
+ index = *(int *) data; /* get the index */
+ if (index >= (unsigned) vinum_conf.subdisks_allocated) /* can't do it */
+ return ENXIO; /* bang */
+ bcopy(&SD[index], data, sizeof(struct _sd)); /* copy the config item out */
+ return 0;
+
+ case VINUM_PLEXCONFIG:
+ index = *(int *) data; /* get the index */
+ if (index >= (unsigned) vinum_conf.plexes_allocated) /* can't do it */
+ return ENXIO; /* bang */
+ bcopy(&PLEX[index], data, sizeof(struct _plex)); /* copy the config item out */
+ return 0;
+
+ case VINUM_VOLCONFIG:
+ index = *(int *) data; /* get the index */
+ if (index >= (unsigned) vinum_conf.volumes_allocated) /* can't do it */
+ return ENXIO; /* bang */
+ bcopy(&VOL[index], data, sizeof(struct _volume)); /* copy the config item out */
+ return 0;
+
+ case VINUM_PLEXSDCONFIG:
+ index = *(int *) data; /* get the plex index */
+ sdno = ((int *) data)[1]; /* and the sd index */
+ if ((index >= (unsigned) vinum_conf.plexes_allocated) /* plex doesn't exist */
+ ||(sdno >= PLEX[index].subdisks)) /* or it doesn't have this many subdisks */
+ return ENXIO; /* bang */
+ bcopy(&SD[PLEX[index].sdnos[sdno]], /* copy the config item out */
+ data,
+ sizeof(struct _sd));
+ return 0;
+
+ /*
+ * We get called in two places: one from the
+ * userland config routines, which call us
+ * to complete the config and save it. This
+ * call supplies the value 0 as a parameter.
+ *
+ * The other place is from the user "saveconfig"
+ * routine, which can only work if we're *not*
+ * configuring. In this case, supply parameter 1.
+ */
+ case VINUM_SAVECONFIG:
+ if (VFLAGS & VF_CONFIGURING) { /* must be us, the others are asleep */
+ if (*(int *) data == 0) /* finish config */
+ finish_config(1); /* finish the configuration and update it */
+ else
+ return EBUSY; /* can't do it now */
+ }
+ save_config(); /* save configuration to disk */
+ return 0;
+
+ case VINUM_RELEASECONFIG: /* release the config */
+ if (VFLAGS & VF_CONFIGURING) { /* must be us, the others are asleep */
+ finish_config(0); /* finish the configuration, don't change it */
+ save_config(); /* save configuration to disk */
+ } else
+ error = EINVAL; /* release what config? */
+ return error;
+
+ case VINUM_READCONFIG:
+ if (((char *) data)[0] == '\0')
+ ioctl_reply->error = vinum_scandisk(NULL); /* built your own list */
+ else
+ ioctl_reply->error = vinum_scandisk((char *) data);
+ if (ioctl_reply->error == ENOENT) {
+ if (vinum_conf.drives_used > 0)
+ strcpy(ioctl_reply->msg, "no additional drives found");
+ else
+ strcpy(ioctl_reply->msg, "no drives found");
+ } else if (ioctl_reply->error)
+ strcpy(ioctl_reply->msg, "can't read configuration information, see log file");
+ return 0; /* must be 0 to return the real error info */
+
+ case VINUM_INIT:
+ ioctl_reply = (struct _ioctl_reply *) data; /* reinstate the address to reply to */
+ ioctl_reply->error = 0;
+ return 0;
+
+ case VINUM_RESETCONFIG:
+ if (vinum_inactive(0)) { /* if the volumes are not active */
+ /*
+ * Note the open count. We may be called from v, so we'll be open.
+ * Keep the count so we don't underflow
+ */
+ free_vinum(1); /* clean up everything */
+ log(LOG_NOTICE, "vinum: CONFIGURATION OBLITERATED\n");
+ ioctl_reply = (struct _ioctl_reply *) data; /* reinstate the address to reply to */
+ ioctl_reply->error = 0;
+ return 0;
+ }
+ return EBUSY;
+
+ case VINUM_SETSTATE:
+ setstate((struct vinum_ioctl_msg *) data); /* set an object state */
+ return 0;
+
+ /*
+ * Set state by force, without changing
+ * anything else.
+ */
+ case VINUM_SETSTATE_FORCE:
+ setstate_by_force((struct vinum_ioctl_msg *) data); /* set an object state */
+ return 0;
+
+#ifdef VINUMDEBUG
+ case VINUM_MEMINFO:
+ vinum_meminfo(data);
+ return 0;
+
+ case VINUM_MALLOCINFO:
+ return vinum_mallocinfo(data);
+
+ case VINUM_RQINFO:
+ return vinum_rqinfo(data);
+#endif
+
+ case VINUM_REMOVE:
+ remove((struct vinum_ioctl_msg *) data); /* remove an object */
+ return 0;
+
+ case VINUM_GETFREELIST: /* get a drive free list element */
+ index = *(int *) data; /* get the drive index */
+ fe = ((int *) data)[1]; /* and the free list element */
+ if ((index >= (unsigned) vinum_conf.drives_allocated) /* plex doesn't exist */
+ ||(DRIVE[index].state == drive_unallocated))
+ return ENODEV;
+ if (fe >= DRIVE[index].freelist_entries) /* no such entry */
+ return ENOENT;
+ bcopy(&DRIVE[index].freelist[fe],
+ data,
+ sizeof(struct drive_freelist));
+ return 0;
+
+ case VINUM_RESETSTATS:
+ resetstats((struct vinum_ioctl_msg *) data); /* reset object stats */
+ return 0;
+
+ /* attach an object to a superordinate object */
+ case VINUM_ATTACH:
+ attachobject((struct vinum_ioctl_msg *) data);
+ return 0;
+
+ /* detach an object from a superordinate object */
+ case VINUM_DETACH:
+ detachobject((struct vinum_ioctl_msg *) data);
+ return 0;
+
+ /* rename an object */
+ case VINUM_RENAME:
+ renameobject((struct vinum_rename_msg *) data);
+ return 0;
+
+ /* replace an object */
+ case VINUM_REPLACE:
+ replaceobject((struct vinum_ioctl_msg *) data);
+ return 0;
+
+ case VINUM_DAEMON:
+ vinum_daemon(); /* perform the daemon */
+ return 0;
+
+ case VINUM_FINDDAEMON: /* check for presence of daemon */
+ return vinum_finddaemon();
+ return 0;
+
+ case VINUM_SETDAEMON: /* set daemon flags */
+ return vinum_setdaemonopts(*(int *) data);
+
+ case VINUM_GETDAEMON: /* get daemon flags */
+ *(int *) data = daemon_options;
+ return 0;
+
+ case VINUM_PARITYOP: /* check/rebuild RAID-4/5 parity */
+ parityops((struct vinum_ioctl_msg *) data);
+ return 0;
+
+ /* move an object */
+ case VINUM_MOVE:
+ moveobject((struct vinum_ioctl_msg *) data);
+ return 0;
+
+ case VINUM_READPOL:
+ setreadpol((struct vinum_ioctl_msg *) data);
+ return 0;
+
+ default:
+ /* FALLTHROUGH */
+ break;
+ }
+ return 0; /* to keep the compiler happy */
+}
+
+/*
+ * The following four functions check the supplied
+ * object index and return a pointer to the object
+ * if it exists. Otherwise they longjump out via
+ * throw_rude_remark.
+ */
+struct drive *
+validdrive(int driveno, struct _ioctl_reply *reply)
+{
+ if ((driveno < vinum_conf.drives_allocated)
+ && (DRIVE[driveno].state > drive_referenced))
+ return &DRIVE[driveno];
+ strcpy(reply->msg, "No such drive");
+ reply->error = ENOENT;
+ return NULL;
+}
+
+struct sd *
+validsd(int sdno, struct _ioctl_reply *reply)
+{
+ if ((sdno < vinum_conf.subdisks_allocated)
+ && (SD[sdno].state > sd_referenced))
+ return &SD[sdno];
+ strcpy(reply->msg, "No such subdisk");
+ reply->error = ENOENT;
+ return NULL;
+}
+
+struct plex *
+validplex(int plexno, struct _ioctl_reply *reply)
+{
+ if ((plexno < vinum_conf.plexes_allocated)
+ && (PLEX[plexno].state > plex_referenced))
+ return &PLEX[plexno];
+ strcpy(reply->msg, "No such plex");
+ reply->error = ENOENT;
+ return NULL;
+}
+
+struct volume *
+validvol(int volno, struct _ioctl_reply *reply)
+{
+ if ((volno < vinum_conf.volumes_allocated)
+ && (VOL[volno].state > volume_uninit))
+ return &VOL[volno];
+ strcpy(reply->msg, "No such volume");
+ reply->error = ENOENT;
+ return NULL;
+}
+
+/* reset an object's stats */
+void
+resetstats(struct vinum_ioctl_msg *msg)
+{
+ struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+
+ switch (msg->type) {
+ case drive_object:
+ if (msg->index < vinum_conf.drives_allocated) {
+ struct drive *drive = &DRIVE[msg->index];
+ if (drive->state > drive_referenced) {
+ drive->reads = 0; /* number of reads on this drive */
+ drive->writes = 0; /* number of writes on this drive */
+ drive->bytes_read = 0; /* number of bytes read */
+ drive->bytes_written = 0; /* number of bytes written */
+ reply->error = 0;
+ return;
+ }
+ reply->error = EINVAL;
+ return;
+ }
+ case sd_object:
+ if (msg->index < vinum_conf.subdisks_allocated) {
+ struct sd *sd = &SD[msg->index];
+ if (sd->state > sd_referenced) {
+ sd->reads = 0; /* number of reads on this subdisk */
+ sd->writes = 0; /* number of writes on this subdisk */
+ sd->bytes_read = 0; /* number of bytes read */
+ sd->bytes_written = 0; /* number of bytes written */
+ reply->error = 0;
+ return;
+ }
+ reply->error = EINVAL;
+ return;
+ }
+ break;
+
+ case plex_object:
+ if (msg->index < vinum_conf.plexes_allocated) {
+ struct plex *plex = &PLEX[msg->index];
+ if (plex->state > plex_referenced) {
+ plex->reads = 0;
+ plex->writes = 0; /* number of writes on this plex */
+ plex->bytes_read = 0; /* number of bytes read */
+ plex->bytes_written = 0; /* number of bytes written */
+ plex->recovered_reads = 0; /* number of recovered read operations */
+ plex->degraded_writes = 0; /* number of degraded writes */
+ plex->parityless_writes = 0; /* number of parityless writes */
+ plex->multiblock = 0; /* requests that needed more than one block */
+ plex->multistripe = 0; /* requests that needed more than one stripe */
+ reply->error = 0;
+ return;
+ }
+ reply->error = EINVAL;
+ return;
+ }
+ break;
+
+ case volume_object:
+ if (msg->index < vinum_conf.volumes_allocated) {
+ struct volume *vol = &VOL[msg->index];
+ if (vol->state > volume_uninit) {
+ vol->bytes_read = 0; /* number of bytes read */
+ vol->bytes_written = 0; /* number of bytes written */
+ vol->reads = 0; /* number of reads on this volume */
+ vol->writes = 0; /* number of writes on this volume */
+ vol->recovered_reads = 0; /* reads recovered from another plex */
+ reply->error = 0;
+ return;
+ }
+ reply->error = EINVAL;
+ return;
+ }
+ case invalid_object: /* can't get this */
+ reply->error = EINVAL;
+ return;
+ }
+}
+
+/* attach an object to a superior object */
+void
+attachobject(struct vinum_ioctl_msg *msg)
+{
+ struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+ int sdno;
+ struct sd *sd;
+ struct plex *plex;
+ struct volume *vol;
+
+ switch (msg->type) {
+ case drive_object: /* you can't attach a drive to anything */
+ case volume_object: /* nor a volume */
+ case invalid_object: /* "this can't happen" */
+ reply->error = EINVAL;
+ reply->msg[0] = '\0'; /* vinum(8) doesn't do this */
+ return;
+
+ case sd_object:
+ sd = validsd(msg->index, reply);
+ if (sd == NULL) /* not a valid subdisk */
+ return;
+ plex = validplex(msg->otherobject, reply);
+ if (plex) {
+ /*
+ * We should be more intelligent about this.
+ * We should be able to reattach a dead
+ * subdisk, but if we want to increase the total
+ * number of subdisks, we have a lot of reshuffling
+ * to do. XXX
+ */
+ if ((plex->organization != plex_concat) /* can't attach to striped and RAID-4/5 */
+ &&(!msg->force)) { /* without using force */
+ reply->error = EINVAL; /* no message, the user should check */
+ strcpy(reply->msg, "Can't attach to this plex organization");
+ } else if (sd->plexno >= 0) { /* already belong to a plex */
+ reply->error = EBUSY; /* no message, the user should check */
+ sprintf(reply->msg, "%s is already attached to %s",
+ sd->name,
+ sd[sd->plexno].name);
+ reply->msg[0] = '\0';
+ } else {
+ sd->plexoffset = msg->offset; /* this is where we want it */
+ set_sd_state(sd->sdno, sd_stale, setstate_force); /* make sure it's stale */
+ give_sd_to_plex(plex->plexno, sd->sdno); /* and give it to the plex */
+ update_sd_config(sd->sdno, 0);
+ save_config();
+ if (sd->state == sd_reviving)
+ reply->error = EAGAIN; /* need to revive it */
+ else
+ reply->error = 0;
+ }
+ }
+ break;
+
+ case plex_object:
+ plex = validplex(msg->index, reply); /* get plex */
+ if (plex == NULL)
+ return;
+ vol = validvol(msg->otherobject, reply); /* and volume information */
+ if (vol) {
+ if (vol->plexes == MAXPLEX) { /* we have too many already */
+ reply->error = ENOSPC; /* nowhere to put it */
+ strcpy(reply->msg, "Too many plexes");
+ } else if (plex->volno >= 0) { /* the plex has an owner */
+ reply->error = EBUSY; /* no message, the user should check */
+ sprintf(reply->msg, "%s is already attached to %s",
+ plex->name,
+ VOL[plex->volno].name);
+ } else {
+ for (sdno = 0; sdno < plex->subdisks; sdno++) {
+ sd = &SD[plex->sdnos[sdno]];
+
+ if (sd->state > sd_down) /* real subdisk, vaguely accessible */
+ set_sd_state(plex->sdnos[sdno], sd_stale, setstate_force); /* make it stale */
+ }
+ set_plex_state(plex->plexno, plex_up, setstate_none); /* update plex state */
+ give_plex_to_volume(msg->otherobject, msg->index, 0); /* and give it to the volume */
+ update_plex_config(plex->plexno, 0);
+ save_config();
+ reply->error = 0; /* all went well */
+ }
+ }
+ }
+}
+
+/* detach an object from a superior object */
+void
+detachobject(struct vinum_ioctl_msg *msg)
+{
+ struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+ struct sd *sd;
+ struct plex *plex;
+ struct volume *vol;
+ int sdno;
+ int plexno;
+
+ switch (msg->type) {
+ case drive_object: /* you can't detach a drive from anything */
+ case volume_object: /* nor a volume */
+ case invalid_object: /* "this can't happen" */
+ reply->error = EINVAL;
+ reply->msg[0] = '\0'; /* vinum(8) doesn't do this */
+ return;
+
+ case sd_object:
+ sd = validsd(msg->index, reply);
+ if (sd == NULL)
+ return;
+ if (sd->plexno < 0) { /* doesn't belong to a plex */
+ reply->error = ENOENT;
+ strcpy(reply->msg, "Subdisk is not attached");
+ return;
+ } else { /* valid plex number */
+ plex = &PLEX[sd->plexno];
+ if ((!msg->force) /* don't force things */
+ &&((plex->state == plex_up) /* and the plex is up */
+ ||((plex->state == plex_flaky) && sd->state == sd_up))) { /* or flaky with this sd up */
+ reply->error = EBUSY; /* we need this sd */
+ reply->msg[0] = '\0';
+ return;
+ }
+ sd->plexno = -1; /* anonymous sd */
+ if (plex->subdisks == 1) { /* this was the only subdisk */
+ Free(plex->sdnos); /* free the subdisk array */
+ plex->sdnos = NULL; /* and note the fact */
+ plex->subdisks_allocated = 0; /* no subdisk space */
+ } else {
+ for (sdno = 0; sdno < plex->subdisks; sdno++) {
+ if (plex->sdnos[sdno] == msg->index) /* found our subdisk */
+ break;
+ }
+ if (sdno < (plex->subdisks - 1)) /* not the last one, compact */
+ bcopy(&plex->sdnos[sdno + 1],
+ &plex->sdnos[sdno],
+ (plex->subdisks - 1 - sdno) * sizeof(int));
+ }
+ plex->subdisks--;
+ if (!bcmp(plex->name, sd->name, strlen(plex->name) + 1))
+ /* this subdisk is named after the plex */
+ {
+ bcopy(sd->name,
+ &sd->name[3],
+ min(strlen(sd->name) + 1, MAXSDNAME - 3));
+ bcopy("ex-", sd->name, 3);
+ sd->name[MAXSDNAME - 1] = '\0';
+ }
+ update_plex_config(plex->plexno, 0);
+ if (isstriped(plex)) /* we've just mutilated our plex, */
+ set_plex_state(plex->plexno,
+ plex_down,
+ setstate_force | setstate_configuring);
+ if (plex->volno >= 0) /* plex attached to volume, */
+ update_volume_config(plex->volno);
+ save_config();
+ reply->error = 0;
+ }
+ return;
+
+ case plex_object:
+ plex = validplex(msg->index, reply); /* get plex */
+ if (plex == NULL)
+ return;
+ if (plex->volno >= 0) {
+ int volno = plex->volno;
+
+ vol = &VOL[volno];
+ if ((!msg->force) /* don't force things */
+ &&((vol->state == volume_up) /* and the volume is up */
+ &&(vol->plexes == 1))) { /* and this is the last plex */
+ /*
+ * XXX As elsewhere, check whether we will lose
+ * mapping by removing this plex
+ */
+ reply->error = EBUSY; /* we need this plex */
+ reply->msg[0] = '\0';
+ return;
+ }
+ plex->volno = -1; /* anonymous plex */
+ for (plexno = 0; plexno < vol->plexes; plexno++) {
+ if (vol->plex[plexno] == msg->index) /* found our plex */
+ break;
+ }
+ if (plexno < (vol->plexes - 1)) /* not the last one, compact */
+ bcopy(&vol->plex[plexno + 1],
+ &vol->plex[plexno],
+ (vol->plexes - 1 - plexno) * sizeof(int));
+ vol->plexes--;
+ vol->last_plex_read = 0; /* don't go beyond the end */
+ if (!bcmp(vol->name, plex->name, strlen(vol->name) + 1))
+ /* this plex is named after the volume */
+ {
+ /* First, check if the subdisks are the same */
+ if (msg->recurse) {
+ int sdno;
+
+ for (sdno = 0; sdno < plex->subdisks; sdno++) {
+ struct sd *sd = &SD[plex->sdnos[sdno]];
+
+ if (!bcmp(plex->name, sd->name, strlen(plex->name) + 1))
+ /* subdisk is named after the plex */
+ {
+ bcopy(sd->name,
+ &sd->name[3],
+ min(strlen(sd->name) + 1, MAXSDNAME - 3));
+ bcopy("ex-", sd->name, 3);
+ sd->name[MAXSDNAME - 1] = '\0';
+ }
+ }
+ }
+ bcopy(plex->name,
+ &plex->name[3],
+ min(strlen(plex->name) + 1, MAXPLEXNAME - 3));
+ bcopy("ex-", plex->name, 3);
+ plex->name[MAXPLEXNAME - 1] = '\0';
+ }
+ update_volume_config(volno);
+ save_config();
+ reply->error = 0;
+ } else {
+ reply->error = ENOENT;
+ strcpy(reply->msg, "Plex is not attached");
+ }
+ }
+}
+
+void
+renameobject(struct vinum_rename_msg *msg)
+{
+ struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+ struct drive *drive;
+ struct sd *sd;
+ struct plex *plex;
+ struct volume *vol;
+
+ switch (msg->type) {
+ case drive_object: /* you can't attach a drive to anything */
+ if (find_drive(msg->newname, 0) >= 0) { /* we have that name already, */
+ reply->error = EEXIST;
+ reply->msg[0] = '\0';
+ return;
+ }
+ drive = validdrive(msg->index, reply);
+ if (drive) {
+ bcopy(msg->newname, drive->label.name, MAXDRIVENAME);
+ save_config();
+ reply->error = 0;
+ }
+ return;
+
+ case sd_object: /* you can't attach a subdisk to anything */
+ if (find_subdisk(msg->newname, 0) >= 0) { /* we have that name already, */
+ reply->error = EEXIST;
+ reply->msg[0] = '\0';
+ return;
+ }
+ sd = validsd(msg->index, reply);
+ if (sd) {
+ bcopy(msg->newname, sd->name, MAXSDNAME);
+ update_sd_config(sd->sdno, 0);
+ save_config();
+ reply->error = 0;
+ }
+ return;
+
+ case plex_object: /* you can't attach a plex to anything */
+ if (find_plex(msg->newname, 0) >= 0) { /* we have that name already, */
+ reply->error = EEXIST;
+ reply->msg[0] = '\0';
+ return;
+ }
+ plex = validplex(msg->index, reply);
+ if (plex) {
+ bcopy(msg->newname, plex->name, MAXPLEXNAME);
+ update_plex_config(plex->plexno, 0);
+ save_config();
+ reply->error = 0;
+ }
+ return;
+
+ case volume_object: /* you can't attach a volume to anything */
+ if (find_volume(msg->newname, 0) >= 0) { /* we have that name already, */
+ reply->error = EEXIST;
+ reply->msg[0] = '\0';
+ return;
+ }
+ vol = validvol(msg->index, reply);
+ if (vol) {
+ bcopy(msg->newname, vol->name, MAXVOLNAME);
+ update_volume_config(msg->index);
+ save_config();
+ reply->error = 0;
+ }
+ return;
+
+ case invalid_object:
+ reply->error = EINVAL;
+ reply->msg[0] = '\0';
+ }
+}
+
+/*
+ * Replace one object with another.
+ * Currently only for drives.
+ * message->index is the drive number of the old drive
+ * message->otherobject is the drive number of the new drive
+ */
+void
+replaceobject(struct vinum_ioctl_msg *msg)
+{
+ struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+
+ reply->error = ENODEV; /* until I know how to do this */
+ strcpy(reply->msg, "replace not implemented yet");
+/* save_config (); */
+}
+
+void
+moveobject(struct vinum_ioctl_msg *msg)
+{
+ struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+ struct drive *drive;
+ struct sd *sd;
+
+ /* Check that our objects are valid (i.e. they exist) */
+ drive = validdrive(msg->index, (struct _ioctl_reply *) msg);
+ if (drive == NULL)
+ return;
+ sd = validsd(msg->otherobject, (struct _ioctl_reply *) msg);
+ if (sd == NULL)
+ return;
+ if (sd->driveno == msg->index) /* sd already belongs to drive */
+ return;
+
+ if (sd->state > sd_stale)
+ set_sd_state(sd->sdno, sd_stale, setstate_force); /* make the subdisk stale */
+ else
+ sd->state = sd_empty;
+ if (sd->plexno >= 0) /* part of a plex, */
+ update_plex_state(sd->plexno); /* update its state */
+
+ /* Return the space on the old drive */
+ if ((sd->driveno >= 0) /* we have a drive, */
+ &&(sd->sectors > 0)) /* and some space on it */
+ return_drive_space(sd->driveno, /* return the space */
+ sd->driveoffset,
+ sd->sectors);
+
+ /* Reassign the old subdisk */
+ sd->driveno = msg->index;
+ sd->driveoffset = -1; /* let the drive decide where to put us */
+ give_sd_to_drive(sd->sdno);
+ reply->error = 0;
+}
+
+void
+setreadpol(struct vinum_ioctl_msg *msg)
+{
+ struct _ioctl_reply *reply = (struct _ioctl_reply *) msg;
+ struct volume *vol;
+ struct plex *plex;
+ int myplexno = -1;
+
+ /* Check that our objects are valid (i.e. they exist) */
+ vol = validvol(msg->index, reply);
+ if (vol == NULL)
+ return;
+
+ /* If a plex was specified, check that is is valid */
+ if (msg->otherobject >= 0) {
+ plex = validplex(msg->otherobject, reply);
+ if (vol == NULL)
+ return;
+
+ /* Is it attached to this volume? */
+ myplexno = my_plex(msg->index, msg->otherobject);
+ if (myplexno < 0) {
+ strcpy(reply->msg, "Plex is not attached to volume");
+ reply->error = ENOENT;
+ return;
+ }
+ }
+ lock_config();
+ vol->preferred_plex = myplexno;
+ save_config();
+ unlock_config();
+ reply->error = 0;
+}
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumkw.h b/sys/dev/vinum/vinumkw.h
new file mode 100644
index 0000000..d7bc7a5
--- /dev/null
+++ b/sys/dev/vinum/vinumkw.h
@@ -0,0 +1,152 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumkw.h,v 1.20 2003/05/07 03:32:09 grog Exp grog $
+ * $FreeBSD$
+ */
+
+/*
+ * Command keywords that vinum knows. These include both user-level
+ * and kernel-level stuff
+ */
+
+/*
+ * Our complete vocabulary. The names of the commands are
+ * the same as the identifier without the kw_ at the beginning
+ * (i.e. kw_create defines the "create" keyword). Preprocessor
+ * magic in parser.c does the rest.
+ *
+ * To add a new word: put it in the table below and one of the
+ * lists in vinumparser.c (probably keywords).
+ */
+enum keyword {
+ kw_create,
+ kw_modify,
+ kw_list,
+ kw_l = kw_list,
+ kw_ld, /* list drive */
+ kw_ls, /* list subdisk */
+ kw_lp, /* list plex */
+ kw_lv, /* list volume */
+ kw_set,
+ kw_rm,
+ kw_mv, /* move object */
+ kw_move, /* synonym for mv */
+ kw_start,
+ kw_stop,
+ kw_makedev, /* make /dev/vinum devices */
+ kw_setdaemon, /* set daemon flags */
+ kw_getdaemon, /* set daemon flags */
+ kw_help,
+ kw_drive,
+ kw_partition,
+ kw_sd,
+ kw_subdisk = kw_sd,
+ kw_plex,
+ kw_volume,
+ kw_vol = kw_volume,
+ kw_read,
+ kw_readpol,
+ kw_org,
+ kw_name,
+ kw_concat,
+ kw_striped,
+ kw_raid4,
+ kw_raid5,
+ kw_driveoffset,
+ kw_plexoffset,
+ kw_len,
+ kw_length = kw_len,
+ kw_size = kw_len,
+ kw_state,
+ kw_setupstate,
+ kw_d, /* flag names */
+ kw_f,
+ kw_r,
+ kw_s,
+ kw_v,
+ kw_w,
+ kw_round, /* round robin */
+ /*
+ * The first of these is a volume attibute ("prefer plex"), and the
+ * second is a plex attribute ("preferred" means that the volume
+ * prefers this plex).
+ */
+ kw_prefer, /* prefer plex */
+ kw_preferred, /* preferred plex */
+ kw_device,
+ kw_init,
+ kw_resetconfig,
+ kw_writethrough,
+ kw_writeback,
+ kw_replace,
+ kw_resetstats,
+ kw_attach,
+ kw_detach,
+ kw_rename,
+ kw_printconfig,
+ kw_saveconfig,
+ kw_hotspare,
+ kw_detached,
+ kw_debug, /* go into debugger */
+ kw_stripe,
+ kw_mirror,
+ kw_info,
+ kw_quit,
+ kw_max,
+ kw_setstate,
+ kw_checkparity,
+ kw_rebuildparity,
+ kw_dumpconfig,
+ kw_retryerrors,
+ kw_invalid_keyword = -1
+};
+
+struct _keywords {
+ char *name;
+ enum keyword keyword;
+};
+
+struct keywordset {
+ int size;
+ struct _keywords *k;
+};
+
+extern struct _keywords keywords[];
+extern struct _keywords flag_keywords[];
+
+extern struct keywordset keyword_set;
+extern struct keywordset flag_set;
+
+/* Parser functions */
+
+enum keyword get_keyword(char *, struct keywordset *);
+int tokenize(char *, char *[], int);
diff --git a/sys/dev/vinum/vinumlock.c b/sys/dev/vinum/vinumlock.c
new file mode 100644
index 0000000..33d9578
--- /dev/null
+++ b/sys/dev/vinum/vinumlock.c
@@ -0,0 +1,264 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumlock.c,v 1.19 2003/05/23 01:07:18 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+/* Lock a drive, wait if it's in use */
+#ifdef VINUMDEBUG
+int
+lockdrive(struct drive *drive, char *file, int line)
+#else
+int
+lockdrive(struct drive *drive)
+#endif
+{
+ int error;
+
+ /* XXX get rid of drive->flags |= VF_LOCKING; */
+ if ((drive->flags & VF_LOCKED) /* it's locked */
+ &&(drive->pid == curproc->p_pid)) { /* by us! */
+#ifdef VINUMDEBUG
+ log(LOG_WARNING,
+ "vinum lockdrive: already locking %s from %s:%d, called from %s:%d\n",
+ drive->label.name,
+ drive->lockfilename,
+ drive->lockline,
+ basename(file),
+ line);
+#else
+ log(LOG_WARNING,
+ "vinum lockdrive: already locking %s\n",
+ drive->label.name);
+#endif
+ return 0;
+ }
+ while ((drive->flags & VF_LOCKED) != 0) {
+ /*
+ * There are problems sleeping on a unique identifier,
+ * since the drive structure can move, and the unlock
+ * function can be called after killing the drive.
+ * Solve this by waiting on this function; the number
+ * of conflicts is negligible.
+ */
+ if ((error = tsleep(&lockdrive,
+ PRIBIO,
+ "vindrv",
+ 0)) != 0)
+ return error;
+ }
+ drive->flags |= VF_LOCKED;
+ drive->pid = curproc->p_pid; /* it's a panic error if curproc is null */
+#ifdef VINUMDEBUG
+ bcopy(basename(file), drive->lockfilename, 15);
+ drive->lockfilename[15] = '\0'; /* truncate if necessary */
+ drive->lockline = line;
+#endif
+ return 0;
+}
+
+/* Unlock a drive and let the next one at it */
+void
+unlockdrive(struct drive *drive)
+{
+ drive->flags &= ~VF_LOCKED;
+ /* we don't reset pid: it's of hysterical interest */
+ wakeup(&lockdrive);
+}
+
+/* Lock a stripe of a plex, wait if it's in use */
+struct rangelock *
+lockrange(daddr_t stripe, struct buf *bp, struct plex *plex)
+{
+ struct rangelock *lock;
+ struct rangelock *pos; /* position of first free lock */
+ int foundlocks; /* number of locks found */
+
+ /*
+ * We could get by without counting the number
+ * of locks we find, but we have a linear search
+ * through a table which in most cases will be
+ * empty. It's faster to stop when we've found
+ * all the locks that are there. This is also
+ * the reason why we put pos at the beginning
+ * instead of the end, though it requires an
+ * extra test.
+ */
+ pos = NULL;
+ foundlocks = 0;
+
+ /*
+ * we can't use 0 as a valid address, so
+ * increment all addresses by 1.
+ */
+ stripe++;
+ mtx_lock(plex->lockmtx);
+
+ /* Wait here if the table is full */
+ while (plex->usedlocks == PLEX_LOCKS) /* all in use */
+ msleep(&plex->usedlocks, plex->lockmtx, PRIBIO, "vlock", 0);
+
+#ifdef DIAGNOSTIC
+ if (plex->usedlocks >= PLEX_LOCKS)
+ panic("lockrange: Too many locks in use");
+#endif
+
+ lock = plex->lock; /* pointer in lock table */
+ if (plex->usedlocks > 0) /* something locked, */
+ /* Search the lock table for our stripe */
+ for (; lock < &plex->lock[PLEX_LOCKS]
+ && foundlocks < plex->usedlocks;
+ lock++) {
+ if (lock->stripe) { /* in use */
+ foundlocks++; /* found another one in use */
+ if ((lock->stripe == stripe) /* it's our stripe */
+ &&(lock->bp != bp)) { /* but not our request */
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_LOCKREQS) {
+ struct rangelockinfo lockinfo;
+
+ lockinfo.stripe = stripe;
+ lockinfo.bp = bp;
+ lockinfo.plexno = plex->plexno;
+ logrq(loginfo_lockwait, (union rqinfou) &lockinfo, bp);
+ }
+#endif
+ plex->lockwaits++; /* waited one more time */
+ msleep(lock, plex->lockmtx, PRIBIO, "vrlock", 0);
+ lock = &plex->lock[-1]; /* start again */
+ foundlocks = 0;
+ pos = NULL;
+ }
+ } else if (pos == NULL) /* still looking for somewhere? */
+ pos = lock; /* a place to put this one */
+ }
+ /*
+ * This untidy looking code ensures that we'll
+ * always end up pointing to the first free lock
+ * entry, thus minimizing the number of
+ * iterations necessary.
+ */
+ if (pos == NULL) /* didn't find one on the way, */
+ pos = lock; /* use the one we're pointing to */
+
+ /*
+ * The address range is free, and we're pointing
+ * to the first unused entry. Make it ours.
+ */
+ pos->stripe = stripe;
+ pos->bp = bp;
+ plex->usedlocks++; /* one more lock */
+ mtx_unlock(plex->lockmtx);
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_LOCKREQS) {
+ struct rangelockinfo lockinfo;
+
+ lockinfo.stripe = stripe;
+ lockinfo.bp = bp;
+ lockinfo.plexno = plex->plexno;
+ logrq(loginfo_lock, (union rqinfou) &lockinfo, bp);
+ }
+#endif
+ return pos;
+}
+
+/* Unlock a volume and let the next one at it */
+void
+unlockrange(int plexno, struct rangelock *lock)
+{
+ struct plex *plex;
+
+ plex = &PLEX[plexno];
+#ifdef DIAGNOSTIC
+ if (lock < &plex->lock[0] || lock >= &plex->lock[PLEX_LOCKS])
+ panic("vinum: rangelock %p on plex %d invalid, not between %p and %p",
+ lock,
+ plexno,
+ &plex->lock[0],
+ &plex->lock[PLEX_LOCKS]);
+#endif
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_LOCKREQS) {
+ struct rangelockinfo lockinfo;
+
+ lockinfo.stripe = lock->stripe;
+ lockinfo.bp = lock->bp;
+ lockinfo.plexno = plex->plexno;
+ logrq(loginfo_lockwait, (union rqinfou) &lockinfo, lock->bp);
+ }
+#endif
+ lock->stripe = 0; /* no longer used */
+ plex->usedlocks--; /* one less lock */
+ if (plex->usedlocks == PLEX_LOCKS - 1) /* we were full, */
+ wakeup(&plex->usedlocks); /* get a waiter if one's there */
+ wakeup((void *) lock);
+}
+
+/* Get a lock for the global config. Wait if it's not available. */
+int
+lock_config(void)
+{
+ int error;
+
+ while ((vinum_conf.flags & VF_LOCKED) != 0) {
+ vinum_conf.flags |= VF_LOCKING;
+ if ((error = tsleep(&vinum_conf, PRIBIO, "vincfg", 0)) != 0)
+ return error;
+ }
+ vinum_conf.flags |= VF_LOCKED;
+ return 0;
+}
+
+/* Unlock global config and wake up any waiters. */
+void
+unlock_config(void)
+{
+ vinum_conf.flags &= ~VF_LOCKED;
+ if ((vinum_conf.flags & VF_LOCKING) != 0) {
+ vinum_conf.flags &= ~VF_LOCKING;
+ wakeup(&vinum_conf);
+ }
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinummemory.c b/sys/dev/vinum/vinummemory.c
new file mode 100644
index 0000000..b4e9a43
--- /dev/null
+++ b/sys/dev/vinum/vinummemory.c
@@ -0,0 +1,288 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinummemory.c,v 1.31 2003/05/23 01:08:36 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+
+#ifdef VINUMDEBUG
+#include <dev/vinum/request.h>
+extern struct rqinfo rqinfo[];
+extern struct rqinfo *rqip;
+int rqinfo_size = RQINFO_SIZE; /* for debugger */
+
+#undef longjmp /* this was defined as LongJmp */
+#define strrchr rindex
+#ifdef __i386__ /* check for validity */
+void
+LongJmp(jmp_buf buf, int retval)
+{
+/*
+ * longjmp is not documented, not even jmp_buf.
+ * This is what's in i386/i386/support.s:
+ * ENTRY(longjmp)
+ * movl 4(%esp),%eax
+ * movl (%eax),%ebx restore ebx
+ * movl 4(%eax),%esp restore esp
+ * movl 8(%eax),%ebp restore ebp
+ * movl 12(%eax),%esi restore esi
+ * movl 16(%eax),%edi restore edi
+ * movl 20(%eax),%edx get rta
+ * movl %edx,(%esp) put in return frame
+ * xorl %eax,%eax return(1);
+ * incl %eax
+ * ret
+ *
+ * from which we deduce the structure of jmp_buf:
+ */
+ struct JmpBuf {
+ int jb_ebx;
+ int jb_esp;
+ int jb_ebp;
+ int jb_esi;
+ int jb_edi;
+ int jb_eip;
+ };
+
+ struct JmpBuf *jb = (struct JmpBuf *) buf;
+
+ if ((jb->jb_esp < 0xc0000000)
+ || (jb->jb_ebp < 0xc0000000)
+ || (jb->jb_eip < 0xc0000000))
+ panic("Invalid longjmp");
+ longjmp(buf, retval);
+}
+
+#else /* not i386 */
+#define LongJmp longjmp /* just use the kernel function */
+#endif /* i386 */
+#endif /* VINUMDEBUG */
+
+/* find the base name of a path name */
+char *
+basename(char *file)
+{
+ char *f = strrchr(file, '/'); /* chop off dirname if present */
+
+ if (f == NULL)
+ return file;
+ else
+ return ++f; /* skip the / */
+}
+
+#ifdef VINUMDEBUG
+void
+expand_table(void **table, int oldsize, int newsize, char *file, int line)
+#else
+void
+expand_table(void **table, int oldsize, int newsize)
+#endif
+{
+ if (newsize > oldsize) {
+ int *temp;
+ int s;
+
+ s = splhigh();
+#ifdef VINUMDEBUG
+ temp = (int *) MMalloc(newsize, file, line); /* allocate a new table */
+#else
+ temp = (int *) Malloc(newsize); /* allocate a new table */
+#endif
+ CHECKALLOC(temp, "vinum: Can't expand table\n");
+ bzero((char *) temp, newsize); /* clean it all out */
+ if (*table != NULL) { /* already something there, */
+ bcopy((char *) *table, (char *) temp, oldsize); /* copy it to the old table */
+#ifdef VINUMDEBUG
+ FFree(*table, file, line);
+#else
+ Free(*table);
+#endif
+ }
+ *table = temp;
+ splx(s);
+ }
+}
+
+#ifdef VINUMDEBUG
+#define MALLOCENTRIES 16384
+int malloccount = 0;
+int highwater = 0; /* highest index ever allocated */
+struct mc malloced[MALLOCENTRIES];
+
+#define FREECOUNT 64
+int freecount = FREECOUNT; /* for debugger */
+int lastfree = 0;
+struct mc freeinfo[FREECOUNT];
+
+int total_malloced;
+static int mallocseq = 0;
+
+caddr_t
+MMalloc(int size, char *file, int line)
+{
+ int s;
+ caddr_t result;
+ int i;
+
+ if (malloccount >= MALLOCENTRIES) { /* too many */
+ log(LOG_ERR, "vinum: can't allocate table space to trace memory allocation");
+ return 0; /* can't continue */
+ }
+ /* Wait for malloc if we can */
+ result = malloc(size,
+ M_DEVBUF,
+ curthread->td_intr_nesting_level == 0 ? M_WAITOK : M_NOWAIT);
+ if (result == NULL)
+ log(LOG_ERR, "vinum: can't allocate %d bytes from %s:%d\n", size, file, line);
+ else {
+ s = splhigh();
+ for (i = 0; i < malloccount; i++) {
+ if (((result + size) > malloced[i].address)
+ && (result < malloced[i].address + malloced[i].size)) /* overlap */
+ Debugger("Malloc overlap");
+ }
+ if (result) {
+ char *f = basename(file);
+
+ i = malloccount++;
+ total_malloced += size;
+ microtime(&malloced[i].time);
+ malloced[i].seq = mallocseq++;
+ malloced[i].size = size;
+ malloced[i].line = line;
+ malloced[i].address = result;
+ strlcpy(malloced[i].file, f, MCFILENAMELEN);
+ }
+ if (malloccount > highwater)
+ highwater = malloccount;
+ splx(s);
+ }
+ return result;
+}
+
+void
+FFree(void *mem, char *file, int line)
+{
+ int s;
+ int i;
+
+ s = splhigh();
+ for (i = 0; i < malloccount; i++) {
+ if ((caddr_t) mem == malloced[i].address) { /* found it */
+ bzero(mem, malloced[i].size); /* XXX */
+ free(mem, M_DEVBUF);
+ malloccount--;
+ total_malloced -= malloced[i].size;
+ if (debug & DEBUG_MEMFREE) { /* keep track of recent frees */
+ char *f = strrchr(file, '/'); /* chop off dirname if present */
+
+ if (f == NULL)
+ f = file;
+ else
+ f++; /* skip the / */
+
+ microtime(&freeinfo[lastfree].time);
+ freeinfo[lastfree].seq = malloced[i].seq;
+ freeinfo[lastfree].size = malloced[i].size;
+ freeinfo[lastfree].line = line;
+ freeinfo[lastfree].address = mem;
+ bcopy(f, freeinfo[lastfree].file, MCFILENAMELEN);
+ if (++lastfree == FREECOUNT)
+ lastfree = 0;
+ }
+ if (i < malloccount) /* more coming after */
+ bcopy(&malloced[i + 1], &malloced[i], (malloccount - i) * sizeof(struct mc));
+ splx(s);
+ return;
+ }
+ }
+ splx(s);
+ log(LOG_ERR,
+ "Freeing unallocated data at 0x%p from %s, line %d\n",
+ mem,
+ file,
+ line);
+ Debugger("Free");
+}
+
+void
+vinum_meminfo(caddr_t data)
+{
+ struct meminfo *m = (struct meminfo *) data;
+
+ m->mallocs = malloccount;
+ m->total_malloced = total_malloced;
+ m->malloced = malloced;
+ m->highwater = highwater;
+}
+
+int
+vinum_mallocinfo(caddr_t data)
+{
+ struct mc *m = (struct mc *) data;
+ unsigned int ent = m->seq; /* index of entry to return */
+
+ if (ent >= malloccount)
+ return ENOENT;
+ m->address = malloced[ent].address;
+ m->size = malloced[ent].size;
+ m->line = malloced[ent].line;
+ m->seq = malloced[ent].seq;
+ strlcpy(m->file, malloced[ent].file, MCFILENAMELEN);
+ return 0;
+}
+
+/*
+ * return the nth request trace buffer entry. This
+ * is indexed back from the current entry (which
+ * has index 0)
+ */
+int
+vinum_rqinfo(caddr_t data)
+{
+ struct rqinfo *rq = (struct rqinfo *) data;
+ int ent = *(int *) data; /* 1st word is index */
+ int lastent = rqip - rqinfo; /* entry number of current entry */
+
+ if (ent >= RQINFO_SIZE) /* out of the table */
+ return ENOENT;
+ if ((ent = lastent - ent - 1) < 0)
+ ent += RQINFO_SIZE; /* roll over backwards */
+ bcopy(&rqinfo[ent], rq, sizeof(struct rqinfo));
+ return 0;
+}
+#endif
diff --git a/sys/dev/vinum/vinumobj.h b/sys/dev/vinum/vinumobj.h
new file mode 100644
index 0000000..81087f3
--- /dev/null
+++ b/sys/dev/vinum/vinumobj.h
@@ -0,0 +1,320 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumobj.h,v 1.7 2003/05/23 01:08:58 grog Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Definitions of Vinum objects: drive, subdisk, plex and volume.
+ * This file is included both by userland programs and by kernel code.
+ * The userland structures are a subset of the kernel structures, and
+ * all userland fields are at the beginning, so that a simple copy in
+ * the length of the userland structure will be sufficient. In order
+ * to perform this copy, vinumioctl must know both structures, so it
+ * includes this file again with _KERNEL reset.
+ */
+
+#ifndef _KERNEL
+/*
+ * Flags for all objects. Most of them only apply
+ * to specific objects, but we currently have
+ * space for all in any 32 bit flags word.
+ */
+enum objflags {
+ VF_LOCKED = 1, /* somebody has locked access to this object */
+ VF_LOCKING = 2, /* we want access to this object */
+ VF_OPEN = 4, /* object has openers */
+ VF_WRITETHROUGH = 8, /* volume: write through */
+ VF_INITED = 0x10, /* unit has been initialized */
+ VF_WLABEL = 0x20, /* label area is writable */
+ VF_LABELLING = 0x40, /* unit is currently being labelled */
+ VF_WANTED = 0x80, /* someone is waiting to obtain a lock */
+ VF_RAW = 0x100, /* raw volume (no file system) */
+ VF_LOADED = 0x200, /* module is loaded */
+ VF_CONFIGURING = 0x400, /* somebody is changing the config */
+ VF_WILL_CONFIGURE = 0x800, /* somebody wants to change the config */
+ VF_CONFIG_INCOMPLETE = 0x1000, /* haven't finished changing the config */
+ VF_CONFIG_SETUPSTATE = 0x2000, /* set a volume up if all plexes are empty */
+ VF_READING_CONFIG = 0x4000, /* we're reading config database from disk */
+ VF_FORCECONFIG = 0x8000, /* configure drives even with different names */
+ VF_NEWBORN = 0x10000, /* for objects: we've just created it */
+ VF_CONFIGURED = 0x20000, /* for drives: we read the config */
+ VF_STOPPING = 0x40000, /* for vinum_conf: stop on last close */
+ VF_DAEMONOPEN = 0x80000, /* the daemon has us open (only superdev) */
+ VF_CREATED = 0x100000, /* for volumes: freshly created, more then new */
+ VF_HOTSPARE = 0x200000, /* for drives: use as hot spare */
+ VF_RETRYERRORS = 0x400000, /* don't down subdisks on I/O errors */
+ VF_HASDEBUG = 0x800000, /* set if we support debug */
+};
+
+#endif
+
+/* Global configuration information for the vinum subsystem */
+#ifdef _KERNEL
+struct _vinum_conf
+#else
+struct __vinum_conf
+#endif
+{
+ int version; /* version of structures */
+#ifdef _KERNEL
+ /* Pointers to vinum structures */
+ struct drive *drive;
+ struct sd *sd;
+ struct plex *plex;
+ struct volume *volume;
+#else
+ /* Pointers to vinum structures */
+ struct _drive *drive;
+ struct _sd *sd;
+ struct _plex *plex;
+ struct _volume *volume;
+#endif
+
+ /* the number allocated of each object */
+ int drives_allocated;
+ int subdisks_allocated;
+ int plexes_allocated;
+ int volumes_allocated;
+
+ /* and the number currently in use */
+ /*
+ * Note that drives_used is not valid during drive recognition
+ * (vinum_scandisk and friends). Many invalid drives are added and
+ * later removed; the count isn't correct until we leave
+ * vinum_scandisk.
+ */
+ int drives_used;
+ int subdisks_used;
+ int plexes_used;
+ int volumes_used;
+
+ int flags; /* see above */
+
+#define VINUM_MAXACTIVE 30000 /* maximum number of active requests */
+ int active; /* current number of requests outstanding */
+ int maxactive; /* maximum number of requests ever outstanding */
+#ifdef _KERNEL
+#ifdef VINUMDEBUG
+ struct request *lastrq;
+ struct buf *lastbuf;
+#endif
+#endif
+};
+
+/* Use these defines to simplify code */
+#define DRIVE vinum_conf.drive
+#define SD vinum_conf.sd
+#define PLEX vinum_conf.plex
+#define VOL vinum_conf.volume
+#define VFLAGS vinum_conf.flags
+
+/*
+ * A drive corresponds to a disk slice. We use a different term to show
+ * the difference in usage: it doesn't have to be a slice, and could
+ * theoretically be a complete, unpartitioned disk
+ */
+
+#ifdef _KERNEL
+struct drive
+#else
+struct _drive
+#endif
+{
+ char devicename[MAXDRIVENAME]; /* name of the slice it's on */
+ struct vinum_label label; /* and the label information */
+ enum drivestate state; /* current state */
+ int flags; /* flags */
+ int subdisks_allocated; /* number of entries in sd */
+ int subdisks_used; /* and the number used */
+ int blocksize; /* size of fs blocks */
+ int pid; /* of locker */
+ u_int64_t sectors_available; /* number of sectors still available */
+ int secsperblock;
+ int lasterror; /* last error on drive */
+ int driveno; /* index of drive in vinum_conf */
+ int opencount; /* number of up subdisks */
+ u_int64_t reads; /* number of reads on this drive */
+ u_int64_t writes; /* number of writes on this drive */
+ u_int64_t bytes_read; /* number of bytes read */
+ u_int64_t bytes_written; /* number of bytes written */
+#define DRIVE_MAXACTIVE 30000 /* maximum number of active requests */
+ int active; /* current number of requests outstanding */
+ int maxactive; /* maximum number of requests ever outstanding */
+ int freelist_size; /* number of entries alloced in free list */
+ int freelist_entries; /* number of entries used in free list */
+ struct drive_freelist *freelist; /* sorted list of free space on drive */
+#ifdef _KERNEL
+ u_int sectorsize;
+ off_t mediasize;
+ dev_t dev; /* device information */
+#ifdef VINUMDEBUG
+ char lockfilename[16]; /* name of file from which we were locked */
+ int lockline; /* and the line number */
+#endif
+#endif
+};
+
+#ifdef _KERNEL
+struct sd
+#else
+struct _sd
+#endif
+{
+ char name[MAXSDNAME]; /* name of subdisk */
+ enum sdstate state; /* state */
+ int flags;
+ int lasterror; /* last error occurred */
+ /* offsets in blocks */
+ int64_t driveoffset; /* offset on drive */
+ /*
+ * plexoffset is the offset from the beginning
+ * of the plex to the very first part of the
+ * subdisk, in sectors. For striped, RAID-4 and
+ * RAID-5 plexes, only the first stripe is
+ * located at this offset
+ */
+ int64_t plexoffset; /* offset in plex */
+ u_int64_t sectors; /* and length in sectors */
+ int sectorsize; /* sector size for DIOCGSECTORSIZE */
+ int plexno; /* index of plex, if it belongs */
+ int driveno; /* index of the drive on which it is located */
+ int sdno; /* our index in vinum_conf */
+ int plexsdno; /* and our number in our plex */
+ /* (undefined if no plex) */
+ u_int64_t reads; /* number of reads on this subdisk */
+ u_int64_t writes; /* number of writes on this subdisk */
+ u_int64_t bytes_read; /* number of bytes read */
+ u_int64_t bytes_written; /* number of bytes written */
+ /* revive parameters */
+ u_int64_t revived; /* block number of current revive request */
+ int revive_blocksize; /* revive block size (bytes) */
+ int revive_interval; /* and time to wait between transfers */
+ pid_t reviver; /* PID of reviving process */
+ /* init parameters */
+ u_int64_t initialized; /* block number of current init request */
+ int init_blocksize; /* init block size (bytes) */
+ int init_interval; /* and time to wait between transfers */
+#ifdef _KERNEL
+ struct request *waitlist; /* list of requests waiting on revive op */
+ dev_t dev; /* associated device */
+#endif
+};
+
+#ifdef _KERNEL
+struct plex
+#else
+struct _plex
+#endif
+{
+ enum plexorg organization; /* Plex organization */
+ enum plexstate state; /* and current state */
+ u_int64_t length; /* total length of plex (sectors) */
+ int flags;
+ int stripesize; /* size of stripe or raid band, in sectors */
+ int sectorsize; /* sector size for DIOCGSECTORSIZE */
+ int subdisks; /* number of associated subdisks */
+ int subdisks_allocated; /* number of subdisks allocated space for */
+ int *sdnos; /* list of component subdisks */
+ int plexno; /* index of plex in vinum_conf */
+ int volno; /* index of volume */
+ int volplexno; /* number of plex in volume */
+ /* Statistics */
+ u_int64_t reads; /* number of reads on this plex */
+ u_int64_t writes; /* number of writes on this plex */
+ u_int64_t bytes_read; /* number of bytes read */
+ u_int64_t bytes_written; /* number of bytes written */
+ u_int64_t recovered_reads; /* number of recovered read operations */
+ u_int64_t degraded_writes; /* number of degraded writes */
+ u_int64_t parityless_writes; /* number of parityless writes */
+ u_int64_t multiblock; /* requests that needed more than one block */
+ u_int64_t multistripe; /* requests that needed more than one stripe */
+ int sddowncount; /* number of subdisks down */
+ /* Lock information */
+ int usedlocks; /* number currently in use */
+ int lockwaits; /* and number of waits for locks */
+ off_t checkblock; /* block number for parity op */
+ char name[MAXPLEXNAME]; /* name of plex */
+#ifdef _KERNEL
+ struct rangelock *lock; /* ranges of locked addresses */
+ struct mtx *lockmtx; /* lock mutex, one of plexmutex [] */
+ dev_t dev; /* associated device */
+#endif
+};
+
+#ifdef _KERNEL
+struct volume
+#else
+struct _volume
+#endif
+{
+ char name[MAXVOLNAME]; /* name of volume */
+ enum volumestate state; /* current state */
+ int plexes; /* number of plexes */
+ int preferred_plex; /* index of plex to read from,
+ * -1 for round-robin */
+ /*
+ * index of plex used for last read, for
+ * round-robin.
+ */
+ int last_plex_read;
+ int volno; /* volume number */
+ int flags; /* status and configuration flags */
+ int openflags; /* flags supplied to last open(2) */
+ u_int64_t size; /* size of volume */
+ int blocksize; /* logical block size */
+ int sectorsize; /* sector size for DIOCGSECTORSIZE */
+ int active; /* number of outstanding requests active */
+ int subops; /* and the number of suboperations */
+ /* Statistics */
+ u_int64_t bytes_read; /* number of bytes read */
+ u_int64_t bytes_written; /* number of bytes written */
+ u_int64_t reads; /* number of reads on this volume */
+ u_int64_t writes; /* number of writes on this volume */
+ u_int64_t recovered_reads; /* reads recovered from another plex */
+ /*
+ * Unlike subdisks in the plex, space for the
+ * plex pointers is static.
+ */
+ int plex[MAXPLEX]; /* index of plexes */
+#ifdef _KERNEL
+ dev_t dev; /* associated device */
+#endif
+};
diff --git a/sys/dev/vinum/vinumparser.c b/sys/dev/vinum/vinumparser.c
new file mode 100644
index 0000000..2820ffd
--- /dev/null
+++ b/sys/dev/vinum/vinumparser.c
@@ -0,0 +1,234 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumparser.c,v 1.25 2003/05/07 03:33:28 grog Exp grog $
+ * $FreeBSD$
+ */
+
+/*
+ * This file contains the parser for the configuration routines. It's used
+ * both in the kernel and in the user interface program, thus the separate file.
+ */
+
+/*
+ * Go through a text and split up into text tokens. These are either non-blank
+ * sequences, or any sequence (except \0) enclosed in ' or ". Embedded ' or
+ * " characters may be escaped by \, which otherwise has no special meaning.
+ *
+ * Delimit by following with a \0, and return pointers to the starts at token [].
+ * Return the number of tokens found as the return value.
+ *
+ * This method has the restriction that a closing " or ' must be followed by
+ * grey space.
+ *
+ * Error conditions are end of line before end of quote, or no space after
+ * a closing quote. In this case, tokenize() returns -1.
+ */
+
+#include <sys/param.h>
+#include <dev/vinum/vinumkw.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <machine/setjmp.h>
+/* All this mess for a single struct definition */
+#include <sys/uio.h>
+#include <sys/namei.h>
+#include <sys/mount.h>
+
+#include <dev/vinum/vinumvar.h>
+#include <dev/vinum/vinumio.h>
+#include <dev/vinum/vinumext.h>
+#define iswhite(c) ((c == ' ') || (c == '\t')) /* check for white space */
+#else /* userland */
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#define iswhite isspace /* use the ctype macro */
+#endif
+
+/* enum keyword is defined in vinumvar.h */
+
+#define keypair(x) { #x, kw_##x } /* create pair "foo", kw_foo */
+#define flagkeypair(x) { "-"#x, kw_##x } /* create pair "-foo", kw_foo */
+#define KEYWORDSET(x) {sizeof (x) / sizeof (struct _keywords), x}
+
+/* Normal keywords. These are all the words that vinum knows. */
+struct _keywords keywords[] =
+{keypair(drive),
+ keypair(partition),
+ keypair(sd),
+ keypair(subdisk),
+ keypair(plex),
+ keypair(volume),
+ keypair(vol),
+ keypair(setupstate),
+ keypair(readpol),
+ keypair(org),
+ keypair(name),
+ keypair(writethrough),
+ keypair(writeback),
+ keypair(device),
+ keypair(concat),
+ keypair(raid4),
+ keypair(raid5),
+ keypair(striped),
+ keypair(plexoffset),
+ keypair(driveoffset),
+ keypair(length),
+ keypair(len),
+ keypair(size),
+ keypair(state),
+ keypair(round),
+ keypair(prefer),
+ keypair(preferred),
+ keypair(rename),
+ keypair(detached),
+#ifndef _KERNEL /* for vinum(8) only */
+ keypair(debug),
+ keypair(stripe),
+ keypair(mirror),
+#endif
+ keypair(attach),
+ keypair(detach),
+ keypair(printconfig),
+ keypair(saveconfig),
+ keypair(replace),
+ keypair(create),
+ keypair(read),
+ keypair(modify),
+ keypair(list),
+ keypair(l),
+ keypair(ld),
+ keypair(ls),
+ keypair(lp),
+ keypair(lv),
+ keypair(info),
+ keypair(set),
+ keypair(rm),
+ keypair(mv),
+ keypair(move),
+ keypair(init),
+ keypair(resetconfig),
+ keypair(start),
+ keypair(stop),
+ keypair(makedev),
+ keypair(help),
+ keypair(quit),
+ keypair(setdaemon),
+ keypair(getdaemon),
+ keypair(max),
+ keypair(replace),
+ keypair(readpol),
+ keypair(resetstats),
+ keypair(setstate),
+ keypair(checkparity),
+ keypair(rebuildparity),
+ keypair(dumpconfig),
+ keypair(retryerrors)
+};
+struct keywordset keyword_set = KEYWORDSET(keywords);
+
+#ifndef _KERNEL
+struct _keywords flag_keywords[] =
+{flagkeypair(f),
+ flagkeypair(d),
+ flagkeypair(v),
+ flagkeypair(s),
+ flagkeypair(r),
+ flagkeypair(w)
+};
+struct keywordset flag_set = KEYWORDSET(flag_keywords);
+
+#endif
+
+/*
+ * Take a blank separated list of tokens and turn it into a list of
+ * individual nul-delimited strings. Build a list of pointers at
+ * token, which must have enough space for the tokens. Return the
+ * number of tokens, or -1 on error (typically a missing string
+ * delimiter).
+ */
+int
+tokenize(char *cptr, char *token[], int maxtoken)
+{
+ char delim; /* delimiter for searching for the partner */
+ int tokennr; /* index of this token */
+
+ for (tokennr = 0; tokennr < maxtoken;) {
+ while (iswhite(*cptr))
+ cptr++; /* skip initial white space */
+ if ((*cptr == '\0') || (*cptr == '\n') || (*cptr == '#')) /* end of line */
+ return tokennr; /* return number of tokens found */
+ delim = *cptr;
+ token[tokennr] = cptr; /* point to it */
+ tokennr++; /* one more */
+ if (tokennr == maxtoken) /* run off the end? */
+ return tokennr;
+ if ((delim == '\'') || (delim == '"')) { /* delimitered */
+ for (;;) {
+ cptr++;
+ if ((*cptr == delim) && (cptr[-1] != '\\')) { /* found the partner */
+ cptr++; /* move on past */
+ if (!iswhite(*cptr)) /* error, no space after closing quote */
+ return -1;
+ *cptr++ = '\0'; /* delimit */
+ } else if ((*cptr == '\0') || (*cptr == '\n')) /* end of line */
+ return -1;
+ }
+ } else { /* not quoted */
+ while ((*cptr != '\0') && (!iswhite(*cptr)) && (*cptr != '\n'))
+ cptr++;
+ if (*cptr != '\0') /* not end of the line, */
+ *cptr++ = '\0'; /* delimit and move to the next */
+ }
+ }
+ return maxtoken; /* can't get here */
+}
+
+/* Find a keyword and return an index */
+enum keyword
+get_keyword(char *name, struct keywordset *keywordset)
+{
+ int i;
+ struct _keywords *keywords = keywordset->k; /* point to the keywords */
+ if (name != NULL) { /* parameter exists */
+ for (i = 0; i < keywordset->size; i++)
+ if (!strcmp(name, keywords[i].name))
+ return (enum keyword) keywords[i].keyword;
+ }
+ return kw_invalid_keyword;
+}
diff --git a/sys/dev/vinum/vinumraid5.c b/sys/dev/vinum/vinumraid5.c
new file mode 100644
index 0000000..73b024f
--- /dev/null
+++ b/sys/dev/vinum/vinumraid5.c
@@ -0,0 +1,698 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Cybernet Corporation and Nan Yang Computer Services Limited.
+ * All rights reserved.
+ *
+ * This software was developed as part of the NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Cybernet Corporation
+ * and Nan Yang Computer Services Limited
+ * 4. Neither the name of the Companies nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumraid5.c,v 1.23 2003/02/08 03:32:45 grog Exp $
+ * $FreeBSD$
+ */
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+#include <sys/resourcevar.h>
+
+/*
+ * Parameters which describe the current transfer.
+ * These are only used for calculation, but they
+ * need to be passed to other functions, so it's
+ * tidier to put them in a struct
+ */
+struct metrics {
+ daddr_t stripebase; /* base address of stripe (1st subdisk) */
+ int stripeoffset; /* offset in stripe */
+ int stripesectors; /* total sectors to transfer in this stripe */
+ daddr_t sdbase; /* offset in subdisk of stripe base */
+ int sdcount; /* number of disks involved in this transfer */
+ daddr_t diskstart; /* remember where this transfer starts */
+ int psdno; /* number of parity subdisk */
+ int badsdno; /* number of down subdisk, if there is one */
+ int firstsdno; /* first data subdisk number */
+ /* These correspond to the fields in rqelement, sort of */
+ int useroffset;
+ /*
+ * Initial offset and length values for the first
+ * data block
+ */
+ int initoffset; /* start address of block to transfer */
+ short initlen; /* length in sectors of data transfer */
+ /* Define a normal operation */
+ int dataoffset; /* start address of block to transfer */
+ int datalen; /* length in sectors of data transfer */
+ /* Define a group operation */
+ int groupoffset; /* subdisk offset of group operation */
+ int grouplen; /* length in sectors of group operation */
+ /* Define a normal write operation */
+ int writeoffset; /* subdisk offset of normal write */
+ int writelen; /* length in sectors of write operation */
+ enum xferinfo flags; /* to check what we're doing */
+ int rqcount; /* number of elements in request */
+};
+
+enum requeststatus bre5(struct request *rq,
+ int plexno,
+ daddr_t * diskstart,
+ daddr_t diskend);
+void complete_raid5_write(struct rqelement *);
+enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
+void setrqebounds(struct rqelement *rqe, struct metrics *mp);
+
+/*
+ * define the low-level requests needed to perform
+ * a high-level I/O operation for a specific plex
+ * 'plexno'.
+ *
+ * Return 0 if all subdisks involved in the
+ * request are up, 1 if some subdisks are not up,
+ * and -1 if the request is at least partially
+ * outside the bounds of the subdisks.
+ *
+ * Modify the pointer *diskstart to point to the
+ * end address. On read, return on the first bad
+ * subdisk, so that the caller
+ * (build_read_request) can try alternatives.
+ *
+ * On entry to this routine, the prq structures
+ * are not assigned. The assignment is performed
+ * by expandrq(). Strictly speaking, the elements
+ * rqe->sdno of all entries should be set to -1,
+ * since 0 (from bzero) is a valid subdisk number.
+ * We avoid this problem by initializing the ones
+ * we use, and not looking at the others (index >=
+ * prq->requests).
+ */
+enum requeststatus
+bre5(struct request *rq,
+ int plexno,
+ daddr_t * diskaddr,
+ daddr_t diskend)
+{
+ struct metrics m; /* most of the information */
+ struct sd *sd;
+ struct plex *plex;
+ struct buf *bp; /* user's bp */
+ struct rqgroup *rqg; /* the request group that we will create */
+ struct rqelement *rqe; /* point to this request information */
+ int rsectors; /* sectors remaining in this stripe */
+ int mysdno; /* another sd index in loops */
+ int rqno; /* request number */
+
+ rqg = NULL; /* shut up, damn compiler */
+ m.diskstart = *diskaddr; /* start of transfer */
+ bp = rq->bp; /* buffer pointer */
+ plex = &PLEX[plexno]; /* point to the plex */
+
+
+ while (*diskaddr < diskend) { /* until we get it all sorted out */
+ if (*diskaddr >= plex->length) /* beyond the end of the plex */
+ return REQUEST_EOF; /* can't continue */
+
+ m.badsdno = -1; /* no bad subdisk yet */
+
+ /* Part A: Define the request */
+ /*
+ * First, calculate some sizes:
+ * The offset of the start address from
+ * the start of the stripe.
+ */
+ m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));
+
+ /*
+ * The plex-relative address of the
+ * start of the stripe.
+ */
+ m.stripebase = *diskaddr - m.stripeoffset;
+
+ /* subdisk containing the parity stripe */
+ if (plex->organization == plex_raid5)
+ m.psdno = plex->subdisks - 1
+ - (*diskaddr / (plex->stripesize * (plex->subdisks - 1)))
+ % plex->subdisks;
+ else /* RAID-4 */
+ m.psdno = plex->subdisks - 1;
+
+ /*
+ * The number of the subdisk in which
+ * the start is located.
+ */
+ m.firstsdno = m.stripeoffset / plex->stripesize;
+ if (m.firstsdno >= m.psdno) /* at or past parity sd */
+ m.firstsdno++; /* increment it */
+
+ /*
+ * The offset from the beginning of
+ * the stripe on this subdisk.
+ */
+ m.initoffset = m.stripeoffset % plex->stripesize;
+
+ /* The offset of the stripe start relative to this subdisk */
+ m.sdbase = m.stripebase / (plex->subdisks - 1);
+
+ m.useroffset = *diskaddr - m.diskstart; /* The offset of the start in the user buffer */
+
+ /*
+ * The number of sectors to transfer in the
+ * current (first) subdisk.
+ */
+ m.initlen = min(diskend - *diskaddr, /* the amount remaining to transfer */
+ plex->stripesize - m.initoffset); /* and the amount left in this block */
+
+ /*
+ * The number of sectors to transfer in this stripe
+ * is the minumum of the amount remaining to transfer
+ * and the amount left in this stripe.
+ */
+ m.stripesectors = min(diskend - *diskaddr,
+ plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);
+
+ /* The number of data subdisks involved in this request */
+ m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;
+
+ /* Part B: decide what kind of transfer this will be.
+
+ * start and end addresses of the transfer in
+ * the current block.
+ *
+ * There are a number of different kinds of
+ * transfer, each of which relates to a
+ * specific subdisk:
+ *
+ * 1. Normal read. All participating subdisks
+ * are up, and the transfer can be made
+ * directly to the user buffer. The bounds
+ * of the transfer are described by
+ * m.dataoffset and m.datalen. We have
+ * already calculated m.initoffset and
+ * m.initlen, which define the parameters
+ * for the first data block.
+ *
+ * 2. Recovery read. One participating
+ * subdisk is down. To recover data, all
+ * the other subdisks, including the parity
+ * subdisk, must be read. The data is
+ * recovered by exclusive-oring all the
+ * other blocks. The bounds of the
+ * transfer are described by m.groupoffset
+ * and m.grouplen.
+ *
+ * 3. A read request may request reading both
+ * available data (normal read) and
+ * non-available data (recovery read).
+ * This can be a problem if the address
+ * ranges of the two reads do not coincide:
+ * in this case, the normal read needs to
+ * be extended to cover the address range
+ * of the recovery read, and must thus be
+ * performed out of malloced memory.
+ *
+ * 4. Normal write. All the participating
+ * subdisks are up. The bounds of the
+ * transfer are described by m.dataoffset
+ * and m.datalen. Since these values
+ * differ for each block, we calculate the
+ * bounds for the parity block
+ * independently as the maximum of the
+ * individual blocks and store these values
+ * in m.writeoffset and m.writelen. This
+ * write proceeds in four phases:
+ *
+ * i. Read the old contents of each block
+ * and the parity block.
+ * ii. ``Remove'' the old contents from
+ * the parity block with exclusive or.
+ * iii. ``Insert'' the new contents of the
+ * block in the parity block, again
+ * with exclusive or.
+ *
+ * iv. Write the new contents of the data
+ * blocks and the parity block. The data
+ * block transfers can be made directly from
+ * the user buffer.
+ *
+ * 5. Degraded write where the data block is
+ * not available. The bounds of the
+ * transfer are described by m.groupoffset
+ * and m.grouplen. This requires the
+ * following steps:
+ *
+ * i. Read in all the other data blocks,
+ * excluding the parity block.
+ *
+ * ii. Recreate the parity block from the
+ * other data blocks and the data to be
+ * written.
+ *
+ * iii. Write the parity block.
+ *
+ * 6. Parityless write, a write where the
+ * parity block is not available. This is
+ * in fact the simplest: just write the
+ * data blocks. This can proceed directly
+ * from the user buffer. The bounds of the
+ * transfer are described by m.dataoffset
+ * and m.datalen.
+ *
+ * 7. Combination of degraded data block write
+ * and normal write. In this case the
+ * address ranges of the reads may also
+ * need to be extended to cover all
+ * participating blocks.
+ *
+ * All requests in a group transfer transfer
+ * the same address range relative to their
+ * subdisk. The individual transfers may
+ * vary, but since our group of requests is
+ * all in a single slice, we can define a
+ * range in which they all fall.
+ *
+ * In the following code section, we determine
+ * which kind of transfer we will perform. If
+ * there is a group transfer, we also decide
+ * its bounds relative to the subdisks. At
+ * the end, we have the following values:
+ *
+ * m.flags indicates the kinds of transfers
+ * we will perform.
+ * m.initoffset indicates the offset of the
+ * beginning of any data operation relative
+ * to the beginning of the stripe base.
+ * m.initlen specifies the length of any data
+ * operation.
+ * m.dataoffset contains the same value as
+ * m.initoffset.
+ * m.datalen contains the same value as
+ * m.initlen. Initially dataoffset and
+ * datalen describe the parameters for the
+ * first data block; while building the data
+ * block requests, they are updated for each
+ * block.
+ * m.groupoffset indicates the offset of any
+ * group operation relative to the beginning
+ * of the stripe base.
+ * m.grouplen specifies the length of any
+ * group operation.
+ * m.writeoffset indicates the offset of a
+ * normal write relative to the beginning of
+ * the stripe base. This value differs from
+ * m.dataoffset in that it applies to the
+ * entire operation, and not just the first
+ * block.
+ * m.writelen specifies the total span of a
+ * normal write operation. writeoffset and
+ * writelen are used to define the parity
+ * block.
+ */
+ m.groupoffset = 0; /* assume no group... */
+ m.grouplen = 0; /* until we know we have one */
+ m.writeoffset = m.initoffset; /* start offset of transfer */
+ m.writelen = 0; /* nothing to write yet */
+ m.flags = 0; /* no flags yet */
+ rsectors = m.stripesectors; /* remaining sectors to examine */
+ m.dataoffset = m.initoffset; /* start at the beginning of the transfer */
+ m.datalen = m.initlen;
+
+ if (m.sdcount > 1) {
+ plex->multiblock++; /* more than one block for the request */
+ /*
+ * If we have two transfers that don't overlap,
+ * (one at the end of the first block, the other
+ * at the beginning of the second block),
+ * it's cheaper to split them.
+ */
+ if (rsectors < plex->stripesize) {
+ m.sdcount = 1; /* just one subdisk */
+ m.stripesectors = m.initlen; /* and just this many sectors */
+ rsectors = m.initlen; /* and in the loop counter */
+ }
+ }
+ if (SD[plex->sdnos[m.psdno]].state < sd_reborn) /* is our parity subdisk down? */
+ m.badsdno = m.psdno; /* note that it's down */
+ if (bp->b_iocmd == BIO_READ) { /* read operation */
+ for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
+ if (mysdno == m.psdno) /* ignore parity on read */
+ mysdno++;
+ if (mysdno == plex->subdisks) /* wraparound */
+ mysdno = 0;
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+
+ if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
+ if (m.badsdno >= 0) /* we had one already, */
+ return REQUEST_DOWN; /* we can't take a second */
+ m.badsdno = mysdno; /* got the first */
+ m.groupoffset = m.dataoffset; /* define the bounds */
+ m.grouplen = m.datalen;
+ m.flags |= XFR_RECOVERY_READ; /* we need recovery */
+ plex->recovered_reads++; /* count another one */
+ } else
+ m.flags |= XFR_NORMAL_READ; /* normal read */
+
+ /* Update the pointers for the next block */
+ m.dataoffset = 0; /* back to the start of the stripe */
+ rsectors -= m.datalen; /* remaining sectors to examine */
+ m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
+ }
+ } else { /* write operation */
+ for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
+ if (mysdno == m.psdno) /* parity stripe, we've dealt with that */
+ mysdno++;
+ if (mysdno == plex->subdisks) /* wraparound */
+ mysdno = 0;
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+
+ sd = &SD[plex->sdnos[mysdno]];
+ if (sd->state != sd_up) {
+ enum requeststatus s;
+
+ s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
+ if (s && (m.badsdno >= 0)) { /* second bad disk, */
+ int sdno;
+ /*
+ * If the parity disk is down, there's
+ * no recovery. We make all involved
+ * subdisks stale. Otherwise, we
+ * should be able to recover, but it's
+ * like pulling teeth. Fix it later.
+ */
+ for (sdno = 0; sdno < m.sdcount; sdno++) {
+ struct sd *sd = &SD[plex->sdnos[sdno]];
+ if (sd->state >= sd_reborn) /* sort of up, */
+ set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
+ }
+ return s; /* and crap out */
+ }
+ m.badsdno = mysdno; /* note which one is bad */
+ m.flags |= XFR_DEGRADED_WRITE; /* we need recovery */
+ plex->degraded_writes++; /* count another one */
+ m.groupoffset = m.dataoffset; /* define the bounds */
+ m.grouplen = m.datalen;
+ } else {
+ m.flags |= XFR_NORMAL_WRITE; /* normal write operation */
+ if (m.writeoffset > m.dataoffset) { /* move write operation lower */
+ m.writelen = max(m.writeoffset + m.writelen,
+ m.dataoffset + m.datalen)
+ - m.dataoffset;
+ m.writeoffset = m.dataoffset;
+ } else
+ m.writelen = max(m.writeoffset + m.writelen,
+ m.dataoffset + m.datalen)
+ - m.writeoffset;
+ }
+
+ /* Update the pointers for the next block */
+ m.dataoffset = 0; /* back to the start of the stripe */
+ rsectors -= m.datalen; /* remaining sectors to examine */
+ m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
+ }
+ if (m.badsdno == m.psdno) { /* got a bad parity block, */
+ struct sd *psd = &SD[plex->sdnos[m.psdno]];
+
+ if (psd->state == sd_down)
+ set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
+ else if (psd->state == sd_crashed)
+ set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
+ m.flags &= ~XFR_NORMAL_WRITE; /* this write isn't normal, */
+ m.flags |= XFR_PARITYLESS_WRITE; /* it's parityless */
+ plex->parityless_writes++; /* count another one */
+ }
+ }
+
+ /* reset the initial transfer values */
+ m.dataoffset = m.initoffset; /* start at the beginning of the transfer */
+ m.datalen = m.initlen;
+
+ /* decide how many requests we need */
+ if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))
+ /* doing a recovery read or degraded write, */
+ m.rqcount = plex->subdisks; /* all subdisks */
+ else if (m.flags & XFR_NORMAL_WRITE) /* normal write, */
+ m.rqcount = m.sdcount + 1; /* all data blocks and the parity block */
+ else /* parityless write or normal read */
+ m.rqcount = m.sdcount; /* just the data blocks */
+
+ /* Part C: build the requests */
+ rqg = allocrqg(rq, m.rqcount); /* get a request group */
+ if (rqg == NULL) { /* malloc failed */
+ bp->b_error = ENOMEM;
+ bp->b_ioflags |= BIO_ERROR;
+ return REQUEST_ENOMEM;
+ }
+ rqg->plexno = plexno;
+ rqg->flags = m.flags;
+ rqno = 0; /* index in the request group */
+
+ /* 1: PARITY BLOCK */
+ /*
+ * Are we performing an operation which requires parity? In that case,
+ * work out the parameters and define the parity block.
+ * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
+ */
+ if (m.flags & XFR_PARITYOP) { /* need parity */
+ rqe = &rqg->rqe[rqno]; /* point to element */
+ sd = &SD[plex->sdnos[m.psdno]]; /* the subdisk in question */
+ rqe->rqg = rqg; /* point back to group */
+ rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
+ &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE); /* transfer flags without data op stuf */
+ setrqebounds(rqe, &m); /* set up the bounds of the transfer */
+ rqe->sdno = sd->sdno; /* subdisk number */
+ rqe->driveno = sd->driveno;
+ if (build_rq_buffer(rqe, plex)) /* build the buffer */
+ return REQUEST_ENOMEM; /* can't do it */
+ rqe->b.b_iocmd = BIO_READ; /* we must read first */
+ m.sdcount++; /* adjust the subdisk count */
+ rqno++; /* and point to the next request */
+ }
+ /*
+ * 2: DATA BLOCKS
+ * Now build up requests for the blocks required
+ * for individual transfers
+ */
+ for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+ if (mysdno == plex->subdisks) /* got to the end, */
+ mysdno = 0; /* wrap around */
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+
+ rqe = &rqg->rqe[rqno]; /* point to element */
+ sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */
+ rqe->rqg = rqg; /* point to group */
+ if (m.flags & XFR_NEEDS_MALLOC) /* we need a malloced buffer first */
+ rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
+ else
+ rqe->flags = m.flags | XFR_DATA_BLOCK; /* transfer flags */
+ if (mysdno == m.badsdno) { /* this is the bad subdisk */
+ rqg->badsdno = rqno; /* note which one */
+ rqe->flags |= XFR_BAD_SUBDISK; /* note that it's dead */
+ /*
+ * we can't read or write from/to it,
+ * but we don't need to malloc
+ */
+ rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
+ }
+ setrqebounds(rqe, &m); /* set up the bounds of the transfer */
+ rqe->useroffset = m.useroffset; /* offset in user buffer */
+ rqe->sdno = sd->sdno; /* subdisk number */
+ rqe->driveno = sd->driveno;
+ if (build_rq_buffer(rqe, plex)) /* build the buffer */
+ return REQUEST_ENOMEM; /* can't do it */
+ if ((m.flags & XFR_PARITYOP) /* parity operation, */
+ &&((m.flags & XFR_BAD_SUBDISK) == 0)) /* and not the bad subdisk, */
+ rqe->b.b_iocmd = BIO_READ; /* we must read first */
+
+ /* Now update pointers for the next block */
+ *diskaddr += m.datalen; /* skip past what we've done */
+ m.stripesectors -= m.datalen; /* deduct from what's left */
+ m.useroffset += m.datalen; /* and move on in the user buffer */
+ m.datalen = min(m.stripesectors, plex->stripesize); /* and recalculate */
+ m.dataoffset = 0; /* start at the beginning of next block */
+ }
+
+ /*
+ * 3: REMAINING BLOCKS FOR RECOVERY
+ * Finally, if we have a recovery operation, build
+ * up transfers for the other subdisks. Follow the
+ * subdisks around until we get to where we started.
+ * These requests use only the group parameters.
+ */
+ if ((rqno < m.rqcount) /* haven't done them all already */
+ &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
+ for (; rqno < m.rqcount; rqno++, mysdno++) {
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+ if (mysdno == plex->subdisks) /* got to the end, */
+ mysdno = 0; /* wrap around */
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+
+ rqe = &rqg->rqe[rqno]; /* point to element */
+ sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */
+ rqe->rqg = rqg; /* point to group */
+
+ rqe->sdoffset = m.sdbase + m.groupoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* for tidiness' sake */
+ rqe->groupoffset = 0; /* group starts at the beginining */
+ rqe->datalen = 0;
+ rqe->grouplen = m.grouplen;
+ rqe->buflen = m.grouplen;
+ rqe->flags = (m.flags | XFR_MALLOCED) /* transfer flags without data op stuf */
+ &~XFR_DATAOP;
+ rqe->sdno = sd->sdno; /* subdisk number */
+ rqe->driveno = sd->driveno;
+ if (build_rq_buffer(rqe, plex)) /* build the buffer */
+ return REQUEST_ENOMEM; /* can't do it */
+ rqe->b.b_iocmd = BIO_READ; /* we must read first */
+ }
+ }
+ /*
+ * We need to lock the address range before
+ * doing anything. We don't have to be
+ * performing a recovery operation: somebody
+ * else could be doing so, and the results could
+ * influence us. Note the fact here, we'll perform
+ * the lock in launch_requests.
+ */
+ rqg->lockbase = m.stripebase;
+ if (*diskaddr < diskend) /* didn't finish the request on this stripe */
+ plex->multistripe++; /* count another one */
+ }
+ return REQUEST_OK;
+}
+
+/*
+ * Helper function for rqe5: adjust the bounds of
+ * the transfers to minimize the buffer
+ * allocation.
+ *
+ * Each request can handle two of three different
+ * data ranges:
+ *
+ * 1. The range described by the parameters
+ * dataoffset and datalen, for normal read or
+ * parityless write.
+ * 2. The range described by the parameters
+ * groupoffset and grouplen, for recovery read
+ * and degraded write.
+ * 3. For normal write, the range depends on the
+ * kind of block. For data blocks, the range
+ * is defined by dataoffset and datalen. For
+ * parity blocks, it is defined by writeoffset
+ * and writelen.
+ *
+ * In order not to allocate more memory than
+ * necessary, this function adjusts the bounds
+ * parameter for each request to cover just the
+ * minimum necessary for the function it performs.
+ * This will normally vary from one request to the
+ * next.
+ *
+ * Things are slightly different for the parity
+ * block. In this case, the bounds defined by
+ * mp->writeoffset and mp->writelen also play a
+ * rôle. Select this case by setting the
+ * parameter forparity != 0.
+ */
+void
+setrqebounds(struct rqelement *rqe, struct metrics *mp)
+{
+ /* parity block of a normal write */
+ if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK))
+ == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) { /* case 3 */
+ if (rqe->flags & XFR_DEGRADED_WRITE) { /* also degraded write */
+ /*
+ * With a combined normal and degraded write, we
+ * will zero out the area of the degraded write
+ * in the second phase, so we don't need to read
+ * it in. Unfortunately, we need a way to tell
+ * build_request_buffer the size of the buffer,
+ * and currently that's the length of the read.
+ * As a result, we read everything, even the stuff
+ * that we're going to nuke.
+ * FIXME XXX
+ */
+ if (mp->groupoffset < mp->writeoffset) { /* group operation starts lower */
+ rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
+ rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */
+ rqe->groupoffset = 0; /* and the group at the beginning */
+ } else { /* individual data starts first */
+ rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* individual data starts at the beginning */
+ rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */
+ }
+ rqe->datalen = mp->writelen;
+ rqe->grouplen = mp->grouplen;
+ } else { /* just normal write (case 3) */
+ rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* degradation starts at the beginning */
+ rqe->groupoffset = 0; /* for tidiness' sake */
+ rqe->datalen = mp->writelen;
+ rqe->grouplen = 0;
+ }
+ } else if (rqe->flags & XFR_DATAOP) { /* data operation (case 1 or 3) */
+ if (rqe->flags & XFR_GROUPOP) { /* also a group operation (case 2) */
+ if (mp->groupoffset < mp->dataoffset) { /* group operation starts lower */
+ rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
+ rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */
+ rqe->groupoffset = 0; /* and the group at the beginning */
+ } else { /* individual data starts first */
+ rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* individual data starts at the beginning */
+ rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */
+ }
+ rqe->datalen = mp->datalen;
+ rqe->grouplen = mp->grouplen;
+ } else { /* just data operation (case 1) */
+ rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* degradation starts at the beginning */
+ rqe->groupoffset = 0; /* for tidiness' sake */
+ rqe->datalen = mp->datalen;
+ rqe->grouplen = 0;
+ }
+ } else { /* just group operations (case 2) */
+ rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* for tidiness' sake */
+ rqe->groupoffset = 0; /* group starts at the beginining */
+ rqe->datalen = 0;
+ rqe->grouplen = mp->grouplen;
+ }
+ rqe->buflen = max(rqe->dataoffset + rqe->datalen, /* total buffer length */
+ rqe->groupoffset + rqe->grouplen);
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumrequest.c b/sys/dev/vinum/vinumrequest.c
new file mode 100644
index 0000000..f74fc89
--- /dev/null
+++ b/sys/dev/vinum/vinumrequest.c
@@ -0,0 +1,1112 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumrequest.c,v 1.36 2003/05/08 04:34:55 grog Exp grog $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+#include <sys/resourcevar.h>
+
+enum requeststatus bre(struct request *rq,
+ int plexno,
+ daddr_t * diskstart,
+ daddr_t diskend);
+enum requeststatus bre5(struct request *rq,
+ int plexno,
+ daddr_t * diskstart,
+ daddr_t diskend);
+enum requeststatus build_read_request(struct request *rq, int volplexno);
+enum requeststatus build_write_request(struct request *rq);
+enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
+int find_alternate_sd(struct request *rq);
+int check_range_covered(struct request *);
+void complete_rqe(struct buf *bp);
+void complete_raid5_write(struct rqelement *);
+int abortrequest(struct request *rq, int error);
+void sdio_done(struct buf *bp);
+int vinum_bounds_check(struct buf *bp, struct volume *vol);
+caddr_t allocdatabuf(struct rqelement *rqe);
+void freedatabuf(struct rqelement *rqe);
+
+#ifdef VINUMDEBUG
+struct rqinfo rqinfo[RQINFO_SIZE];
+struct rqinfo *rqip = rqinfo;
+
+void
+logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)
+{
+ int s = splhigh();
+
+ microtime(&rqip->timestamp); /* when did this happen? */
+ rqip->type = type;
+ rqip->bp = ubp; /* user buffer */
+ switch (type) {
+ case loginfo_user_bp:
+ case loginfo_user_bpl:
+ case loginfo_sdio: /* subdisk I/O */
+ case loginfo_sdiol: /* subdisk I/O launch */
+ case loginfo_sdiodone: /* subdisk I/O complete */
+ bcopy(info.bp, &rqip->info.b, sizeof(struct buf));
+ rqip->devmajor = major(info.bp->b_dev);
+ rqip->devminor = minor(info.bp->b_dev);
+ break;
+
+ case loginfo_iodone:
+ case loginfo_rqe:
+ case loginfo_raid5_data:
+ case loginfo_raid5_parity:
+ bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement));
+ rqip->devmajor = major(info.rqe->b.b_dev);
+ rqip->devminor = minor(info.rqe->b.b_dev);
+ break;
+
+ case loginfo_lockwait:
+ case loginfo_lock:
+ case loginfo_unlock:
+ bcopy(info.lockinfo, &rqip->info.lockinfo, sizeof(struct rangelock));
+
+ break;
+
+ case loginfo_unused:
+ break;
+ }
+ rqip++;
+ if (rqip >= &rqinfo[RQINFO_SIZE]) /* wrap around */
+ rqip = rqinfo;
+ splx(s);
+}
+
+#endif
+
+void
+vinumstrategy(struct bio *biop)
+{
+ struct buf *bp = (struct buf *) biop;
+ int volno;
+ struct volume *vol = NULL;
+
+ switch (DEVTYPE(bp->b_dev)) {
+ case VINUM_SD_TYPE:
+ case VINUM_SD2_TYPE:
+ sdio(bp);
+ return;
+
+ default:
+ bp->b_error = EIO; /* I/O error */
+ bp->b_io.bio_flags |= BIO_ERROR;
+ bufdone(bp);
+ return;
+
+ case VINUM_VOLUME_TYPE: /* volume I/O */
+ volno = Volno(bp->b_dev);
+ vol = &VOL[volno];
+ if (vol->state != volume_up) { /* can't access this volume */
+ bp->b_error = EIO; /* I/O error */
+ bp->b_io.bio_flags |= BIO_ERROR;
+ bufdone(bp);
+ return;
+ }
+ if (vinum_bounds_check(bp, vol) <= 0) { /* don't like them bounds */
+ bufdone(bp);
+ return;
+ }
+ /* FALLTHROUGH */
+ /*
+ * Plex I/O is pretty much the same as volume I/O
+ * for a single plex. Indicate this by passing a NULL
+ * pointer (set above) for the volume
+ */
+ case VINUM_PLEX_TYPE:
+ bp->b_resid = bp->b_bcount; /* transfer everything */
+ vinumstart(bp, 0);
+ return;
+ }
+}
+
+/*
+ * Start a transfer. Return -1 on error, 0 if OK,
+ * 1 if we need to retry. Parameter reviveok is
+ * set when doing transfers for revives: it allows
+ * transfers to be started immediately when a
+ * revive is in progress. During revive, normal
+ * transfers are queued if they share address
+ * space with a currently active revive operation.
+ */
+int
+vinumstart(struct buf *bp, int reviveok)
+{
+ int plexno;
+ int maxplex; /* maximum number of plexes to handle */
+ struct volume *vol;
+ struct request *rq; /* build up our request here */
+ enum requeststatus status;
+
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_LASTREQS)
+ logrq(loginfo_user_bp, (union rqinfou) bp, bp);
+#endif
+
+ if ((bp->b_bcount % DEV_BSIZE) != 0) { /* bad length */
+ bp->b_error = EINVAL; /* invalid size */
+ bp->b_io.bio_flags |= BIO_ERROR;
+ bufdone(bp);
+ return -1;
+ }
+ rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */
+ if (rq == NULL) { /* can't do it */
+ bp->b_error = ENOMEM; /* can't get memory */
+ bp->b_io.bio_flags |= BIO_ERROR;
+ bufdone(bp);
+ return -1;
+ }
+ bzero(rq, sizeof(struct request));
+
+ /*
+ * Note the volume ID. This can be NULL, which
+ * the request building functions use as an
+ * indication for single plex I/O.
+ */
+ rq->bp = bp; /* and the user buffer struct */
+
+ if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) { /* it's a volume, */
+ rq->volplex.volno = Volno(bp->b_dev); /* get the volume number */
+ vol = &VOL[rq->volplex.volno]; /* and point to it */
+ vol->active++; /* one more active request */
+ maxplex = vol->plexes; /* consider all its plexes */
+ } else {
+ vol = NULL; /* no volume */
+ rq->volplex.plexno = Plexno(bp->b_dev); /* point to the plex */
+ rq->isplex = 1; /* note that it's a plex */
+ maxplex = 1; /* just the one plex */
+ }
+
+ if (bp->b_iocmd == BIO_READ) {
+ /*
+ * This is a read request. Decide
+ * which plex to read from.
+ *
+ * There's a potential race condition here,
+ * since we're not locked, and we could end
+ * up multiply incrementing the round-robin
+ * counter. This doesn't have any serious
+ * effects, however.
+ */
+ if (vol != NULL) {
+ plexno = vol->preferred_plex; /* get the plex to use */
+ if (plexno < 0) { /* round robin */
+ plexno = vol->last_plex_read;
+ vol->last_plex_read++;
+ if (vol->last_plex_read >= vol->plexes) /* got the the end? */
+ vol->last_plex_read = 0; /* wrap around */
+ }
+ status = build_read_request(rq, plexno); /* build a request */
+ } else {
+ daddr_t diskaddr = bp->b_blkno; /* start offset of transfer */
+ status = bre(rq, /* build a request list */
+ rq->volplex.plexno,
+ &diskaddr,
+ diskaddr + (bp->b_bcount / DEV_BSIZE));
+ }
+
+ if (status > REQUEST_RECOVERED) { /* can't satisfy it */
+ if (status == REQUEST_DOWN) { /* not enough subdisks */
+ bp->b_error = EIO; /* I/O error */
+ bp->b_io.bio_flags |= BIO_ERROR;
+ }
+ bufdone(bp);
+ freerq(rq);
+ return -1;
+ }
+ return launch_requests(rq, reviveok); /* now start the requests if we can */
+ } else
+ /*
+ * This is a write operation. We write to all plexes. If this is
+ * a RAID-4 or RAID-5 plex, we must also update the parity stripe.
+ */
+ {
+ if (vol != NULL) {
+ if ((vol->plexes > 0) /* multiple plex */
+ ||(isparity((&PLEX[vol->plex[0]])))) { /* or RAID-[45], */
+ rq->save_data = bp->b_data; /* save the data buffer address */
+ bp->b_data = Malloc(bp->b_bcount);
+ bcopy(rq->save_data, bp->b_data, bp->b_bcount); /* make a copy */
+ rq->flags |= XFR_COPYBUF; /* and note that we did it */
+ }
+ status = build_write_request(rq);
+ } else { /* plex I/O */
+ daddr_t diskstart;
+
+ diskstart = bp->b_blkno; /* start offset of transfer */
+ status = bre(rq,
+ Plexno(bp->b_dev),
+ &diskstart,
+ bp->b_blkno + (bp->b_bcount / DEV_BSIZE)); /* build requests for the plex */
+ }
+ if (status > REQUEST_RECOVERED) { /* can't satisfy it */
+ if (status == REQUEST_DOWN) { /* not enough subdisks */
+ bp->b_error = EIO; /* I/O error */
+ bp->b_io.bio_flags |= BIO_ERROR;
+ }
+ if (rq->flags & XFR_COPYBUF) {
+ Free(bp->b_data);
+ bp->b_data = rq->save_data;
+ }
+ bufdone(bp);
+ freerq(rq);
+ return -1;
+ }
+ return launch_requests(rq, reviveok); /* now start the requests if we can */
+ }
+}
+
+/*
+ * Call the low-level strategy routines to
+ * perform the requests in a struct request
+ */
+int
+launch_requests(struct request *rq, int reviveok)
+{
+ struct rqgroup *rqg;
+ int rqno; /* loop index */
+ struct rqelement *rqe; /* current element */
+ struct drive *drive;
+ int rcount; /* request count */
+
+ /*
+ * First find out whether we're reviving, and
+ * the request contains a conflict. If so, we
+ * hang the request off plex->waitlist of the
+ * first plex we find which is reviving.
+ */
+
+ if ((rq->flags & XFR_REVIVECONFLICT) /* possible revive conflict */
+ &&(!reviveok)) { /* and we don't want to do it now, */
+ struct sd *sd;
+ struct request *waitlist; /* point to the waitlist */
+
+ sd = &SD[rq->sdno];
+ if (sd->waitlist != NULL) { /* something there already, */
+ waitlist = sd->waitlist;
+ while (waitlist->next != NULL) /* find the end */
+ waitlist = waitlist->next;
+ waitlist->next = rq; /* hook our request there */
+ } else
+ sd->waitlist = rq; /* hook our request at the front */
+
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_REVIVECONFLICT)
+ log(LOG_DEBUG,
+ "Revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n",
+ rq->sdno,
+ rq,
+ rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
+ major(rq->bp->b_dev),
+ minor(rq->bp->b_dev),
+ (intmax_t) rq->bp->b_blkno,
+ rq->bp->b_bcount);
+#endif
+ return 0; /* and get out of here */
+ }
+ rq->active = 0; /* nothing yet */
+#ifdef VINUMDEBUG
+ /* XXX This is probably due to a bug */
+ if (rq->rqg == NULL) { /* no request */
+ log(LOG_ERR, "vinum: null rqg\n");
+ abortrequest(rq, EINVAL);
+ return -1;
+ }
+#endif
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_ADDRESSES)
+ log(LOG_DEBUG,
+ "Request: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n",
+ rq,
+ rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
+ major(rq->bp->b_dev),
+ minor(rq->bp->b_dev),
+ (intmax_t) rq->bp->b_blkno,
+ rq->bp->b_bcount);
+ vinum_conf.lastrq = rq;
+ vinum_conf.lastbuf = rq->bp;
+ if (debug & DEBUG_LASTREQS)
+ logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp);
+#endif
+
+ /*
+ * We used to have an splbio() here anyway, out
+ * of superstition. With the division of labour
+ * below (first count the requests, then issue
+ * them), it looks as if we don't need this
+ * splbio() protection. In fact, as dillon
+ * points out, there's a race condition
+ * incrementing and decrementing rq->active and
+ * rqg->active. This splbio() didn't help
+ * there, because the device strategy routine
+ * can sleep. Solve this by putting shorter
+ * duration locks on the code.
+ */
+ /*
+ * This loop happens without any participation
+ * of the bottom half, so it requires no
+ * protection.
+ */
+ for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) { /* through the whole request chain */
+ rqg->active = rqg->count; /* they're all active */
+ for (rqno = 0; rqno < rqg->count; rqno++) {
+ rqe = &rqg->rqe[rqno];
+ if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */
+ rqg->active--; /* one less active request */
+ }
+ if (rqg->active) /* we have at least one active request, */
+ rq->active++; /* one more active request group */
+ }
+
+ /*
+ * Now fire off the requests. In this loop the
+ * bottom half could be completing requests
+ * before we finish. We avoid splbio()
+ * protection by ensuring we don't tread in the
+ * same places that the bottom half does.
+ */
+ for (rqg = rq->rqg; rqg != NULL;) { /* through the whole request chain */
+ if (rqg->lockbase >= 0) /* this rqg needs a lock first */
+ rqg->lock = lockrange(rqg->lockbase, rqg->rq->bp, &PLEX[rqg->plexno]);
+ rcount = rqg->count;
+ for (rqno = 0; rqno < rcount;) {
+ rqe = &rqg->rqe[rqno];
+
+ /*
+ * Point to next rqg before the bottom half
+ * changes the structures.
+ */
+ if (++rqno >= rcount)
+ rqg = rqg->next;
+ if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* this subdisk is good, */
+ drive = &DRIVE[rqe->driveno]; /* look at drive */
+ drive->active++;
+ if (drive->active >= drive->maxactive)
+ drive->maxactive = drive->active;
+ vinum_conf.active++;
+ if (vinum_conf.active >= vinum_conf.maxactive)
+ vinum_conf.maxactive = vinum_conf.active;
+
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_ADDRESSES)
+ log(LOG_DEBUG,
+ " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%jx, length %ld\n",
+ rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
+ major(rqe->b.b_dev),
+ minor(rqe->b.b_dev),
+ rqe->sdno,
+ (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
+ (intmax_t) rqe->b.b_blkno,
+ rqe->b.b_bcount);
+ if (debug & DEBUG_LASTREQS) {
+ microtime(&rqe->launchtime); /* time we launched this request */
+ logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp);
+ }
+#endif
+ /* fire off the request */
+ DEV_STRATEGY(&rqe->b);
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * define the low-level requests needed to perform a
+ * high-level I/O operation for a specific plex 'plexno'.
+ *
+ * Return REQUEST_OK if all subdisks involved in the request are up,
+ * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the
+ * request is at least partially outside the bounds of the subdisks.
+ *
+ * Modify the pointer *diskstart to point to the end address. On
+ * read, return on the first bad subdisk, so that the caller
+ * (build_read_request) can try alternatives.
+ *
+ * On entry to this routine, the rqg structures are not assigned. The
+ * assignment is performed by expandrq(). Strictly speaking, the
+ * elements rqe->sdno of all entries should be set to -1, since 0
+ * (from bzero) is a valid subdisk number. We avoid this problem by
+ * initializing the ones we use, and not looking at the others (index
+ * >= rqg->requests).
+ */
+enum requeststatus
+bre(struct request *rq,
+ int plexno,
+ daddr_t * diskaddr,
+ daddr_t diskend)
+{
+ int sdno;
+ struct sd *sd;
+ struct rqgroup *rqg;
+ struct buf *bp; /* user's bp */
+ struct plex *plex;
+ enum requeststatus status; /* return value */
+ daddr_t plexoffset; /* offset of transfer in plex */
+ daddr_t stripebase; /* base address of stripe (1st subdisk) */
+ daddr_t stripeoffset; /* offset in stripe */
+ daddr_t blockoffset; /* offset in stripe on subdisk */
+ struct rqelement *rqe; /* point to this request information */
+ daddr_t diskstart = *diskaddr; /* remember where this transfer starts */
+ enum requeststatus s; /* temp return value */
+
+ bp = rq->bp; /* buffer pointer */
+ status = REQUEST_OK; /* return value: OK until proven otherwise */
+ plex = &PLEX[plexno]; /* point to the plex */
+
+ switch (plex->organization) {
+ case plex_concat:
+ sd = NULL; /* (keep compiler quiet) */
+ for (sdno = 0; sdno < plex->subdisks; sdno++) {
+ sd = &SD[plex->sdnos[sdno]];
+ if (*diskaddr < sd->plexoffset) /* we must have a hole, */
+ status = REQUEST_DEGRADED; /* note the fact */
+ if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */
+ rqg = allocrqg(rq, 1); /* space for the request */
+ if (rqg == NULL) { /* malloc failed */
+ bp->b_error = ENOMEM;
+ bp->b_io.bio_flags |= BIO_ERROR;
+ return REQUEST_ENOMEM;
+ }
+ rqg->plexno = plexno;
+
+ rqe = &rqg->rqe[0]; /* point to the element */
+ rqe->rqg = rqg; /* group */
+ rqe->sdno = sd->sdno; /* put in the subdisk number */
+ plexoffset = *diskaddr; /* start offset in plex */
+ rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */
+ rqe->useroffset = plexoffset - diskstart; /* start offset in user buffer */
+ rqe->dataoffset = 0;
+ rqe->datalen = min(diskend - *diskaddr, /* number of sectors to transfer in this sd */
+ sd->sectors - rqe->sdoffset);
+ rqe->groupoffset = 0; /* no groups for concatenated plexes */
+ rqe->grouplen = 0;
+ rqe->buflen = rqe->datalen; /* buffer length is data buffer length */
+ rqe->flags = 0;
+ rqe->driveno = sd->driveno;
+ if (sd->state != sd_up) { /* *now* we find the sd is down */
+ s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
+ if (s == REQUEST_DOWN) { /* down? */
+ rqe->flags = XFR_BAD_SUBDISK; /* yup */
+ if (rq->bp->b_iocmd == BIO_READ) /* read request, */
+ return REQUEST_DEGRADED; /* give up here */
+ /*
+ * If we're writing, don't give up
+ * because of a bad subdisk. Go
+ * through to the bitter end, but note
+ * which ones we can't access.
+ */
+ status = REQUEST_DEGRADED; /* can't do it all */
+ }
+ }
+ *diskaddr += rqe->datalen; /* bump the address */
+ if (build_rq_buffer(rqe, plex)) { /* build the buffer */
+ deallocrqg(rqg);
+ bp->b_error = ENOMEM;
+ bp->b_io.bio_flags |= BIO_ERROR;
+ return REQUEST_ENOMEM; /* can't do it */
+ }
+ }
+ if (*diskaddr == diskend) /* we're finished, */
+ break; /* get out of here */
+ }
+ /*
+ * We've got to the end of the plex. Have we got to the end of
+ * the transfer? It would seem that having an offset beyond the
+ * end of the subdisk is an error, but in fact it can happen if
+ * the volume has another plex of different size. There's a valid
+ * question as to why you would want to do this, but currently
+ * it's allowed.
+ *
+ * In a previous version, I returned REQUEST_DOWN here. I think
+ * REQUEST_EOF is more appropriate now.
+ */
+ if (diskend > sd->sectors + sd->plexoffset) /* pointing beyond EOF? */
+ status = REQUEST_EOF;
+ break;
+
+ case plex_striped:
+ {
+ while (*diskaddr < diskend) { /* until we get it all sorted out */
+ if (*diskaddr >= plex->length) /* beyond the end of the plex */
+ return REQUEST_EOF; /* can't continue */
+
+ /* The offset of the start address from the start of the stripe. */
+ stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks);
+
+ /* The plex-relative address of the start of the stripe. */
+ stripebase = *diskaddr - stripeoffset;
+
+ /* The number of the subdisk in which the start is located. */
+ sdno = stripeoffset / plex->stripesize;
+
+ /* The offset from the beginning of the stripe on this subdisk. */
+ blockoffset = stripeoffset % plex->stripesize;
+
+ sd = &SD[plex->sdnos[sdno]]; /* the subdisk in question */
+ rqg = allocrqg(rq, 1); /* space for the request */
+ if (rqg == NULL) { /* malloc failed */
+ bp->b_error = ENOMEM;
+ bp->b_io.bio_flags |= BIO_ERROR;
+ return REQUEST_ENOMEM;
+ }
+ rqg->plexno = plexno;
+
+ rqe = &rqg->rqe[0]; /* point to the element */
+ rqe->rqg = rqg;
+ rqe->sdoffset = stripebase / plex->subdisks + blockoffset; /* start offset in this subdisk */
+ rqe->useroffset = *diskaddr - diskstart; /* The offset of the start in the user buffer */
+ rqe->dataoffset = 0;
+ rqe->datalen = min(diskend - *diskaddr, /* the amount remaining to transfer */
+ plex->stripesize - blockoffset); /* and the amount left in this stripe */
+ rqe->groupoffset = 0; /* no groups for striped plexes */
+ rqe->grouplen = 0;
+ rqe->buflen = rqe->datalen; /* buffer length is data buffer length */
+ rqe->flags = 0;
+ rqe->sdno = sd->sdno; /* put in the subdisk number */
+ rqe->driveno = sd->driveno;
+
+ if (sd->state != sd_up) { /* *now* we find the sd is down */
+ s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
+ if (s == REQUEST_DOWN) { /* down? */
+ rqe->flags = XFR_BAD_SUBDISK; /* yup */
+ if (rq->bp->b_iocmd == BIO_READ) /* read request, */
+ return REQUEST_DEGRADED; /* give up here */
+ /*
+ * If we're writing, don't give up
+ * because of a bad subdisk. Go through
+ * to the bitter end, but note which
+ * ones we can't access.
+ */
+ status = REQUEST_DEGRADED; /* can't do it all */
+ }
+ }
+ /*
+ * It would seem that having an offset
+ * beyond the end of the subdisk is an
+ * error, but in fact it can happen if the
+ * volume has another plex of different
+ * size. There's a valid question as to why
+ * you would want to do this, but currently
+ * it's allowed.
+ */
+ if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */
+ rqe->datalen = sd->sectors - rqe->sdoffset; /* truncate */
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_EOFINFO) { /* tell on the request */
+ log(LOG_DEBUG,
+ "vinum: EOF on plex %s, sd %s offset %x (user offset 0x%jx)\n",
+ plex->name,
+ sd->name,
+ (u_int) sd->sectors,
+ (intmax_t) bp->b_blkno);
+ log(LOG_DEBUG,
+ "vinum: stripebase %#jx, stripeoffset %#jx, blockoffset %#jx\n",
+ (intmax_t) stripebase,
+ (intmax_t) stripeoffset,
+ (intmax_t) blockoffset);
+ }
+#endif
+ }
+ if (build_rq_buffer(rqe, plex)) { /* build the buffer */
+ deallocrqg(rqg);
+ bp->b_error = ENOMEM;
+ bp->b_io.bio_flags |= BIO_ERROR;
+ return REQUEST_ENOMEM; /* can't do it */
+ }
+ *diskaddr += rqe->datalen; /* look at the remainder */
+ if ((*diskaddr < diskend) /* didn't finish the request on this stripe */
+ &&(*diskaddr < plex->length)) { /* and there's more to come */
+ plex->multiblock++; /* count another one */
+ if (sdno == plex->subdisks - 1) /* last subdisk, */
+ plex->multistripe++; /* another stripe as well */
+ }
+ }
+ }
+ break;
+
+ /*
+ * RAID-4 and RAID-5 are complicated enough to have their own
+ * function.
+ */
+ case plex_raid4:
+ case plex_raid5:
+ status = bre5(rq, plexno, diskaddr, diskend);
+ break;
+
+ default:
+ log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization);
+ status = REQUEST_DOWN; /* can't access it */
+ }
+
+ return status;
+}
+
+/*
+ * Build up a request structure for reading volumes.
+ * This function is not needed for plex reads, since there's
+ * no recovery if a plex read can't be satisified.
+ */
+enum requeststatus
+build_read_request(struct request *rq, /* request */
+ int plexindex)
+{ /* index in the volume's plex table */
+ struct buf *bp;
+ daddr_t startaddr; /* offset of previous part of transfer */
+ daddr_t diskaddr; /* offset of current part of transfer */
+ daddr_t diskend; /* and end offset of transfer */
+ int plexno; /* plex index in vinum_conf */
+ struct rqgroup *rqg; /* point to the request we're working on */
+ struct volume *vol; /* volume in question */
+ int recovered = 0; /* set if we recover a read */
+ enum requeststatus status = REQUEST_OK;
+ int plexmask; /* bit mask of plexes, for recovery */
+
+ bp = rq->bp; /* buffer pointer */
+ diskaddr = bp->b_blkno; /* start offset of transfer */
+ diskend = diskaddr + (bp->b_bcount / DEV_BSIZE); /* and end offset of transfer */
+ rqg = &rq->rqg[plexindex]; /* plex request */
+ vol = &VOL[rq->volplex.volno]; /* point to volume */
+
+ while (diskaddr < diskend) { /* build up request components */
+ startaddr = diskaddr;
+ status = bre(rq, vol->plex[plexindex], &diskaddr, diskend); /* build up a request */
+ switch (status) {
+ case REQUEST_OK:
+ continue;
+
+ case REQUEST_RECOVERED:
+ /*
+ * XXX FIXME if we have more than one plex, and we can
+ * satisfy the request from another, don't use the
+ * recovered request, since it's more expensive.
+ */
+ recovered = 1;
+ break;
+
+ case REQUEST_ENOMEM:
+ return status;
+ /*
+ * If we get here, our request is not complete. Try
+ * to fill in the missing parts from another plex.
+ * This can happen multiple times in this function,
+ * and we reinitialize the plex mask each time, since
+ * we could have a hole in our plexes.
+ */
+ case REQUEST_EOF:
+ case REQUEST_DOWN: /* can't access the plex */
+ case REQUEST_DEGRADED: /* can't access the plex */
+ plexmask = ((1 << vol->plexes) - 1) /* all plexes in the volume */
+ &~(1 << plexindex); /* except for the one we were looking at */
+ for (plexno = 0; plexno < vol->plexes; plexno++) {
+ if (plexmask == 0) /* no plexes left to try */
+ return REQUEST_DOWN; /* failed */
+ diskaddr = startaddr; /* start at the beginning again */
+ if (plexmask & (1 << plexno)) { /* we haven't tried this plex yet */
+ bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */
+ if (diskaddr > startaddr) { /* we satisfied another part */
+ recovered = 1; /* we recovered from the problem */
+ status = REQUEST_OK; /* don't complain about it */
+ break;
+ }
+ }
+ }
+ if (diskaddr == startaddr) /* didn't get any further, */
+ return status;
+ }
+ if (recovered)
+ vol->recovered_reads += recovered; /* adjust our recovery count */
+ }
+ return status;
+}
+
+/*
+ * Build up a request structure for writes.
+ * Return 0 if all subdisks involved in the request are up, 1 if some
+ * subdisks are not up, and -1 if the request is at least partially
+ * outside the bounds of the subdisks.
+ */
+enum requeststatus
+build_write_request(struct request *rq)
+{ /* request */
+ struct buf *bp;
+ daddr_t diskstart; /* offset of current part of transfer */
+ daddr_t diskend; /* and end offset of transfer */
+ int plexno; /* plex index in vinum_conf */
+ struct volume *vol; /* volume in question */
+ enum requeststatus status;
+
+ bp = rq->bp; /* buffer pointer */
+ vol = &VOL[rq->volplex.volno]; /* point to volume */
+ diskend = bp->b_blkno + (bp->b_bcount / DEV_BSIZE); /* end offset of transfer */
+ status = REQUEST_DOWN; /* assume the worst */
+ for (plexno = 0; plexno < vol->plexes; plexno++) {
+ diskstart = bp->b_blkno; /* start offset of transfer */
+ /*
+ * Build requests for the plex.
+ * We take the best possible result here (min,
+ * not max): we're happy if we can write at all
+ */
+ status = min(status, bre(rq,
+ vol->plex[plexno],
+ &diskstart,
+ diskend));
+ }
+ return status;
+}
+
+/* Fill in the struct buf part of a request element. */
+enum requeststatus
+build_rq_buffer(struct rqelement *rqe, struct plex *plex)
+{
+ struct sd *sd; /* point to subdisk */
+ struct volume *vol;
+ struct buf *bp;
+ struct buf *ubp; /* user (high level) buffer header */
+
+ vol = &VOL[rqe->rqg->rq->volplex.volno];
+ sd = &SD[rqe->sdno]; /* point to subdisk */
+ bp = &rqe->b;
+ ubp = rqe->rqg->rq->bp; /* pointer to user buffer header */
+
+ /* Initialize the buf struct */
+ /* copy these flags from user bp */
+ bp->b_flags = ubp->b_flags & (B_NOCACHE | B_ASYNC);
+ bp->b_io.bio_flags = 0;
+ bp->b_iocmd = ubp->b_iocmd;
+#ifdef VINUMDEBUG
+ if (rqe->flags & XFR_BUFLOCKED) /* paranoia */
+ panic("build_rq_buffer: rqe already locked"); /* XXX remove this when we're sure */
+#endif
+ BUF_LOCKINIT(bp); /* get a lock for the buffer */
+ BUF_LOCK(bp, LK_EXCLUSIVE, NULL); /* and lock it */
+ BUF_KERNPROC(bp);
+ rqe->flags |= XFR_BUFLOCKED;
+ bp->b_iodone = complete_rqe;
+ /*
+ * You'd think that we wouldn't need to even
+ * build the request buffer for a dead subdisk,
+ * but in some cases we need information like
+ * the user buffer address. Err on the side of
+ * generosity and supply what we can. That
+ * obviously doesn't include drive information
+ * when the drive is dead.
+ */
+ if ((rqe->flags & XFR_BAD_SUBDISK) == 0) /* subdisk is accessible, */
+ bp->b_dev = DRIVE[rqe->driveno].dev; /* drive device */
+ bp->b_blkno = rqe->sdoffset + sd->driveoffset; /* start address */
+ bp->b_bcount = rqe->buflen << DEV_BSHIFT; /* number of bytes to transfer */
+ bp->b_resid = bp->b_bcount; /* and it's still all waiting */
+ bp->b_bufsize = bp->b_bcount; /* and buffer size */
+ bp->b_rcred = FSCRED; /* we have the file system credentials */
+ bp->b_wcred = FSCRED; /* we have the file system credentials */
+
+ if (rqe->flags & XFR_MALLOCED) { /* this operation requires a malloced buffer */
+ bp->b_data = Malloc(bp->b_bcount); /* get a buffer to put it in */
+ if (bp->b_data == NULL) { /* failed */
+ abortrequest(rqe->rqg->rq, ENOMEM);
+ return REQUEST_ENOMEM; /* no memory */
+ }
+ } else
+ /*
+ * Point directly to user buffer data. This means
+ * that we don't need to do anything when we have
+ * finished the transfer
+ */
+ bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE;
+ /*
+ * On a recovery read, we perform an XOR of
+ * all blocks to the user buffer. To make
+ * this work, we first clean out the buffer
+ */
+ if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK))
+ == (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) { /* bad subdisk of a recovery read */
+ int length = rqe->grouplen << DEV_BSHIFT; /* and count involved */
+ char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */
+
+ bzero(data, length); /* clean it out */
+ }
+ return 0;
+}
+
+/*
+ * Abort a request: free resources and complete the
+ * user request with the specified error
+ */
+int
+abortrequest(struct request *rq, int error)
+{
+ struct buf *bp = rq->bp; /* user buffer */
+
+ bp->b_error = error;
+ freerq(rq); /* free everything we're doing */
+ bp->b_io.bio_flags |= BIO_ERROR;
+ return error; /* and give up */
+}
+
+/*
+ * Check that our transfer will cover the
+ * complete address space of the user request.
+ *
+ * Return 1 if it can, otherwise 0
+ */
+int
+check_range_covered(struct request *rq)
+{
+ return 1;
+}
+
+/* Perform I/O on a subdisk */
+void
+sdio(struct buf *bp)
+{
+ int s; /* spl */
+ struct sd *sd;
+ struct sdbuf *sbp;
+ daddr_t endoffset;
+ struct drive *drive;
+
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_LASTREQS)
+ logrq(loginfo_sdio, (union rqinfou) bp, bp);
+#endif
+ sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */
+ drive = &DRIVE[sd->driveno];
+
+ if (drive->state != drive_up) {
+ if (sd->state >= sd_crashed) {
+ if (bp->b_iocmd == BIO_WRITE) /* writing, */
+ set_sd_state(sd->sdno, sd_stale, setstate_force);
+ else
+ set_sd_state(sd->sdno, sd_crashed, setstate_force);
+ }
+ bp->b_error = EIO;
+ bp->b_io.bio_flags |= BIO_ERROR;
+ bufdone(bp);
+ return;
+ }
+ /*
+ * We allow access to any kind of subdisk as long as we can expect
+ * to get the I/O performed.
+ */
+ if (sd->state < sd_empty) { /* nothing to talk to, */
+ bp->b_error = EIO;
+ bp->b_io.bio_flags |= BIO_ERROR;
+ bufdone(bp);
+ return;
+ }
+ /* Get a buffer */
+ sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf));
+ if (sbp == NULL) {
+ bp->b_error = ENOMEM;
+ bp->b_io.bio_flags |= BIO_ERROR;
+ bufdone(bp);
+ return;
+ }
+ bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */
+ sbp->b.b_flags = bp->b_flags;
+ sbp->b.b_iocmd = bp->b_iocmd;
+ sbp->b.b_bufsize = bp->b_bcount; /* buffer size */
+ sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */
+ sbp->b.b_resid = bp->b_resid; /* and amount waiting */
+ sbp->b.b_dev = DRIVE[sd->driveno].dev; /* device */
+ sbp->b.b_data = bp->b_data; /* data buffer */
+ sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;
+ sbp->b.b_iodone = sdio_done; /* come here on completion */
+ BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */
+ BUF_LOCK(&sbp->b, LK_EXCLUSIVE, NULL); /* and lock it */
+ BUF_KERNPROC(&sbp->b);
+ sbp->bp = bp; /* note the address of the original header */
+ sbp->sdno = sd->sdno; /* note for statistics */
+ sbp->driveno = sd->driveno;
+ endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */
+ if (endoffset > sd->sectors) { /* beyond the end */
+ sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */
+ if (sbp->b.b_bcount <= 0) { /* nothing to transfer */
+ bp->b_resid = bp->b_bcount; /* nothing transferred */
+ bufdone(bp);
+ BUF_UNLOCK(&sbp->b);
+ BUF_LOCKFREE(&sbp->b);
+ Free(sbp);
+ return;
+ }
+ }
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_ADDRESSES)
+ log(LOG_DEBUG,
+ " %s dev %d.%d, sd %d, offset 0x%jx, devoffset 0x%jx, length %ld\n",
+ sbp->b.b_iocmd == BIO_READ ? "Read" : "Write",
+ major(sbp->b.b_dev),
+ minor(sbp->b.b_dev),
+ sbp->sdno,
+ (intmax_t) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset),
+ (intmax_t) sbp->b.b_blkno,
+ sbp->b.b_bcount);
+#endif
+ s = splbio();
+#ifdef VINUMDEBUG
+ if (debug & DEBUG_LASTREQS)
+ logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b);
+#endif
+ DEV_STRATEGY(&sbp->b);
+ splx(s);
+}
+
+/*
+ * Simplified version of bounds_check_with_label
+ * Determine the size of the transfer, and make sure it is
+ * within the boundaries of the partition. Adjust transfer
+ * if needed, and signal errors or early completion.
+ *
+ * Volumes are simpler than disk slices: they only contain
+ * one component (though we call them a, b and c to make
+ * system utilities happy), and they always take up the
+ * complete space of the "partition".
+ *
+ * I'm still not happy with this: why should the label be
+ * protected? If it weren't so damned difficult to write
+ * one in the first pleace (because it's protected), it wouldn't
+ * be a problem.
+ */
+int
+vinum_bounds_check(struct buf *bp, struct volume *vol)
+{
+ int maxsize = vol->size; /* size of the partition (sectors) */
+ int size = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* size of this request (sectors) */
+
+#ifdef LABELSECTOR
+ /* Would this transfer overwrite the disk label? */
+ if (bp->b_blkno <= LABELSECTOR /* starts before or at the label */
+#if LABELSECTOR != 0
+ && bp->b_blkno + size > LABELSECTOR /* and finishes after */
+#endif
+ && (bp->b_iocmd == BIO_WRITE) /* and it's a write */
+ &&(!vol->flags & (VF_WLABEL | VF_LABELLING))) { /* and we're not allowed to write the label */
+ bp->b_error = EROFS; /* read-only */
+ bp->b_io.bio_flags |= BIO_ERROR;
+ return -1;
+ }
+#endif
+ if (size == 0) /* no transfer specified, */
+ return 0; /* treat as EOF */
+ /* beyond partition? */
+ if (bp->b_blkno < 0 /* negative start */
+ || bp->b_blkno + size > maxsize) { /* or goes beyond the end of the partition */
+ /* if exactly at end of disk, return an EOF */
+ if (bp->b_blkno == maxsize) {
+ bp->b_resid = bp->b_bcount;
+ return 0;
+ }
+ /* or truncate if part of it fits */
+ size = maxsize - bp->b_blkno;
+ if (size <= 0) { /* nothing to transfer */
+ bp->b_error = EINVAL;
+ bp->b_io.bio_flags |= BIO_ERROR;
+ return -1;
+ }
+ bp->b_bcount = size << DEV_BSHIFT;
+ }
+ bp->b_pblkno = bp->b_blkno;
+ return 1;
+}
+
+/*
+ * Allocate a request group and hook
+ * it in in the list for rq
+ */
+struct rqgroup *
+allocrqg(struct request *rq, int elements)
+{
+ struct rqgroup *rqg; /* the one we're going to allocate */
+ int size = sizeof(struct rqgroup) + elements * sizeof(struct rqelement);
+
+ rqg = (struct rqgroup *) Malloc(size);
+ if (rqg != NULL) { /* malloc OK, */
+ if (rq->rqg) /* we already have requests */
+ rq->lrqg->next = rqg; /* hang it off the end */
+ else /* first request */
+ rq->rqg = rqg; /* at the start */
+ rq->lrqg = rqg; /* this one is the last in the list */
+
+ bzero(rqg, size); /* no old junk */
+ rqg->rq = rq; /* point back to the parent request */
+ rqg->count = elements; /* number of requests in the group */
+ rqg->lockbase = -1; /* no lock required yet */
+ }
+ return rqg;
+}
+
+/*
+ * Deallocate a request group out of a chain. We do
+ * this by linear search: the chain is short, this
+ * almost never happens, and currently it can only
+ * happen to the first member of the chain.
+ */
+void
+deallocrqg(struct rqgroup *rqg)
+{
+ struct rqgroup *rqgc = rqg->rq->rqg; /* point to the request chain */
+
+ if (rqg->lock) /* got a lock? */
+ unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
+ if (rqgc == rqg) /* we're first in line */
+ rqg->rq->rqg = rqg->next; /* unhook ourselves */
+ else {
+ while ((rqgc->next != NULL) /* find the group */
+ &&(rqgc->next != rqg))
+ rqgc = rqgc->next;
+ if (rqgc->next == NULL)
+ log(LOG_ERR,
+ "vinum deallocrqg: rqg %p not found in request %p\n",
+ rqg->rq,
+ rqg);
+ else
+ rqgc->next = rqg->next; /* make the chain jump over us */
+ }
+ Free(rqg);
+}
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumrevive.c b/sys/dev/vinum/vinumrevive.c
new file mode 100644
index 0000000..03e16f9
--- /dev/null
+++ b/sys/dev/vinum/vinumrevive.c
@@ -0,0 +1,622 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumrevive.c,v 1.18 2003/04/28 02:54:43 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+/*
+ * Revive a block of a subdisk. Return an error
+ * indication. EAGAIN means successful copy, but
+ * that more blocks remain to be copied. EINVAL
+ * means that the subdisk isn't associated with a
+ * plex (which means a programming error if we get
+ * here at all; FIXME).
+ */
+
+int
+revive_block(int sdno)
+{
+ int s; /* priority level */
+ struct sd *sd;
+ struct plex *plex;
+ struct volume *vol;
+ struct buf *bp;
+ int error = EAGAIN;
+ int size; /* size of revive block, bytes */
+ daddr_t plexblkno; /* lblkno in plex */
+ int psd; /* parity subdisk number */
+ u_int64_t stripe; /* stripe number */
+ int paritysd = 0; /* set if this is the parity stripe */
+ struct rangelock *lock; /* for locking */
+ daddr_t stripeoffset; /* offset in stripe */
+
+ plexblkno = 0; /* to keep the compiler happy */
+ sd = &SD[sdno];
+ lock = NULL;
+ if (sd->plexno < 0) /* no plex? */
+ return EINVAL;
+ plex = &PLEX[sd->plexno]; /* point to plex */
+ if (plex->volno >= 0)
+ vol = &VOL[plex->volno];
+ else
+ vol = NULL;
+
+ if ((sd->revive_blocksize == 0) /* no block size */
+ ||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1))) /* or invalid block size */
+ sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
+ else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE)
+ sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE;
+ size = min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT;
+ sd->reviver = curproc->p_pid; /* note who last had a bash at it */
+
+ /* Now decide where to read from */
+ switch (plex->organization) {
+ case plex_concat:
+ plexblkno = sd->revived + sd->plexoffset; /* corresponding address in plex */
+ break;
+
+ case plex_striped:
+ stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
+ if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize)
+ size = (plex->stripesize - stripeoffset) << DEV_BSHIFT;
+ plexblkno = sd->plexoffset /* base */
+ + (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */
+ + stripeoffset; /* offset from beginning of stripe */
+ break;
+
+ case plex_raid4:
+ case plex_raid5:
+ stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
+ plexblkno = sd->plexoffset /* base */
+ + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */
+ +stripeoffset; /* offset from beginning of stripe */
+ stripe = (sd->revived / plex->stripesize); /* stripe number */
+
+ /* Make sure we don't go beyond the end of the band. */
+ size = min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT);
+ if (plex->organization == plex_raid4)
+ psd = plex->subdisks - 1; /* parity subdisk for this stripe */
+ else
+ psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
+ paritysd = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */
+
+ /*
+ * Now adjust for the strangenesses
+ * in RAID-4 and RAID-5 striping.
+ */
+ if (sd->plexsdno > psd) /* beyond the parity stripe, */
+ plexblkno -= plex->stripesize; /* one stripe less */
+ else if (paritysd)
+ plexblkno -= plex->stripesize * sd->plexsdno; /* go back to the beginning of the band */
+ break;
+
+ case plex_disorg: /* to keep the compiler happy */
+ break; /* to keep the pedants happy */
+ }
+
+ if (paritysd) { /* we're reviving a parity block, */
+ bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */
+ if (bp == NULL) /* no buffer space */
+ return ENOMEM; /* chicken out */
+ } else { /* data block */
+ s = splbio();
+ bp = geteblk(size); /* Get a buffer */
+ splx(s);
+ if (bp == NULL)
+ return ENOMEM;
+
+ /*
+ * Amount to transfer: block size, unless it
+ * would overlap the end.
+ */
+ bp->b_bcount = size;
+ bp->b_resid = bp->b_bcount;
+ bp->b_blkno = plexblkno; /* start here */
+ if (isstriped(plex)) /* we need to lock striped plexes */
+ lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */
+ if (vol != NULL) /* it's part of a volume, */
+ /*
+ * First, read the data from the volume. We
+ * don't care which plex, that's bre's job.
+ */
+ bp->b_dev = VINUM_VOL(plex->volno); /* create the device number */
+ else /* it's an unattached plex */
+ bp->b_dev = VINUM_PLEX(sd->plexno); /* create the device number */
+
+ bp->b_iocmd = BIO_READ; /* either way, read it */
+ bp->b_flags = 0;
+ vinumstart(bp, 1);
+ bufwait(bp);
+ }
+
+ if (bp->b_ioflags & BIO_ERROR) {
+ error = bp->b_error;
+ if (lock) /* we took a lock, */
+ unlockrange(sd->plexno, lock); /* give it back */
+ } else
+ /* Now write to the subdisk */
+ {
+ bp->b_dev = VINUM_SD(sdno); /* create the device number */
+ bp->b_flags &= ~B_DONE; /* no longer done */
+ bp->b_ioflags = 0;
+ bp->b_iocmd = BIO_WRITE;
+ bp->b_resid = bp->b_bcount;
+ bp->b_blkno = sd->revived; /* write it to here */
+ sdio(bp); /* perform the I/O */
+ bufwait(bp);
+ if (bp->b_ioflags & BIO_ERROR)
+ error = bp->b_error;
+ else {
+ sd->revived += bp->b_bcount >> DEV_BSHIFT; /* moved this much further down */
+ if (sd->revived >= sd->sectors) { /* finished */
+ sd->revived = 0;
+ set_sd_state(sdno, sd_up, setstate_force); /* bring the sd up */
+ log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
+ save_config(); /* and save the updated configuration */
+ error = 0; /* we're done */
+ }
+ }
+ if (lock) /* we took a lock, */
+ unlockrange(sd->plexno, lock); /* give it back */
+ while (sd->waitlist) { /* we have waiting requests */
+#ifdef VINUMDEBUG
+ struct request *rq = sd->waitlist;
+
+ if (debug & DEBUG_REVIVECONFLICT)
+ log(LOG_DEBUG,
+ "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n",
+ rq->sdno,
+ rq,
+ rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
+ major(rq->bp->b_dev),
+ minor(rq->bp->b_dev),
+ (intmax_t)rq->bp->b_blkno,
+ rq->bp->b_bcount);
+#endif
+ launch_requests(sd->waitlist, 1); /* do them now */
+ sd->waitlist = sd->waitlist->next; /* and move on to the next */
+ }
+ }
+ if (bp->b_qindex == 0) { /* not on a queue, */
+ bp->b_flags |= B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ brelse(bp); /* is this kosher? */
+ }
+ return error;
+}
+
+/*
+ * Check or rebuild the parity blocks of a RAID-4
+ * or RAID-5 plex.
+ *
+ * The variables plex->checkblock and
+ * plex->rebuildblock represent the
+ * subdisk-relative address of the stripe we're
+ * looking at, not the plex-relative address. We
+ * store it in the plex and not as a local
+ * variable because this function could be
+ * stopped, and we don't want to repeat the part
+ * we've already done. This is also the reason
+ * why we don't initialize it here except at the
+ * end. It gets initialized with the plex on
+ * creation.
+ *
+ * Each call to this function processes at most
+ * one stripe. We can't loop in this function,
+ * because we're unstoppable, so we have to be
+ * called repeatedly from userland.
+ */
+void
+parityops(struct vinum_ioctl_msg *data)
+{
+ int plexno;
+ struct plex *plex;
+ int size; /* I/O transfer size, bytes */
+ int stripe; /* stripe number in plex */
+ int psd; /* parity subdisk number */
+ struct rangelock *lock; /* lock on stripe */
+ struct _ioctl_reply *reply;
+ off_t pstripe; /* pointer to our stripe counter */
+ struct buf *pbp;
+ off_t errorloc; /* offset of parity error */
+ enum parityop op; /* operation to perform */
+
+ plexno = data->index;
+ op = data->op;
+ pbp = NULL;
+ reply = (struct _ioctl_reply *) data;
+ reply->error = EAGAIN; /* expect to repeat this call */
+ plex = &PLEX[plexno];
+ if (!isparity(plex)) { /* not RAID-4 or RAID-5 */
+ reply->error = EINVAL;
+ return;
+ } else if (plex->state < plex_flaky) {
+ reply->error = EIO;
+ strcpy(reply->msg, "Plex is not completely accessible\n");
+ return;
+ }
+ pstripe = data->offset;
+ stripe = pstripe / plex->stripesize; /* stripe number */
+ psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
+ size = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
+ plex->stripesize << DEV_BSHIFT);
+
+ pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */
+ if (pbp == NULL) { /* no buffer space */
+ reply->error = ENOMEM;
+ return; /* chicken out */
+ }
+ /*
+ * Now we have a result in the data buffer of
+ * the parity buffer header, which we have kept.
+ * Decide what to do with it.
+ */
+ reply->msg[0] = '\0'; /* until shown otherwise */
+ if ((pbp->b_ioflags & BIO_ERROR) == 0) { /* no error */
+ if ((op == rebuildparity)
+ || (op == rebuildandcheckparity)) {
+ pbp->b_iocmd = BIO_WRITE;
+ pbp->b_resid = pbp->b_bcount;
+ sdio(pbp); /* write the parity block */
+ bufwait(pbp);
+ }
+ if (((op == checkparity)
+ || (op == rebuildandcheckparity))
+ && (errorloc != -1)) {
+ if (op == checkparity)
+ reply->error = EIO;
+ sprintf(reply->msg,
+ "Parity incorrect at offset 0x%jx\n",
+ (intmax_t)errorloc);
+ }
+ if (reply->error == EAGAIN) { /* still OK, */
+ plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT); /* moved this much further down */
+ if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */
+ plex->checkblock = 0;
+ reply->error = 0;
+ }
+ }
+ }
+ if (pbp->b_ioflags & BIO_ERROR)
+ reply->error = pbp->b_error;
+ pbp->b_flags |= B_INVAL;
+ pbp->b_ioflags &= ~BIO_ERROR;
+ brelse(pbp);
+ unlockrange(plexno, lock);
+}
+
+/*
+ * Rebuild a parity stripe. Return pointer to
+ * parity bp. On return,
+ *
+ * 1. The band is locked. The caller must unlock
+ * the band and release the buffer header.
+ *
+ * 2. All buffer headers except php have been
+ * released. The caller must release pbp.
+ *
+ * 3. For checkparity and rebuildandcheckparity,
+ * the parity is compared with the current
+ * parity block. If it's different, the
+ * offset of the error is returned to
+ * errorloc. The caller can set the value of
+ * the pointer to NULL if this is called for
+ * rebuilding parity.
+ *
+ * pstripe is the subdisk-relative base address of
+ * the data to be reconstructed, size is the size
+ * of the transfer in bytes.
+ */
+struct buf *
+parityrebuild(struct plex *plex,
+ u_int64_t pstripe,
+ int size,
+ enum parityop op,
+ struct rangelock **lockp,
+ off_t * errorloc)
+{
+ int error;
+ int s;
+ int sdno;
+ u_int64_t stripe; /* stripe number */
+ int *parity_buf; /* buffer address for current parity block */
+ int *newparity_buf; /* and for new parity block */
+ int mysize; /* I/O transfer size for this transfer */
+ int isize; /* mysize in ints */
+ int i;
+ int psd; /* parity subdisk number */
+ int newpsd; /* and "subdisk number" of new parity */
+ struct buf **bpp; /* pointers to our bps */
+ struct buf *pbp; /* buffer header for parity stripe */
+ int *sbuf;
+ int bufcount; /* number of buffers we need */
+
+ stripe = pstripe / plex->stripesize; /* stripe number */
+ psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
+ parity_buf = NULL; /* to keep the compiler happy */
+ error = 0;
+
+ /*
+ * It's possible that the default transfer size
+ * we chose is not a factor of the stripe size.
+ * We *must* limit this operation to a single
+ * stripe, at least for RAID-5 rebuild, since
+ * the parity subdisk changes between stripes,
+ * so in this case we need to perform a short
+ * transfer. Set variable mysize to reflect
+ * this.
+ */
+ mysize = min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT);
+ isize = mysize / (sizeof(int)); /* number of ints in the buffer */
+ bufcount = plex->subdisks + 1; /* sd buffers plus result buffer */
+ newpsd = plex->subdisks;
+ bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */
+
+ /* First, build requests for all subdisks */
+ for (sdno = 0; sdno < bufcount; sdno++) { /* for each subdisk */
+ if ((sdno != psd) || (op != rebuildparity)) {
+ /* Get a buffer header and initialize it. */
+ s = splbio();
+ bpp[sdno] = geteblk(mysize); /* Get a buffer */
+ if (bpp[sdno] == NULL) {
+ while (sdno-- > 0) { /* release the ones we got */
+ bpp[sdno]->b_flags |= B_INVAL;
+ brelse(bpp[sdno]); /* give back our resources */
+ }
+ splx(s);
+ printf("vinum: can't allocate buffer space for parity op.\n");
+ return NULL; /* no bpps */
+ }
+ splx(s);
+ if (sdno == psd)
+ parity_buf = (int *) bpp[sdno]->b_data;
+ if (sdno == newpsd) /* the new one? */
+ bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[psd]); /* write back to the parity SD */
+ else
+ bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[sdno]); /* device number */
+ bpp[sdno]->b_iocmd = BIO_READ; /* either way, read it */
+ bpp[sdno]->b_flags = 0;
+ bpp[sdno]->b_bcount = mysize;
+ bpp[sdno]->b_resid = bpp[sdno]->b_bcount;
+ bpp[sdno]->b_blkno = pstripe; /* transfer from here */
+ }
+ }
+
+ /* Initialize result buffer */
+ pbp = bpp[newpsd];
+ newparity_buf = (int *) bpp[newpsd]->b_data;
+ bzero(newparity_buf, mysize);
+
+ /*
+ * Now lock the stripe with the first non-parity
+ * bp as locking bp.
+ */
+ *lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1),
+ bpp[psd ? 0 : 1],
+ plex);
+
+ /*
+ * Then issue requests for all subdisks in
+ * parallel. Don't transfer the parity stripe
+ * if we're rebuilding parity, unless we also
+ * want to check it.
+ */
+ for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each real subdisk */
+ if ((sdno != psd) || (op != rebuildparity)) {
+ sdio(bpp[sdno]);
+ }
+ }
+
+ /*
+ * Next, wait for the requests to complete.
+ * We wait in the order in which they were
+ * issued, which isn't necessarily the order in
+ * which they complete, but we don't have a
+ * convenient way of doing the latter, and the
+ * delay is minimal.
+ */
+ for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */
+ if ((sdno != psd) || (op != rebuildparity)) {
+ bufwait(bpp[sdno]);
+ if (bpp[sdno]->b_ioflags & BIO_ERROR) /* can't read, */
+ error = bpp[sdno]->b_error;
+ else if (sdno != psd) { /* update parity */
+ sbuf = (int *) bpp[sdno]->b_data;
+ for (i = 0; i < isize; i++)
+ ((int *) newparity_buf)[i] ^= sbuf[i]; /* xor in the buffer */
+ }
+ }
+ if (sdno != psd) { /* release all bps except parity */
+ bpp[sdno]->b_flags |= B_INVAL;
+ brelse(bpp[sdno]); /* give back our resources */
+ }
+ }
+
+ /*
+ * If we're checking, compare the calculated
+ * and the read parity block. If they're
+ * different, return the plex-relative offset;
+ * otherwise return -1.
+ */
+ if ((op == checkparity)
+ || (op == rebuildandcheckparity)) {
+ *errorloc = -1; /* no error yet */
+ for (i = 0; i < isize; i++) {
+ if (parity_buf[i] != newparity_buf[i]) {
+ *errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1)
+ + i * sizeof(int);
+ break;
+ }
+ }
+ bpp[psd]->b_flags |= B_INVAL;
+ brelse(bpp[psd]); /* give back our resources */
+ }
+ /* release our resources */
+ Free(bpp);
+ if (error) {
+ pbp->b_ioflags |= BIO_ERROR;
+ pbp->b_error = error;
+ }
+ return pbp;
+}
+
+/*
+ * Initialize a subdisk by writing zeroes to the
+ * complete address space. If verify is set,
+ * check each transfer for correctness.
+ *
+ * Each call to this function writes (and maybe
+ * checks) a single block.
+ */
+int
+initsd(int sdno, int verify)
+{
+ int s; /* priority level */
+ struct sd *sd;
+ struct plex *plex;
+ struct volume *vol;
+ struct buf *bp;
+ int error;
+ int size; /* size of init block, bytes */
+ daddr_t plexblkno; /* lblkno in plex */
+ int verified; /* set when we're happy with what we wrote */
+
+ error = 0;
+ plexblkno = 0; /* to keep the compiler happy */
+ sd = &SD[sdno];
+ if (sd->plexno < 0) /* no plex? */
+ return EINVAL;
+ plex = &PLEX[sd->plexno]; /* point to plex */
+ if (plex->volno >= 0)
+ vol = &VOL[plex->volno];
+ else
+ vol = NULL;
+
+ if (sd->init_blocksize == 0) {
+ if (plex->stripesize != 0) /* we're striped, don't init more than */
+ sd->init_blocksize = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
+ plex->stripesize << DEV_BSHIFT);
+ else
+ sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
+ } else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE)
+ sd->init_blocksize = MAX_REVIVE_BLOCKSIZE;
+
+ size = min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT;
+
+ verified = 0;
+ while (!verified) { /* until we're happy with it, */
+ s = splbio();
+ bp = geteblk(size); /* Get a buffer */
+ splx(s);
+ if (bp == NULL)
+ return ENOMEM;
+
+ bp->b_bcount = size;
+ bp->b_resid = bp->b_bcount;
+ bp->b_blkno = sd->initialized; /* write it to here */
+ bzero(bp->b_data, bp->b_bcount);
+ bp->b_dev = VINUM_SD(sdno); /* create the device number */
+ bp->b_iocmd = BIO_WRITE;
+ sdio(bp); /* perform the I/O */
+ bufwait(bp);
+ if (bp->b_ioflags & BIO_ERROR)
+ error = bp->b_error;
+ if (bp->b_qindex == 0) { /* not on a queue, */
+ bp->b_flags |= B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ brelse(bp); /* is this kosher? */
+ }
+ if ((error == 0) && verify) { /* check that it got there */
+ s = splbio();
+ bp = geteblk(size); /* get a buffer */
+ if (bp == NULL) {
+ splx(s);
+ error = ENOMEM;
+ } else {
+ bp->b_bcount = size;
+ bp->b_resid = bp->b_bcount;
+ bp->b_blkno = sd->initialized; /* read from here */
+ bp->b_dev = VINUM_SD(sdno); /* create the device number */
+ bp->b_iocmd = BIO_READ; /* read it back */
+ splx(s);
+ sdio(bp);
+ bufwait(bp);
+ /*
+ * XXX Bug fix code. This is hopefully no
+ * longer needed (21 February 2000).
+ */
+ if (bp->b_ioflags & BIO_ERROR)
+ error = bp->b_error;
+ else if ((*bp->b_data != 0) /* first word spammed */
+ ||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */
+ printf("vinum: init error on %s, offset 0x%llx sectors\n",
+ sd->name,
+ (long long) sd->initialized);
+ verified = 0;
+ } else
+ verified = 1;
+ if (bp->b_qindex == 0) { /* not on a queue, */
+ bp->b_flags |= B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ brelse(bp); /* is this kosher? */
+ }
+ }
+ } else
+ verified = 1;
+ }
+ if (error == 0) { /* did it, */
+ sd->initialized += size >> DEV_BSHIFT; /* moved this much further down */
+ if (sd->initialized >= sd->sectors) { /* finished */
+ sd->initialized = 0;
+ set_sd_state(sdno, sd_initialized, setstate_force); /* bring the sd up */
+ log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
+ save_config(); /* and save the updated configuration */
+ } else /* more to go, */
+ error = EAGAIN; /* ya'll come back, see? */
+ }
+ return error;
+}
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumstate.c b/sys/dev/vinum/vinumstate.c
new file mode 100644
index 0000000..59c9860
--- /dev/null
+++ b/sys/dev/vinum/vinumstate.c
@@ -0,0 +1,1093 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumstate.c,v 2.21 2003/04/28 02:54:43 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+
+/* Update drive state */
+/* Return 1 if the state changes, otherwise 0 */
+int
+set_drive_state(int driveno, enum drivestate newstate, enum setstateflags flags)
+{
+ struct drive *drive = &DRIVE[driveno];
+ int oldstate = drive->state;
+ int sdno;
+
+ if (drive->state == drive_unallocated) /* no drive to do anything with, */
+ return 0;
+
+ if (newstate == oldstate) /* don't change it if it's not different */
+ return 1; /* all OK */
+ if ((newstate == drive_down) /* the drive's going down */
+ &&(!(flags & setstate_force))
+ && (drive->opencount != 0)) /* we can't do it */
+ return 0; /* don't do it */
+ drive->state = newstate; /* set the state */
+ if (drive->label.name[0] != '\0') /* we have a name, */
+ log(LOG_INFO,
+ "vinum: drive %s is %s\n",
+ drive->label.name,
+ drive_state(drive->state));
+ if (drive->state != oldstate) { /* state has changed */
+ for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { /* find this drive's subdisks */
+ if ((SD[sdno].state >= sd_referenced)
+ && (SD[sdno].driveno == driveno)) /* belongs to this drive */
+ update_sd_state(sdno); /* update the state */
+ }
+ }
+ if (newstate == drive_up) { /* want to bring it up */
+ if ((drive->flags & VF_OPEN) == 0) /* should be open, but we're not */
+ init_drive(drive, 1); /* which changes the state again */
+ } else /* taking it down or worse */
+ queue_daemon_request(daemonrq_closedrive, /* get the daemon to close it */
+ (union daemoninfo) drive);
+ if ((flags & setstate_configuring) == 0) /* configuring? */
+ save_config(); /* no: save the updated configuration now */
+ return 1;
+}
+
+/*
+ * Try to set the subdisk state. Return 1 if
+ * state changed to what we wanted, -1 if it
+ * changed to something else, and 0 if no change.
+ *
+ * This routine is called both from the user (up,
+ * down states only) and internally.
+ *
+ * The setstate_force bit in the flags enables the
+ * state change even if it could be dangerous to
+ * data consistency. It shouldn't allow nonsense.
+ */
+int
+set_sd_state(int sdno, enum sdstate newstate, enum setstateflags flags)
+{
+ struct sd *sd = &SD[sdno];
+ struct plex *plex;
+ struct volume *vol;
+ int oldstate = sd->state;
+ int status = 1; /* status to return */
+
+ if (newstate == oldstate) /* already there, */
+ return 1;
+ else if (sd->state == sd_unallocated) /* no subdisk to do anything with, */
+ return 0; /* can't do it */
+
+ if (sd->driveoffset < 0) { /* not allocated space */
+ sd->state = sd_down;
+ if (newstate != sd_down) {
+ if (sd->plexno >= 0)
+ sdstatemap(&PLEX[sd->plexno]); /* count up subdisks */
+ return -1;
+ }
+ } else { /* space allocated */
+ switch (newstate) {
+ case sd_down: /* take it down? */
+ /*
+ * If we're attached to a plex, and we're
+ * not reborn, we won't go down without
+ * use of force.
+ */
+ if ((!flags & setstate_force)
+ && (sd->plexno >= 0)
+ && (sd->state != sd_reborn))
+ return 0; /* don't do it */
+ break;
+
+ case sd_initialized:
+ if ((sd->state == sd_initializing) /* we were initializing */
+ ||(flags & setstate_force)) /* or we forced it */
+ break;
+ return 0; /* can't do it otherwise */
+
+ case sd_up:
+ if (DRIVE[sd->driveno].state != drive_up) /* can't bring the sd up if the drive isn't, */
+ return 0; /* not even by force */
+ if (flags & setstate_force) /* forcing it, */
+ break; /* just do it, and damn the consequences */
+ switch (sd->state) {
+ /*
+ * Perform the necessary tests. To allow
+ * the state transition, just break out of
+ * the switch.
+ */
+ case sd_crashed:
+ case sd_reborn:
+ case sd_down: /* been down, no data lost */
+ /*
+ * If we're associated with a plex, and
+ * the plex isn't up, or we're the only
+ * subdisk in the plex, we can do it.
+ */
+ if ((sd->plexno >= 0)
+ && (((PLEX[sd->plexno].state < plex_firstup)
+ || (PLEX[sd->plexno].subdisks > 1))))
+ break; /* do it */
+ if (oldstate != sd_reborn) {
+ sd->state = sd_reborn; /* here it is again */
+ log(LOG_INFO,
+ "vinum: %s is %s, not %s\n",
+ sd->name,
+ sd_state(sd->state),
+ sd_state(newstate));
+ }
+ status = -1;
+ break;
+
+ case sd_init: /* brand new */
+ if (flags & setstate_configuring) /* we're doing this while configuring */
+ break;
+ /* otherwise it's like being empty */
+ /* FALLTHROUGH */
+
+ case sd_empty:
+ case sd_initialized:
+ /*
+ * If we're not part of a plex, or the
+ * plex is not part of a volume with other
+ * plexes which are up, we can come up
+ * without being inconsistent.
+ *
+ * If we're part of a parity plex, we'll
+ * come up if the caller uses force. This
+ * is the way we bring them up after
+ * initialization.
+ */
+ if ((sd->plexno < 0)
+ || ((vpstate(&PLEX[sd->plexno]) & volplex_otherup) == 0)
+ || (isparity((&PLEX[sd->plexno]))
+ && (flags & setstate_force)))
+ break;
+
+ /* Otherwise it's just out of date */
+ /* FALLTHROUGH */
+
+ case sd_stale: /* out of date info, need reviving */
+ case sd_obsolete:
+ /*
+
+ * 1. If the subdisk is not part of a
+ * plex, bring it up, don't revive.
+ *
+ * 2. If the subdisk is part of a
+ * one-plex volume or an unattached
+ * plex, and it's not RAID-4 or
+ * RAID-5, we *can't revive*. The
+ * subdisk doesn't change its state.
+ *
+ * 3. If the subdisk is part of a
+ * one-plex volume or an unattached
+ * plex, and it's RAID-4 or RAID-5,
+ * but more than one subdisk is down,
+ * we *still can't revive*. The
+ * subdisk doesn't change its state.
+ *
+ * 4. If the subdisk is part of a
+ * multi-plex volume, we'll change to
+ * reviving and let the revive
+ * routines find out whether it will
+ * work or not. If they don't, the
+ * revive stops with an error message,
+ * but the state doesn't change
+ * (FWIW).
+ */
+ if (sd->plexno < 0) /* no plex associated, */
+ break; /* bring it up */
+ plex = &PLEX[sd->plexno];
+ if (plex->volno >= 0) /* have a volume */
+ vol = &VOL[plex->volno];
+ else
+ vol = NULL;
+ /*
+ * We can't do it if:
+ *
+ * 1: we don't have a volume
+ * 2: we're the only plex in the volume
+ * 3: we're a RAID-4 or RAID-5 plex, and
+ * more than one subdisk is down.
+ */
+ if (((vol == NULL)
+ || (vol->plexes == 1))
+ && ((!isparity(plex))
+ || (plex->sddowncount > 1))) {
+ if (sd->state == sd_initializing) /* it's finished initializing */
+ sd->state = sd_initialized;
+ else
+ return 0; /* can't do it */
+ } else {
+ sd->state = sd_reviving; /* put in reviving state */
+ sd->revived = 0; /* nothing done yet */
+ status = EAGAIN; /* need to repeat */
+ }
+ break;
+
+ case sd_reviving:
+ if (flags & setstate_force) /* insist, */
+ break;
+ return EAGAIN; /* no, try again */
+
+ default: /* can't do it */
+ /*
+ * There's no way to bring subdisks up directly from
+ * other states. First they need to be initialized
+ * or revived.
+ */
+ return 0;
+ }
+ break;
+
+ default: /* other ones, only internal with force */
+ if ((flags & setstate_force) == 0) /* no force? What's this? */
+ return 0; /* don't do it */
+ }
+ }
+ if (status == 1) { /* we can do it, */
+ sd->state = newstate;
+ if (flags & setstate_force)
+ log(LOG_INFO, "vinum: %s is %s by force\n", sd->name, sd_state(sd->state));
+ else
+ log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
+ } else /* we don't get here with status 0 */
+ log(LOG_INFO,
+ "vinum: %s is %s, not %s\n",
+ sd->name,
+ sd_state(sd->state),
+ sd_state(newstate));
+ if (sd->plexno >= 0) /* we belong to a plex */
+ update_plex_state(sd->plexno); /* update plex state */
+ if ((flags & setstate_configuring) == 0) /* save config now */
+ save_config();
+ return status;
+}
+
+/*
+ * Set the state of a plex dependent on its subdisks.
+ * This time round, we'll let plex state just reflect
+ * aggregate subdisk state, so this becomes an order of
+ * magnitude less complicated. In particular, ignore
+ * the requested state.
+ */
+int
+set_plex_state(int plexno, enum plexstate state, enum setstateflags flags)
+{
+ struct plex *plex; /* point to our plex */
+ enum plexstate oldstate;
+ enum volplexstate vps; /* how do we compare with the other plexes? */
+
+ plex = &PLEX[plexno]; /* point to our plex */
+ oldstate = plex->state;
+
+ /* If the plex isn't allocated, we can't do it. */
+ if (plex->state == plex_unallocated)
+ return 0;
+
+ /*
+ * If it's already in the the state we want,
+ * and it's not up, just return. If it's up,
+ * we still need to do some housekeeping.
+ */
+ if ((state == oldstate)
+ && (state != plex_up))
+ return 1;
+ vps = vpstate(plex); /* how do we compare with the other plexes? */
+ switch (state) {
+ /*
+ * We can't bring the plex up, even by force,
+ * unless it's ready. update_plex_state
+ * checks that.
+ */
+ case plex_up: /* bring the plex up */
+ update_plex_state(plex->plexno); /* it'll come up if it can */
+ break;
+
+ case plex_down: /* want to take it down */
+ /*
+ * If we're the only one, or the only one
+ * which is up, we need force to do it.
+ */
+ if (((vps == volplex_onlyus)
+ || (vps == volplex_onlyusup))
+ && (!(flags & setstate_force)))
+ return 0; /* can't do it */
+ plex->state = state; /* do it */
+ invalidate_subdisks(plex, sd_down); /* and down all up subdisks */
+ break;
+
+ /*
+ * This is only requested internally.
+ * Trust ourselves
+ */
+ case plex_faulty:
+ plex->state = state; /* do it */
+ invalidate_subdisks(plex, sd_crashed); /* and crash all up subdisks */
+ break;
+
+ case plex_initializing:
+ /* XXX consider what safeguards we need here */
+ if ((flags & setstate_force) == 0)
+ return 0;
+ plex->state = state; /* do it */
+ break;
+
+ /* What's this? */
+ default:
+ return 0;
+ }
+ if (plex->state != oldstate) /* we've changed, */
+ log(LOG_INFO, /* tell them about it */
+ "vinum: %s is %s\n",
+ plex->name,
+ plex_state(plex->state));
+ /*
+ * Now see what we have left, and whether
+ * we're taking the volume down
+ */
+ if (plex->volno >= 0) /* we have a volume */
+ update_volume_state(plex->volno); /* update its state */
+ if ((flags & setstate_configuring) == 0) /* save config now */
+ save_config(); /* yes: save the updated configuration */
+ return 1;
+}
+
+/* Update the state of a plex dependent on its plexes. */
+int
+set_volume_state(int volno, enum volumestate state, enum setstateflags flags)
+{
+ struct volume *vol = &VOL[volno]; /* point to our volume */
+
+ if (vol->state == volume_unallocated) /* no volume to do anything with, */
+ return 0;
+ if (vol->state == state) /* we're there already */
+ return 1;
+
+ if (state == volume_up) /* want to come up */
+ update_volume_state(volno);
+ else if (state == volume_down) { /* want to go down */
+ if (((vol->flags & VF_OPEN) == 0) /* not open */
+ ||((flags & setstate_force) != 0)) { /* or we're forcing */
+ vol->state = volume_down;
+ log(LOG_INFO,
+ "vinum: volume %s is %s\n",
+ vol->name,
+ volume_state(vol->state));
+ if ((flags & setstate_configuring) == 0) /* save config now */
+ save_config(); /* yes: save the updated configuration */
+ return 1;
+ }
+ }
+ return 0; /* no change */
+}
+
+/* Set the state of a subdisk based on its environment */
+void
+update_sd_state(int sdno)
+{
+ struct sd *sd;
+ struct drive *drive;
+ enum sdstate oldstate;
+
+ sd = &SD[sdno];
+ oldstate = sd->state;
+ drive = &DRIVE[sd->driveno];
+
+ if (drive->state == drive_up) {
+ switch (sd->state) {
+ case sd_down:
+ case sd_crashed:
+ sd->state = sd_reborn; /* back up again with no loss */
+ break;
+
+ default:
+ break;
+ }
+ } else { /* down or worse */
+ switch (sd->state) {
+ case sd_up:
+ case sd_reborn:
+ case sd_reviving:
+ case sd_empty:
+ sd->state = sd_crashed; /* lost our drive */
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (sd->state != oldstate) /* state has changed, */
+ log(LOG_INFO, /* say so */
+ "vinum: %s is %s\n",
+ sd->name,
+ sd_state(sd->state));
+ if (sd->plexno >= 0) /* we're part of a plex, */
+ update_plex_state(sd->plexno); /* update its state */
+}
+
+/*
+ * Force a plex and all its subdisks
+ * into an 'up' state. This is a helper
+ * for update_plex_state.
+ */
+void
+forceup(int plexno)
+{
+ struct plex *plex;
+ int sdno;
+
+ plex = &PLEX[plexno]; /* point to the plex */
+ plex->state = plex_up; /* and bring it up */
+
+ /* change the subdisks to up state */
+ for (sdno = 0; sdno < plex->subdisks; sdno++) {
+ SD[plex->sdnos[sdno]].state = sd_up;
+ log(LOG_INFO, /* tell them about it */
+ "vinum: %s is up\n",
+ SD[plex->sdnos[sdno]].name);
+ }
+}
+
+/* Set the state of a plex based on its environment */
+void
+update_plex_state(int plexno)
+{
+ struct plex *plex; /* point to our plex */
+ enum plexstate oldstate;
+ enum sdstates statemap; /* get a map of the subdisk states */
+ enum volplexstate vps; /* how do we compare with the other plexes? */
+
+ plex = &PLEX[plexno]; /* point to our plex */
+ oldstate = plex->state;
+ statemap = sdstatemap(plex); /* get a map of the subdisk states */
+ vps = vpstate(plex); /* how do we compare with the other plexes? */
+
+ if (statemap & sd_initstate) /* something initializing? */
+ plex->state = plex_initializing; /* yup, that makes the plex the same */
+ else if (statemap == sd_upstate)
+ /*
+ * All the subdisks are up. This also means that
+ * they are consistent, so we can just bring
+ * the plex up
+ */
+ plex->state = plex_up;
+ else if (isparity(plex) /* RAID-4 or RAID-5 plex */
+ &&(plex->sddowncount == 1)) /* and exactly one subdisk down */
+ plex->state = plex_degraded; /* limping a bit */
+ else if (((statemap & ~sd_downstate) == sd_emptystate) /* all subdisks empty */
+ ||((statemap & ~sd_downstate)
+ == (statemap & ~sd_downstate & (sd_initializedstate | sd_upstate)))) {
+ if ((vps & volplex_otherup) == 0) { /* no other plex is up */
+ struct volume *vol = &VOL[plex->volno]; /* possible volume to which it points */
+
+ /*
+ * If we're a striped or concat plex
+ * associated with a volume, none of whose
+ * plexes are up, and we're new and untested,
+ * and the volume has the setupstate bit set,
+ * we can pretend to be in a consistent state.
+ *
+ * We need to do this in one swell foop: on
+ * the next call we will no longer be just
+ * empty.
+ *
+ * This code assumes that all the other plexes
+ * are also capable of coming up (i.e. all the
+ * sds are up), but that's OK: we'll come back
+ * to this function for the remaining plexes
+ * in the volume.
+ */
+ if ((plex->state == plex_init)
+ && (plex->volno >= 0)
+ && (vol->flags & VF_CONFIG_SETUPSTATE)) {
+ for (plexno = 0; plexno < vol->plexes; plexno++)
+ forceup(VOL[plex->volno].plex[plexno]);
+ } else if ((statemap == sd_initializedstate) /* if it's initialized (not empty) */
+ ||(plex->organization == plex_concat) /* and we're not RAID-4 or RAID-5 */
+ ||(plex->organization == plex_striped))
+ forceup(plexno); /* we'll do it */
+ /*
+ * This leaves a case where things don't get
+ * done: the plex is RAID-4 or RAID-5, and
+ * the subdisks are all empty. They need to
+ * be initialized first.
+ */
+ } else {
+ if (statemap == sd_upstate) /* all subdisks up */
+ plex->state = plex_up; /* we can come up too */
+ else
+ plex->state = plex_faulty;
+ }
+ } else if ((statemap & (sd_upstate | sd_rebornstate)) == statemap) /* all up or reborn */
+ plex->state = plex_flaky;
+ else if (statemap & (sd_upstate | sd_rebornstate)) /* some up or reborn */
+ plex->state = plex_corrupt; /* corrupt */
+ else if (statemap & (sd_initstate | sd_emptystate)) /* some subdisks empty or initializing */
+ plex->state = plex_initializing;
+ else /* nothing at all up */
+ plex->state = plex_faulty;
+
+ if (plex->state != oldstate) /* state has changed, */
+ log(LOG_INFO, /* tell them about it */
+ "vinum: %s is %s\n",
+ plex->name,
+ plex_state(plex->state));
+ if (plex->volno >= 0) /* we're part of a volume, */
+ update_volume_state(plex->volno); /* update its state */
+}
+
+/* Set volume state based on its components */
+void
+update_volume_state(int volno)
+{
+ struct volume *vol; /* our volume */
+ int plexno;
+ enum volumestate oldstate;
+
+ vol = &VOL[volno]; /* point to our volume */
+ oldstate = vol->state;
+
+ for (plexno = 0; plexno < vol->plexes; plexno++) {
+ struct plex *plex = &PLEX[vol->plex[plexno]]; /* point to the plex */
+ if (plex->state >= plex_corrupt) { /* something accessible, */
+ vol->state = volume_up;
+ break;
+ }
+ }
+ if (plexno == vol->plexes) /* didn't find an up plex */
+ vol->state = volume_down;
+
+ if (vol->state != oldstate) { /* state changed */
+ log(LOG_INFO, "vinum: %s is %s\n", vol->name, volume_state(vol->state));
+ save_config(); /* save the updated configuration */
+ }
+}
+
+/*
+ * Called from request routines when they find
+ * a subdisk which is not kosher. Decide whether
+ * it warrants changing the state. Return
+ * REQUEST_DOWN if we can't use the subdisk,
+ * REQUEST_OK if we can.
+ */
+/*
+ * A prior version of this function checked the plex
+ * state as well. At the moment, consider plex states
+ * information for the user only. We'll ignore them
+ * and use the subdisk state only. The last version of
+ * this file with the old logic was 2.7. XXX
+ */
+enum requeststatus
+checksdstate(struct sd *sd, struct request *rq, daddr_t diskaddr, daddr_t diskend)
+{
+ struct plex *plex = &PLEX[sd->plexno];
+ int writeop = (rq->bp->b_iocmd == BIO_WRITE); /* note if we're writing */
+
+ switch (sd->state) {
+ /* We shouldn't get called if the subdisk is up */
+ case sd_up:
+ return REQUEST_OK;
+
+ case sd_reviving:
+ /*
+ * Access to a reviving subdisk depends on the
+ * organization of the plex:
+ *
+ * - If it's concatenated, access the subdisk
+ * up to its current revive point. If we
+ * want to write to the subdisk overlapping
+ * the current revive block, set the
+ * conflict flag in the request, asking the
+ * caller to put the request on the wait
+ * list, which will be attended to by
+ * revive_block when it's done.
+ * - if it's striped, we can't do it (we could
+ * do some hairy calculations, but it's
+ * unlikely to work).
+ * - if it's RAID-4 or RAID-5, we can do it as
+ * long as only one subdisk is down
+ */
+ if (plex->organization == plex_striped) /* plex is striped, */
+ return REQUEST_DOWN;
+ else if (isparity(plex)) { /* RAID-4 or RAID-5 plex */
+ if (plex->sddowncount > 1) /* with more than one sd down, */
+ return REQUEST_DOWN;
+ else
+ /*
+ * XXX We shouldn't do this if we can find a
+ * better way. Check the other plexes
+ * first, and return a DOWN if another
+ * plex will do it better
+ */
+ return REQUEST_OK; /* OK, we'll find a way */
+ }
+ if (diskaddr > (sd->revived
+ + sd->plexoffset
+ + (sd->revive_blocksize >> DEV_BSHIFT))) /* we're beyond the end */
+ return REQUEST_DOWN;
+ else if (diskend > (sd->revived + sd->plexoffset)) { /* we finish beyond the end */
+ if (writeop) {
+ rq->flags |= XFR_REVIVECONFLICT; /* note a potential conflict */
+ rq->sdno = sd->sdno; /* and which sd last caused it */
+ } else
+ return REQUEST_DOWN;
+ }
+ return REQUEST_OK;
+
+ case sd_reborn:
+ if (writeop)
+ return REQUEST_OK; /* always write to a reborn disk */
+ else /* don't allow a read */
+ /*
+ * Handle the mapping. We don't want to reject
+ * a read request to a reborn subdisk if that's
+ * all we have. XXX
+ */
+ return REQUEST_DOWN;
+
+ case sd_down:
+ if (writeop) /* writing to a consistent down disk */
+ set_sd_state(sd->sdno, sd_obsolete, setstate_force); /* it's not consistent now */
+ return REQUEST_DOWN;
+
+ case sd_crashed:
+ if (writeop) /* writing to a consistent down disk */
+ set_sd_state(sd->sdno, sd_stale, setstate_force); /* it's not consistent now */
+ return REQUEST_DOWN;
+
+ default:
+ return REQUEST_DOWN;
+ }
+}
+
+/* return a state map for the subdisks of a plex */
+enum sdstates
+sdstatemap(struct plex *plex)
+{
+ int sdno;
+ enum sdstates statemap = 0; /* note the states we find */
+
+ plex->sddowncount = 0; /* no subdisks down yet */
+ for (sdno = 0; sdno < plex->subdisks; sdno++) {
+ struct sd *sd = &SD[plex->sdnos[sdno]]; /* point to the subdisk */
+
+ switch (sd->state) {
+ case sd_empty:
+ statemap |= sd_emptystate;
+ (plex->sddowncount)++; /* another unusable subdisk */
+ break;
+
+ case sd_init:
+ statemap |= sd_initstate;
+ (plex->sddowncount)++; /* another unusable subdisk */
+ break;
+
+ case sd_down:
+ statemap |= sd_downstate;
+ (plex->sddowncount)++; /* another unusable subdisk */
+ break;
+
+ case sd_crashed:
+ statemap |= sd_crashedstate;
+ (plex->sddowncount)++; /* another unusable subdisk */
+ break;
+
+ case sd_obsolete:
+ statemap |= sd_obsoletestate;
+ (plex->sddowncount)++; /* another unusable subdisk */
+ break;
+
+ case sd_stale:
+ statemap |= sd_stalestate;
+ (plex->sddowncount)++; /* another unusable subdisk */
+ break;
+
+ case sd_reborn:
+ statemap |= sd_rebornstate;
+ break;
+
+ case sd_up:
+ statemap |= sd_upstate;
+ break;
+
+ case sd_initializing:
+ statemap |= sd_initstate;
+ (plex->sddowncount)++; /* another unusable subdisk */
+ break;
+
+ case sd_initialized:
+ statemap |= sd_initializedstate;
+ (plex->sddowncount)++; /* another unusable subdisk */
+ break;
+
+ case sd_unallocated:
+ case sd_uninit:
+ case sd_reviving:
+ case sd_referenced:
+ statemap |= sd_otherstate;
+ (plex->sddowncount)++; /* another unusable subdisk */
+ }
+ }
+ return statemap;
+}
+
+/* determine the state of the volume relative to this plex */
+enum volplexstate
+vpstate(struct plex *plex)
+{
+ struct volume *vol;
+ enum volplexstate state = volplex_onlyusdown; /* state to return */
+ int plexno;
+
+ if (plex->volno < 0) { /* not associated with a volume */
+ if (plex->state > plex_degraded)
+ return volplex_onlyus; /* just us */
+ else
+ return volplex_onlyusdown; /* assume the worst */
+ }
+ vol = &VOL[plex->volno]; /* point to our volume */
+ for (plexno = 0; plexno < vol->plexes; plexno++) {
+ if (&PLEX[vol->plex[plexno]] == plex) { /* us */
+ if (PLEX[vol->plex[plexno]].state >= plex_degraded) /* are we up? */
+ state |= volplex_onlyus; /* yes */
+ } else {
+ if (PLEX[vol->plex[plexno]].state >= plex_degraded) /* not us */
+ state |= volplex_otherup; /* and when they were up, they were up */
+ else
+ state |= volplex_alldown; /* and when they were down, they were down */
+ }
+ }
+ return state; /* and when they were only halfway up */
+} /* they were neither up nor down */
+
+/* Check if all bits b are set in a */
+int allset(int a, int b);
+
+int
+allset(int a, int b)
+{
+ return (a & b) == b;
+}
+
+/* Invalidate the subdisks belonging to a plex */
+void
+invalidate_subdisks(struct plex *plex, enum sdstate state)
+{
+ int sdno;
+
+ for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */
+ struct sd *sd = &SD[plex->sdnos[sdno]];
+
+ switch (sd->state) {
+ case sd_unallocated:
+ case sd_uninit:
+ case sd_init:
+ case sd_initializing:
+ case sd_initialized:
+ case sd_empty:
+ case sd_obsolete:
+ case sd_stale:
+ case sd_crashed:
+ case sd_down:
+ case sd_referenced:
+ break;
+
+ case sd_reviving:
+ case sd_reborn:
+ case sd_up:
+ set_sd_state(plex->sdnos[sdno], state, setstate_force);
+ }
+ }
+}
+
+/*
+ * Start an object, in other words do what we can to get it up.
+ * This is called from vinumioctl (VINUMSTART).
+ * Return error indications via ioctl_reply
+ */
+void
+start_object(struct vinum_ioctl_msg *data)
+{
+ int status;
+ int objindex = data->index; /* data gets overwritten */
+ struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) data; /* format for returning replies */
+ enum setstateflags flags;
+
+ if (data->force != 0) /* are we going to use force? */
+ flags = setstate_force; /* yes */
+ else
+ flags = setstate_none; /* no */
+
+ switch (data->type) {
+ case drive_object:
+ status = set_drive_state(objindex, drive_up, flags);
+ if (DRIVE[objindex].state != drive_up) /* set status on whether we really did it */
+ ioctl_reply->error = EBUSY;
+ else
+ ioctl_reply->error = 0;
+ break;
+
+ case sd_object:
+ if (DRIVE[SD[objindex].driveno].state != drive_up) {
+ ioctl_reply->error = EIO;
+ strcpy(ioctl_reply->msg, "Drive is down");
+ return;
+ }
+ if (data->blocksize)
+ SD[objindex].revive_blocksize = data->blocksize;
+ if ((SD[objindex].state == sd_reviving) /* reviving, */
+ ||(SD[objindex].state == sd_stale)) { /* or stale, will revive */
+ SD[objindex].state = sd_reviving; /* make sure we're reviving */
+ ioctl_reply->error = revive_block(objindex); /* revive another block */
+ ioctl_reply->msg[0] = '\0'; /* no comment */
+ return;
+ } else if (SD[objindex].state == sd_initializing) { /* initializing, */
+ if (data->blocksize)
+ SD[objindex].init_blocksize = data->blocksize;
+ ioctl_reply->error = initsd(objindex, data->verify); /* initialize another block */
+ ioctl_reply->msg[0] = '\0'; /* no comment */
+ return;
+ }
+ status = set_sd_state(objindex, sd_up, flags); /* set state */
+ if (status != EAGAIN) { /* not first revive or initialize, */
+ if (SD[objindex].state != sd_up) /* set status on whether we really did it */
+ ioctl_reply->error = EBUSY;
+ else
+ ioctl_reply->error = 0;
+ } else
+ ioctl_reply->error = status;
+ break;
+
+ case plex_object:
+ status = set_plex_state(objindex, plex_up, flags);
+ if (PLEX[objindex].state != plex_up) /* set status on whether we really did it */
+ ioctl_reply->error = EBUSY;
+ else
+ ioctl_reply->error = 0;
+ break;
+
+ case volume_object:
+ status = set_volume_state(objindex, volume_up, flags);
+ if (VOL[objindex].state != volume_up) /* set status on whether we really did it */
+ ioctl_reply->error = EBUSY;
+ else
+ ioctl_reply->error = 0;
+ break;
+
+ default:
+ ioctl_reply->error = EINVAL;
+ strcpy(ioctl_reply->msg, "Invalid object type");
+ return;
+ }
+ /*
+ * There's no point in saying anything here:
+ * the userland program does it better
+ */
+ ioctl_reply->msg[0] = '\0';
+}
+
+/*
+ * Stop an object, in other words do what we can to get it down
+ * This is called from vinumioctl (VINUMSTOP).
+ * Return error indications via ioctl_reply.
+ */
+void
+stop_object(struct vinum_ioctl_msg *data)
+{
+ int status = 1;
+ int objindex = data->index; /* save the number from change */
+ struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) data; /* format for returning replies */
+
+ switch (data->type) {
+ case drive_object:
+ status = set_drive_state(objindex, drive_down, data->force);
+ break;
+
+ case sd_object:
+ status = set_sd_state(objindex, sd_down, data->force);
+ break;
+
+ case plex_object:
+ status = set_plex_state(objindex, plex_down, data->force);
+ break;
+
+ case volume_object:
+ status = set_volume_state(objindex, volume_down, data->force);
+ break;
+
+ default:
+ ioctl_reply->error = EINVAL;
+ strcpy(ioctl_reply->msg, "Invalid object type");
+ return;
+ }
+ ioctl_reply->msg[0] = '\0';
+ if (status == 0) /* couldn't do it */
+ ioctl_reply->error = EBUSY;
+ else
+ ioctl_reply->error = 0;
+}
+
+/*
+ * VINUM_SETSTATE ioctl: set an object state.
+ * msg is the message passed by the user.
+ */
+void
+setstate(struct vinum_ioctl_msg *msg)
+{
+ int sdno;
+ struct sd *sd;
+ struct plex *plex;
+ struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) msg; /* format for returning replies */
+
+ switch (msg->state) {
+ case object_down:
+ stop_object(msg);
+ break;
+
+ case object_initializing:
+ switch (msg->type) {
+ case sd_object:
+ sd = &SD[msg->index];
+ if ((msg->index >= vinum_conf.subdisks_allocated)
+ || (sd->state <= sd_referenced)) {
+ sprintf(ioctl_reply->msg, "Invalid subdisk %d", msg->index);
+ ioctl_reply->error = EFAULT;
+ return;
+ }
+ set_sd_state(msg->index, sd_initializing, msg->force);
+ if (sd->state != sd_initializing) {
+ strcpy(ioctl_reply->msg, "Can't set state");
+ ioctl_reply->error = EBUSY;
+ } else
+ ioctl_reply->error = 0;
+ break;
+
+ case plex_object:
+ plex = &PLEX[msg->index];
+ if ((msg->index >= vinum_conf.plexes_allocated)
+ || (plex->state <= plex_unallocated)) {
+ sprintf(ioctl_reply->msg, "Invalid plex %d", msg->index);
+ ioctl_reply->error = EFAULT;
+ return;
+ }
+ set_plex_state(msg->index, plex_initializing, msg->force);
+ if (plex->state != plex_initializing) {
+ strcpy(ioctl_reply->msg, "Can't set state");
+ ioctl_reply->error = EBUSY;
+ } else {
+ ioctl_reply->error = 0;
+ for (sdno = 0; sdno < plex->subdisks; sdno++) {
+ sd = &SD[plex->sdnos[sdno]];
+ set_sd_state(plex->sdnos[sdno], sd_initializing, msg->force);
+ if (sd->state != sd_initializing) {
+ strcpy(ioctl_reply->msg, "Can't set state");
+ ioctl_reply->error = EBUSY;
+ break;
+ }
+ }
+ }
+ break;
+
+ default:
+ strcpy(ioctl_reply->msg, "Invalid object");
+ ioctl_reply->error = EINVAL;
+ }
+ break;
+
+ case object_initialized:
+ if (msg->type == sd_object) {
+ sd = &SD[msg->index];
+ if ((msg->index >= vinum_conf.subdisks_allocated)
+ || (sd->state <= sd_referenced)) {
+ sprintf(ioctl_reply->msg, "Invalid subdisk %d", msg->index);
+ ioctl_reply->error = EFAULT;
+ return;
+ }
+ set_sd_state(msg->index, sd_initialized, msg->force);
+ if (sd->state != sd_initializing) {
+ strcpy(ioctl_reply->msg, "Can't set state");
+ ioctl_reply->error = EBUSY;
+ } else
+ ioctl_reply->error = 0;
+ } else {
+ strcpy(ioctl_reply->msg, "Invalid object");
+ ioctl_reply->error = EINVAL;
+ }
+ break;
+
+ case object_up:
+ start_object(msg);
+ }
+}
+
+/*
+ * Brute force set state function. Don't look at
+ * any dependencies, just do it. This is mainly
+ * intended for testing and recovery.
+ */
+void
+setstate_by_force(struct vinum_ioctl_msg *msg)
+{
+ struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) msg; /* format for returning replies */
+
+ switch (msg->type) {
+ case drive_object:
+ DRIVE[msg->index].state = msg->state;
+ break;
+
+ case sd_object:
+ SD[msg->index].state = msg->state;
+ break;
+
+ case plex_object:
+ PLEX[msg->index].state = msg->state;
+ break;
+
+ case volume_object:
+ VOL[msg->index].state = msg->state;
+ break;
+
+ default:
+ break;
+ }
+ ioctl_reply->error = 0;
+}
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumstate.h b/sys/dev/vinum/vinumstate.h
new file mode 100644
index 0000000..572f317
--- /dev/null
+++ b/sys/dev/vinum/vinumstate.h
@@ -0,0 +1,257 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This file gets read by makestatetext to create text files
+ * with the names of the states, so don't change the file
+ * format
+ */
+
+enum volumestate {
+ volume_unallocated,
+ /* present but unused. Must be 0 */
+
+ volume_uninit,
+ /* mentioned elsewhere but not known to the configuration */
+
+ volume_down,
+
+ /* The volume is up and functional, but not all plexes may be available */
+ volume_up,
+ volume_laststate = volume_up /* last value, for table dimensions */
+};
+
+enum plexstate {
+ /* An empty entry, not a plex at all. */
+ plex_unallocated,
+
+ /* The plex has been referenced by a volume */
+ plex_referenced,
+ /*
+ * The plex has been allocated, but there configuration
+ * is not complete
+ */
+ plex_init,
+
+ /*
+ * A plex which has gone completely down because of
+ * I/O errors.
+ */
+ plex_faulty,
+
+ /*
+ * A plex which has been taken down by the
+ * administrator.
+ */
+ plex_down,
+
+ /* A plex which is being initialized */
+ plex_initializing,
+
+ /*
+ * *** The remaining states represent plexes which are
+ * at least partially up. Keep these separate so that
+ * they can be checked more easily.
+ */
+
+ /*
+ * A plex entry which is at least partially up. Not
+ * all subdisks are available, and an inconsistency
+ * has occurred. If no other plex is uncorrupted,
+ * the volume is no longer consistent.
+ */
+ plex_corrupt,
+
+ plex_firstup = plex_corrupt, /* first "up" state */
+
+ /*
+ * A RAID-5 plex entry which is accessible, but one
+ * subdisk is down, requiring recovery for many
+ * I/O requests.
+ */
+ plex_degraded,
+
+ /*
+ * A plex which is really up, but which has a reborn
+ * subdisk which we don't completely trust, and
+ * which we don't want to read if we can avoid it
+ */
+ plex_flaky,
+
+ /*
+ * A plex entry which is completely up. All subdisks
+ * are up.
+ */
+ plex_up,
+
+ plex_laststate = plex_up /* last value, for table dimensions */
+};
+
+/* subdisk states */
+enum sdstate {
+ /* An empty entry, not a subdisk at all. */
+ sd_unallocated,
+
+ /*
+ * A subdisk entry which has not been created
+ * completely. Some fields may be empty.
+ */
+ sd_uninit,
+
+ /* The subdisk has been referenced by a plex */
+ sd_referenced,
+
+ /*
+ * A subdisk entry which has been created completely.
+ * All fields are correct, but the disk hasn't
+ * been updated.
+ */
+ sd_init,
+
+ /*
+ * A subdisk entry which has been created completely.
+ * All fields are correct, and the disk has been
+ * updated, but there is no data on the disk.
+ */
+ sd_empty,
+
+ /*
+ * A subdisk entry which has been created completely and
+ * which is currently being initialized
+ */
+ sd_initializing,
+
+ /*
+ * A subdisk entry which has been initialized,
+ * but which can't come up because it would
+ * cause inconsistencies.
+ */
+ sd_initialized,
+
+ /* *** The following states represent invalid data */
+ /*
+ * A subdisk entry which has been created completely.
+ * All fields are correct, the config on disk has been
+ * updated, and the data was valid, but since then the
+ * drive has been taken down, and as a result updates
+ * have been missed.
+ */
+ sd_obsolete,
+
+ /*
+ * A subdisk entry which has been created completely.
+ * All fields are correct, the disk has been updated,
+ * and the data was valid, but since then the drive
+ * has been crashed and updates have been lost.
+ */
+ sd_stale,
+
+ /* *** The following states represent valid, inaccessible data */
+
+ /*
+ * A subdisk entry which has been created completely.
+ * All fields are correct, the disk has been updated,
+ * and the data was valid, but since then the drive
+ * has gone down. No attempt has been made to write
+ * to the subdisk since the crash, so the data is valid.
+ */
+ sd_crashed,
+
+ /*
+ * A subdisk entry which was up, which contained
+ * valid data, and which was taken down by the
+ * administrator. The data is valid.
+ */
+ sd_down,
+
+ /*
+ * *** This is invalid data (the subdisk previously had
+ * a numerically lower state), but it is currently in the
+ * process of being revived. We can write but not read.
+ */
+ sd_reviving,
+
+ /*
+ * *** The following states represent accessible subdisks
+ * with valid data
+ */
+
+ /*
+ * A subdisk entry which has been created completely.
+ * All fields are correct, the disk has been updated,
+ * and the data was valid, but since then the drive
+ * has gone down and up again. No updates were lost,
+ * but it is possible that the subdisk has been
+ * damaged. We won't read from this subdisk if we
+ * have a choice. If this is the only subdisk which
+ * covers this address space in the plex, we set its
+ * state to sd_up under these circumstances, so this
+ * status implies that there is another subdisk to
+ * fulfil the request.
+ */
+ sd_reborn,
+
+ /*
+ * A subdisk entry which has been created completely.
+ * All fields are correct, the disk has been updated,
+ * and the data is valid.
+ */
+ sd_up,
+
+ sd_laststate = sd_up /* last value, for table dimensions */
+};
+
+enum drivestate {
+ drive_unallocated,
+ /* present but unused. Must be 0 */
+
+ drive_referenced,
+ /* just mentioned in some other config entry */
+
+ drive_down,
+ /* not accessible */
+
+ drive_up,
+ /* up and running */
+
+ drive_laststate = drive_up /* last value, for table dimensions */
+};
+
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
diff --git a/sys/dev/vinum/vinumutil.c b/sys/dev/vinum/vinumutil.c
new file mode 100644
index 0000000..5d3fe82
--- /dev/null
+++ b/sys/dev/vinum/vinumutil.c
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumutil.c,v 1.17 2003/04/28 02:54:43 grog Exp $
+ * $FreeBSD$
+ */
+
+/* This file contains utility routines used both in kernel and user context */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/statetexts.h>
+#ifndef _KERNEL
+#include <stdio.h>
+#include <string.h>
+extern jmp_buf command_fail; /* return on a failed command */
+#endif
+
+static char numeric_state[32]; /* temporary buffer for ASCII conversions */
+#define STATECOUNT(x) (sizeof (x##statetext) / sizeof (char *))
+/* Return drive state as a string */
+char *
+drive_state(enum drivestate state)
+{
+ if (((unsigned) state) >= STATECOUNT(drive)) {
+ sprintf(numeric_state, "Invalid state %d", (int) state);
+ return numeric_state;
+ } else
+ return drivestatetext[state];
+}
+
+/* Return volume state as a string */
+char *
+volume_state(enum volumestate state)
+{
+ if (((unsigned) state) >= STATECOUNT(vol)) {
+ sprintf(numeric_state, "Invalid state %d", (int) state);
+ return numeric_state;
+ } else
+ return volstatetext[state];
+}
+
+/* Return plex state as a string */
+char *
+plex_state(enum plexstate state)
+{
+ if (((unsigned) state) >= STATECOUNT(plex)) {
+ sprintf(numeric_state, "Invalid state %d", (int) state);
+ return numeric_state;
+ } else
+ return plexstatetext[state];
+}
+
+/* Return plex organization as a string */
+char *
+plex_org(enum plexorg org)
+{
+ switch (org) {
+ case plex_disorg: /* disorganized */
+ return "disorg";
+ break;
+
+ case plex_concat: /* concatenated plex */
+ return "concat";
+ break;
+
+ case plex_striped: /* striped plex */
+ return "striped";
+ break;
+
+ case plex_raid4: /* RAID-4 plex */
+ return "raid4";
+
+ case plex_raid5: /* RAID-5 plex */
+ return "raid5";
+ break;
+
+ default:
+ sprintf(numeric_state, "Invalid org %d", (int) org);
+ return numeric_state;
+ }
+}
+
+/* Return sd state as a string */
+char *
+sd_state(enum sdstate state)
+{
+ if (((unsigned) state) >= STATECOUNT(sd)) {
+ sprintf(numeric_state, "Invalid state %d", (int) state);
+ return numeric_state;
+ } else
+ return sdstatetext[state];
+}
+
+/* Now convert in the other direction */
+/*
+ * These are currently used only internally,
+ * so we don't do too much error checking
+ */
+enum drivestate
+DriveState(char *text)
+{
+ int i;
+ for (i = 0; i < STATECOUNT(drive); i++)
+ if (strcmp(text, drivestatetext[i]) == 0) /* found it */
+ return (enum drivestate) i;
+ return -1;
+}
+
+enum sdstate
+SdState(char *text)
+{
+ int i;
+ for (i = 0; i < STATECOUNT(sd); i++)
+ if (strcmp(text, sdstatetext[i]) == 0) /* found it */
+ return (enum sdstate) i;
+ return -1;
+}
+
+enum plexstate
+PlexState(char *text)
+{
+ int i;
+ for (i = 0; i < STATECOUNT(plex); i++)
+ if (strcmp(text, plexstatetext[i]) == 0) /* found it */
+ return (enum plexstate) i;
+ return -1;
+}
+
+enum volumestate
+VolState(char *text)
+{
+ int i;
+ for (i = 0; i < STATECOUNT(vol); i++)
+ if (strcmp(text, volstatetext[i]) == 0) /* found it */
+ return (enum volumestate) i;
+ return -1;
+}
+
+/*
+ * Take a number with an optional scale factor and convert
+ * it to a number of bytes.
+ *
+ * The scale factors are:
+ *
+ * s sectors (of 512 bytes)
+ * b blocks (of 512 bytes). This unit is deprecated,
+ * because it's confusing, but maintained to avoid
+ * confusing Veritas users.
+ * k kilobytes (1024 bytes)
+ * m megabytes (of 1024 * 1024 bytes)
+ * g gigabytes (of 1024 * 1024 * 1024 bytes)
+ */
+u_int64_t
+sizespec(char *spec)
+{
+ u_int64_t size;
+ char *s;
+ int sign = 1; /* -1 if negative */
+
+ size = 0;
+ if (spec != NULL) { /* we have a parameter */
+ s = spec;
+ if (*s == '-') { /* negative, */
+ sign = -1;
+ s++; /* skip */
+ }
+ if ((*s >= '0') && (*s <= '9')) { /* it's numeric */
+ while ((*s >= '0') && (*s <= '9')) /* it's numeric */
+ size = size * 10 + *s++ - '0'; /* convert it */
+ switch (*s) {
+ case '\0':
+ return size * sign;
+
+ case 'B':
+ case 'b':
+ case 'S':
+ case 's':
+ return size * sign * 512;
+
+ case 'K':
+ case 'k':
+ return size * sign * 1024;
+
+ case 'M':
+ case 'm':
+ return size * sign * 1024 * 1024;
+
+ case 'G':
+ case 'g':
+ return size * sign * 1024 * 1024 * 1024;
+ }
+ }
+#ifdef _KERNEL
+ throw_rude_remark(EINVAL, "Invalid length specification: %s", spec);
+#else
+ fprintf(stderr, "Invalid length specification: %s", spec);
+ longjmp(command_fail, 1);
+#endif
+ }
+#ifdef _KERNEL
+ throw_rude_remark(EINVAL, "Missing length specification");
+#else
+ fprintf(stderr, "Missing length specification");
+ longjmp(command_fail, 1);
+#endif
+ /* NOTREACHED */
+ return -1;
+}
+
+/*
+ * Extract the volume number from a device number. Check that it's
+ * the correct type, and that it isn't one of the superdevs.
+ */
+int
+Volno(dev_t dev)
+{
+ int volno = minor(dev);
+
+ if (OBJTYPE(dev) != VINUM_VOLUME_TYPE)
+ return -1;
+ else
+ volno = ((volno & 0x3fff0000) >> 8) | (volno & 0xff);
+ if ((volno == VINUM_SUPERDEV_VOL)
+ || (volno == VINUM_DAEMON_VOL))
+ return -1;
+ else
+ return volno;
+}
+
+/*
+ * Extract a plex number from a device number.
+ * Don't check the major number, but check the
+ * type. Return -1 for invalid types.
+ */
+int
+Plexno(dev_t dev)
+{
+ int plexno = minor(dev);
+
+ if (OBJTYPE(dev) != VINUM_PLEX_TYPE)
+ return -1;
+ else
+ return ((plexno & 0x3fff0000) >> 8) | (plexno & 0xff);
+}
+
+/*
+ * Extract a subdisk number from a device number.
+ * Don't check the major number, but check the
+ * type. Return -1 for invalid types.
+ */
+int
+Sdno(dev_t dev)
+{
+ int sdno = minor(dev);
+
+ /*
+ * Care: VINUM_SD_TYPE is 2 or 3, which is why we use < instead of
+ * !=. It's not clear that this makes any sense abstracting it to
+ * this level.
+ */
+ if (OBJTYPE(dev) < VINUM_SD_TYPE)
+ return -1;
+ else
+/*
+ * Note that the number we return includes the low-order bit of the
+ * type field. This gives us twice as many potential subdisks as
+ * plexes or volumes.
+ */
+ return ((sdno & 0x7fff0000) >> 8) | (sdno & 0xff);
+}
diff --git a/sys/dev/vinum/vinumutil.h b/sys/dev/vinum/vinumutil.h
new file mode 100644
index 0000000..2efa42c
--- /dev/null
+++ b/sys/dev/vinum/vinumutil.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumutil.h,v 1.1 2001/05/22 04:07:22 grog Exp grog $
+ * $FreeBSD$
+ */
+
+/*
+ * Functions defined in vinumutil.c, which is used both in userland
+ * and in the kernel.
+ */
+char *drive_state(enum drivestate);
+char *volume_state(enum volumestate);
+char *plex_state(enum plexstate);
+char *plex_org(enum plexorg);
+char *sd_state(enum sdstate);
+enum drivestate DriveState(char *text);
+enum sdstate SdState(char *text);
+enum plexstate PlexState(char *text);
+enum volumestate VolState(char *text);
diff --git a/sys/dev/vinum/vinumvar.h b/sys/dev/vinum/vinumvar.h
new file mode 100644
index 0000000..8c6a07b
--- /dev/null
+++ b/sys/dev/vinum/vinumvar.h
@@ -0,0 +1,400 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999
+ * Nan Yang Computer Services Limited. All rights reserved.
+ *
+ * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Nan Yang Computer
+ * Services Limited.
+ * 4. Neither the name of the Company nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: vinumvar.h,v 1.33 2003/05/23 01:09:23 grog Exp $
+ * $FreeBSD$
+ */
+
+#include <sys/time.h>
+#include <dev/vinum/vinumstate.h>
+#include <sys/mutex.h>
+
+/* Directory for device nodes. */
+#define VINUM_DIR "/dev/vinum"
+
+/*
+ * Some configuration maxima. They're an enum because
+ * we can't define global constants. Sorry about that.
+ *
+ * These aren't as bad as they look: most of them are soft limits.
+ */
+
+#define VINUMROOT
+enum constants {
+ /*
+ * Current version of the data structures. This
+ * is used to ensure synchronization between
+ * kernel module and userland vinum(8).
+ */
+ VINUMVERSION = 1,
+ VINUM_HEADER = 512, /* size of header on disk */
+ MAXCONFIGLINE = 1024, /* maximum size of a single config line */
+ MINVINUMSLICE = 1048576, /* minimum size of a slice */
+
+ VINUM_CDEV_MAJOR = 91, /* major number for character device */
+
+ ROUND_ROBIN_READPOL = -1, /* round robin read policy */
+
+ /*
+ * Type field in high-order two bits of minor
+ * number. Subdisks are in fact both type 2 and
+ * type 3, giving twice the number of subdisks.
+ * This causes some ugliness in the code.
+ */
+ VINUM_VOLUME_TYPE = 0,
+ VINUM_PLEX_TYPE = 1,
+ VINUM_SD_TYPE = 2,
+ VINUM_SD2_TYPE = 3,
+
+
+ /*
+ * Define a minor device number.
+ * This is not used directly; instead, it's
+ * called by the other macros.
+ */
+#define VINUMMINOR(o,t) ((o & 0xff) | ((o & 0x3fff00) << 8) | (t << VINUM_TYPE_SHIFT))
+
+ VINUM_TYPE_SHIFT = 30,
+ VINUM_MAXVOL = 0x3ffffd, /* highest numbered volume */
+
+ /*
+ * The super device and the daemon device are
+ * magic: they're the two highest-numbered
+ * volumes.
+ */
+ VINUM_SUPERDEV_VOL = 0x3ffffe,
+ VINUM_DAEMON_VOL = 0x3fffff,
+ VINUM_MAXPLEX = 0x3fffff,
+ VINUM_MAXSD = 0x7fffff,
+
+#define VINUM_SUPERDEV_MINOR VINUMMINOR (VINUM_SUPERDEV_VOL, VINUM_VOLUME_TYPE)
+#define VINUM_DAEMON_MINOR VINUMMINOR (VINUM_DAEMON_VOL, VINUM_VOLUME_TYPE)
+
+ /*
+ * Mask for the number part of each object.
+ * Plexes and volumes are the same, subdisks use
+ * the low-order bit of the type field and thus
+ * have twice the number.
+ */
+
+ MAJORDEV_SHIFT = 8,
+
+ MAXPLEX = 8, /* maximum number of plexes in a volume */
+ MAXSD = 256, /* maximum number of subdisks in a plex */
+ MAXDRIVENAME = 32, /* maximum length of a device name */
+ MAXSDNAME = 64, /* maximum length of a subdisk name */
+ MAXPLEXNAME = 64, /* maximum length of a plex name */
+ MAXVOLNAME = 64, /* maximum length of a volume name */
+ MAXNAME = 64, /* maximum length of any name */
+
+
+#define OBJTYPE(x) ((minor(x) >> VINUM_TYPE_SHIFT) & 3)
+
+ /* Create device minor numbers */
+#define VINUMDEV(o, t) makedev (VINUM_CDEV_MAJOR, VINUMMINOR (o, t))
+
+#define VINUM_VOL(v) makedev (VINUM_CDEV_MAJOR, \
+ VINUMMINOR (v, VINUM_VOLUME_TYPE))
+#define VINUM_PLEX(p) makedev (VINUM_CDEV_MAJOR, \
+ VINUMMINOR (p, VINUM_PLEX_TYPE))
+#define VINUM_SD(s) makedev (VINUM_CDEV_MAJOR, \
+ VINUMMINOR (s, VINUM_SD_TYPE))
+
+ /* extract device type */
+#define DEVTYPE(x) ((minor (x) >> VINUM_TYPE_SHIFT) & 3)
+
+#define VINUM_SUPERDEV_NAME VINUM_DIR"/control" /* normal super device */
+#define VINUM_DAEMON_DEV_NAME VINUM_DIR"/controld" /* super device for daemon only */
+
+ /*
+ * the number of object entries to cater for initially, and also the
+ * value by which they are incremented. It doesn't take long
+ * to extend them, so theoretically we could start with 1 of each, but
+ * it's untidy to allocate such small areas. These values are
+ * probably too small.
+ */
+
+ INITIAL_DRIVES = 4,
+ INITIAL_VOLUMES = 4,
+ INITIAL_PLEXES = 8,
+ INITIAL_SUBDISKS = 16,
+ INITIAL_SUBDISKS_IN_PLEX = 4, /* number of subdisks to allocate to a plex */
+ INITIAL_SUBDISKS_IN_DRIVE = 4, /* number of subdisks to allocate to a drive */
+ INITIAL_DRIVE_FREELIST = 16, /* number of entries in drive freelist */
+ PLEX_REGION_TABLE_SIZE = 8, /* number of entries in plex region tables */
+ PLEX_LOCKS = 256, /* number of locks to allocate to a plex */
+ PLEXMUTEXES = 32,
+ MAX_REVIVE_BLOCKSIZE = MAXPHYS, /* maximum revive block size */
+ DEFAULT_REVIVE_BLOCKSIZE = 65536, /* default revive block size */
+ VINUMHOSTNAMELEN = 32, /* host name field in label */
+};
+
+/*
+ * Slice header
+ *
+ * Vinum drives start with this structure:
+ *
+ *\ Sector
+ * |--------------------------------------|
+ * | PDP-11 memorial boot block | 0
+ * |--------------------------------------|
+ * | Disk label, maybe | 1
+ * |--------------------------------------|
+ * | Slice definition (vinum_hdr) | 8
+ * |--------------------------------------|
+ * | |
+ * | Configuration info, first copy | 9
+ * | |
+ * |--------------------------------------|
+ * | |
+ * | Configuration info, second copy | 9 + size of config
+ * | |
+ * |--------------------------------------|
+ */
+
+/* Sizes and offsets of our information */
+enum {
+ VINUM_LABEL_OFFSET = 4096, /* offset of vinum label */
+ VINUMHEADERLEN = 512, /* size of vinum label */
+ VINUM_CONFIG_OFFSET = 4608, /* offset of first config copy */
+ MAXCONFIG = 65536, /* and size of config copy */
+ DATASTART = (MAXCONFIG * 2 + VINUM_CONFIG_OFFSET) / DEV_BSIZE /* this is where the data starts */
+};
+
+/*
+ * hostname is 256 bytes long, but we don't need to shlep
+ * multiple copies in vinum. We use the host name just
+ * to identify this system, and 32 bytes should be ample
+ * for that purpose
+ */
+
+struct vinum_label {
+ char sysname[VINUMHOSTNAMELEN]; /* system name at time of creation */
+ char name[MAXDRIVENAME]; /* our name of the drive */
+ struct timeval date_of_birth; /* the time it was created */
+ struct timeval last_update; /* and the time of last update */
+ /*
+ * total size in bytes of the drive. This value
+ * includes the headers.
+ */
+ off_t drive_size;
+};
+
+struct vinum_hdr {
+ uint64_t magic; /* we're long on magic numbers */
+#define VINUM_MAGIC 22322600044678729LL /* should be this */
+#define VINUM_NOMAGIC 22322600044678990LL /* becomes this after obliteration */
+ /*
+ * Size in bytes of each copy of the
+ * configuration info. This must be a multiple
+ * of the sector size.
+ */
+ int config_length;
+ struct vinum_label label; /* unique label */
+};
+
+/* Information returned from read_drive_label */
+enum drive_label_info {
+ DL_CANT_OPEN, /* invalid partition */
+ DL_NOT_OURS, /* valid partition, but no vinum label */
+ DL_DELETED_LABEL, /* valid partition, deleted label found */
+ DL_WRONG_DRIVE, /* drive name doesn't match */
+ DL_OURS /* valid partition and label found */
+};
+
+/* kinds of plex organization */
+enum plexorg {
+ plex_disorg, /* disorganized */
+ plex_concat, /* concatenated plex */
+ plex_striped, /* striped plex */
+ plex_raid4, /* RAID4 plex */
+ plex_raid5 /* RAID5 plex */
+};
+
+/* Recognize plex organizations */
+#define isstriped(p) (p->organization >= plex_striped) /* RAID 1, 4 or 5 */
+#define isparity(p) (p->organization >= plex_raid4) /* RAID 4 or 5 */
+
+/* Address range definitions, for locking volumes */
+struct rangelock {
+ daddr_t stripe; /* address + 1 of the range being locked */
+ struct buf *bp; /* user's buffer pointer */
+};
+
+struct drive_freelist { /* sorted list of free space on drive */
+ u_int64_t offset; /* offset of entry */
+ u_int64_t sectors; /* and length in sectors */
+};
+
+/*
+ * Include the structure definitions shared
+ * between userland and kernel.
+ */
+
+#ifdef _KERNEL
+#include <dev/vinum/vinumobj.h>
+#undef _KERNEL
+#include <dev/vinum/vinumobj.h>
+#define _KERNEL
+#else
+#include <dev/vinum/vinumobj.h>
+#endif
+
+/*
+ * Table expansion. Expand table, which contains oldcount
+ * entries of type element, by increment entries, and change
+ * oldcount accordingly
+ */
+#ifdef VINUMDEBUG
+#define EXPAND(table, element, oldcount, increment) \
+{ \
+ expand_table ((void **) &table, \
+ oldcount * sizeof (element), \
+ (oldcount + increment) * sizeof (element), \
+ __FILE__, \
+ __LINE__ ); \
+ oldcount += increment; \
+ }
+#else
+#define EXPAND(table, element, oldcount, increment) \
+{ \
+ expand_table ((void **) &table, \
+ oldcount * sizeof (element), \
+ (oldcount + increment) * sizeof (element)); \
+ oldcount += increment; \
+ }
+#endif
+
+/* Information on vinum's memory usage */
+struct meminfo {
+ int mallocs; /* number of malloced blocks */
+ int total_malloced; /* total amount malloced */
+ int highwater; /* maximum number of mallocs */
+ struct mc *malloced; /* pointer to kernel table */
+};
+
+#define MCFILENAMELEN 16
+struct mc {
+ struct timeval time;
+ int seq;
+ int size;
+ short line;
+ caddr_t address;
+ char file[MCFILENAMELEN];
+};
+
+/*
+ * These enums are used by the state transition
+ * routines. They're in bit map format:
+ *
+ * Bit 0: Other plexes in the volume are down
+ * Bit 1: Other plexes in the volume are up
+ * Bit 2: The current plex is up
+ * Maybe they should be local to
+ * state.c
+ */
+enum volplexstate {
+ volplex_onlyusdown = 0, /* 0: we're the only plex, and we're down */
+ volplex_alldown, /* 1: another plex is down, and so are we */
+ volplex_otherup, /* 2: another plex is up */
+ volplex_otherupdown, /* 3: other plexes are up and down */
+ volplex_onlyus, /* 4: we're up and alone */
+ volplex_onlyusup, /* 5: only we are up, others are down */
+ volplex_allup, /* 6: all plexes are up */
+ volplex_someup /* 7: some plexes are up, including us */
+};
+
+/* state map for plex */
+enum sdstates {
+ sd_emptystate = 1,
+ sd_downstate = 2, /* SD is down */
+ sd_crashedstate = 4, /* SD is crashed */
+ sd_obsoletestate = 8, /* SD is obsolete */
+ sd_stalestate = 16, /* SD is stale */
+ sd_rebornstate = 32, /* SD is reborn */
+ sd_upstate = 64, /* SD is up */
+ sd_initstate = 128, /* SD is initializing */
+ sd_initializedstate = 256, /* SD is initialized */
+ sd_otherstate = 512, /* SD is in some other state */
+};
+
+/*
+ * This is really just a parameter to pass to
+ * set_<foo>_state, but since it needs to be known
+ * in the external definitions, we need to define
+ * it here
+ */
+enum setstateflags {
+ setstate_none = 0, /* no flags */
+ setstate_force = 1, /* force the state change */
+ setstate_configuring = 2, /* we're currently configuring, don't save */
+};
+
+/* Operations for parityops to perform. */
+enum parityop {
+ checkparity,
+ rebuildparity,
+ rebuildandcheckparity, /* rebuildparity with the -v option */
+};
+
+#ifdef VINUMDEBUG
+/* Debugging stuff */
+enum debugflags {
+ DEBUG_ADDRESSES = 1, /* show buffer information during requests */
+ DEBUG_NUMOUTPUT = 2, /* show the value of vp->v_numoutput */
+ DEBUG_RESID = 4, /* go into debugger in complete_rqe */
+ DEBUG_LASTREQS = 8, /* keep a circular buffer of last requests */
+ DEBUG_REVIVECONFLICT = 16, /* print info about revive conflicts */
+ DEBUG_EOFINFO = 32, /* print info about EOF detection */
+ DEBUG_MEMFREE = 64, /* keep info about Frees */
+ DEBUG_BIGDRIVE = 128, /* pretend our drives are 100 times the size */
+ DEBUG_REMOTEGDB = 256, /* go into remote gdb */
+ DEBUG_WARNINGS = 512, /* log various relatively harmless warnings */
+ DEBUG_LOCKREQS = 1024, /* log locking requests */
+};
+
+#ifdef _KERNEL
+#ifdef __i386__
+#define longjmp LongJmp /* test our longjmps */
+#endif
+#endif
+#endif
+/* Local Variables: */
+/* fill-column: 50 */
+/* End: */
OpenPOWER on IntegriCloud