diff options
Diffstat (limited to 'sys/dev/vinum')
26 files changed, 12118 insertions, 0 deletions
diff --git a/sys/dev/vinum/COPYRIGHT b/sys/dev/vinum/COPYRIGHT new file mode 100644 index 0000000..f0295e6 --- /dev/null +++ b/sys/dev/vinum/COPYRIGHT @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $FreeBSD$ + */ diff --git a/sys/dev/vinum/makestatetext b/sys/dev/vinum/makestatetext new file mode 100755 index 0000000..c5a7da2 --- /dev/null +++ b/sys/dev/vinum/makestatetext @@ -0,0 +1,78 @@ +#!/bin/sh +# Make statetexts.h from vinumstate.h +# $FreeBSD$ +# $Id: makestatetext,v 1.7 1999/12/29 07:24:54 grog Exp grog $ +infile=vinumstate.h +ofile=statetexts.h +echo >$ofile "/* Created by $0 on" `date`. "Do not edit */" +echo >>$ofile +cat >> $ofile <<FOO +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called \`\`Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided \`\`as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + */ + +FOO + +echo >>$ofile "/* Drive state texts */" +echo >>$ofile "char *drivestatetext [] = + { " +egrep -e 'drive_[A-z0-9]*,' <$infile | grep -v = | sed 's: *drive_\([^,]*\).*: \"\1\",:' >>$ofile +cat <<FOO >> $ofile + }; + +/* Subdisk state texts */ +char *sdstatetext [] = + { +FOO +egrep -e 'sd_[A-z0-9]*,' $infile | grep -v = | sed 's: *sd_\([^,]*\).*: \"\1\",:' >>$ofile +cat <<FOO >> $ofile + }; + +/* Plex state texts */ +char *plexstatetext [] = + { +FOO +egrep -e 'plex_[A-z0-9]*,' $infile | grep -v = | sed 's: *plex_\([^,]*\).*: \"\1\",:' >>$ofile +cat <<FOO >> $ofile + }; + +/* Volume state texts */ +char *volstatetext [] = + { +FOO +egrep -e 'volume_[A-z0-9]*,' $infile | grep -v = | sed 's: *volume_\([^,]*\).*: \"\1\",:' >>$ofile +cat <<FOO >> $ofile + }; +FOO diff --git a/sys/dev/vinum/request.h b/sys/dev/vinum/request.h new file mode 100644 index 0000000..600130f --- /dev/null +++ b/sys/dev/vinum/request.h @@ -0,0 +1,273 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: request.h,v 1.22 2003/04/24 04:37:08 grog Exp $ + * $FreeBSD$ + */ + +/* Information needed to set up a transfer */ + +enum xferinfo { + XFR_NORMAL_READ = 1, + XFR_NORMAL_WRITE = 2, /* write request in normal mode */ + XFR_RECOVERY_READ = 4, + XFR_DEGRADED_WRITE = 8, + XFR_PARITYLESS_WRITE = 0x10, + XFR_NO_PARITY_STRIPE = 0x20, /* parity stripe is not available */ + XFR_DATA_BLOCK = 0x40, /* data block in request */ + XFR_PARITY_BLOCK = 0x80, /* parity block in request */ + XFR_BAD_SUBDISK = 0x100, /* this subdisk is dead */ + XFR_MALLOCED = 0x200, /* this buffer is malloced */ +#ifdef VINUMDEBUG + XFR_PHASE2 = 0x800, /* documentation only: 2nd phase write */ +#endif + XFR_REVIVECONFLICT = 0x1000, /* possible conflict with a revive operation */ + XFR_BUFLOCKED = 0x2000, /* BUF_LOCK performed on this buffer */ + XFR_COPYBUF = 0x4000, /* data buffer was copied */ + /* operations that need a parity block */ + XFR_PARITYOP = (XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE), + /* operations that use the group parameters */ + XFR_GROUPOP = (XFR_DEGRADED_WRITE | XFR_RECOVERY_READ), + /* operations that that use the data parameters */ + XFR_DATAOP = (XFR_NORMAL_READ | XFR_NORMAL_WRITE | XFR_PARITYLESS_WRITE), + /* operations requiring read before write */ + XFR_RBW = (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE), + /* operations that need a malloced buffer */ + XFR_NEEDS_MALLOC = (XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE) +}; + +/* + * Describe one low-level request, part of a + * high-level request. This is an extended + * struct buf buffer, and the first element + * *must* be a struct buf. We pass this + * structure to the I/O routines instead of a + * struct buf in order to be able to locate the + * high-level request when it completes. + * + * All offsets and lengths are in sectors. + */ + +struct rqelement { + struct buf b; /* buf structure */ + struct rqgroup *rqg; /* pointer to our group */ + /* Information about the transfer */ + daddr_t sdoffset; /* offset in subdisk */ + int useroffset; /* offset in user buffer of normal data */ + /* + * dataoffset and datalen refer to "individual" data + * transfers which involve only this drive (normal read, + * parityless write) and also degraded write. + * + * groupoffset and grouplen refer to the other "group" + * operations (normal write, recovery read) which involve + * more than one drive. Both the offsets are relative to + * the start of the local buffer. + */ + int dataoffset; /* offset in buffer of the normal data */ + int groupoffset; /* offset in buffer of group data */ + short datalen; /* length of normal data (sectors) */ + short grouplen; /* length of group data (sectors) */ + short buflen; /* total buffer length to allocate */ + short flags; /* really enum xferinfo (see above) */ + /* Ways to find other components */ + short sdno; /* subdisk number */ + short driveno; /* drive number */ + struct timeval launchtime; /* time of launch, for info function */ +}; + +/* + * A group of requests built to satisfy an I/O + * transfer on a single plex. + */ +struct rqgroup { + struct rqgroup *next; /* pointer to next group */ + struct request *rq; /* pointer to the request */ + short count; /* number of requests in this group */ + short active; /* and number active */ + short plexno; /* index of plex */ + int badsdno; /* index of bad subdisk or -1 */ + enum xferinfo flags; /* description of transfer */ + struct rangelock *lock; /* lock for this transfer */ + daddr_t lockbase; /* and lock address */ + struct rqelement rqe[0]; /* and the elements of this request */ +}; + +/* + * Describe one high-level request and the + * work we have to do to satisfy it. + */ +struct request { + struct buf *bp; /* pointer to the high-level request */ + caddr_t save_data; /* for copied write buffers */ + enum xferinfo flags; + union { + int volno; /* volume index */ + int plexno; /* or plex index */ + } volplex; + int error; /* current error indication */ + int sdno; /* reviving subdisk (XFR_REVIVECONFLICT) */ + short isplex; /* set if this is a plex request */ + short active; /* number of subrequests still active */ + struct rqgroup *rqg; /* pointer to the first group of requests */ + struct rqgroup *lrqg; /* and to the last group of requests */ + struct request *next; /* link of waiting requests */ +}; + +/* + * Extended buffer header for subdisk I/O. Includes + * a pointer to the user I/O request. + */ +struct sdbuf { + struct buf b; /* our buffer */ + struct buf *bp; /* and pointer to parent */ + short driveno; /* drive index */ + short sdno; /* and subdisk index */ +}; + +/* + * Values returned by rqe and friends. Be careful + * with these: they are in order of increasing + * seriousness. Some routines check for + * > REQUEST_RECOVERED to indicate a failed request. XXX + */ +enum requeststatus { + REQUEST_OK, /* request built OK */ + REQUEST_RECOVERED, /* request OK, but involves RAID5 recovery */ + REQUEST_DEGRADED, /* parts of request failed */ + REQUEST_EOF, /* parts of request failed: outside plex */ + REQUEST_DOWN, /* all of request failed: subdisk(s) down */ + REQUEST_ENOMEM /* all of request failed: ran out of memory */ +}; + +#ifdef VINUMDEBUG +/* Trace entry for request info (DEBUG_LASTREQS) */ +enum rqinfo_type { + loginfo_unused, /* never been used */ + loginfo_user_bp, /* this is the bp when strategy is called */ + loginfo_user_bpl, /* and this is the bp at launch time */ + loginfo_rqe, /* user RQE */ + loginfo_iodone, /* iodone */ + loginfo_raid5_data, /* write RAID-5 data block */ + loginfo_raid5_parity, /* write RAID-5 parity block */ + loginfo_sdio, /* subdisk I/O */ + loginfo_sdiol, /* subdisk I/O launch */ + loginfo_sdiodone, /* subdisk iodone */ + loginfo_lockwait, /* wait for range lock */ + loginfo_lock, /* lock range */ + loginfo_unlock, /* unlock range */ +}; + +/* + * This is the rangelock structure with an added + * buffer pointer and plex number. We don't need + * the plex number for the locking protocol, but + * it does help a lot when logging. + */ +struct rangelockinfo { + daddr_t stripe; /* address + 1 of the range being locked */ + struct buf *bp; /* user's buffer pointer */ + int plexno; +}; + +union rqinfou { /* info to pass to logrq */ + struct buf *bp; + struct rqelement *rqe; /* address of request, for correlation */ + struct rangelockinfo *lockinfo; +}; + +struct rqinfo { + enum rqinfo_type type; /* kind of event */ + struct timeval timestamp; /* time it happened */ + struct buf *bp; /* point to user buffer */ + int devmajor; /* major and minor device info */ + int devminor; + union { + struct buf b; /* yup, the *whole* buffer header */ + struct rqelement rqe; /* and the whole rqe */ + struct rangelock lockinfo; + } info; +}; + +#define RQINFO_SIZE 128 /* number of info slots in buffer */ + +void logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp); +#endif + +/* Structures for the daemon */ + +/* types of request to the daemon */ +enum daemonrq { + daemonrq_none, /* dummy to catch bugs */ + daemonrq_ioerror, /* error occurred on I/O */ + daemonrq_saveconfig, /* save configuration */ + daemonrq_return, /* return to userland */ + daemonrq_ping, /* show sign of life */ + daemonrq_init, /* initialize a plex */ + daemonrq_revive, /* revive a subdisk */ + daemonrq_closedrive, /* close a drive */ +}; + +/* info field for daemon requests */ +union daemoninfo { /* and the request information */ + struct request *rq; /* for daemonrq_ioerror */ + struct sd *sd; /* for daemonrq_revive */ + struct plex *plex; /* for daemonrq_init */ + struct drive *drive; /* for daemonrq_closedrive */ + int nothing; /* for passing NULL */ +}; + +struct daemonq { + struct daemonq *next; /* pointer to next element in queue */ + enum daemonrq type; /* type of request */ + int privateinuse; /* private element, being used */ + union daemoninfo info; /* and the request information */ +}; + +void queue_daemon_request(enum daemonrq type, union daemoninfo info); + +extern int daemon_options; + +enum daemon_option { + daemon_verbose = 1, /* talk about what we're doing */ + daemon_stopped = 2, + daemon_noupdate = 4, /* don't update the disk config, for recovery */ +}; + +void freerq(struct request *rq); +void unlockrange(int plexno, struct rangelock *); +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/statetexts.h b/sys/dev/vinum/statetexts.h new file mode 100644 index 0000000..88cfc17 --- /dev/null +++ b/sys/dev/vinum/statetexts.h @@ -0,0 +1,91 @@ +/* Created by ./makestatetext on Wed Jan 5 10:05:30 CST 2000. Do not edit */ + +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $FreeBSD$ + */ + +/* Drive state texts */ +char *drivestatetext[] = +{ + "unallocated", + "referenced", + "down", + "up", +}; + +/* Subdisk state texts */ +char *sdstatetext[] = +{ + "unallocated", + "uninit", + "referenced", + "init", + "empty", + "initializing", + "initialized", + "obsolete", + "stale", + "crashed", + "down", + "reviving", + "reborn", + "up", +}; + +/* Plex state texts */ +char *plexstatetext[] = +{ + "unallocated", + "referenced", + "init", + "faulty", + "down", + "initializing", + "corrupt", + "degraded", + "flaky", + "up", +}; + +/* Volume state texts */ +char *volstatetext[] = +{ + "unallocated", + "uninit", + "down", + "up", +}; diff --git a/sys/dev/vinum/vinum.c b/sys/dev/vinum/vinum.c new file mode 100644 index 0000000..36dfa98 --- /dev/null +++ b/sys/dev/vinum/vinum.c @@ -0,0 +1,531 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinum.c,v 1.44 2003/05/23 00:50:55 grog Exp grog $ + * $FreeBSD$ + */ + +#define STATIC static /* nothing while we're testing */ + +#include <dev/vinum/vinumhdr.h> +#include <sys/sysproto.h> /* for sync(2) */ +#ifdef VINUMDEBUG +#include <sys/reboot.h> +int debug = 0; /* debug flags */ +extern int total_malloced; +extern int malloccount; +extern struct mc malloced[]; +#endif +#include <dev/vinum/request.h> + +struct cdevsw vinum_cdevsw = +{ + .d_open = vinumopen, + .d_close = vinumclose, + .d_read = physread, + .d_write = physwrite, + .d_ioctl = vinumioctl, + .d_strategy = vinumstrategy, + .d_name = "vinum", + .d_maj = VINUM_CDEV_MAJOR, + .d_flags = D_DISK +}; + +/* Called by main() during pseudo-device attachment. */ +void vinumattach(void *); +STATIC int vinum_modevent(module_t mod, modeventtype_t type, void *unused); +STATIC void vinum_clone(void *arg, char *name, int namelen, dev_t * dev); + +struct _vinum_conf vinum_conf; /* configuration information */ + +dev_t vinum_daemon_dev; +dev_t vinum_super_dev; + +static eventhandler_tag dev_clone_tag; + +/* + * Mutexes for plex synchronization. Ideally each plex + * should have its own mutex, but the fact that the plex + * struct can move makes that very complicated. Instead, + * have plexes use share these mutexes based on modulo plex + * number. + */ +struct mtx plexmutex[PLEXMUTEXES]; + +/* + * Called by main() during pseudo-device attachment. All we need + * to do is allocate enough space for devices to be configured later, and + * add devsw entries. + */ +void +vinumattach(void *dummy) +{ + char *envp; + int i; +#define MUTEXNAMELEN 16 + char mutexname[MUTEXNAMELEN]; +#if PLEXMUTEXES > 10000 +#error Increase size of MUTEXNAMELEN +#endif +/* modload should prevent multiple loads, so this is worth a panic */ + if ((vinum_conf.flags & VF_LOADED) != 0) + panic("vinum: already loaded"); + + log(LOG_INFO, "vinum: loaded\n"); +#ifdef VINUMDEBUG + vinum_conf.flags |= VF_LOADED | VF_HASDEBUG; /* we're loaded now, and we support debug */ +#else + vinum_conf.flags |= VF_LOADED; /* we're loaded now */ +#endif + + daemonq = NULL; /* initialize daemon's work queue */ + dqend = NULL; + + vinum_daemon_dev = make_dev(&vinum_cdevsw, + VINUM_DAEMON_MINOR, + UID_ROOT, + GID_WHEEL, + S_IRUSR | S_IWUSR, + "vinum/controld"); + vinum_super_dev = make_dev(&vinum_cdevsw, + VINUM_SUPERDEV_MINOR, + UID_ROOT, + GID_WHEEL, + S_IRUSR | S_IWUSR, + "vinum/control"); + + vinum_conf.version = VINUMVERSION; /* note what version we are */ + + /* allocate space: drives... */ + DRIVE = (struct drive *) Malloc(sizeof(struct drive) * INITIAL_DRIVES); + CHECKALLOC(DRIVE, "vinum: no memory\n"); + bzero(DRIVE, sizeof(struct drive) * INITIAL_DRIVES); + vinum_conf.drives_allocated = INITIAL_DRIVES; /* number of drive slots allocated */ + vinum_conf.drives_used = 0; /* and number in use */ + + /* volumes, ... */ + VOL = (struct volume *) Malloc(sizeof(struct volume) * INITIAL_VOLUMES); + CHECKALLOC(VOL, "vinum: no memory\n"); + bzero(VOL, sizeof(struct volume) * INITIAL_VOLUMES); + vinum_conf.volumes_allocated = INITIAL_VOLUMES; /* number of volume slots allocated */ + vinum_conf.volumes_used = 0; /* and number in use */ + + /* plexes, ... */ + PLEX = (struct plex *) Malloc(sizeof(struct plex) * INITIAL_PLEXES); + CHECKALLOC(PLEX, "vinum: no memory\n"); + bzero(PLEX, sizeof(struct plex) * INITIAL_PLEXES); + vinum_conf.plexes_allocated = INITIAL_PLEXES; /* number of plex slots allocated */ + vinum_conf.plexes_used = 0; /* and number in use */ + + for (i = 0; i < PLEXMUTEXES; i++) { + snprintf(mutexname, MUTEXNAMELEN, "vinumplex%d", i); + mtx_init(&plexmutex[i], mutexname, "plex", MTX_DEF); + } + + /* and subdisks */ + SD = (struct sd *) Malloc(sizeof(struct sd) * INITIAL_SUBDISKS); + CHECKALLOC(SD, "vinum: no memory\n"); + bzero(SD, sizeof(struct sd) * INITIAL_SUBDISKS); + vinum_conf.subdisks_allocated = INITIAL_SUBDISKS; /* number of sd slots allocated */ + vinum_conf.subdisks_used = 0; /* and number in use */ + dev_clone_tag = EVENTHANDLER_REGISTER(dev_clone, vinum_clone, 0, 1000); + + /* + * See if the loader has passed us any of the autostart + * options. + */ + envp = NULL; + if ((envp = getenv("vinum.autostart")) != NULL) { /* start all drives now */ + vinum_scandisk(NULL); + freeenv(envp); + } else if ((envp = getenv("vinum.drives")) != NULL) { + vinum_scandisk(envp); + freeenv(envp); + } +} + +/* + * Check if we have anything open. If confopen is != 0, + * that goes for the super device as well, otherwise + * only for volumes. + * + * Return 0 if not inactive, 1 if inactive. + */ +int +vinum_inactive(int confopen) +{ + int i; + int can_do = 1; /* assume we can do it */ + + if (confopen && (vinum_conf.flags & VF_OPEN)) /* open by vinum(8)? */ + return 0; /* can't do it while we're open */ + lock_config(); + for (i = 0; i < vinum_conf.volumes_allocated; i++) { + if ((VOL[i].state > volume_down) + && (VOL[i].flags & VF_OPEN)) { /* volume is open */ + can_do = 0; + break; + } + } + unlock_config(); + return can_do; +} + +/* + * Free all structures. + * If cleardrive is 0, save the configuration; otherwise + * remove the configuration from the drive. + * + * Before coming here, ensure that no volumes are open. + */ +void +free_vinum(int cleardrive) +{ + int i; + int drives_allocated = vinum_conf.drives_allocated; + + while ((vinum_conf.flags & (VF_STOPPING | VF_DAEMONOPEN)) + == (VF_STOPPING | VF_DAEMONOPEN)) { /* at least one daemon open, we're stopping */ + queue_daemon_request(daemonrq_return, (union daemoninfo) 0); /* stop the daemon */ + tsleep(&vinumclose, PUSER, "vstop", 1); /* and wait for it */ + } + if (DRIVE != NULL) { + if (cleardrive) { /* remove the vinum config */ + for (i = 0; i < drives_allocated; i++) + remove_drive(i); /* remove the drive */ + } else { /* keep the config */ + for (i = 0; i < drives_allocated; i++) + free_drive(&DRIVE[i]); /* close files and things */ + } + Free(DRIVE); + } + if (SD != NULL) { + for (i = 0; i < vinum_conf.subdisks_allocated; i++) { + struct sd *sd = &SD[i]; + + if (sd->state != sd_unallocated) + free_sd(i); + } + Free(SD); + } + if (PLEX != NULL) { + for (i = 0; i < vinum_conf.plexes_allocated; i++) { + struct plex *plex = &PLEX[i]; + + if (plex->state != plex_unallocated) /* we have real data there */ + free_plex(i); + } + Free(PLEX); + } + if (VOL != NULL) { + for (i = 0; i < vinum_conf.volumes_allocated; i++) { + struct volume *volume = &VOL[i]; + + if (volume->state != volume_unallocated) + free_volume(i); + } + Free(VOL); + } + bzero(&vinum_conf, sizeof(vinum_conf)); + vinum_conf.version = VINUMVERSION; /* reinstate version number */ +} + +STATIC int +vinum_modevent(module_t mod, modeventtype_t type, void *unused) +{ + struct sync_args dummyarg = + {0}; + int i; + + switch (type) { + case MOD_LOAD: + vinumattach(NULL); + return 0; /* OK */ + case MOD_UNLOAD: + if (!vinum_inactive(1)) /* is anything open? */ + return EBUSY; /* yes, we can't do it */ + vinum_conf.flags |= VF_STOPPING; /* note that we want to stop */ + sync(curthread, &dummyarg); /* write out buffers */ + free_vinum(0); /* clean up */ +#ifdef VINUMDEBUG + if (total_malloced) { + int i; +#ifdef INVARIANTS + int *poke; +#endif + + for (i = 0; i < malloccount; i++) { + if (debug & DEBUG_WARNINGS) /* want to hear about them */ + log(LOG_WARNING, + "vinum: exiting with %d bytes malloced from %s:%d\n", + malloced[i].size, + malloced[i].file, + malloced[i].line); +#ifdef INVARIANTS + poke = &((int *) malloced[i].address) + [malloced[i].size / (2 * sizeof(int))]; /* middle of the area */ + if (*poke == 0xdeadc0de) /* already freed */ + log(LOG_ERR, + "vinum: exiting with malloc table inconsistency at %p from %s:%d\n", + malloced[i].address, + malloced[i].file, + malloced[i].line); +#endif + Free(malloced[i].address); + } + } +#endif + destroy_dev(vinum_daemon_dev); /* daemon device */ + destroy_dev(vinum_super_dev); + for (i = 0; i < PLEXMUTEXES; i++) + mtx_destroy(&plexmutex[i]); + log(LOG_INFO, "vinum: unloaded\n"); /* tell the world */ + EVENTHANDLER_DEREGISTER(dev_clone, dev_clone_tag); + return 0; + default: + break; + } + return 0; +} + +static moduledata_t vinum_mod = +{ + "vinum", + (modeventhand_t) vinum_modevent, + 0 +}; +DECLARE_MODULE(vinum, vinum_mod, SI_SUB_RAID, SI_ORDER_MIDDLE); + +/* ARGSUSED */ +/* Open a vinum object */ +int +vinumopen(dev_t dev, + int flags, + int fmt, + struct thread *td) +{ + int error; + unsigned int index; + struct volume *vol; + struct plex *plex; + struct sd *sd; + int devminor; /* minor number */ + + devminor = minor(dev); + error = 0; + /* First, decide what we're looking at */ + switch (DEVTYPE(dev)) { + case VINUM_VOLUME_TYPE: + /* + * The super device and daemon device are the last two + * volume numbers, so check for them first. + */ + if ((devminor == VINUM_DAEMON_MINOR) /* daemon device */ + ||(devminor == VINUM_SUPERDEV_MINOR)) { /* or normal super device */ + error = suser(td); /* are we root? */ + + if (error == 0) { /* yes, can do */ + if (devminor == VINUM_DAEMON_MINOR) /* daemon device */ + vinum_conf.flags |= VF_DAEMONOPEN; /* we're open */ + else /* superdev */ + vinum_conf.flags |= VF_OPEN; /* we're open */ + } + return error; + } + /* Must be a real volume. Check. */ + index = Volno(dev); + if (index >= vinum_conf.volumes_allocated) + return ENXIO; /* no such device */ + vol = &VOL[index]; + + switch (vol->state) { + case volume_unallocated: + case volume_uninit: + return ENXIO; + + case volume_up: + vol->flags |= VF_OPEN; /* note we're open */ + return 0; + + case volume_down: + return EIO; + + default: + return EINVAL; + } + + case VINUM_PLEX_TYPE: + index = Plexno(dev); /* get plex index in vinum_conf */ + if (index >= vinum_conf.plexes_allocated) + return ENXIO; /* no such device */ + plex = &PLEX[index]; + + switch (plex->state) { + case plex_unallocated: + return ENXIO; + + case plex_referenced: + return EINVAL; + + default: + plex->flags |= VF_OPEN; /* note we're open */ + return 0; + } + + case VINUM_SD_TYPE: + case VINUM_SD2_TYPE: + index = Sdno(dev); /* get the subdisk number */ + if (index >= vinum_conf.subdisks_allocated) /* not a valid SD entry */ + return ENXIO; /* no such device */ + sd = &SD[index]; + + /* + * Opening a subdisk is always a special operation, so + * we ignore the state as long as it represents a real + * subdisk. + */ + switch (sd->state) { + case sd_unallocated: + return ENXIO; + + case sd_uninit: + case sd_referenced: + return EINVAL; + + default: + sd->flags |= VF_OPEN; /* note we're open */ + return 0; + } + } + return 0; /* to keep the compiler happy */ +} + +/* ARGSUSED */ +int +vinumclose(dev_t dev, + int flags, + int fmt, + struct thread *td) +{ + unsigned int index; + struct volume *vol; + int devminor; + + devminor = minor(dev); + /* First, decide what we're looking at */ + switch (DEVTYPE(dev)) { + case VINUM_VOLUME_TYPE: + /* + * The super device and daemon device are the last two + * volume numbers, so check for them first. + */ + if ((devminor == VINUM_DAEMON_MINOR) /* daemon device */ + ||(devminor == VINUM_SUPERDEV_MINOR)) { /* or normal super device */ + /* + * don't worry about whether we're root: + * nobody else would get this far. + */ + if (devminor == VINUM_SUPERDEV_MINOR) /* normal superdev */ + vinum_conf.flags &= ~VF_OPEN; /* no longer open */ + else { /* the daemon device */ + vinum_conf.flags &= ~VF_DAEMONOPEN; /* no longer open */ + if (vinum_conf.flags & VF_STOPPING) /* we're trying to stop, */ + wakeup(&vinumclose); /* we can continue now */ + } + return 0; + } + /* Real volume */ + index = Volno(dev); + if (index >= vinum_conf.volumes_allocated) + return ENXIO; /* no such device */ + vol = &VOL[index]; + + switch (vol->state) { + case volume_unallocated: + case volume_uninit: + return ENXIO; + + case volume_up: + vol->flags &= ~VF_OPEN; /* reset our flags */ + return 0; + + case volume_down: + return EIO; + + default: + return EINVAL; + } + + case VINUM_PLEX_TYPE: + if (Volno(dev) >= vinum_conf.volumes_allocated) + return ENXIO; + /* FALLTHROUGH */ + + case VINUM_SD_TYPE: + if ((Volno(dev) >= vinum_conf.volumes_allocated) || /* no such volume */ + (Plexno(dev) >= vinum_conf.plexes_allocated)) /* or no such plex */ + return ENXIO; /* no such device */ + /* FALLTHROUGH */ + + default: + return ENODEV; /* don't know what to do with these */ + } +} + +void +vinum_clone(void *arg, char *name, int namelen, dev_t * dev) +{ + struct volume *vol; + int i; + + if (*dev != NODEV) + return; + if (strncmp(name, "vinum/", sizeof("vinum/") - 1) != 0) + return; + + name += sizeof("vinum/") - 1; + if ((i = find_volume(name, 0)) == -1) + return; + + vol = &VOL[i]; + *dev = vol->dev; +} + + +/* Local Variables: */ +/* fill-column: 60 */ +/* End: */ diff --git a/sys/dev/vinum/vinumconfig.c b/sys/dev/vinum/vinumconfig.c new file mode 100644 index 0000000..2c00921 --- /dev/null +++ b/sys/dev/vinum/vinumconfig.c @@ -0,0 +1,2148 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumconfig.c,v 1.41 2003/05/23 00:57:34 grog Exp grog $ + * $FreeBSD$ + */ + +#define STATIC static + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> + +#define MAXTOKEN 64 /* maximum number of tokens in a line */ + +/* + * We can afford the luxury of global variables here, + * since start_config ensures that these functions + * are single-threaded. + */ + +/* These are indices in vinum_conf of the last-mentioned of each kind of object */ +static int current_drive; /* note the last drive we mention, for + * some defaults */ +static int current_plex; /* and the same for the last plex */ +static int current_volume; /* and the last volme */ +static struct _ioctl_reply *ioctl_reply; /* struct to return via ioctl */ + + +/* These values are used by most of these routines, so set them as globals */ +static char *token[MAXTOKEN]; /* pointers to individual tokens */ +static int tokens; /* number of tokens */ + +#define TOCONS 0x01 +#define TOTTY 0x02 +#define TOLOG 0x04 + +struct putchar_arg { + int flags; + struct tty *tty; +}; + +#define MSG_MAX 1024 /* maximum length of a formatted message */ +/* + * Format an error message and return to the user + * in the reply. CARE: This routine is designed + * to be called only from the configuration + * routines, so it assumes it's the owner of the + * configuration lock, and unlocks it on exit. + */ +void +throw_rude_remark(int error, char *msg,...) +{ + int retval; + va_list ap; + char *text; + static int finishing; /* don't recurse */ + int was_finishing; + + if ((vinum_conf.flags & VF_LOCKED) == 0) /* bug catcher */ + panic("throw_rude_remark: called without config lock"); + va_start(ap, msg); + if ((ioctl_reply != NULL) /* we're called from the user */ + &&(!(vinum_conf.flags & VF_READING_CONFIG))) { /* and not reading from disk: return msg */ + /* + * We can't just format to ioctl_reply, since it + * may contain our input parameters + */ + text = Malloc(MSG_MAX); + if (text == NULL) { + log(LOG_ERR, "vinum: can't allocate error message buffer\n"); + printf("vinum: "); + vprintf(msg, ap); /* print to the console */ + printf("\n"); + } else { + retval = kvprintf(msg, NULL, (void *) text, 10, ap); + text[retval] = '\0'; /* delimit */ + strlcpy(ioctl_reply->msg, text, sizeof(ioctl_reply->msg)); + ioctl_reply->error = error; /* first byte is the error number */ + Free(text); + } + } else { + printf("vinum: "); + vprintf(msg, ap); /* print to the console */ + printf("\n"); + } + va_end(ap); + + if (vinum_conf.flags & VF_READING_CONFIG) { /* go through to the bitter end, */ + if ((vinum_conf.flags & VF_READING_CONFIG) /* we're reading from disk, */ + &&((daemon_options & daemon_noupdate) == 0)) { + log(LOG_NOTICE, "Disabling configuration updates\n"); + daemon_options |= daemon_noupdate; + } + return; + } + /* + * We have a problem here: we want to unlock the + * configuration, which implies tidying up, but + * if we find an error while tidying up, we + * could recurse for ever. Use this kludge to + * only try once. + */ + was_finishing = finishing; + finishing = 1; + finish_config(was_finishing); /* unlock anything we may be holding */ + finishing = was_finishing; + longjmp(command_fail, error); +} + +/* + * Check a volume to see if the plex is already assigned to it. + * Return index in volume->plex, or -1 if not assigned + */ +int +my_plex(int volno, int plexno) +{ + int i; + struct volume *vol; + + vol = &VOL[volno]; /* point to volno */ + for (i = 0; i < vol->plexes; i++) + if (vol->plex[i] == plexno) + return i; + return -1; /* not found */ +} + +/* + * Check a plex to see if the subdisk is already assigned to it. + * Return index in plex->sd, or -1 if not assigned + */ +int +my_sd(int plexno, int sdno) +{ + int i; + struct plex *plex; + + plex = &PLEX[plexno]; + for (i = 0; i < plex->subdisks; i++) + if (plex->sdnos[i] == sdno) + return i; + return -1; /* not found */ +} + +/* Add plex to the volume if possible */ +int +give_plex_to_volume(int volno, int plexno, int preferme) +{ + struct volume *vol; + int i; + int volplexno; + + /* + * It's not an error for the plex to already + * belong to the volume, but we need to check a + * number of things to make sure it's done right. + * Some day. + */ + volplexno = my_plex(volno, plexno); + vol = &VOL[volno]; /* point to volume */ + if (volplexno < 0) { + if (vol->plexes == MAXPLEX) /* all plexes allocated */ + throw_rude_remark(ENOSPC, + "Too many plexes for volume %s", + vol->name); + else if ((vol->plexes > 0) /* we have other plexes */ + &&((vol->flags & VF_CONFIG_SETUPSTATE) == 0)) /* and we're not setting up state */ + invalidate_subdisks(&PLEX[plexno], sd_stale); /* make our subdisks invalid */ + vol->plex[vol->plexes] = plexno; /* this one */ + vol->plexes++; /* add another plex */ + PLEX[plexno].volno = volno; /* note the number of our volume */ + + /* Find out how big our volume is */ + for (i = 0; i < vol->plexes; i++) + vol->size = max(vol->size, PLEX[vol->plex[i]].length); + volplexno = vol->plexes - 1; /* number of plex in volume */ + } + if (preferme) { + if (vol->preferred_plex >= 0) /* already had a facourite, */ + printf("vinum: changing preferred plex for %s from %s to %s\n", + vol->name, + PLEX[vol->plex[vol->preferred_plex]].name, + PLEX[plexno].name); + vol->preferred_plex = volplexno; + } + return volplexno; +} + +/* + * Add subdisk to a plex if possible + */ +int +give_sd_to_plex(int plexno, int sdno) +{ + int i; + struct plex *plex; + struct sd *sd; + + /* + * It's not an error for the sd to already + * belong to the plex, but we need to check a + * number of things to make sure it's done right. + * Some day. + */ + i = my_sd(plexno, sdno); + if (i >= 0) /* does it already belong to us? */ + return i; /* that's it */ + + plex = &PLEX[plexno]; /* point to the plex */ + sd = &SD[sdno]; /* and the subdisk */ + + /* Do we have an offset? Otherwise put it after the last one */ + if (sd->plexoffset < 0) { /* no offset specified */ + if (plex->subdisks > 0) { + struct sd *lastsd = &SD[plex->sdnos[plex->subdisks - 1]]; /* last subdisk */ + + if (plex->organization == plex_concat) /* concat, */ + sd->plexoffset = lastsd->sectors + lastsd->plexoffset; /* starts here */ + else /* striped, RAID-4 or RAID-5 */ + sd->plexoffset = plex->stripesize * plex->subdisks; /* starts here */ + } else /* first subdisk */ + sd->plexoffset = 0; /* start at the beginning */ + } + if (plex->subdisks == MAXSD) /* we already have our maximum */ + throw_rude_remark(ENOSPC, /* crap out */ + "Can't add %s to %s: plex full", + sd->name, + plex->name); + + plex->subdisks++; /* another entry */ + if (plex->subdisks >= plex->subdisks_allocated) /* need more space */ + EXPAND(plex->sdnos, int, plex->subdisks_allocated, INITIAL_SUBDISKS_IN_PLEX); + + /* Adjust size of plex and volume. */ + if (isparity(plex)) /* RAID-4 or RAID-5 */ + plex->length = (plex->subdisks - 1) * sd->sectors; /* size is one disk short */ + else + plex->length += sd->sectors; /* plex gets this much bigger */ + if (plex->volno >= 0) /* we have a volume */ + VOL[plex->volno].size = max(VOL[plex->volno].size, plex->length); /* adjust its size */ + + /* + * We need to check that the subdisks don't overlap, + * but we can't do that until a point where we *must* + * know the size of all the subdisks. That's not + * here. But we need to sort them by offset + */ + for (i = 0; i < plex->subdisks - 1; i++) { + if (sd->plexoffset < SD[plex->sdnos[i]].plexoffset) { /* it fits before this one */ + /* First move any remaining subdisks by one */ + int j; + + for (j = plex->subdisks - 1; j > i; j--) /* move up one at a time */ + plex->sdnos[j] = plex->sdnos[j - 1]; + plex->sdnos[i] = sdno; + sd->plexsdno = i; /* note where we are in the subdisk */ + return i; + } + } + + /* + * The plex doesn't have any subdisk with a + * larger offset. Insert it here. + */ + plex->sdnos[i] = sdno; + sd->plexsdno = i; /* note where we are in the subdisk */ + sd->plexno = plex->plexno; /* and who we belong to */ + return i; +} + +/* + * Add a subdisk to drive if possible. The + * pointer to the drive must already be stored in + * the sd structure, but the drive doesn't know + * about the subdisk yet. + */ +void +give_sd_to_drive(int sdno) +{ + struct sd *sd; /* pointer to subdisk */ + struct drive *drive; /* and drive */ + int fe; /* index in free list */ + int sfe; /* and index of subdisk when assigning max */ + + sd = &SD[sdno]; /* point to sd */ + drive = &DRIVE[sd->driveno]; /* and drive */ + + if (drive->state != drive_up) { + update_sd_state(sdno); /* that crashes the subdisk */ + return; + } + sd->sectorsize = drive->sectorsize; /* get sector size from drive */ + if (drive->flags & VF_HOTSPARE) /* the drive is a hot spare, */ + throw_rude_remark(ENOSPC, + "Can't place %s on hot spare drive %s", + sd->name, + drive->label.name); + if ((drive->sectors_available == 0) /* no space left */ + ||(sd->sectors > drive->sectors_available)) { /* or too big, */ + sd->driveoffset = -1; /* don't be confusing */ + free_sd(sd->sdno); + throw_rude_remark(ENOSPC, "No space for %s on %s", sd->name, drive->label.name); + return; /* in case we come back here */ + } + drive->subdisks_used++; /* one more subdisk */ + + if (sd->sectors == 0) { /* take the largest chunk */ + sfe = 0; /* to keep the compiler happy */ + for (fe = 0; fe < drive->freelist_entries; fe++) { + if (drive->freelist[fe].sectors >= sd->sectors) { /* more space here */ + sd->sectors = drive->freelist[fe].sectors; /* take it */ + sd->driveoffset = drive->freelist[fe].offset; + sfe = fe; /* and note the index for later */ + } + } + if (sd->sectors == 0) { /* no luck, */ + sd->driveoffset = -1; /* don't be confusing */ + free_sd(sd->sdno); + throw_rude_remark(ENOSPC, /* give up */ + "No space for %s on %s", + sd->name, + drive->label.name); + } + if (sfe < (drive->freelist_entries - 1)) /* not the last one, */ + bcopy(&drive->freelist[sfe + 1], + &drive->freelist[sfe], + (drive->freelist_entries - sfe) * sizeof(struct drive_freelist)); + drive->freelist_entries--; /* one less entry */ + drive->sectors_available -= sd->sectors; /* and note how much less space we have */ + } else if (sd->driveoffset < 0) { /* no offset specified, find one */ + for (fe = 0; fe < drive->freelist_entries; fe++) { + if (drive->freelist[fe].sectors >= sd->sectors) { /* it'll fit here */ + sd->driveoffset = drive->freelist[fe].offset; + if (sd->sectors == drive->freelist[fe].sectors) { /* used up the entire entry */ + if (fe < (drive->freelist_entries - 1)) /* not the last one, */ + bcopy(&drive->freelist[fe + 1], + &drive->freelist[fe], + (drive->freelist_entries - fe) * sizeof(struct drive_freelist)); + drive->freelist_entries--; /* one less entry */ + } else { + drive->freelist[fe].sectors -= sd->sectors; /* this much less space */ + drive->freelist[fe].offset += sd->sectors; /* this much further on */ + } + drive->sectors_available -= sd->sectors; /* and note how much less space we have */ + break; + } + } + if (sd->driveoffset < 0) + /* + * Didn't find anything. Although the drive has + * enough space, it's too fragmented + */ + { + free_sd(sd->sdno); + throw_rude_remark(ENOSPC, "No space for %s on %s", sd->name, drive->label.name); + } + } else { /* specific offset */ + /* + * For a specific offset to work, the space must be + * entirely in a single freelist entry. Look for it. + */ + u_int64_t sdend = sd->driveoffset + sd->sectors; /* end of our subdisk */ + for (fe = 0; fe < drive->freelist_entries; fe++) { + u_int64_t dend = drive->freelist[fe].offset + drive->freelist[fe].sectors; /* end of entry */ + if (dend >= sdend) { /* fits before here */ + if (drive->freelist[fe].offset > sd->driveoffset) { /* starts after the beginning of sd area */ + sd->driveoffset = -1; /* don't be confusing */ + set_sd_state(sd->sdno, sd_down, setstate_force); + throw_rude_remark(ENOSPC, + "No space for %s on drive %s at offset %lld", + sd->name, + drive->label.name, + sd->driveoffset); + return; + } + /* + * We've found the space, and we can allocate it. + * We don't need to say that to the subdisk, which + * already knows about it. We need to tell it to + * the free list, though. We have four possibilities: + * + * 1. The subdisk exactly eats up the entry. That's the + * same as above. + * 2. The subdisk starts at the beginning and leaves space + * at the end. + * 3. The subdisk starts after the beginning and leaves + * space at the end as well: we end up with another + * fragment. + * 4. The subdisk leaves space at the beginning and finishes + * at the end. + */ + drive->sectors_available -= sd->sectors; /* note how much less space we have */ + if (sd->driveoffset == drive->freelist[fe].offset) { /* 1 or 2 */ + if (sd->sectors == drive->freelist[fe].sectors) { /* 1: used up the entire entry */ + if (fe < (drive->freelist_entries - 1)) /* not the last one, */ + bcopy(&drive->freelist[fe + 1], + &drive->freelist[fe], + (drive->freelist_entries - fe) * sizeof(struct drive_freelist)); + drive->freelist_entries--; /* one less entry */ + } else { /* 2: space at the end */ + drive->freelist[fe].sectors -= sd->sectors; /* this much less space */ + drive->freelist[fe].offset += sd->sectors; /* this much further on */ + } + } else { /* 3 or 4 */ + drive->freelist[fe].sectors = sd->driveoffset - drive->freelist[fe].offset; + if (dend > sdend) { /* 3: space at the end as well */ + if (fe < (drive->freelist_entries - 1)) /* not the last one */ + bcopy(&drive->freelist[fe], /* move the rest down */ + &drive->freelist[fe + 1], + (drive->freelist_entries - fe) * sizeof(struct drive_freelist)); + drive->freelist_entries++; /* one less entry */ + drive->freelist[fe + 1].offset = sdend; /* second entry starts after sd */ + drive->freelist[fe + 1].sectors = dend - sdend; /* and is this long */ + } + } + break; + } + } + } + drive->opencount++; /* one more subdisk attached */ +} + +/* Get an empty drive entry from the drive table */ +int +get_empty_drive(void) +{ + int driveno; + struct drive *drive; + + /* first see if we have one which has been deallocated */ + for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) { + if (DRIVE[driveno].state == drive_unallocated) /* bingo */ + break; + } + + if (driveno >= vinum_conf.drives_allocated) /* we've used all our allocation */ + EXPAND(DRIVE, struct drive, vinum_conf.drives_allocated, INITIAL_DRIVES); + + /* got a drive entry. Make it pretty */ + drive = &DRIVE[driveno]; + bzero(drive, sizeof(struct drive)); + drive->driveno = driveno; /* put number in structure */ + drive->flags |= VF_NEWBORN; /* newly born drive */ + strcpy(drive->devicename, "unknown"); /* and make the name ``unknown'' */ + return driveno; /* return the index */ +} + +/* + * Find the named drive in vinum_conf.drive, + * return the index in vinum_conf.drive. + * Don't mark the drive as allocated (XXX SMP) + * If create != 0, create an entry if it doesn't exist + */ +/* XXX check if we have it open from attach */ +int +find_drive(const char *name, int create) +{ + int driveno; + struct drive *drive; + + if (name != NULL) { + for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) { + drive = &DRIVE[driveno]; /* point to drive */ + if ((drive->label.name[0] != '\0') /* it has a name */ + &&(strcmp(drive->label.name, name) == 0) /* and it's this one */ + &&(drive->state > drive_unallocated)) /* and it's a real one: found */ + return driveno; + } + } + /* the drive isn't in the list. Add it if he wants */ + if (create == 0) /* don't want to create */ + return -1; /* give up */ + + driveno = get_empty_drive(); + drive = &DRIVE[driveno]; + if (name != NULL) + strlcpy(drive->label.name, /* put in its name */ + name, + sizeof(drive->label.name)); + drive->state = drive_referenced; /* in use, nothing worthwhile there */ + return driveno; /* return the index */ +} + +/* + * Find a drive given its device name. + * devname must be valid. + * Otherwise the same as find_drive above. + */ +int +find_drive_by_name(const char *devname, int create) +{ + int driveno; + struct drive *drive; + + for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) { + drive = &DRIVE[driveno]; /* point to drive */ + if ((strcmp(drive->devicename, devname) == 0) /* it's this device */ + &&(drive->state > drive_unallocated)) /* and it's a real one: found */ + return driveno; + } + + /* the drive isn't in the list. Add it if he wants */ + if (create == 0) /* don't want to create */ + return -1; /* give up */ + + driveno = get_empty_drive(); + drive = &DRIVE[driveno]; + bcopy(devname, /* put in its name */ + drive->devicename, + min(sizeof(drive->devicename), + strlen(devname))); + drive->state = drive_referenced; /* in use, nothing worthwhile there */ + return driveno; /* return the index */ +} + +/* Find an empty subdisk in the subdisk table */ +int +get_empty_sd(void) +{ + int sdno; + struct sd *sd; + + /* first see if we have one which has been deallocated */ + for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { + if (SD[sdno].state == sd_unallocated) /* bingo */ + break; + } + if (sdno >= vinum_conf.subdisks_allocated) + /* + * We've run out of space. sdno is pointing + * where we want it, but at the moment we + * don't have the space. Get it. + * + * XXX We should check for overflow here. We + * shouldn't allocate more than VINUM_MAXSD + * subdisks (currently at least a quarter of a + * million). + */ + EXPAND(SD, struct sd, vinum_conf.subdisks_allocated, INITIAL_SUBDISKS); + + /* initialize some things */ + sd = &SD[sdno]; /* point to it */ + bzero(sd, sizeof(struct sd)); /* initialize */ + sd->flags |= VF_NEWBORN; /* newly born subdisk */ + sd->plexno = -1; /* no plex */ + sd->sectors = -1; /* no space */ + sd->driveno = -1; /* no drive */ + sd->plexoffset = -1; /* and no offsets */ + sd->driveoffset = -1; + return sdno; /* return the index */ +} + +/* return a drive to the free pool */ +void +free_drive(struct drive *drive) +{ + if ((drive->state > drive_referenced) /* real drive */ + ||(drive->flags & VF_OPEN)) { /* how can it be open without a state? */ + LOCKDRIVE(drive); + if (drive->flags & VF_OPEN) { /* it's open, */ + close_locked_drive(drive); /* close it */ + drive->state = drive_down; /* and note the fact */ + } + if (drive->freelist) + Free(drive->freelist); + bzero(drive, sizeof(struct drive)); /* this also sets drive_unallocated */ + unlockdrive(drive); + } +} + +/* + * Find the named subdisk in vinum_conf.sd. + * + * If create != 0, create an entry if it doesn't exist + * + * Return index in vinum_conf.sd + */ +int +find_subdisk(const char *name, int create) +{ + int sdno; + struct sd *sd; + + for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { + if (strcmp(SD[sdno].name, name) == 0) /* found it */ + return sdno; + } + + /* the subdisk isn't in the list. Add it if he wants */ + if (create == 0) /* don't want to create */ + return -1; /* give up */ + + /* Allocate one and insert the name */ + sdno = get_empty_sd(); + sd = &SD[sdno]; + bcopy(name, sd->name, min(sizeof(sd->name), strlen(name))); /* put in its name */ + return sdno; /* return the pointer */ +} + +/* Return space to a drive */ +void +return_drive_space(int driveno, int64_t offset, int length) +{ + struct drive *drive; + int fe; /* free list entry */ + u_int64_t sdend; /* end of our subdisk */ + u_int64_t dend; /* end of our freelist entry */ + + drive = &DRIVE[driveno]; + if (drive->state == drive_up) { + sdend = offset + length; /* end of our subdisk */ + + /* Look for where to return the sd address space */ + for (fe = 0; + (fe < drive->freelist_entries) && (drive->freelist[fe].offset < offset); + fe++); + /* + * Now we are pointing to the last entry, the first + * with a higher offset than the subdisk, or both. + */ + if ((fe > 1) /* not the first entry */ + &&((fe == drive->freelist_entries) /* gone past the end */ + ||(drive->freelist[fe].offset > offset))) /* or past the block were looking for */ + fe--; /* point to the block before */ + dend = drive->freelist[fe].offset + drive->freelist[fe].sectors; /* end of the entry */ + + /* + * At this point, we are pointing to the correct + * place in the free list. A number of possibilities + * exist: + * + * 1. The block to be freed starts at the end of the + * block to which we are pointing. This has two + * subcases: + * + * a. The block to be freed ends at the beginning + * of the following block. Merge the three + * areas into a single block. + * + * b. The block is shorter than the space between + * the current block and the next one. Enlarge + * the current block. + * + * 2. The block to be freed starts after the end + * of the block. Again, we have two cases: + * + * a. It ends before the start of the following block. + * Create a new free block. + * + * b. It ends at the start of the following block. + * Enlarge the following block downwards. + * + * When there is only one free space block, and the + * space to be returned is before it, the pointer is + * to a non-existent zeroth block. XXX check this + */ + if (offset == dend) { /* Case 1: it starts at the end of this block */ + if ((fe < drive->freelist_entries - 1) /* we're not the last block in the free list */ + /* and the subdisk ends at the start of the next block */ + &&(sdend == drive->freelist[fe + 1].offset)) { + drive->freelist[fe].sectors /* 1a: merge all three blocks */ + = drive->freelist[fe + 1].sectors; + if (fe < drive->freelist_entries - 2) /* still more blocks after next */ + bcopy(&drive->freelist[fe + 2], /* move down one */ + &drive->freelist[fe + 1], + (drive->freelist_entries - 2 - fe) + * sizeof(struct drive_freelist)); + drive->freelist_entries--; /* one less entry in the free list */ + } else /* 1b: just enlarge this block */ + drive->freelist[fe].sectors += length; + } else { /* Case 2 */ + if (offset > dend) /* it starts after this block */ + fe++; /* so look at the next block */ + if ((fe < drive->freelist_entries) /* we're not the last block in the free list */ + /* and the subdisk ends at the start of this block: case 4 */ + &&(sdend == drive->freelist[fe].offset)) { + drive->freelist[fe].offset = offset; /* it starts where the sd was */ + drive->freelist[fe].sectors += length; /* and it's this much bigger */ + } else { /* case 3: non-contiguous */ + if (fe < drive->freelist_entries) /* not after the last block, */ + bcopy(&drive->freelist[fe], /* move the rest up one entry */ + &drive->freelist[fe + 1], + (drive->freelist_entries - fe) + * sizeof(struct drive_freelist)); + drive->freelist_entries++; /* one less entry */ + drive->freelist[fe].offset = offset; /* this entry represents the sd */ + drive->freelist[fe].sectors = length; + } + } + drive->sectors_available += length; /* the sectors are now available */ + } +} + +/* + * Free an allocated sd entry. + * This performs memory management only. remove() + * is responsible for checking relationships. + */ +void +free_sd(int sdno) +{ + struct sd *sd; + + sd = &SD[sdno]; + if ((sd->driveno >= 0) /* we have a drive, */ + &&(sd->sectors > 0)) /* and some space on it */ + return_drive_space(sd->driveno, /* return the space */ + sd->driveoffset, + sd->sectors); + if (sd->plexno >= 0) + PLEX[sd->plexno].subdisks--; /* one less subdisk */ + destroy_dev(sd->dev); + bzero(sd, sizeof(struct sd)); /* and clear it out */ + sd->state = sd_unallocated; + vinum_conf.subdisks_used--; /* one less sd */ +} + +/* Find an empty plex in the plex table */ +int +get_empty_plex(void) +{ + int plexno; + struct plex *plex; /* if we allocate one */ + + /* first see if we have one which has been deallocated */ + for (plexno = 0; plexno < vinum_conf.plexes_allocated; plexno++) { + if (PLEX[plexno].state == plex_unallocated) /* bingo */ + break; /* and get out of here */ + } + + if (plexno >= vinum_conf.plexes_allocated) + EXPAND(PLEX, struct plex, vinum_conf.plexes_allocated, INITIAL_PLEXES); + + /* Found a plex. Give it an sd structure */ + plex = &PLEX[plexno]; /* this one is ours */ + bzero(plex, sizeof(struct plex)); /* polish it up */ + plex->sdnos = (int *) Malloc(sizeof(int) * INITIAL_SUBDISKS_IN_PLEX); /* allocate sd table */ + CHECKALLOC(plex->sdnos, "vinum: Can't allocate plex subdisk table"); + bzero(plex->sdnos, (sizeof(int) * INITIAL_SUBDISKS_IN_PLEX)); /* do we need this? */ + plex->flags |= VF_NEWBORN; /* newly born plex */ + plex->subdisks = 0; /* no subdisks in use */ + plex->subdisks_allocated = INITIAL_SUBDISKS_IN_PLEX; /* and we have space for this many */ + plex->organization = plex_disorg; /* and it's not organized */ + plex->volno = -1; /* no volume yet */ + return plexno; /* return the index */ +} + +/* + * Find the named plex in vinum_conf.plex + * + * If create != 0, create an entry if it doesn't exist + * return index in vinum_conf.plex + */ +int +find_plex(const char *name, int create) +{ + int plexno; + struct plex *plex; + + for (plexno = 0; plexno < vinum_conf.plexes_allocated; plexno++) { + if (strcmp(PLEX[plexno].name, name) == 0) /* found it */ + return plexno; + } + + /* the plex isn't in the list. Add it if he wants */ + if (create == 0) /* don't want to create */ + return -1; /* give up */ + + /* Allocate one and insert the name */ + plexno = get_empty_plex(); + plex = &PLEX[plexno]; /* point to it */ + bcopy(name, plex->name, min(sizeof(plex->name), strlen(name))); /* put in its name */ + return plexno; /* return the pointer */ +} + +/* + * Free an allocated plex entry + * and its associated memory areas + */ +void +free_plex(int plexno) +{ + struct plex *plex; + + plex = &PLEX[plexno]; + if (plex->sdnos) + Free(plex->sdnos); + if (plex->lock) + Free(plex->lock); + destroy_dev(plex->dev); + bzero(plex, sizeof(struct plex)); /* and clear it out */ + plex->state = plex_unallocated; +} + +/* Find an empty volume in the volume table */ +int +get_empty_volume(void) +{ + int volno; + struct volume *vol; + int i; + + /* first see if we have one which has been deallocated */ + for (volno = 0; volno < vinum_conf.volumes_allocated; volno++) { + if (VOL[volno].state == volume_unallocated) /* bingo */ + break; + } + + if (volno >= vinum_conf.volumes_allocated) + EXPAND(VOL, struct volume, vinum_conf.volumes_allocated, INITIAL_VOLUMES); + + /* Now initialize fields */ + vol = &VOL[volno]; + bzero(vol, sizeof(struct volume)); + vol->flags |= VF_NEWBORN | VF_CREATED; /* newly born volume */ + vol->preferred_plex = ROUND_ROBIN_READPOL; /* round robin */ + for (i = 0; i < MAXPLEX; i++) /* mark the plexes missing */ + vol->plex[i] = -1; + return volno; /* return the index */ +} + +/* + * Find the named volume in vinum_conf.volume. + * + * If create != 0, create an entry if it doesn't exist + * return the index in vinum_conf + */ +int +find_volume(const char *name, int create) +{ + int volno; + struct volume *vol; + + for (volno = 0; volno < vinum_conf.volumes_allocated; volno++) { + if (strcmp(VOL[volno].name, name) == 0) /* found it */ + return volno; + } + + /* the volume isn't in the list. Add it if he wants */ + if (create == 0) /* don't want to create */ + return -1; /* give up */ + + /* Allocate one and insert the name */ + volno = get_empty_volume(); + vol = &VOL[volno]; + bcopy(name, vol->name, min(sizeof(vol->name), strlen(name))); /* put in its name */ + vol->blocksize = DEV_BSIZE; /* block size of this volume */ + return volno; /* return the pointer */ +} + +/* + * Free an allocated volume entry + * and its associated memory areas + */ +void +free_volume(int volno) +{ + struct volume *vol; + + vol = &VOL[volno]; + destroy_dev(vol->dev); + bzero(vol, sizeof(struct volume)); /* and clear it out */ + vol->state = volume_unallocated; +} + +/* + * Handle a drive definition. We store the information in the global variable + * drive, so we don't need to allocate. + * + * If we find an error, print a message and return + */ +void +config_drive(int update) +{ + enum drive_label_info partition_status; /* info about the partition */ + int parameter; + int driveno; /* index of drive in vinum_conf */ + struct drive *drive; /* and pointer to it */ + int otherdriveno; /* index of possible second drive */ + int sdno; + + if (tokens < 2) /* not enough tokens */ + throw_rude_remark(EINVAL, "Drive has no name\n"); + driveno = find_drive(token[1], 1); /* allocate a drive to initialize */ + drive = &DRIVE[driveno]; /* and get a pointer */ + if (update && ((drive->flags & VF_NEWBORN) == 0)) /* this drive exists already */ + return; /* don't do anything */ + drive->flags &= ~VF_NEWBORN; /* no longer newly born */ + + if (drive->state != drive_referenced) { /* we already know this drive */ + /* + * XXX Check which definition is more up-to-date. Give + * preference for the definition on its own drive. + */ + return; /* XXX */ + } + for (parameter = 2; parameter < tokens; parameter++) { /* look at the other tokens */ + switch (get_keyword(token[parameter], &keyword_set)) { + case kw_device: + parameter++; + otherdriveno = find_drive_by_name(token[parameter], 0); /* see if it exists already */ + if (otherdriveno >= 0) { /* yup, */ + drive->state = drive_unallocated; /* deallocate the drive */ + throw_rude_remark(EEXIST, /* and complain */ + "Drive %s would have same device as drive %s", + token[1], + DRIVE[otherdriveno].label.name); + } + if (drive->devicename[0] == '/') { /* we know this drive... */ + if (strcmp(drive->devicename, token[parameter])) /* different name */ + close_drive(drive); /* close it if it's open */ + else /* no change */ + break; + } + /* open the device and get the configuration */ + bcopy(token[parameter], /* insert device information */ + drive->devicename, + min(sizeof(drive->devicename), + strlen(token[parameter]))); + partition_status = read_drive_label(drive, 1); + switch (partition_status) { + case DL_CANT_OPEN: /* not our kind */ + close_drive(drive); + if (drive->lasterror == EFTYPE) /* wrong kind of partition */ + throw_rude_remark(drive->lasterror, + "Drive %s has invalid partition type", + drive->label.name); + else /* I/O error of some kind */ + throw_rude_remark(drive->lasterror, + "Can't initialize drive %s", + drive->label.name); + break; + + case DL_WRONG_DRIVE: /* valid drive, not the name we expected */ + if (vinum_conf.flags & VF_FORCECONFIG) { /* but we'll accept that */ + bcopy(token[1], drive->label.name, sizeof(drive->label.name)); + break; + } + close_drive(drive); + /* + * There's a potential race condition here: + * the rude remark refers to a field in an + * unallocated drive, which potentially could + * be reused. This works because we're the only + * thread accessing the config at the moment. + */ + drive->state = drive_unallocated; /* throw it away completely */ + throw_rude_remark(drive->lasterror, + "Incorrect drive name %s specified for drive %s", + token[1], + drive->label.name); + break; + + case DL_DELETED_LABEL: /* it was a drive, but we deleted it */ + case DL_NOT_OURS: /* nothing to do with the rest */ + case DL_OURS: + break; + } + /* + * read_drive_label overwrites the device name. + * If we get here, we can have the drive, + * so put it back again + */ + bcopy(token[parameter], + drive->devicename, + min(sizeof(drive->devicename), + strlen(token[parameter]))); + break; + + case kw_state: + parameter++; /* skip the keyword */ + if (vinum_conf.flags & VF_READING_CONFIG) + drive->state = DriveState(token[parameter]); /* set the state */ + break; + + case kw_hotspare: /* this drive is a hot spare */ + drive->flags |= VF_HOTSPARE; + break; + + default: + close_drive(drive); + throw_rude_remark(EINVAL, + "Drive %s, invalid keyword: %s", + token[1], + token[parameter]); + } + } + + if (drive->devicename[0] != '/') { + drive->state = drive_unallocated; /* deallocate the drive */ + throw_rude_remark(EINVAL, "No device name for %s", drive->label.name); + } + vinum_conf.drives_used++; /* passed all hurdles: one more in use */ + /* + * If we're replacing a drive, it could be that + * we already have subdisks referencing this + * drive. Note where they should be and change + * their state to obsolete. + */ + for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { + if ((SD[sdno].state > sd_referenced) + && (SD[sdno].driveno == driveno)) { + give_sd_to_drive(sdno); + if (SD[sdno].state > sd_stale) + SD[sdno].state = sd_stale; + } + } +} + +/* + * Handle a subdisk definition. We store the + * information in the global variable sd, so we + * don't need to allocate. + * + * On error throw a message back to the caller. + */ +void +config_subdisk(int update) +{ + int parameter; + int sdno; /* index of sd in vinum_conf */ + struct sd *sd; /* and pointer to it */ + u_int64_t size; + int detached = 0; /* set to 1 if this is a detached subdisk */ + int sdindex = -1; /* index in plexes subdisk table */ + enum sdstate state = sd_unallocated; /* state to set, if specified */ + int autosize = 0; /* set if we autosize in give_sd_to_drive */ + int namedsdno; /* index of another with this name */ + char partition = 0; /* partition of external subdisk */ + + sdno = get_empty_sd(); /* allocate an SD to initialize */ + sd = &SD[sdno]; /* and get a pointer */ + + for (parameter = 1; parameter < tokens; parameter++) { /* look at the other tokens */ + switch (get_keyword(token[parameter], &keyword_set)) { + /* + * If we have a 'name' parameter, it must + * come first, because we're too lazy to tidy + * up dangling refs if it comes later. + */ + case kw_name: + namedsdno = find_subdisk(token[++parameter], 0); /* find an existing sd with this name */ + if (namedsdno >= 0) { /* got one */ + if (SD[namedsdno].state == sd_referenced) { /* we've been told about this one */ + if (parameter > 2) + throw_rude_remark(EINVAL, + "sd %s: name parameter must come first\n", /* no go */ + token[parameter]); + else { + int i; + struct plex *plex; /* for tidying up dangling references */ + + *sd = SD[namedsdno]; /* copy from the referenced one */ + SD[namedsdno].state = sd_unallocated; /* and deallocate the referenced one */ + plex = &PLEX[sd->plexno]; /* now take a look at our plex */ + for (i = 0; i < plex->subdisks; i++) { /* look for the pointer */ + if (plex->sdnos[i] == namedsdno) /* pointing to the old subdisk */ + plex->sdnos[i] = sdno; /* bend it to point here */ + } + } + } + if (update) /* are we updating? */ + return; /* that's OK, nothing more to do */ + else + throw_rude_remark(EINVAL, "Duplicate subdisk %s", token[parameter]); + } else + bcopy(token[parameter], + sd->name, + min(sizeof(sd->name), strlen(token[parameter]))); + break; + + case kw_detached: + detached = 1; + break; + + case kw_plexoffset: + size = sizespec(token[++parameter]); + if ((size == -1) /* unallocated */ + &&(vinum_conf.flags & VF_READING_CONFIG)) /* reading from disk */ + break; /* invalid sd; just ignore it */ + if ((size % DEV_BSIZE) != 0) + throw_rude_remark(EINVAL, + "sd %s, bad plex offset alignment: %lld", + sd->name, + (long long) size); + else + sd->plexoffset = size / DEV_BSIZE; + break; + + case kw_driveoffset: + size = sizespec(token[++parameter]); + if ((size == -1) /* unallocated */ + &&(vinum_conf.flags & VF_READING_CONFIG)) /* reading from disk */ + break; /* invalid sd; just ignore it */ + if ((size % DEV_BSIZE) != 0) + throw_rude_remark(EINVAL, + "sd %s, bad drive offset alignment: %lld", + sd->name, + (long long) size); + else + sd->driveoffset = size / DEV_BSIZE; + break; + + case kw_len: + if (get_keyword(token[++parameter], &keyword_set) == kw_max) /* select maximum size from drive */ + size = 0; /* this is how we say it :-) */ + else + size = sizespec(token[parameter]); + if ((size % DEV_BSIZE) != 0) + throw_rude_remark(EINVAL, "sd %s, length %d not multiple of sector size", sd->name, size); + else + sd->sectors = size / DEV_BSIZE; + /* + * We have a problem with autosizing: we need to + * give the drive to the plex before we give it + * to the drive, in order to be clean if we give + * up in the middle, but at this time the size hasn't + * been set. Note that we have to fix up after + * giving the subdisk to the drive. + */ + if (size == 0) + autosize = 1; /* note that we're autosizing */ + break; + + case kw_drive: + sd->driveno = find_drive(token[++parameter], 1); /* insert drive information */ + break; + + case kw_plex: + sd->plexno = find_plex(token[++parameter], 1); /* insert plex information */ + break; + + /* + * Set the state. We can't do this directly, + * because give_sd_to_plex may change it + */ + case kw_state: + parameter++; /* skip the keyword */ + if (vinum_conf.flags & VF_READING_CONFIG) + state = SdState(token[parameter]); /* set the state */ + break; + + case kw_partition: + parameter++; /* skip the keyword */ + if ((strlen(token[parameter]) != 1) + || (token[parameter][0] < 'a') + || (token[parameter][0] > 'h')) + throw_rude_remark(EINVAL, + "%s: invalid partition %c", + sd->name, + token[parameter][0]); + else + partition = token[parameter][0]; + break; + + case kw_retryerrors: + sd->flags |= VF_RETRYERRORS; + break; + + default: + throw_rude_remark(EINVAL, "%s: invalid keyword: %s", sd->name, token[parameter]); + } + } + + /* Check we have a drive name */ + if (sd->driveno < 0) { /* didn't specify a drive */ + sd->driveno = current_drive; /* set to the current drive */ + if (sd->driveno < 0) /* no current drive? */ + throw_rude_remark(EINVAL, "Subdisk %s is not associated with a drive", sd->name); + } + if (DRIVE[sd->driveno].state != drive_up) + sd->state = sd_crashed; + + /* + * This is tacky. If something goes wrong + * with the checks, we may end up losing drive + * space. FIXME. + */ + if (autosize != 0) /* need to find a size, */ + give_sd_to_drive(sdno); /* do it before the plex */ + + /* Check for a plex name */ + if ((sd->plexno < 0) /* didn't specify a plex */ + &&(!detached)) /* and didn't say not to, */ + sd->plexno = current_plex; /* set to the current plex */ + + if (sd->plexno >= 0) + sdindex = give_sd_to_plex(sd->plexno, sdno); /* now tell the plex that it has this sd */ + + sd->sdno = sdno; /* point to our entry in the table */ + + /* Does the subdisk have a name? If not, give it one */ + if (sd->name[0] == '\0') { /* no name */ + char sdsuffix[8]; /* form sd name suffix here */ + + /* Do we have a plex name? */ + if (sdindex >= 0) /* we have a plex */ + strlcpy(sd->name, /* take it from there */ + PLEX[sd->plexno].name, + sizeof(sd->name)); + else /* no way */ + throw_rude_remark(EINVAL, "Unnamed sd is not associated with a plex"); + sprintf(sdsuffix, ".s%d", sdindex); /* form the suffix */ + strlcat(sd->name, sdsuffix, sizeof(sd->name)); /* and add it to the name */ + } + /* do we have complete info for this subdisk? */ + if (sd->sectors < 0) + throw_rude_remark(EINVAL, "sd %s has no length spec", sd->name); + + if (sd->dev == NULL) + /* + * sdno can (at least theoretically) overflow + * into the low order bit of the type field. + * This gives rise to a subdisk with type + * VINUM_SD2_TYPE. This is a feature, not a + * bug. + */ + sd->dev = make_dev(&vinum_cdevsw, + VINUMMINOR(sdno, VINUM_SD_TYPE), + UID_ROOT, + GID_OPERATOR, + S_IRUSR | S_IWUSR | S_IRGRP, + "vinum/sd/%s", + sd->name); + if (state != sd_unallocated) /* we had a specific state to set */ + sd->state = state; /* do it now */ + else if (sd->state == sd_unallocated) /* no, nothing set yet, */ + sd->state = sd_empty; /* must be empty */ + if (autosize == 0) /* no autoconfig, do the drive now */ + give_sd_to_drive(sdno); + vinum_conf.subdisks_used++; /* one more in use */ +} + +/* + * Handle a plex definition. + */ +void +config_plex(int update) +{ + int parameter; + int plexno; /* index of plex in vinum_conf */ + struct plex *plex; /* and pointer to it */ + int pindex = MAXPLEX; /* index in volume's plex list */ + int detached = 0; /* don't give it to a volume */ + int namedplexno; + enum plexstate state = plex_init; /* state to set at end */ + int preferme; /* set if we want to be preferred access */ + + current_plex = -1; /* forget the previous plex */ + preferme = 0; /* nothing special yet */ + plexno = get_empty_plex(); /* allocate a plex */ + plex = &PLEX[plexno]; /* and point to it */ + plex->plexno = plexno; /* and back to the config */ + + for (parameter = 1; parameter < tokens; parameter++) { /* look at the other tokens */ + switch (get_keyword(token[parameter], &keyword_set)) { + /* + * If we have a 'name' parameter, it must + * come first, because we're too lazy to tidy + * up dangling refs if it comes later. + */ + case kw_name: + namedplexno = find_plex(token[++parameter], 0); /* find an existing plex with this name */ + if (namedplexno >= 0) { /* plex exists already, */ + if (PLEX[namedplexno].state == plex_referenced) { /* we've been told about this one */ + if (parameter > 2) /* we've done other things first, */ + throw_rude_remark(EINVAL, + "plex %s: name parameter must come first\n", /* no go */ + token[parameter]); + else { + int i; + struct volume *vol; /* for tidying up dangling references */ + + *plex = PLEX[namedplexno]; /* get the info */ + PLEX[namedplexno].state = plex_unallocated; /* and deallocate the other one */ + vol = &VOL[plex->volno]; /* point to the volume */ + for (i = 0; i < MAXPLEX; i++) { /* for each plex */ + if (vol->plex[i] == namedplexno) + vol->plex[i] = plexno; /* bend the pointer */ + } + } + break; /* use this one */ + } + if (update) /* are we updating? */ + return; /* yes: that's OK, just return */ + else + throw_rude_remark(EINVAL, "Duplicate plex %s", token[parameter]); + } else + bcopy(token[parameter], /* put in the name */ + plex->name, + min(MAXPLEXNAME, strlen(token[parameter]))); + break; + + case kw_detached: + detached = 1; + break; + + case kw_org: /* plex organization */ + switch (get_keyword(token[++parameter], &keyword_set)) { + case kw_concat: + plex->organization = plex_concat; + break; + + case kw_striped: + { + int stripesize = sizespec(token[++parameter]); + + plex->organization = plex_striped; + if (stripesize % DEV_BSIZE != 0) /* not a multiple of block size, */ + throw_rude_remark(EINVAL, "plex %s: stripe size %d not a multiple of sector size", + plex->name, + stripesize); + else + plex->stripesize = stripesize / DEV_BSIZE; + break; + } + + case kw_raid4: + { + int stripesize = sizespec(token[++parameter]); + + plex->organization = plex_raid4; + if (stripesize % DEV_BSIZE != 0) /* not a multiple of block size, */ + throw_rude_remark(EINVAL, "plex %s: stripe size %d not a multiple of sector size", + plex->name, + stripesize); + else + plex->stripesize = stripesize / DEV_BSIZE; + break; + } + + case kw_raid5: + { + int stripesize = sizespec(token[++parameter]); + + plex->organization = plex_raid5; + if (stripesize % DEV_BSIZE != 0) /* not a multiple of block size, */ + throw_rude_remark(EINVAL, "plex %s: stripe size %d not a multiple of sector size", + plex->name, + stripesize); + else + plex->stripesize = stripesize / DEV_BSIZE; + break; + } + + default: + throw_rude_remark(EINVAL, "Invalid plex organization"); + } + if (isstriped(plex) + && (plex->stripesize == 0)) /* didn't specify a valid stripe size */ + throw_rude_remark(EINVAL, "Need a stripe size parameter"); + break; + + /* + * We're the preferred plex of our volume. + * Unfortunately, we don't know who our + * volume is yet. Note that we want to be + * preferred, and actually do it after we + * get a volume. + */ + case kw_preferred: + preferme = 1; + break; + + case kw_volume: + plex->volno = find_volume(token[++parameter], 1); /* insert a pointer to the volume */ + break; + + case kw_sd: /* add a subdisk */ + { + int sdno; + + sdno = find_subdisk(token[++parameter], 1); /* find a subdisk */ + SD[sdno].plexoffset = sizespec(token[++parameter]); /* get the offset */ + give_sd_to_plex(plexno, sdno); /* and insert it there */ + break; + } + + case kw_state: + parameter++; /* skip the keyword */ + if (vinum_conf.flags & VF_READING_CONFIG) + state = PlexState(token[parameter]); /* set the state */ + break; + + default: + throw_rude_remark(EINVAL, "plex %s, invalid keyword: %s", + plex->name, + token[parameter]); + } + } + + if (plex->organization == plex_disorg) + throw_rude_remark(EINVAL, "No plex organization specified"); + + if ((plex->volno < 0) /* we don't have a volume */ + &&(!detached)) /* and we wouldn't object */ + plex->volno = current_volume; + + if (plex->volno >= 0) + pindex = give_plex_to_volume(plex->volno, /* Now tell the volume that it has this plex */ + plexno, + preferme); + + /* Does the plex have a name? If not, give it one */ + if (plex->name[0] == '\0') { /* no name */ + char plexsuffix[8]; /* form plex name suffix here */ + /* Do we have a volume name? */ + if (plex->volno >= 0) /* we have a volume */ + strlcpy(plex->name, /* take it from there */ + VOL[plex->volno].name, + sizeof(plex->name)); + else /* no way */ + throw_rude_remark(EINVAL, "Unnamed plex is not associated with a volume"); + sprintf(plexsuffix, ".p%d", pindex); /* form the suffix */ + strlcat(plex->name, plexsuffix, sizeof(plex->name)); /* and add it to the name */ + } + if (isstriped(plex)) { + plex->lock = (struct rangelock *) + Malloc(PLEX_LOCKS * sizeof(struct rangelock)); + CHECKALLOC(plex->lock, "vinum: Can't allocate lock table\n"); + bzero((char *) plex->lock, PLEX_LOCKS * sizeof(struct rangelock)); + plex->lockmtx = &plexmutex[plexno % PLEXMUTEXES]; /* use this mutex for locking */ + } + /* Note the last plex we configured */ + current_plex = plexno; + plex->state = state; /* set whatever state we chose */ + vinum_conf.plexes_used++; /* one more in use */ + if (plex->dev == NULL) + plex->dev = make_dev(&vinum_cdevsw, + VINUMMINOR(plexno, VINUM_PLEX_TYPE), + UID_ROOT, + GID_OPERATOR, + S_IRUSR | S_IWUSR | S_IRGRP, + "vinum/plex/%s", + plex->name); +} + +/* + * Handle a volume definition. + * If we find an error, print a message, deallocate the nascent volume, and return + */ +void +config_volume(int update) +{ + int parameter; + int volno; + struct volume *vol; /* collect volume info here */ + int i; + + if (tokens < 2) /* not enough tokens */ + throw_rude_remark(EINVAL, "Volume has no name"); + current_volume = -1; /* forget the previous volume */ + volno = find_volume(token[1], 1); /* allocate a volume to initialize */ + vol = &VOL[volno]; /* and get a pointer */ + if (update && ((vol->flags & VF_CREATED) == 0)) /* this volume exists already */ + return; /* don't do anything */ + vol->flags &= ~VF_CREATED; /* it exists now */ + + for (parameter = 2; parameter < tokens; parameter++) { /* look at all tokens */ + switch (get_keyword(token[parameter], &keyword_set)) { + case kw_plex: + { + int plexno; /* index of this plex */ + int myplexno; /* and index if it's already ours */ + + plexno = find_plex(token[++parameter], 1); /* find a plex */ + if (plexno < 0) /* couldn't */ + break; /* we've already had an error message */ + myplexno = my_plex(volno, plexno); /* does it already belong to us? */ + if (myplexno > 0) /* yes, shouldn't get it again */ + throw_rude_remark(EINVAL, + "Plex %s already belongs to volume %s", + token[parameter], + vol->name); + else if (++vol->plexes > 8) /* another entry */ + throw_rude_remark(EINVAL, + "Too many plexes for volume %s", + vol->name); + vol->plex[vol->plexes - 1] = plexno; + PLEX[plexno].state = plex_referenced; /* we know something about it */ + PLEX[plexno].volno = volno; /* and this volume references it */ + } + break; + + case kw_readpol: + switch (get_keyword(token[++parameter], &keyword_set)) { /* decide what to do */ + case kw_round: + vol->preferred_plex = ROUND_ROBIN_READPOL; /* default */ + break; + + case kw_prefer: + { + int myplexno; /* index of this plex */ + + myplexno = find_plex(token[++parameter], 1); /* find a plex */ + if (myplexno < 0) { /* couldn't */ + printf("vinum: couldn't find preferred plex %s for %s\n", + token[parameter], + vol->name); + break; /* we've already had an error message */ + } + myplexno = my_plex(volno, myplexno); /* does it already belong to us? */ + if (myplexno > 0) /* yes */ + vol->preferred_plex = myplexno; /* just note the index */ + else if (++vol->plexes > 8) /* another entry */ + throw_rude_remark(EINVAL, "Too many plexes"); + else { /* space for the new plex */ + vol->plex[vol->plexes - 1] = myplexno; /* add it to our list */ + vol->preferred_plex = vol->plexes - 1; /* and note the index */ + } + } + break; + + default: + throw_rude_remark(EINVAL, "Invalid read policy"); + } + + case kw_setupstate: + vol->flags |= VF_CONFIG_SETUPSTATE; /* set the volume up later on */ + break; + + case kw_state: + parameter++; /* skip the keyword */ + if (vinum_conf.flags & VF_READING_CONFIG) + vol->state = VolState(token[parameter]); /* set the state */ + break; + + /* + * XXX experimental ideas. These are not + * documented, and will not be until I + * decide they're worth keeping. + */ + case kw_writethrough: /* set writethrough mode */ + vol->flags |= VF_WRITETHROUGH; + break; + + case kw_writeback: /* set writeback mode */ + vol->flags &= ~VF_WRITETHROUGH; + break; + + default: + throw_rude_remark(EINVAL, "volume %s, invalid keyword: %s", + vol->name, + token[parameter]); + } + } + current_volume = volno; /* note last referred volume */ + vol->volno = volno; /* also note in volume */ + + /* + * Before we can actually use the volume, we need + * a volume label. We could start to fake one here, + * but it will be a lot easier when we have some + * to copy from the drives, so defer it until we + * set up the configuration. XXX + */ + if (vol->state == volume_unallocated) + vol->state = volume_down; /* now ready to bring up at the end */ + + /* Find out how big our volume is */ + for (i = 0; i < vol->plexes; i++) + vol->size = max(vol->size, PLEX[vol->plex[i]].length); + vinum_conf.volumes_used++; /* one more in use */ + if (vol->dev == NULL) + vol->dev = make_dev(&vinum_cdevsw, + VINUMMINOR(volno, VINUM_VOLUME_TYPE), + UID_ROOT, + GID_OPERATOR, + S_IRUSR | S_IWUSR | S_IRGRP, + "vinum/%s", + vol->name); +} + +/* + * Parse a config entry. CARE! This destroys the original contents of the + * config entry, which we don't really need after this. More specifically, it + * places \0 characters at the end of each token. + * + * Return 0 if all is well, otherwise EINVAL for invalid keyword, + * or ENOENT if 'read' command doesn't find any drives. + */ +int +parse_config(char *cptr, struct keywordset *keyset, int update) +{ + int status; + + status = 0; /* until proven otherwise */ + tokens = tokenize(cptr, token, MAXTOKEN); /* chop up into tokens */ + + if (tokens <= 0) /* screwed up or empty line */ + return tokens; /* give up */ + else if (tokens == MAXTOKEN) /* too many */ + throw_rude_remark(E2BIG, + "Configuration error for %s: too many parameters", + token[1]); + + if (token[0][0] == '#') /* comment line */ + return 0; + + switch (get_keyword(token[0], keyset)) { /* decide what to do */ + case kw_drive: + config_drive(update); + break; + + case kw_subdisk: + config_subdisk(update); + break; + + case kw_plex: + config_plex(update); + break; + + case kw_volume: + config_volume(update); + break; + + /* Anything else is invalid in this context */ + default: + throw_rude_remark(EINVAL, /* should we die? */ + "Invalid configuration information: %s", + token[0]); + } + return status; +} + +/* + * parse a line handed in from userland via ioctl. + * This differs only by the error reporting mechanism: + * we return the error indication in the reply to the + * ioctl, so we need to set a global static pointer in + * this file. This technique works because we have + * ensured that configuration is performed in a single- + * threaded manner + */ +int +parse_user_config(char *cptr, struct keywordset *keyset) +{ + int status; + + ioctl_reply = (struct _ioctl_reply *) cptr; + status = parse_config(cptr, keyset, 0); + ioctl_reply = NULL; /* don't do this again */ + return status; +} + +/* Remove an object */ +void +remove(struct vinum_ioctl_msg *msg) +{ + struct vinum_ioctl_msg message = *msg; /* make a copy to hand on */ + + ioctl_reply = (struct _ioctl_reply *) msg; /* reinstate the address to reply to */ + ioctl_reply->error = 0; /* no error, */ + ioctl_reply->msg[0] = '\0'; /* no message */ + + switch (message.type) { + case drive_object: + remove_drive_entry(message.index, message.force); + updateconfig(0); + return; + + case sd_object: + remove_sd_entry(message.index, message.force, message.recurse); + updateconfig(0); + return; + + case plex_object: + remove_plex_entry(message.index, message.force, message.recurse); + updateconfig(0); + return; + + case volume_object: + remove_volume_entry(message.index, message.force, message.recurse); + updateconfig(0); + return; + + default: + ioctl_reply->error = EINVAL; + strcpy(ioctl_reply->msg, "Invalid object type"); + } +} + +/* Remove a drive. */ +void +remove_drive_entry(int driveno, int force) +{ + struct drive *drive = &DRIVE[driveno]; + int sdno; + + if ((driveno > vinum_conf.drives_allocated) /* not a valid drive */ + ||(drive->state == drive_unallocated)) { /* or nothing there */ + ioctl_reply->error = EINVAL; + strcpy(ioctl_reply->msg, "No such drive"); + } else if (drive->opencount > 0) { /* we have subdisks */ + if (force) { /* do it at any cost */ + for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { + if ((SD[sdno].state != sd_unallocated) /* subdisk is allocated */ + &&(SD[sdno].driveno == driveno)) /* and it belongs to this drive */ + remove_sd_entry(sdno, force, 0); + } + remove_drive(driveno); /* now remove it */ + vinum_conf.drives_used--; /* one less drive */ + } else + ioctl_reply->error = EBUSY; /* can't do that */ + } else { + remove_drive(driveno); /* just remove it */ + vinum_conf.drives_used--; /* one less drive */ + } +} + +/* remove a subdisk */ +void +remove_sd_entry(int sdno, int force, int recurse) +{ + struct sd *sd = &SD[sdno]; + + if ((sdno > vinum_conf.subdisks_allocated) /* not a valid sd */ + ||(sd->state == sd_unallocated)) { /* or nothing there */ + ioctl_reply->error = EINVAL; + strcpy(ioctl_reply->msg, "No such subdisk"); + } else if (sd->flags & VF_OPEN) /* we're open */ + ioctl_reply->error = EBUSY; /* no getting around that */ + else if (sd->plexno >= 0) { /* we have a plex */ + if (force) { /* do it at any cost */ + struct plex *plex = &PLEX[sd->plexno]; /* point to our plex */ + int mysdno; + + for (mysdno = 0; /* look for ourselves */ + mysdno < plex->subdisks && &SD[plex->sdnos[mysdno]] != sd; + mysdno++); + if (mysdno == plex->subdisks) /* didn't find it */ + log(LOG_ERR, + "Error removing subdisk %s: not found in plex %s\n", + SD[mysdno].name, + plex->name); + else { /* remove the subdisk from plex */ + if (mysdno < (plex->subdisks - 1)) /* not the last subdisk */ + bcopy(&plex->sdnos[mysdno + 1], + &plex->sdnos[mysdno], + (plex->subdisks - 1 - mysdno) * sizeof(int)); + plex->subdisks--; + sd->plexno = -1; /* disown the subdisk */ + } + + /* + * Removing a subdisk from a striped or + * RAID-4 or RAID-5 plex really tears the + * hell out of the structure, and it needs + * to be reinitialized. + */ + if (plex->organization != plex_concat) /* not concatenated, */ + set_plex_state(plex->plexno, plex_faulty, setstate_force); /* need to reinitialize */ + log(LOG_INFO, "vinum: removing %s\n", sd->name); + free_sd(sdno); + } else + ioctl_reply->error = EBUSY; /* can't do that */ + } else { + log(LOG_INFO, "vinum: removing %s\n", sd->name); + free_sd(sdno); + } +} + +/* remove a plex */ +void +remove_plex_entry(int plexno, int force, int recurse) +{ + struct plex *plex = &PLEX[plexno]; + int sdno; + + if ((plexno > vinum_conf.plexes_allocated) /* not a valid plex */ + ||(plex->state == plex_unallocated)) { /* or nothing there */ + ioctl_reply->error = EINVAL; + strcpy(ioctl_reply->msg, "No such plex"); + } else if (plex->flags & VF_OPEN) { /* we're open */ + ioctl_reply->error = EBUSY; /* no getting around that */ + return; + } + if (plex->subdisks) { + if (force) { /* do it anyway */ + if (recurse) { /* remove all below */ + int sds = plex->subdisks; + for (sdno = 0; sdno < sds; sdno++) + free_sd(plex->sdnos[sdno]); /* free all subdisks */ + } else { /* just tear them out */ + int sds = plex->subdisks; + for (sdno = 0; sdno < sds; sdno++) + SD[plex->sdnos[sdno]].plexno = -1; /* no plex any more */ + } + } else { /* can't do it without force */ + ioctl_reply->error = EBUSY; /* can't do that */ + return; + } + } + if (plex->volno >= 0) { /* we are part of a volume */ + if (force) { /* do it at any cost */ + struct volume *vol = &VOL[plex->volno]; + int myplexno; + + for (myplexno = 0; myplexno < vol->plexes; myplexno++) + if (vol->plex[myplexno] == plexno) /* found it */ + break; + if (myplexno == vol->plexes) /* didn't find it. Huh? */ + log(LOG_ERR, + "Error removing plex %s: not found in volume %s\n", + plex->name, + vol->name); + if (myplexno < (vol->plexes - 1)) /* not the last plex in the list */ + bcopy(&vol->plex[myplexno + 1], + &vol->plex[myplexno], + vol->plexes - 1 - myplexno); + vol->plexes--; + } else { + ioctl_reply->error = EBUSY; /* can't do that */ + return; + } + } + log(LOG_INFO, "vinum: removing %s\n", plex->name); + free_plex(plexno); + vinum_conf.plexes_used--; /* one less plex */ +} + +/* remove a volume */ +void +remove_volume_entry(int volno, int force, int recurse) +{ + struct volume *vol = &VOL[volno]; + int plexno; + + if ((volno > vinum_conf.volumes_allocated) /* not a valid volume */ + ||(vol->state == volume_unallocated)) { /* or nothing there */ + ioctl_reply->error = EINVAL; + strcpy(ioctl_reply->msg, "No such volume"); + } else if (vol->flags & VF_OPEN) /* we're open */ + ioctl_reply->error = EBUSY; /* no getting around that */ + else if (vol->plexes) { + if (recurse && force) { /* remove all below */ + int plexes = vol->plexes; + +/* for (plexno = plexes - 1; plexno >= 0; plexno--) */ + for (plexno = 0; plexno < plexes; plexno++) + remove_plex_entry(vol->plex[plexno], force, recurse); + log(LOG_INFO, "vinum: removing %s\n", vol->name); + free_volume(volno); + vinum_conf.volumes_used--; /* one less volume */ + } else + ioctl_reply->error = EBUSY; /* can't do that */ + } else { + log(LOG_INFO, "vinum: removing %s\n", vol->name); + free_volume(volno); + vinum_conf.volumes_used--; /* one less volume */ + } +} + +/* Currently called only from ioctl */ +void +update_sd_config(int sdno, int diskconfig) +{ + if (!diskconfig) + set_sd_state(sdno, sd_up, setstate_configuring); + SD[sdno].flags &= ~VF_NEWBORN; +} + +void +update_plex_config(int plexno, int diskconfig) +{ + u_int64_t size; + int sdno; + struct plex *plex = &PLEX[plexno]; + enum plexstate state = plex_up; /* state we want the plex in */ + int remainder; /* size of fractional stripe at end */ + int added_plex; /* set if we add a plex to a volume */ + int required_sds; /* number of subdisks we need */ + struct sd *sd; + struct volume *vol; + int data_sds = 0; /* number of sds carrying data */ + + if (plex->state < plex_init) /* not a real plex, */ + return; + added_plex = 0; + if (plex->volno >= 0) { /* we have a volume */ + vol = &VOL[plex->volno]; + + /* + * If we're newly born, + * and the volume isn't, + * and it has other plexes, + * and we didn't read this mess from disk, + * we were added later. + */ + if ((plex->flags & VF_NEWBORN) + && ((vol->flags & VF_NEWBORN) == 0) + && (vol->plexes > 0) + && (diskconfig == 0)) { + added_plex = 1; + state = plex_down; /* so take ourselves down */ + } + } + /* + * Check that our subdisks make sense. For + * striped plexes, we need at least two + * subdisks, and for RAID-4 and RAID-5 plexes we + * need at least three subdisks. In each case + * they must all be the same size. + */ + if (plex->organization == plex_striped) { + data_sds = plex->subdisks; + required_sds = 2; + } else if (isparity(plex)) { /* RAID 4 or 5 */ + data_sds = plex->subdisks - 1; + required_sds = 3; + } else + required_sds = 0; + if (required_sds > 0) { /* striped, RAID-4 or RAID-5 */ + if (plex->subdisks < required_sds) { + log(LOG_ERR, + "vinum: plex %s does not have at least %d subdisks\n", + plex->name, + required_sds); + state = plex_faulty; + } + /* + * Now see if the plex size is a multiple of + * the stripe size. If not, trim off the end + * of each subdisk and return it to the drive. + */ + if (plex->length > 0) { + if (data_sds > 0) { + if (plex->stripesize > 0) { + remainder = (int) (plex->length /* are we exact? */ + % ((u_int64_t) plex->stripesize * data_sds)); + if (remainder) { /* no */ + log(LOG_INFO, "vinum: removing %d blocks of partial stripe at the end of %s\n", + remainder, + plex->name); + plex->length -= remainder; /* shorten the plex */ + remainder /= data_sds; /* spread the remainder amongst the sds */ + for (sdno = 0; sdno < plex->subdisks; sdno++) { + sd = &SD[plex->sdnos[sdno]]; /* point to the subdisk */ + return_drive_space(sd->driveno, /* return the space */ + sd->driveoffset + sd->sectors - remainder, + remainder); + sd->sectors -= remainder; /* and shorten it */ + } + } + } else /* no data sds, */ + plex->length = 0; /* reset length */ + } + } + } + size = 0; + for (sdno = 0; sdno < plex->subdisks; sdno++) { + sd = &SD[plex->sdnos[sdno]]; + if (isstriped(plex) + && (sdno > 0) + && (sd->sectors != SD[plex->sdnos[sdno - 1]].sectors)) { + log(LOG_ERR, "vinum: %s must have equal sized subdisks\n", plex->name); + state = plex_down; + } + size += sd->sectors; + if (added_plex) /* we were added later */ + sd->state = sd_stale; /* stale until proven otherwise */ + if (plex->sectorsize != 0) { + if (sd->sectorsize != plex->sectorsize) /* incompatible sector sizes? */ + printf("vinum: incompatible sector sizes. " + "%s has %d bytes, %s has %d bytes. Ignored.\n", + sd->name, + sd->sectorsize, + plex->name, + plex->sectorsize); + } else /* not set yet, */ + plex->sectorsize = sd->sectorsize; + } + + if (plex->subdisks) { /* plex has subdisks, calculate size */ + /* + * XXX We shouldn't need to calculate the size any + * more. Check this some time + */ + if (isparity(plex)) + size = size / plex->subdisks * (plex->subdisks - 1); /* less space for RAID-4 and RAID-5 */ + if (plex->length != size) + log(LOG_INFO, + "Correcting length of %s: was %lld, is %lld\n", + plex->name, + (long long) plex->length, + (long long) size); + plex->length = size; + } else { /* no subdisks, */ + plex->length = 0; /* no size */ + state = plex_down; /* take it down */ + } + update_plex_state(plexno); /* set the state */ + plex->flags &= ~VF_NEWBORN; +} + +void +update_volume_config(int volno) +{ + struct volume *vol = &VOL[volno]; + struct plex *plex; + int plexno; + + if (vol->state != volume_unallocated) + /* + * Recalculate the size of the volume, + * which might change if the original + * plexes were not a multiple of the + * stripe size. + */ + { + vol->size = 0; + for (plexno = 0; plexno < vol->plexes; plexno++) { + plex = &PLEX[vol->plex[plexno]]; + vol->size = max(plex->length, vol->size); /* maximum size */ + plex->volplexno = plexno; /* note it in the plex */ + if (vol->sectorsize != 0) { + if (plex->sectorsize != vol->sectorsize) /* incompatible sector sizes? */ + printf("vinum: incompatible sector sizes. " + "%s has %d, %s has %d. Ignored.\n", + plex->name, + plex->sectorsize, + vol->name, + vol->sectorsize); + } else /* not set yet, */ + vol->sectorsize = plex->sectorsize; + } + } + vol->flags &= ~VF_NEWBORN; /* no longer newly born */ +} + +/* + * Update the global configuration. This is + * called after configuration changes. + * + * diskconfig is != 0 if we're reading in a config + * from disk. In this case, we don't try to bring + * the devices up, though we will bring them down + * if there's some error which got missed when + * writing to disk. + */ +void +updateconfig(int diskconfig) +{ + int plexno; + int volno; + + for (plexno = 0; plexno < vinum_conf.plexes_allocated; plexno++) + update_plex_config(plexno, diskconfig); + + for (volno = 0; volno < vinum_conf.volumes_allocated; volno++) { + if (VOL[volno].state > volume_uninit) { + VOL[volno].flags &= ~VF_CONFIG_SETUPSTATE; /* no more setupstate */ + update_volume_state(volno); + update_volume_config(volno); + } + } + save_config(); +} + +/* + * Start manual changes to the configuration and lock out + * others who may wish to do so. + * XXX why do we need this and lock_config too? + */ +int +start_config(int force) +{ + int error; + + current_drive = -1; /* note the last drive we mention, for + * some defaults */ + current_plex = -1; /* and the same for the last plex */ + current_volume = -1; /* and the last volume */ + while ((vinum_conf.flags & VF_CONFIGURING) != 0) { + vinum_conf.flags |= VF_WILL_CONFIGURE; + if ((error = tsleep(&vinum_conf, PRIBIO | PCATCH, "vincfg", 0)) != 0) + return error; + } + /* + * We need two flags here: VF_CONFIGURING + * tells other processes to hold off (this + * function), and VF_CONFIG_INCOMPLETE + * tells the state change routines not to + * propagate incrememntal state changes + */ + vinum_conf.flags |= VF_CONFIGURING | VF_CONFIG_INCOMPLETE; + if (force) + vinum_conf.flags |= VF_FORCECONFIG; /* overwrite differently named drives */ + current_drive = -1; /* reset the defaults */ + current_plex = -1; /* and the same for the last plex */ + current_volume = -1; /* and the last volme */ + return 0; +} + +/* + * Update the config if update is 1, and unlock + * it. We won't update the configuration if we + * are called in a recursive loop via throw_rude_remark. + */ +void +finish_config(int update) +{ + /* we've finished our config */ + vinum_conf.flags &= ~(VF_CONFIG_INCOMPLETE | VF_READING_CONFIG | VF_FORCECONFIG); + if (update) + updateconfig(0); /* so update things */ + else + updateconfig(1); /* do some updates only */ + vinum_conf.flags &= ~VF_CONFIGURING; /* and now other people can take a turn */ + if ((vinum_conf.flags & VF_WILL_CONFIGURE) != 0) { + vinum_conf.flags &= ~VF_WILL_CONFIGURE; + wakeup_one(&vinum_conf); + } +} +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumdaemon.c b/sys/dev/vinum/vinumdaemon.c new file mode 100644 index 0000000..3ae09c0 --- /dev/null +++ b/sys/dev/vinum/vinumdaemon.c @@ -0,0 +1,281 @@ +/* daemon.c: kernel part of Vinum daemon */ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumdaemon.c,v 1.8 2000/01/03 05:22:03 grog Exp grog $ + * $FreeBSD$ + */ + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> + +#ifdef VINUMDEBUG +#include <sys/reboot.h> +#endif + +/* declarations */ +void recover_io(struct request *rq); + +int daemon_options = 0; /* options */ +int daemonpid; /* PID of daemon */ +struct daemonq *daemonq; /* daemon's work queue */ +struct daemonq *dqend; /* and the end of the queue */ + +/* + * We normally call Malloc to get a queue element. In interrupt + * context, we can't guarantee that we'll get one, since we're not + * allowed to wait. If malloc fails, use one of these elements. + */ + +#define INTQSIZE 4 +struct daemonq intq[INTQSIZE]; /* queue elements for interrupt context */ +struct daemonq *intqp; /* and pointer in it */ + +void +vinum_daemon(void) +{ + int s; + struct daemonq *request; + + PROC_LOCK(curproc); + curproc->p_flag |= P_SYSTEM; /* we're a system process */ + mtx_lock_spin(&sched_lock); + curproc->p_sflag |= PS_INMEM; + mtx_unlock_spin(&sched_lock); + PROC_UNLOCK(curproc); + daemon_save_config(); /* start by saving the configuration */ + daemonpid = curproc->p_pid; /* mark our territory */ + while (1) { + tsleep(&vinum_daemon, PRIBIO, "vinum", 0); /* wait for something to happen */ + + /* + * It's conceivable that, as the result of an + * I/O error, we'll be out of action long + * enough that another daemon gets started. + * That's OK, just give up gracefully. + */ + if (curproc->p_pid != daemonpid) { /* we've been ousted in our sleep */ + if (daemon_options & daemon_verbose) + log(LOG_INFO, "vinum: abdicating\n"); + return; + } + while (daemonq != NULL) { /* we have work to do, */ + s = splhigh(); /* don't get interrupted here */ + request = daemonq; /* get the request */ + daemonq = daemonq->next; /* and detach it */ + if (daemonq == NULL) /* got to the end, */ + dqend = NULL; /* no end any more */ + splx(s); + + switch (request->type) { + /* + * We had an I/O error on a request. Go through the + * request and try to salvage it + */ + case daemonrq_ioerror: + if (daemon_options & daemon_verbose) { + struct request *rq = request->info.rq; + + log(LOG_WARNING, + "vinum: recovering I/O request: %p\n%s dev %d.%d, offset 0x%llx, length %ld\n", + rq, + rq->bp->b_iocmd == BIO_READ ? "Read" : "Write", + major(rq->bp->b_dev), + minor(rq->bp->b_dev), + (long long)rq->bp->b_blkno, + rq->bp->b_bcount); + } + recover_io(request->info.rq); /* the failed request */ + break; + + /* + * Write the config to disk. We could end up with + * quite a few of these in a row. Only honour the + * last one + */ + case daemonrq_saveconfig: + if ((daemonq == NULL) /* no more requests */ + ||(daemonq->type != daemonrq_saveconfig)) { /* or the next isn't the same */ + if (((daemon_options & daemon_noupdate) == 0) /* we're allowed to do it */ + &&((vinum_conf.flags & VF_READING_CONFIG) == 0)) { /* and we're not building the config now */ + /* + * We obviously don't want to save a + * partial configuration. Less obviously, + * we don't need to do anything if we're + * asked to write the config when we're + * building it up, because we save it at + * the end. + */ + if (daemon_options & daemon_verbose) + log(LOG_INFO, "vinum: saving config\n"); + daemon_save_config(); /* save it */ + } + } + break; + + case daemonrq_return: /* been told to stop */ + if (daemon_options & daemon_verbose) + log(LOG_INFO, "vinum: stopping\n"); + daemon_options |= daemon_stopped; /* note that we've stopped */ + Free(request); + while (daemonq != NULL) { /* backed up requests, */ + request = daemonq; /* get the request */ + daemonq = daemonq->next; /* and detach it */ + Free(request); /* then free it */ + } + wakeup(&vinumclose); /* and wake any waiting vinum(8)s */ + return; + + case daemonrq_ping: /* tell the caller we're here */ + if (daemon_options & daemon_verbose) + log(LOG_INFO, "vinum: ping reply\n"); + wakeup(&vinum_finddaemon); /* wake up the caller */ + break; + + case daemonrq_closedrive: /* close a drive */ + close_drive(request->info.drive); /* do it */ + break; + + case daemonrq_init: /* initialize a plex */ + /* XXX */ + case daemonrq_revive: /* revive a subdisk */ + /* XXX */ + /* FALLTHROUGH */ + default: + log(LOG_WARNING, "Invalid request\n"); + break; + } + if (request->privateinuse) /* one of ours, */ + request->privateinuse = 0; /* no longer in use */ + else + Free(request); /* return it */ + } + } +} + +/* + * Recover a failed I/O operation. + * + * The correct way to do this is to examine the request and determine + * how to recover each individual failure. In the case of a write, + * this could be as simple as doing nothing: the defective drives may + * already be down, and there may be nothing else to do. In case of + * a read, it will be necessary to retry if there are alternative + * copies of the data. + * + * The easy way (here) is just to reissue the request. This will take + * a little longer, but nothing like as long as the failure will have + * taken. + * + */ +void +recover_io(struct request *rq) +{ + /* + * This should read: + * + * vinumstrategy(rq->bp); + * + * Negotiate with phk to get it fixed. + */ + DEV_STRATEGY(rq->bp); /* reissue the command */ +} + +/* Functions called to interface with the daemon */ + +/* queue a request for the daemon */ +void +queue_daemon_request(enum daemonrq type, union daemoninfo info) +{ + int s; + + struct daemonq *qelt = (struct daemonq *) Malloc(sizeof(struct daemonq)); + + if (qelt == NULL) { /* malloc failed, we're prepared for that */ + /* + * Take one of our spares. Give up if it's still in use; the only + * message we're likely to get here is a 'drive failed' message, + * and that'll come by again if we miss it. + */ + if (intqp->privateinuse) /* still in use? */ + return; /* yes, give up */ + qelt = intqp++; + if (intqp == &intq[INTQSIZE]) /* got to the end, */ + intqp = intq; /* wrap around */ + qelt->privateinuse = 1; /* it's ours, and it's in use */ + } else + qelt->privateinuse = 0; + + qelt->next = NULL; /* end of the chain */ + qelt->type = type; + qelt->info = info; + s = splhigh(); + if (daemonq) { /* something queued already */ + dqend->next = qelt; + dqend = qelt; + } else { /* queue is empty, */ + daemonq = qelt; /* this is the whole queue */ + dqend = qelt; + } + splx(s); + wakeup(&vinum_daemon); /* and give the dæmon a kick */ +} + +/* + * see if the daemon is running. Return 0 (no error) + * if it is, ESRCH otherwise + */ +int +vinum_finddaemon() +{ + int result; + + if (daemonpid != 0) { /* we think we have a daemon, */ + queue_daemon_request(daemonrq_ping, (union daemoninfo) 0); /* queue a ping */ + result = tsleep(&vinum_finddaemon, PUSER, "reap", 2 * hz); + if (result == 0) /* yup, the daemon's up and running */ + return 0; + } + /* no daemon, or we couldn't talk to it: start it */ + vinum_daemon(); /* start the daemon */ + return 0; +} + +int +vinum_setdaemonopts(int options) +{ + daemon_options = options; + return 0; +} diff --git a/sys/dev/vinum/vinumext.h b/sys/dev/vinum/vinumext.h new file mode 100644 index 0000000..807bb5c6 --- /dev/null +++ b/sys/dev/vinum/vinumext.h @@ -0,0 +1,263 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumext.h,v 1.33 2003/05/23 00:57:48 grog Exp $ + * $FreeBSD$ + */ + +/* vinumext.h: external definitions */ + +/* *sigh* We still need this at the moment. */ +#ifdef _KERNEL +extern struct _vinum_conf vinum_conf; /* configuration information */ +extern struct mtx plexmutex[]; /* mutexes for plexes to use */ +#else +extern struct __vinum_conf vinum_conf; /* configuration information */ +#endif + +#ifdef VINUMDEBUG +extern int debug; /* debug flags */ +#endif + +/* Physical read and write drive */ +#define read_drive(a, b, c, d) driveio (a, b, c, d, BIO_READ) +#define write_drive(a, b, c, d) driveio (a, b, c, d, BIO_WRITE) + +#define CHECKALLOC(ptr, msg) \ + if (ptr == NULL) \ + { \ + printf (msg); \ + longjmp (command_fail, -1); \ + } +#ifndef _KERNEL +struct vnode; +struct thread; +#endif + +#ifdef _KERNEL +int vinum_inactive(int); +void free_vinum(int); +int give_sd_to_plex(int plexno, int sdno); +void give_sd_to_drive(int sdno); +int give_plex_to_volume(int, int, int); +struct drive *check_drive(char *); +enum drive_label_info read_drive_label(struct drive *, int); +int parse_config(char *, struct keywordset *, int); +int parse_user_config(char *cptr, struct keywordset *keyset); +u_int64_t sizespec(char *spec); +int volume_index(struct volume *volume); +int plex_index(struct plex *plex); +int sd_index(struct sd *sd); +int drive_index(struct drive *drive); +int my_plex(int volno, int plexno); +int my_sd(int plexno, int sdno); +int get_empty_drive(void); +int find_drive(const char *name, int create); +int find_drive_by_name(const char *devname, int create); +int get_empty_sd(void); +int find_subdisk(const char *name, int create); +void return_drive_space(int driveno, int64_t offset, int length); +void free_sd(int sdno); +void free_volume(int volno); +int get_empty_plex(void); +int find_plex(const char *name, int create); +void free_plex(int plexno); +int get_empty_volume(void); +int find_volume(const char *name, int create); +void config_subdisk(int); +void config_plex(int); +void config_volume(int); +void config_drive(int); +void updateconfig(int); +void update_sd_config(int sdno, int kernelstate); +void update_plex_config(int plexno, int kernelstate); +void update_volume_config(int volno); +void update_config(void); +void drive_io_done(struct buf *); +void save_config(void); +void daemon_save_config(void); +void write_config(char *, int); +int start_config(int); +void finish_config(int); +void remove(struct vinum_ioctl_msg *msg); +void remove_drive_entry(int driveno, int force); +void remove_sd_entry(int sdno, int force, int recurse); +void remove_plex_entry(int plexno, int force, int recurse); +void remove_volume_entry(int volno, int force, int recurse); + +void checkdiskconfig(char *); +int open_drive(struct drive *, struct thread *, int); +void close_drive(struct drive *drive); +void close_locked_drive(struct drive *drive); +int driveio(struct drive *, char *, size_t, off_t, int); +int set_drive_parms(struct drive *drive); +int init_drive(struct drive *, int); +/* void throw_rude_remark (int, struct _ioctl_reply *, char *, ...); XXX */ +void throw_rude_remark(int, char *,...); + +void format_config(char *config, int len); +void checkkernel(char *op); +void free_drive(struct drive *drive); +void down_drive(struct drive *drive); +void remove_drive(int driveno); + +int vinum_scandisk(char *drivename); + +/* I/O */ +d_open_t vinumopen; +d_close_t vinumclose; +d_strategy_t vinumstrategy; +d_ioctl_t vinumioctl; + +int vinum_super_ioctl(dev_t, u_long, caddr_t); +int vinumstart(struct buf *bp, int reviveok); +int launch_requests(struct request *rq, int reviveok); +void sdio(struct buf *bp); + +/* XXX Do we need this? */ +int vinumpart(dev_t); + +extern jmp_buf command_fail; /* return here if config fails */ + +#ifdef VINUMDEBUG +/* Memory allocation and request tracing */ +void vinum_meminfo(caddr_t data); +int vinum_mallocinfo(caddr_t data); +int vinum_rqinfo(caddr_t data); +void LongJmp(jmp_buf, int); +char *basename(char *); +#endif + +#ifdef VINUMDEBUG +void expand_table(void **, int, int, char *, int); +#else +void expand_table(void **, int, int); +#endif + +struct disklabel; +struct request; +struct rqgroup *allocrqg(struct request *rq, int elements); +void deallocrqg(struct rqgroup *rqg); + +/* Device number decoding */ +int Volno(dev_t x); +int Plexno(dev_t x); +int Sdno(dev_t x); + +/* State transitions */ +int set_drive_state(int driveno, enum drivestate state, enum setstateflags flags); +int set_sd_state(int sdno, enum sdstate state, enum setstateflags flags); +enum requeststatus checksdstate(struct sd *sd, struct request *rq, daddr_t diskaddr, daddr_t diskend); +int set_plex_state(int plexno, enum plexstate state, enum setstateflags flags); +int set_volume_state(int volumeno, enum volumestate state, enum setstateflags flags); +void update_sd_state(int sdno); +void forceup(int plexno); +void update_plex_state(int plexno); +void update_volume_state(int volno); +void invalidate_subdisks(struct plex *, enum sdstate); +void get_volume_label(char *name, int plexes, u_int64_t size, struct disklabel *lp); +int write_volume_label(int); +void start_object(struct vinum_ioctl_msg *); +void stop_object(struct vinum_ioctl_msg *); +void setstate(struct vinum_ioctl_msg *msg); +void setstate_by_force(struct vinum_ioctl_msg *msg); +void vinum_label(int); +int vinum_writedisklabel(struct volume *, struct disklabel *); +int initsd(int, int); +struct buf *parityrebuild(struct plex *, u_int64_t, int, enum parityop, struct rangelock **, off_t *); +enum requeststatus sddownstate(struct request *rq); + +int restart_plex(int plexno); +int revive_read(struct sd *sd); +int revive_block(int sdno); +void parityops(struct vinum_ioctl_msg *); + +/* Auxiliary functions */ +enum sdstates sdstatemap(struct plex *plex); +enum volplexstate vpstate(struct plex *plex); +#endif + +struct drive *validdrive(int driveno, struct _ioctl_reply *); +struct sd *validsd(int sdno, struct _ioctl_reply *); +struct plex *validplex(int plexno, struct _ioctl_reply *); +struct volume *validvol(int volno, struct _ioctl_reply *); +void resetstats(struct vinum_ioctl_msg *msg); + +/* Locking */ +#ifdef VINUMDEBUG +int lockdrive(struct drive *drive, char *, int); +#else +int lockdrive(struct drive *drive); +#endif +void unlockdrive(struct drive *drive); +int lockvol(struct volume *vol); +void unlockvol(struct volume *vol); +int lockplex(struct plex *plex); +void unlockplex(struct plex *plex); +struct rangelock *lockrange(daddr_t stripe, struct buf *bp, struct plex *plex); +int lock_config(void); +void unlock_config(void); + +/* Dæmon */ + +void vinum_daemon(void); +int vinum_finddaemon(void); +int vinum_setdaemonopts(int); +extern struct daemonq *daemonq; /* daemon's work queue */ +extern struct daemonq *dqend; /* and the end of the queue */ +extern struct cdevsw vinum_cdevsw; + +#undef Free /* defined in some funny net stuff */ +#ifdef _KERNEL +#ifdef VINUMDEBUG +#define Malloc(x) MMalloc ((x), __FILE__, __LINE__) /* show where we came from */ +#define Free(x) FFree ((x), __FILE__, __LINE__) /* show where we came from */ +caddr_t MMalloc(int size, char *, int); +void FFree(void *mem, char *, int); +#define LOCKDRIVE(d) lockdrive (d, __FILE__, __LINE__) +#else +#define Malloc(x) malloc((x), M_DEVBUF, \ + curthread->td_proc->p_intr_nesting_level == 0? M_WAITOK: M_NOWAIT) +#define Free(x) free((x), M_DEVBUF) +#define LOCKDRIVE(d) lockdrive (d) +#endif +#else +#define Malloc(x) malloc ((x)) /* just the size */ +#define Free(x) free ((x)) /* just the address */ +#endif + +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumhdr.h b/sys/dev/vinum/vinumhdr.h new file mode 100644 index 0000000..e8161e8 --- /dev/null +++ b/sys/dev/vinum/vinumhdr.h @@ -0,0 +1,80 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + */ + +/* Header files used by all modules */ +/* + * $Id: vinumhdr.h,v 1.19 2001/05/22 04:07:22 grog Exp grog $ + * $FreeBSD$ + */ + +#include <sys/param.h> +#ifdef _KERNEL +#include "opt_vinum.h" +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/conf.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/sysctl.h> +#endif +#include <sys/errno.h> +#include <sys/time.h> +#include <sys/bio.h> +#include <sys/buf.h> +#include <sys/malloc.h> +#include <sys/uio.h> +#include <sys/namei.h> +#include <sys/stat.h> +#include <sys/disk.h> +#include <sys/disklabel.h> +#include <sys/syslog.h> +#include <sys/fcntl.h> +#include <sys/queue.h> +#ifdef _KERNEL +#include <machine/setjmp.h> +#include <machine/stdarg.h> +#else +#include <setjmp.h> +#include <stdarg.h> +#endif +#include <vm/vm.h> +#include <dev/vinum/vinumvar.h> +#include <dev/vinum/vinumio.h> +#include <dev/vinum/vinumkw.h> +#include <dev/vinum/vinumext.h> +#include <dev/vinum/vinumutil.h> +#include <machine/cpu.h> diff --git a/sys/dev/vinum/vinuminterrupt.c b/sys/dev/vinum/vinuminterrupt.c new file mode 100644 index 0000000..8d72579 --- /dev/null +++ b/sys/dev/vinum/vinuminterrupt.c @@ -0,0 +1,467 @@ +/* vinuminterrupt.c: bottom half of the driver */ + +/*- + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinuminterrupt.c,v 1.14 2001/05/23 23:03:37 grog Exp grog $ + * $FreeBSD$ + */ + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> +#include <sys/resourcevar.h> + +void complete_raid5_write(struct rqelement *); +void complete_rqe(struct buf *bp); +void sdio_done(struct buf *bp); + +/* + * Take a completed buffer, transfer the data back if + * it's a read, and complete the high-level request + * if this is the last subrequest. + * + * The bp parameter is in fact a struct rqelement, which + * includes a couple of extras at the end. + */ +void +complete_rqe(struct buf *bp) +{ + struct rqelement *rqe; + struct request *rq; + struct rqgroup *rqg; + struct buf *ubp; /* user buffer */ + struct drive *drive; + struct sd *sd; + char *gravity; /* for error messages */ + + rqe = (struct rqelement *) bp; /* point to the element element that completed */ + rqg = rqe->rqg; /* and the request group */ + rq = rqg->rq; /* and the complete request */ + ubp = rq->bp; /* user buffer */ + +#ifdef VINUMDEBUG + if (debug & DEBUG_LASTREQS) + logrq(loginfo_iodone, (union rqinfou) rqe, ubp); +#endif + drive = &DRIVE[rqe->driveno]; + drive->active--; /* one less outstanding I/O on this drive */ + vinum_conf.active--; /* one less outstanding I/O globally */ + if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */ + ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */ + wakeup(&launch_requests); /* let another one at it */ + if ((bp->b_io.bio_flags & BIO_ERROR) != 0) { /* transfer in error */ + gravity = ""; + sd = &SD[rqe->sdno]; + + if (bp->b_error != 0) /* did it return a number? */ + rq->error = bp->b_error; /* yes, put it in. */ + else if (rq->error == 0) /* no: do we have one already? */ + rq->error = EIO; /* no: catchall "I/O error" */ + sd->lasterror = rq->error; + if (bp->b_iocmd == BIO_READ) { /* read operation */ + if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { + gravity = " fatal"; + set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */ + } + log(LOG_ERR, + "%s:%s read error, block %lld for %ld bytes\n", + gravity, + sd->name, + (long long)bp->b_blkno, + bp->b_bcount); + } else { /* write operation */ + if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { + gravity = "fatal "; + set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */ + } + log(LOG_ERR, + "%s:%s write error, block %lld for %ld bytes\n", + gravity, + sd->name, + (long long)bp->b_blkno, + bp->b_bcount); + } + log(LOG_ERR, + "%s: user buffer block %lld for %ld bytes\n", + sd->name, + (long long)ubp->b_blkno, + ubp->b_bcount); + if (rq->error == ENXIO) { /* the drive's down too */ + log(LOG_ERR, + "%s: fatal drive I/O error, block %lld for %ld bytes\n", + DRIVE[rqe->driveno].label.name, + (long long)bp->b_blkno, + bp->b_bcount); + DRIVE[rqe->driveno].lasterror = rq->error; + set_drive_state(rqe->driveno, /* take the drive down */ + drive_down, + setstate_force); + } + } + /* Now update the statistics */ + if (bp->b_iocmd == BIO_READ) { /* read operation */ + DRIVE[rqe->driveno].reads++; + DRIVE[rqe->driveno].bytes_read += bp->b_bcount; + SD[rqe->sdno].reads++; + SD[rqe->sdno].bytes_read += bp->b_bcount; + PLEX[rqe->rqg->plexno].reads++; + PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount; + if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ + VOL[PLEX[rqe->rqg->plexno].volno].reads++; + VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount; + } + } else { /* write operation */ + DRIVE[rqe->driveno].writes++; + DRIVE[rqe->driveno].bytes_written += bp->b_bcount; + SD[rqe->sdno].writes++; + SD[rqe->sdno].bytes_written += bp->b_bcount; + PLEX[rqe->rqg->plexno].writes++; + PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount; + if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ + VOL[PLEX[rqe->rqg->plexno].volno].writes++; + VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount; + } + } + if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */ + int *sdata; /* source */ + int *data; /* and group data */ + int length; /* and count involved */ + int count; /* loop counter */ + struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */ + + /* XOR destination is the user data */ + sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */ + data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */ + length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ + + for (count = 0; count < length; count++) + data[count] ^= sdata[count]; + + /* + * In a normal read, we will normally read directly + * into the user buffer. This doesn't work if + * we're also doing a recovery, so we have to + * copy it + */ + if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */ + char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */ + char *dst; + + dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */ + length = rqe->datalen << DEV_BSHIFT; /* and count involved */ + bcopy(src, dst, length); /* move it */ + } + } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */ + &&(rqg->active == 1)) /* and this is the last active request */ + complete_raid5_write(rqe); + /* + * This is the earliest place where we can be + * sure that the request has really finished, + * since complete_raid5_write can issue new + * requests. + */ + rqg->active--; /* this request now finished */ + if (rqg->active == 0) { /* request group finished, */ + rq->active--; /* one less */ + if (rqg->lock) { /* got a lock? */ + unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ + rqg->lock = 0; + } + } + if (rq->active == 0) { /* request finished, */ +#ifdef VINUMDEBUG + if (debug & DEBUG_RESID) { + if (ubp->b_resid != 0) /* still something to transfer? */ + Debugger("resid"); + } +#endif + + if (rq->error) { /* did we have an error? */ + if (rq->isplex) { /* plex operation, */ + ubp->b_io.bio_flags |= BIO_ERROR; /* yes, propagate to user */ + ubp->b_error = rq->error; + } else /* try to recover */ + queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */ + } else { + ubp->b_resid = 0; /* completed our transfer */ + if (rq->isplex == 0) /* volume request, */ + VOL[rq->volplex.volno].active--; /* another request finished */ + if (rq->flags & XFR_COPYBUF) { + Free(ubp->b_data); + ubp->b_data = rq->save_data; + } + bufdone(ubp); /* top level buffer completed */ + freerq(rq); /* return the request storage */ + } + } +} + +/* Free a request block and anything hanging off it */ +void +freerq(struct request *rq) +{ + struct rqgroup *rqg; + struct rqgroup *nrqg; /* next in chain */ + int rqno; + + for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) { /* through the whole request chain */ + if (rqg->lock) /* got a lock? */ + unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ + for (rqno = 0; rqno < rqg->count; rqno++) { + if ((rqg->rqe[rqno].flags & XFR_MALLOCED) /* data buffer was malloced, */ + &&rqg->rqe[rqno].b.b_data) /* and the allocation succeeded */ + Free(rqg->rqe[rqno].b.b_data); /* free it */ + if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) { /* locked this buffer, */ + BUF_UNLOCK(&rqg->rqe[rqno].b); /* unlock it again */ + BUF_LOCKFREE(&rqg->rqe[rqno].b); + } + } + nrqg = rqg->next; /* note the next one */ + Free(rqg); /* and free this one */ + } + Free(rq); /* free the request itself */ +} + +/* I/O on subdisk completed */ +void +sdio_done(struct buf *bp) +{ + struct sdbuf *sbp; + + sbp = (struct sdbuf *) bp; + if (sbp->b.b_io.bio_flags & BIO_ERROR) { /* had an error */ + sbp->bp->b_io.bio_flags |= BIO_ERROR; /* propagate upwards */ + sbp->bp->b_error = sbp->b.b_error; + } +#ifdef VINUMDEBUG + if (debug & DEBUG_LASTREQS) + logrq(loginfo_sdiodone, (union rqinfou) bp, bp); +#endif + sbp->bp->b_resid = sbp->b.b_resid; /* copy the resid field */ + /* Now update the statistics */ + if (bp->b_iocmd == BIO_READ) { /* read operation */ + DRIVE[sbp->driveno].reads++; + DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount; + SD[sbp->sdno].reads++; + SD[sbp->sdno].bytes_read += sbp->b.b_bcount; + } else { /* write operation */ + DRIVE[sbp->driveno].writes++; + DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount; + SD[sbp->sdno].writes++; + SD[sbp->sdno].bytes_written += sbp->b.b_bcount; + } + bufdone(sbp->bp); /* complete the caller's I/O */ + BUF_UNLOCK(&sbp->b); + BUF_LOCKFREE(&sbp->b); + Free(sbp); +} + +/* Start the second phase of a RAID-4 or RAID-5 group write operation. */ +void +complete_raid5_write(struct rqelement *rqe) +{ + int *sdata; /* source */ + int *pdata; /* and parity block data */ + int length; /* and count involved */ + int count; /* loop counter */ + int rqno; /* request index */ + int rqoffset; /* offset of request data from parity data */ + struct buf *ubp; /* user buffer header */ + struct request *rq; /* pointer to our request */ + struct rqgroup *rqg; /* and to the request group */ + struct rqelement *prqe; /* point to the parity block */ + struct drive *drive; /* drive to access */ + + rqg = rqe->rqg; /* and to our request group */ + rq = rqg->rq; /* point to our request */ + ubp = rq->bp; /* user's buffer header */ + prqe = &rqg->rqe[0]; /* point to the parity block */ + + /* + * If we get to this function, we have normal or + * degraded writes, or a combination of both. We do + * the same thing in each case: we perform an + * exclusive or to the parity block. The only + * difference is the origin of the data and the + * address range. + */ + if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */ + pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */ + bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */ + + /* Now get what data we need from each block */ + for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ + rqe = &rqg->rqe[rqno]; /* this request */ + sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */ + length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ + + /* + * Add the data block to the parity block. Before + * we started the request, we zeroed the parity + * block, so the result of adding all the other + * blocks and the block we want to write will be + * the correct parity block. + */ + for (count = 0; count < length; count++) + pdata[count] ^= sdata[count]; + if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */ + &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */ + Free(rqe->b.b_data); /* free it now */ + rqe->flags &= ~XFR_MALLOCED; + } + } + } + if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */ + /* Get what data we need from each block */ + for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ + rqe = &rqg->rqe[rqno]; /* this request */ + if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE)) + == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */ + sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */ + rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */ + pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */ + length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ + + /* + * "remove" the old data block + * from the parity block + */ + if ((pdata < ((int *) prqe->b.b_data)) + || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount))) + || (sdata < ((int *) rqe->b.b_data)) + || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount)))) + panic("complete_raid5_write: bounds overflow"); + for (count = 0; count < length; count++) + pdata[count] ^= sdata[count]; + + /* "add" the new data block */ + sdata = (int *) (&ubp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */ + if ((sdata < ((int *) ubp->b_data)) + || (&sdata[length] > ((int *) (ubp->b_data + ubp->b_bcount)))) + panic("complete_raid5_write: bounds overflow"); + for (count = 0; count < length; count++) + pdata[count] ^= sdata[count]; + + /* Free the malloced buffer */ + if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */ + Free(rqe->b.b_data); /* free it */ + rqe->flags &= ~XFR_MALLOCED; + } else + panic("complete_raid5_write: malloc conflict"); + + if ((rqe->b.b_iocmd == BIO_READ) /* this was a read */ + &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */ + rqe->b.b_flags &= ~B_DONE; /* start a new request */ + rqe->b.b_iocmd = BIO_WRITE; /* we're writing now */ + rqe->b.b_iodone = complete_rqe; /* call us here when done */ + rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */ + rqe->b.b_data = &ubp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */ + rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */ + rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */ + rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ + rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */ + rqg->active++; /* another active request */ + drive = &DRIVE[rqe->driveno]; /* drive to access */ + + /* We can't sleep here, so we just increment the counters. */ + drive->active++; + if (drive->active >= drive->maxactive) + drive->maxactive = drive->active; + vinum_conf.active++; + if (vinum_conf.active >= vinum_conf.maxactive) + vinum_conf.maxactive = vinum_conf.active; +#ifdef VINUMDEBUG + if (debug & DEBUG_ADDRESSES) + log(LOG_DEBUG, + " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n", + rqe->b.b_iocmd == BIO_READ ? "Read" : "Write", + major(rqe->b.b_dev), + minor(rqe->b.b_dev), + rqe->sdno, + (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), + (long long)rqe->b.b_blkno, + rqe->b.b_bcount); + if (debug & DEBUG_LASTREQS) + logrq(loginfo_raid5_data, (union rqinfou) rqe, ubp); +#endif + DEV_STRATEGY(&rqe->b); + } + } + } + } + /* Finally, write the parity block */ + rqe = &rqg->rqe[0]; + rqe->b.b_flags &= ~B_DONE; /* we're not done */ + rqe->b.b_iocmd = BIO_WRITE; /* we're writing now */ + rqe->b.b_iodone = complete_rqe; /* call us here when done */ + rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */ + rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */ + rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */ + rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ + rqg->active++; /* another active request */ + drive = &DRIVE[rqe->driveno]; /* drive to access */ + + /* We can't sleep here, so we just increment the counters. */ + drive->active++; + if (drive->active >= drive->maxactive) + drive->maxactive = drive->active; + vinum_conf.active++; + if (vinum_conf.active >= vinum_conf.maxactive) + vinum_conf.maxactive = vinum_conf.active; + +#ifdef VINUMDEBUG + if (debug & DEBUG_ADDRESSES) + log(LOG_DEBUG, + " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n", + rqe->b.b_iocmd == BIO_READ ? "Read" : "Write", + major(rqe->b.b_dev), + minor(rqe->b.b_dev), + rqe->sdno, + (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), + (long long)rqe->b.b_blkno, + rqe->b.b_bcount); + if (debug & DEBUG_LASTREQS) + logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubp); +#endif + DEV_STRATEGY(&rqe->b); +} + +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumio.c b/sys/dev/vinum/vinumio.c new file mode 100644 index 0000000..8544f95 --- /dev/null +++ b/sys/dev/vinum/vinumio.c @@ -0,0 +1,959 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumio.c,v 1.39 2003/05/23 00:59:53 grog Exp grog $ + * $FreeBSD$ + */ + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> + +static char *sappend(char *txt, char *s); +static int drivecmp(const void *va, const void *vb); + +/* + * Open the device associated with the drive, and + * set drive's vp. Return an error number. + */ +int +open_drive(struct drive *drive, struct thread *td, int verbose) +{ + struct cdevsw *dsw; /* pointer to cdevsw entry */ + + if (drive->flags & VF_OPEN) /* open already, */ + return EBUSY; /* don't do it again */ + + drive->dev = getdiskbyname(drive->devicename); + if (drive->dev == NODEV) /* didn't find anything */ + return ENOENT; + + drive->dev->si_iosize_max = DFLTPHYS; + dsw = devsw(drive->dev); + if (dsw == NULL) /* sanity, should not happen */ + drive->lasterror = ENOENT; + else if ((dsw->d_flags & D_DISK) == 0) + drive->lasterror = ENOTBLK; + else + drive->lasterror = (dsw->d_open) (drive->dev, FWRITE | FREAD, 0, NULL); + + if (drive->lasterror != 0) { /* failed */ + drive->state = drive_down; /* just force it down */ + if (verbose) + log(LOG_WARNING, + "vinum open_drive %s: failed with error %d\n", + drive->devicename, drive->lasterror); + } else + drive->flags |= VF_OPEN; /* we're open now */ + + return drive->lasterror; +} + +/* + * Set some variables in the drive struct in more + * convenient form. Return error indication. + */ +int +set_drive_parms(struct drive *drive) +{ + drive->blocksize = BLKDEV_IOSIZE; /* do we need this? */ + drive->secsperblock = drive->blocksize /* number of sectors per block */ + / drive->sectorsize; + + /* Now update the label part */ + bcopy(hostname, drive->label.sysname, VINUMHOSTNAMELEN); /* put in host name */ + microtime(&drive->label.date_of_birth); /* and current time */ + drive->label.drive_size = drive->mediasize; /* size of the drive in bytes */ +#ifdef VINUMDEBUG + if (debug & DEBUG_BIGDRIVE) /* pretend we're 100 times as big */ + drive->label.drive_size *= 100; +#endif + + /* number of sectors available for subdisks */ + drive->sectors_available = drive->label.drive_size / DEV_BSIZE - DATASTART; + + /* + * Bug in 3.0 as of January 1998: you can open + * non-existent slices. They have a length of 0. + */ + if (drive->label.drive_size < MINVINUMSLICE) { /* too small to worry about */ + set_drive_state(drive->driveno, drive_down, setstate_force); + drive->lasterror = ENOSPC; + return ENOSPC; + } + drive->freelist_size = INITIAL_DRIVE_FREELIST; /* initial number of entries */ + drive->freelist = (struct drive_freelist *) + Malloc(INITIAL_DRIVE_FREELIST * sizeof(struct drive_freelist)); + if (drive->freelist == NULL) /* can't malloc, dammit */ + return ENOSPC; + drive->freelist_entries = 1; /* just (almost) the complete drive */ + drive->freelist[0].offset = DATASTART; /* starts here */ + drive->freelist[0].sectors = (drive->label.drive_size >> DEV_BSHIFT) - DATASTART; /* and it's this long */ + if (drive->label.name[0] != '\0') /* got a name */ + set_drive_state(drive->driveno, drive_up, setstate_force); /* our drive is accessible */ + else /* we know about it, but that's all */ + drive->state = drive_referenced; + return 0; +} + +/* + * Initialize a drive: open the device and add + * device information. + */ +int +init_drive(struct drive *drive, int verbose) +{ + + drive->lasterror = open_drive(drive, curthread, verbose); /* open the drive */ + if (drive->lasterror) + return drive->lasterror; + + drive->lasterror = (*devsw(drive->dev)->d_ioctl) (drive->dev, + DIOCGSECTORSIZE, + (caddr_t) & drive->sectorsize, + FREAD, + curthread); + if (drive->lasterror == 0) + drive->lasterror = (*devsw(drive->dev)->d_ioctl) (drive->dev, + DIOCGMEDIASIZE, + (caddr_t) & drive->mediasize, + FREAD, + curthread); + if (drive->lasterror) { + if (verbose) + log(LOG_ERR, + "vinum: Can't get drive dimensions for %s: error %d\n", + drive->devicename, + drive->lasterror); + close_drive(drive); + return drive->lasterror; + } + return set_drive_parms(drive); /* set various odds and ends */ +} + +/* Close a drive if it's open. */ +void +close_drive(struct drive *drive) +{ + LOCKDRIVE(drive); /* keep the daemon out */ + if (drive->flags & VF_OPEN) + close_locked_drive(drive); /* and close it */ + if (drive->state > drive_down) /* if it's up */ + drive->state = drive_down; /* make sure it's down */ + unlockdrive(drive); +} + +/* + * Real drive close code, called with drive already locked. + * We have also checked that the drive is open. No errors. + */ +void +close_locked_drive(struct drive *drive) +{ + int error; + + /* + * If we can't access the drive, we can't flush + * the queues, which spec_close() will try to + * do. Get rid of them here first. + */ + error = (*devsw(drive->dev)->d_close) (drive->dev, FWRITE | FREAD, 0, NULL); + drive->flags &= ~VF_OPEN; /* no longer open */ + if (drive->lasterror == 0) + drive->lasterror = error; +} + +/* + * Remove drive from the configuration. + * Caller must ensure that it isn't active. + */ +void +remove_drive(int driveno) +{ + struct drive *drive = &vinum_conf.drive[driveno]; + struct vinum_hdr *vhdr; /* buffer for header */ + int error; + + if (drive->state > drive_referenced) { /* real drive */ + if (drive->state == drive_up) { + vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN); /* allocate buffer */ + CHECKALLOC(vhdr, "Can't allocate memory"); + error = read_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET); + if (error) + drive->lasterror = error; + else { + vhdr->magic = VINUM_NOMAGIC; /* obliterate the magic, but leave the rest */ + write_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET); + } + Free(vhdr); + } + free_drive(drive); /* close it and free resources */ + save_config(); /* and save the updated configuration */ + } +} + +/* + * Transfer drive data. Usually called from one of these defines; + * #define read_drive(a, b, c, d) driveio (a, b, c, d, B_READ) + * #define write_drive(a, b, c, d) driveio (a, b, c, d, B_WRITE) + * + * length and offset are in bytes, but must be multiples of sector + * size. The function *does not check* for this condition, and + * truncates ruthlessly. + * Return error number. + */ +int +driveio(struct drive *drive, char *buf, size_t length, off_t offset, int flag) +{ + int error; + struct buf *bp; + + error = 0; /* to keep the compiler happy */ + while (length) { /* divide into small enough blocks */ + int len = min(length, MAXBSIZE); /* maximum block device transfer is MAXBSIZE */ + + bp = geteblk(len); /* get a buffer header */ + bp->b_flags = 0; + bp->b_iocmd = flag; + bp->b_dev = drive->dev; /* device */ + bp->b_blkno = offset / drive->sectorsize; /* block number */ + bp->b_saveaddr = bp->b_data; + bp->b_data = buf; + bp->b_bcount = len; + DEV_STRATEGY(bp); /* initiate the transfer */ + error = bufwait(bp); + bp->b_data = bp->b_saveaddr; + bp->b_flags |= B_INVAL | B_AGE; + bp->b_ioflags &= ~BIO_ERROR; + brelse(bp); + if (error) + break; + length -= len; /* update pointers */ + buf += len; + offset += len; + } + return error; +} + +/* + * Check a drive for a vinum header. If found, + * update the drive information. We come here + * with a partially populated drive structure + * which includes the device name. + * + * Return information on what we found. + * + * This function is called from two places: check_drive, + * which wants to find out whether the drive is a + * Vinum drive, and config_drive, which asserts that + * it is a vinum drive. In the first case, we don't + * print error messages (verbose==0), in the second + * we do (verbose==1). + */ +enum drive_label_info +read_drive_label(struct drive *drive, int verbose) +{ + int error; + int result; /* result of our search */ + struct vinum_hdr *vhdr; /* and as header */ + + error = init_drive(drive, 0); /* find the drive */ + if (error) /* find the drive */ + return DL_CANT_OPEN; /* not ours */ + + vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN); /* allocate buffers */ + CHECKALLOC(vhdr, "Can't allocate memory"); + + drive->state = drive_up; /* be optimistic */ + error = read_drive(drive, (void *) vhdr, VINUMHEADERLEN, VINUM_LABEL_OFFSET); + if (vhdr->magic == VINUM_MAGIC) { /* ours! */ + if (drive->label.name[0] /* we have a name for this drive */ + &&(strcmp(drive->label.name, vhdr->label.name))) { /* but it doesn't match the real name */ + drive->lasterror = EINVAL; + result = DL_WRONG_DRIVE; /* it's the wrong drive */ + drive->state = drive_unallocated; /* put it back, it's not ours */ + } else + result = DL_OURS; + /* + * We copy the drive anyway so that we have + * the correct name in the drive info. This + * may not be the name specified + */ + drive->label = vhdr->label; /* put in the label information */ + } else if (vhdr->magic == VINUM_NOMAGIC) /* was ours, but we gave it away */ + result = DL_DELETED_LABEL; /* and return the info */ + else + result = DL_NOT_OURS; /* we could have it, but we don't yet */ + Free(vhdr); /* that's all. */ + return result; +} + +/* + * Check a drive for a vinum header. If found, + * read configuration information from the drive and + * incorporate the data into the configuration. + * + * Return drive number. + */ +struct drive * +check_drive(char *devicename) +{ + int driveno; + int i; + struct drive *drive; + + driveno = find_drive_by_name(devicename, 1); /* if entry doesn't exist, create it */ + drive = &vinum_conf.drive[driveno]; /* and get a pointer */ + + if (drive->state >= drive_down) /* up or down, we know it */ + return drive; + if (read_drive_label(drive, 0) == DL_OURS) { /* one of ours */ + for (i = 0; i < vinum_conf.drives_allocated; i++) { /* see if the name already exists */ + if ((i != driveno) /* not this drive */ + &&(DRIVE[i].state != drive_unallocated) /* and it's allocated */ + &&(strcmp(DRIVE[i].label.name, + DRIVE[driveno].label.name) == 0)) { /* and it has the same name */ + struct drive *mydrive = &DRIVE[i]; + + if (mydrive->devicename[0] == '/') { /* we know a device name for it */ + /* + * set an error, but don't take the + * drive down: that would cause unneeded + * error messages. + */ + drive->lasterror = EEXIST; + break; + } else { /* it's just a place holder, */ + int sdno; + + for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { /* look at each subdisk */ + if ((SD[sdno].driveno == i) /* it's pointing to this one, */ + &&(SD[sdno].state != sd_unallocated)) { /* and it's a real subdisk */ + SD[sdno].driveno = drive->driveno; /* point to the one we found */ + update_sd_state(sdno); /* and update its state */ + } + } + bzero(mydrive, sizeof(struct drive)); /* don't deallocate it, just remove it */ + } + } + } + return drive; + } else { /* not ours, */ + close_drive(drive); + free_drive(drive); /* get rid of it */ + return NULL; + } +} + +static char * +sappend(char *txt, char *s) +{ + while ((*s++ = *txt++) != 0); + return s - 1; +} + +void +format_config(char *config, int len) +{ + int i; + int j; + char *s = config; + char *configend = &config[len]; + + bzero(config, len); + + /* First write the volume configuration */ + for (i = 0; i < vinum_conf.volumes_allocated; i++) { + struct volume *vol; + + vol = &vinum_conf.volume[i]; + if ((vol->state > volume_uninit) + && (vol->name[0] != '\0')) { /* paranoia */ + snprintf(s, + configend - s, + "volume %s state %s", + vol->name, + volume_state(vol->state)); + while (*s) + s++; /* find the end */ + s = sappend("\n", s); + } + } + + /* Then the plex configuration */ + for (i = 0; i < vinum_conf.plexes_allocated; i++) { + struct plex *plex; + struct volume *vol; + + plex = &vinum_conf.plex[i]; + if ((plex->state > plex_referenced) + && (plex->name[0] != '\0')) { /* paranoia */ + snprintf(s, + configend - s, + "plex name %s state %s org %s ", + plex->name, + plex_state(plex->state), + plex_org(plex->organization)); + while (*s) + s++; /* find the end */ + if (isstriped(plex)) { + snprintf(s, + configend - s, + "%ds ", + (int) plex->stripesize); + while (*s) + s++; /* find the end */ + } + if (plex->volno >= 0) { /* we have a volume */ + vol = &VOL[plex->volno]; + snprintf(s, + configend - s, + "vol %s ", + vol->name); + while (*s) + s++; /* find the end */ + if ((vol->preferred_plex >= 0) /* has a preferred plex */ + &&vol->plex[vol->preferred_plex] == i) /* and it's us */ + snprintf(s, configend - s, "preferred "); + while (*s) + s++; /* find the end */ + } + for (j = 0; j < plex->subdisks; j++) { + snprintf(s, + configend - s, + " sd %s", + vinum_conf.sd[plex->sdnos[j]].name); + } + s = sappend("\n", s); + } + } + + /* And finally the subdisk configuration */ + for (i = 0; i < vinum_conf.subdisks_allocated; i++) { + struct sd *sd; + char *drivename; + + sd = &SD[i]; + if ((sd->state != sd_referenced) + && (sd->state != sd_unallocated) + && (sd->name[0] != '\0')) { /* paranoia */ + drivename = vinum_conf.drive[sd->driveno].label.name; + /* + * XXX We've seen cases of dead subdisks + * which don't have a drive. If we let them + * through here, the drive name is null, so + * they get the drive named 'plex'. + * + * This is a breakage limiter, not a fix. + */ + if (drivename[0] == '\0') + drivename = "*invalid*"; + snprintf(s, + configend - s, + "sd name %s drive %s len %llus driveoffset %llus state %s", + sd->name, + drivename, + (unsigned long long) sd->sectors, + (unsigned long long) sd->driveoffset, + sd_state(sd->state)); + while (*s) + s++; /* find the end */ + if (sd->plexno >= 0) + snprintf(s, + configend - s, + " plex %s plexoffset %llds", + vinum_conf.plex[sd->plexno].name, + (long long) sd->plexoffset); + else + snprintf(s, configend - s, " detached"); + while (*s) + s++; /* find the end */ + if (sd->flags & VF_RETRYERRORS) { + snprintf(s, configend - s, " retryerrors"); + while (*s) + s++; /* find the end */ + } + snprintf(s, configend - s, " \n"); + while (*s) + s++; /* find the end */ + } + } + if (s > &config[len - 2]) + panic("vinum: configuration data overflow"); +} + +/* + * issue a save config request to the dæmon. The actual work + * is done in process context by daemon_save_config. + */ +void +save_config(void) +{ + queue_daemon_request(daemonrq_saveconfig, (union daemoninfo) 0); +} + +/* + * Write the configuration to all vinum slices. This + * is performed by the daemon only. + */ +void +daemon_save_config(void) +{ + int error; + int written_config; /* set when we first write the config to disk */ + int driveno; + struct drive *drive; /* point to current drive info */ + struct vinum_hdr *vhdr; /* and as header */ + char *config; /* point to config data */ + + /* don't save the configuration while we're still working on it */ + if (vinum_conf.flags & VF_CONFIGURING) + return; + written_config = 0; /* no config written yet */ + /* Build a volume header */ + vhdr = (struct vinum_hdr *) Malloc(VINUMHEADERLEN); /* get space for the config data */ + CHECKALLOC(vhdr, "Can't allocate config data"); + vhdr->magic = VINUM_MAGIC; /* magic number */ + vhdr->config_length = MAXCONFIG; /* length of following config info */ + + config = Malloc(MAXCONFIG); /* get space for the config data */ + CHECKALLOC(config, "Can't allocate config data"); + + format_config(config, MAXCONFIG); + error = 0; /* no errors yet */ + for (driveno = 0; driveno < vinum_conf.drives_allocated; driveno++) { + drive = &vinum_conf.drive[driveno]; /* point to drive */ + if (drive->state > drive_referenced) { + LOCKDRIVE(drive); /* don't let it change */ + + /* + * First, do some drive consistency checks. Some + * of these are kludges, others require a process + * context and couldn't be done before. + */ + if ((drive->devicename[0] == '\0') + || (drive->label.name[0] == '\0')) { + unlockdrive(drive); + free_drive(drive); /* get rid of it */ + break; + } + if (((drive->flags & VF_OPEN) == 0) /* drive not open */ + &&(drive->state > drive_down)) { /* and it thinks it's not down */ + unlockdrive(drive); + set_drive_state(driveno, drive_down, setstate_force); /* tell it what's what */ + continue; + } + if ((drive->state == drive_down) /* it's down */ + &&(drive->flags & VF_OPEN)) { /* but open, */ + unlockdrive(drive); + close_drive(drive); /* close it */ + } else if (drive->state > drive_down) { + microtime(&drive->label.last_update); /* time of last update is now */ + bcopy((char *) &drive->label, /* and the label info from the drive structure */ + (char *) &vhdr->label, + sizeof(vhdr->label)); + if ((drive->state != drive_unallocated) + && (drive->state != drive_referenced)) { /* and it's a real drive */ + error = write_drive(drive, + (char *) vhdr, + VINUMHEADERLEN, + VINUM_LABEL_OFFSET); + if (error == 0) /* first config copy */ + error = write_drive(drive, + config, + MAXCONFIG, + VINUM_CONFIG_OFFSET); + if (error == 0) + error = write_drive(drive, /* second copy */ + config, + MAXCONFIG, + VINUM_CONFIG_OFFSET + MAXCONFIG); + unlockdrive(drive); + if (error) { + log(LOG_ERR, + "vinum: Can't write config to %s, error %d\n", + drive->devicename, + error); + set_drive_state(drive->driveno, drive_down, setstate_force); + } else + written_config = 1; /* we've written it on at least one drive */ + } + } else /* not worth looking at, */ + unlockdrive(drive); /* just unlock it again */ + } + } + Free(vhdr); + Free(config); +} + +/* + * Disk labels are a mess. The correct way to + * access them is with the DIOC[GSW]DINFO ioctls, + * but some programs, such as newfs, access the + * disk directly, so we have to write things + * there. We do this only on request. If a user + * request tries to read it directly, we fake up + * one on the fly. + */ + +/* + * get_volume_label returns a label structure to + * lp, which is allocated by the caller. + */ +void +get_volume_label(char *name, int plexes, u_int64_t size, struct disklabel *lp) +{ + bzero(lp, sizeof(struct disklabel)); + + strncpy(lp->d_typename, "vinum", sizeof(lp->d_typename)); + lp->d_type = DTYPE_VINUM; + strncpy(lp->d_packname, name, min(sizeof(lp->d_packname), sizeof(name))); + lp->d_rpm = 14400 * plexes; /* to keep them guessing */ + lp->d_interleave = 1; + lp->d_flags = 0; + + /* + * A Vinum volume has a single track with all + * its sectors. + */ + lp->d_secsize = DEV_BSIZE; /* bytes per sector */ + lp->d_nsectors = size; /* data sectors per track */ + lp->d_ntracks = 1; /* tracks per cylinder */ + lp->d_ncylinders = 1; /* data cylinders per unit */ + lp->d_secpercyl = size; /* data sectors per cylinder */ + lp->d_secperunit = size; /* data sectors per unit */ + + lp->d_bbsize = BBSIZE; + lp->d_sbsize = 0; /* no longer used? */ + lp->d_magic = DISKMAGIC; + lp->d_magic2 = DISKMAGIC; + + /* + * Set up partitions a, b and c to be identical + * and the size of the volume. a is UFS, b is + * swap, c is nothing. + */ + lp->d_partitions[0].p_size = size; + lp->d_partitions[0].p_fsize = 1024; + lp->d_partitions[0].p_fstype = FS_BSDFFS; /* FreeBSD File System :-) */ + lp->d_partitions[0].p_fsize = 1024; /* FS fragment size */ + lp->d_partitions[0].p_frag = 8; /* and fragments per block */ + lp->d_partitions[SWAP_PART].p_size = size; + lp->d_partitions[SWAP_PART].p_fstype = FS_SWAP; /* swap partition */ + lp->d_partitions[LABEL_PART].p_size = size; + lp->d_npartitions = LABEL_PART + 1; + strncpy(lp->d_packname, name, min(sizeof(lp->d_packname), sizeof(name))); + lp->d_checksum = dkcksum(lp); +} + +/* + * Seach disks on system for vinum slices and add + * them to the configuuration if they're not + * there already. devicename is a blank-separate + * list of device names. If not provided, use + * sysctl to get a list of all disks on the + * system. + * + * Return an error indication. + */ +int +vinum_scandisk(char *devicename) +{ + struct drive *volatile drive; + volatile int driveno; + int firstdrive; /* first drive in this list */ + volatile int gooddrives; /* number of usable drives found */ + int firsttime; /* set if we have never configured before */ + int error; + char *config_text; /* read the config info from disk into here */ + char *volatile cptr; /* pointer into config information */ + char *eptr; /* end pointer into config information */ + char *config_line; /* copy the config line to */ + volatile int status; + int *drivelist; /* list of drive indices */ + char *partname; /* for creating partition names */ + char *cp; /* pointer to start of disk name */ + char *ep; /* and to first char after name */ + char *np; /* name pointer in naem we build */ + size_t alloclen; + int malloced; + int partnamelen; /* length of partition name */ + int drives; + + malloced = 0; /* devicename not malloced */ + if (devicename == NULL) { /* no devices specified, */ + /* get a list of all disks in the system */ + /* Get size of disk list */ + error = kernel_sysctlbyname(&thread0, "kern.disks", NULL, + NULL, NULL, 0, &alloclen); + if (error) { + log(LOG_ERR, "vinum: can't get disk list: %d\n", error); + return EINVAL; + } + devicename = Malloc(alloclen); + if (devicename == NULL) { + printf("vinum: can't allocate memory for drive list"); + return ENOMEM; + } else + malloced = 1; + /* Now get the list of disks */ + kernel_sysctlbyname(&thread0, "kern.disks", devicename, + &alloclen, NULL, 0, NULL); + } + status = 0; /* success indication */ + vinum_conf.flags |= VF_READING_CONFIG; /* reading config from disk */ + partname = Malloc(MAXPATHLEN); /* extract name of disk here */ + if (partname == NULL) { + printf("vinum_scandisk: can't allocate memory for drive name"); + return ENOMEM; + } + gooddrives = 0; /* number of usable drives found */ + firstdrive = vinum_conf.drives_used; /* the first drive */ + firsttime = vinum_conf.drives_used == 0; /* are we a virgin? */ + + /* allocate a drive pointer list */ + drives = 256; /* should be enough for most cases */ + drivelist = (int *) Malloc(drives * sizeof(int)); + CHECKALLOC(drivelist, "Can't allocate memory"); + error = lock_config(); /* make sure we're alone here */ + if (error) + return error; + error = setjmp(command_fail); /* come back here on error */ + if (error) /* longjmped out */ + return error; + + /* Open all drives and find which was modified most recently */ + for (cp = devicename; *cp; cp = ep) { + char part; /* UNIX partition */ + int slice; + + while (*cp == ' ') + cp++; /* find start of name */ + if (*cp == '\0') /* done, */ + break; + ep = cp; + while (*ep && (*ep != ' ')) /* find end of name */ + ep++; + + np = partname; /* start building up a name here */ + if (*cp != '/') { /* name doesn't start with /, */ + strcpy(np, "/dev/"); /* assume /dev */ + np += strlen("/dev/"); + } + memcpy(np, cp, ep - cp); /* put in name */ + np += ep - cp; /* and point past */ + + partnamelen = MAXPATHLEN + np - partname; /* remaining length in partition name */ + /* first try the partition table */ + for (slice = 1; slice < 5; slice++) + for (part = 'a'; part < 'i'; part++) { + if (part != 'c') { /* don't do the c partition */ + snprintf(np, + partnamelen, + "s%d%c", + slice, + part); + drive = check_drive(partname); /* try to open it */ + if (drive) { /* got something, */ + if (drive->flags & VF_CONFIGURED) /* already read this config, */ + log(LOG_WARNING, + "vinum: already read config from %s\n", /* say so */ + drive->label.name); + else { + if (gooddrives == drives) /* ran out of entries */ + EXPAND(drivelist, int, drives, drives); /* double the size */ + drivelist[gooddrives] = drive->driveno; /* keep the drive index */ + drive->flags &= ~VF_NEWBORN; /* which is no longer newly born */ + gooddrives++; + } + } + } + } + /* + * This is a kludge. Probably none of this + * should be here. + */ + if (gooddrives == 0) { /* didn't find anything, */ + for (part = 'a'; part < 'i'; part++) /* try the compatibility partition */ + if (part != 'c') { /* don't do the c partition */ + snprintf(np, + partnamelen, + "%c", + part); + drive = check_drive(partname); /* try to open it */ + if (drive) { /* got something, */ + if (drive->flags & VF_CONFIGURED) /* already read this config, */ + log(LOG_WARNING, + "vinum: already read config from %s\n", /* say so */ + drive->label.name); + else { + if (gooddrives == drives) /* ran out of entries */ + EXPAND(drivelist, int, drives, drives); /* double the size */ + drivelist[gooddrives] = drive->driveno; /* keep the drive index */ + drive->flags &= ~VF_NEWBORN; /* which is no longer newly born */ + gooddrives++; + } + } + } + } + } + Free(partname); + + if (gooddrives == 0) { + if (firsttime) + log(LOG_WARNING, "vinum: no drives found\n"); + else + log(LOG_INFO, "vinum: no additional drives found\n"); + if (malloced) + Free(devicename); + unlock_config(); + return ENOENT; + } + /* + * We now have at least one drive open. Sort + * them in order of config time and merge the + * config info with what we have already. + */ + qsort(drivelist, gooddrives, sizeof(int), drivecmp); + config_text = (char *) Malloc(MAXCONFIG * 2); /* allocate buffers */ + CHECKALLOC(config_text, "Can't allocate memory"); + config_line = (char *) Malloc(MAXCONFIGLINE * 2); /* allocate buffers */ + CHECKALLOC(config_line, "Can't allocate memory"); + for (driveno = 0; driveno < gooddrives; driveno++) { /* now include the config */ + drive = &DRIVE[drivelist[driveno]]; /* point to the drive */ + + if (firsttime && (driveno == 0)) /* we've never configured before, */ + log(LOG_INFO, "vinum: reading configuration from %s\n", drive->devicename); + else + log(LOG_INFO, "vinum: updating configuration from %s\n", drive->devicename); + + if (drive->state == drive_up) + /* Read in both copies of the configuration information */ + error = read_drive(drive, config_text, MAXCONFIG * 2, VINUM_CONFIG_OFFSET); + else { + error = EIO; + printf("vinum_scandisk: %s is %s\n", drive->devicename, drive_state(drive->state)); + } + + if (error != 0) { + log(LOG_ERR, "vinum: Can't read device %s, error %d\n", drive->devicename, error); + free_drive(drive); /* give it back */ + status = error; + } + /* + * At this point, check that the two copies + * are the same, and do something useful if + * not. In particular, consider which is + * newer, and what this means for the + * integrity of the data on the drive. + */ + else { + vinum_conf.drives_used++; /* another drive in use */ + /* Parse the configuration, and add it to the global configuration */ + for (cptr = config_text; *cptr != '\0';) { /* love this style(9) */ + volatile int parse_status; /* return value from parse_config */ + + for (eptr = config_line; (*cptr != '\n') && (*cptr != '\0');) /* until the end of the line */ + *eptr++ = *cptr++; + *eptr = '\0'; /* and delimit */ + if (setjmp(command_fail) == 0) { /* come back here on error and continue */ + parse_status = parse_config(config_line, &keyword_set, 1); /* parse the config line */ + /* + * parse_config recognizes referenced + * drives and builds a drive entry for + * them. This may expand the drive + * table, thus invalidating the pointer. + */ + drive = &DRIVE[drivelist[driveno]]; /* point to the drive */ + + if (parse_status < 0) { /* error in config */ + /* + * This config should have been parsed + * in user space. If we run into + * problems here, something serious is + * afoot. Complain and let the user + * snarf the config to see what's + * wrong. + */ + log(LOG_ERR, + "vinum: Config error on %s, aborting integration\n", + drive->devicename); + free_drive(drive); /* give it back */ + status = EINVAL; + } + } + while (*cptr == '\n') + cptr++; /* skip to next line */ + } + } + drive->flags |= VF_CONFIGURED; /* this drive's configuration is complete */ + } + + Free(config_text); + Free(drivelist); + vinum_conf.flags &= ~VF_READING_CONFIG; /* no longer reading from disk */ + if (status != 0) + printf("vinum: couldn't read configuration"); + else + updateconfig(VF_READING_CONFIG); /* update from disk config */ + if (malloced) + Free(devicename); + unlock_config(); + return status; +} + +/* + * Compare the modification dates of the drives, for qsort. + * Return 1 if a < b, 0 if a == b, 01 if a > b: in other + * words, sort backwards. + */ +int +drivecmp(const void *va, const void *vb) +{ + const struct drive *a = &DRIVE[*(const int *) va]; + const struct drive *b = &DRIVE[*(const int *) vb]; + + if ((a->label.last_update.tv_sec == b->label.last_update.tv_sec) + && (a->label.last_update.tv_usec == b->label.last_update.tv_usec)) + return 0; + else if ((a->label.last_update.tv_sec > b->label.last_update.tv_sec) + || ((a->label.last_update.tv_sec == b->label.last_update.tv_sec) + && (a->label.last_update.tv_usec > b->label.last_update.tv_usec))) + return -1; + else + return 1; +} +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumio.h b/sys/dev/vinum/vinumio.h new file mode 100644 index 0000000..bf5134a --- /dev/null +++ b/sys/dev/vinum/vinumio.h @@ -0,0 +1,154 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumio.h,v 1.23 2003/05/04 05:25:46 grog Exp grog $ + * $FreeBSD$ + */ + +#define L 'F' /* ID letter of our ioctls */ + +#define MAX_IOCTL_REPLY 1024 + +#ifdef VINUMDEBUG +struct debuginfo { + int changeit; + int param; +}; + +#endif + +enum objecttype { + drive_object, + sd_object, + plex_object, + volume_object, + invalid_object +}; + +/* + * The state to set with VINUM_SETSTATE. Since each object has a + * different set of states, we need to translate later. + */ +enum objectstate { + object_down, + object_initializing, + object_initialized, + object_up +}; + +/* + * This structure is used for modifying objects + * (VINUM_SETSTATE, VINUM_REMOVE, VINUM_RESETSTATS, VINUM_ATTACH, + * VINUM_DETACH, VINUM_REPLACE + */ +struct vinum_ioctl_msg { + int index; + enum objecttype type; + enum objectstate state; /* state to set (VINUM_SETSTATE) */ + enum parityop op; /* for parity ops */ + int force; /* do it even if it doesn't make sense */ + int recurse; /* recurse (VINUM_REMOVE) */ + int verify; /* verify (initsd, rebuildparity) */ + int otherobject; /* superordinate object (attach), + * replacement object (replace) */ + int rename; /* rename object (attach) */ + int64_t offset; /* offset of subdisk (for attach) */ + int blocksize; /* size of block to revive (bytes) */ +}; + +/* VINUM_CREATE returns a buffer of this kind */ +struct _ioctl_reply { + int error; + char msg[MAX_IOCTL_REPLY]; +}; + +struct vinum_rename_msg { + int index; + int recurse; /* rename subordinate objects too */ + enum objecttype type; + char newname[MAXNAME]; /* new name to give to object */ +}; + +/* ioctl requests */ +#define BUFSIZE 1024 /* size of buffer, including continuations */ +#define VINUM_CREATE _IOC(IOC_IN | IOC_OUT, L, 64, BUFSIZE) /* configure vinum */ +#define VINUM_GETCONFIG _IOR(L, 65, struct __vinum_conf) /* get global config */ +#define VINUM_DRIVECONFIG _IOWR(L, 66, struct _drive) /* get drive config */ +#define VINUM_SDCONFIG _IOWR(L, 67, struct _sd) /* get subdisk config */ +#define VINUM_PLEXCONFIG _IOWR(L, 68, struct _plex) /* get plex config */ +#define VINUM_VOLCONFIG _IOWR(L, 69, struct _volume) /* get volume config */ +#define VINUM_PLEXSDCONFIG _IOWR(L, 70, struct _sd) /* get sd config for plex (plex, sdno) */ +#define VINUM_GETFREELIST _IOWR(L, 71, struct drive_freelist) /* get freelist element (drive, fe) */ +#define VINUM_SAVECONFIG _IOW(L, 72, int) /* write config to disk */ +#define VINUM_RESETCONFIG _IOC(0, L, 73, 0) /* trash config on disk */ +#define VINUM_INIT _IOC(0, L, 74, 0) /* read config from disk */ +#define VINUM_READCONFIG _IOC(IOC_IN | IOC_OUT, L, 75, BUFSIZE) /* read config from disk */ +#ifdef VINUMDEBUG +#define VINUM_DEBUG _IOWR(L, 127, struct debuginfo) /* call the debugger from ioctl () */ +#endif + +/* + * Start an object. Pass two integers: + * msg [0] index in vinum_conf.<object> + * msg [1] type of object (see below) + * + * Return ioctl_reply + */ +#define VINUM_SETSTATE _IOC(IOC_IN | IOC_OUT, L, 76, MAX_IOCTL_REPLY) /* start an object */ +#define VINUM_RELEASECONFIG _IOC(0, L, 77, 0) /* release locks and write config to disk */ +#define VINUM_STARTCONFIG _IOW(L, 78, int) /* start a configuration operation */ +#define VINUM_MEMINFO _IOR(L, 79, struct meminfo) /* get memory usage summary */ +#define VINUM_MALLOCINFO _IOWR(L, 80, struct mc) /* get specific malloc information [i] */ +#define VINUM_INITSD _IOW(L, 82, int) /* initialize a subdisk */ +#define VINUM_REMOVE _IOWR(L, 83, struct _ioctl_reply) /* remove an object */ +#define VINUM_READPOL _IOWR(L, 84, struct _ioctl_reply) /* set read policy */ +#define VINUM_SETSTATE_FORCE _IOC(IOC_IN | IOC_OUT, L, 85, MAX_IOCTL_REPLY) /* diddle object state */ +#define VINUM_RESETSTATS _IOWR(L, 86, struct _ioctl_reply) /* reset object stats */ +#define VINUM_ATTACH _IOWR(L, 87, struct _ioctl_reply) /* attach an object */ +#define VINUM_DETACH _IOWR(L, 88, struct _ioctl_reply) /* remove an object */ + +#define VINUM_RENAME _IOWR(L, 89, struct _ioctl_reply) /* rename an object */ +#define VINUM_REPLACE _IOWR(L, 90, struct _ioctl_reply) /* replace an object */ + +#ifdef VINUMDEBUG +#define VINUM_RQINFO _IOWR(L, 91, struct rqinfo) /* get request info [i] from trace buffer */ +#endif + +#define VINUM_DAEMON _IOC(0, L, 92, 0) /* perform the kernel part of Vinum daemon */ +#define VINUM_FINDDAEMON _IOC(0, L, 93, 0) /* check for presence of Vinum daemon */ +#define VINUM_SETDAEMON _IOW(L, 94, int) /* set daemon flags */ +#define VINUM_GETDAEMON _IOR(L, 95, int) /* get daemon flags */ +#define VINUM_PARITYOP _IOWR(L, 96, struct _ioctl_reply) /* check/rebuild RAID-4/5 parity */ +#define VINUM_MOVE _IOWR(L, 98, struct _ioctl_reply) /* move an object */ diff --git a/sys/dev/vinum/vinumioctl.c b/sys/dev/vinum/vinumioctl.c new file mode 100644 index 0000000..2f7b876 --- /dev/null +++ b/sys/dev/vinum/vinumioctl.c @@ -0,0 +1,958 @@ +/* + * XXX replace all the checks on object validity with + * calls to valid<object> + */ +/*- + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumioctl.c,v 1.23 2003/05/23 01:02:22 grog Exp grog $ + * $FreeBSD$ + */ + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> + +#ifdef VINUMDEBUG +#include <sys/reboot.h> +#endif + +void attachobject(struct vinum_ioctl_msg *); +void detachobject(struct vinum_ioctl_msg *); +void renameobject(struct vinum_rename_msg *); +void replaceobject(struct vinum_ioctl_msg *); +void moveobject(struct vinum_ioctl_msg *); +void setreadpol(struct vinum_ioctl_msg *); + +jmp_buf command_fail; /* return on a failed command */ + +/* ioctl routine */ +int +vinumioctl(dev_t dev, + u_long cmd, + caddr_t data, + int flag, + struct thread *td) +{ + unsigned int objno; + struct sd *sd; + struct plex *plex; + struct volume *vol; + + /* First, decide what we're looking at */ + if ((minor(dev) == VINUM_SUPERDEV_MINOR) + || (minor(dev) == VINUM_DAEMON_MINOR)) + return vinum_super_ioctl(dev, cmd, data); + else /* real device */ + switch (DEVTYPE(dev)) { + case VINUM_SD_TYPE: + case VINUM_SD2_TYPE: /* second half of sd namespace */ + objno = Sdno(dev); + + sd = &SD[objno]; + + switch (cmd) { + case DIOCGSECTORSIZE: + *(u_int *) data = sd->sectorsize; + return 0; + + case DIOCGMEDIASIZE: + *(u_int64_t *) data = sd->sectors * sd->sectorsize; + return 0; + + /* + * We don't have this stuff on hardware, + * so just pretend to do it so that + * utilities don't get upset. + */ + case DIOCWDINFO: /* write partition info */ + case DIOCSDINFO: /* set partition info */ + return 0; /* not a titty */ + + default: + return ENOTTY; /* not my kind of ioctl */ + } + + return 0; /* pretend we did it */ + + case VINUM_PLEX_TYPE: + objno = Plexno(dev); + + plex = &PLEX[objno]; + + switch (cmd) { + case DIOCGSECTORSIZE: + *(u_int64_t *) data = plex->sectorsize; + return 0; + + case DIOCGMEDIASIZE: + *(u_int64_t *) data = plex->length * plex->sectorsize; + return 0; + + /* + * We don't have this stuff on hardware, + * so just pretend to do it so that + * utilities don't get upset. + */ + case DIOCWDINFO: /* write partition info */ + case DIOCSDINFO: /* set partition info */ + return 0; /* not a titty */ + + default: + return ENOTTY; /* not my kind of ioctl */ + } + + return 0; /* pretend we did it */ + + case VINUM_VOLUME_TYPE: + objno = Volno(dev); + + if ((unsigned) objno >= (unsigned) vinum_conf.volumes_allocated) /* not a valid volume */ + return ENXIO; + vol = &VOL[objno]; + if (vol->state != volume_up) /* not up, */ + return EIO; /* I/O error */ + + switch (cmd) { + case DIOCGSECTORSIZE: + *(u_int *) data = vol->sectorsize; + return 0; + + case DIOCGMEDIASIZE: + *(u_int64_t *) data = vol->size * vol->sectorsize; + return 0; + + /* + * We don't have this stuff on hardware, + * so just pretend to do it so that + * utilities don't get upset. + */ + case DIOCWDINFO: /* write partition info */ + case DIOCSDINFO: /* set partition info */ + return 0; /* not a titty */ + + default: + return ENOTTY; /* not my kind of ioctl */ + } + break; + } + return 0; /* XXX */ +} + +/* Handle ioctls for the super device */ +int +vinum_super_ioctl(dev_t dev, + u_long cmd, + caddr_t data) +{ + int error = 0; + unsigned int index; /* for transferring config info */ + unsigned int sdno; /* for transferring config info */ + int fe; /* free list element number */ + struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) data; /* struct to return */ + + ioctl_reply = (struct _ioctl_reply *) data; /* save the address to reply to */ + if (error) /* bombed out */ + return 0; /* the reply will contain meaningful info */ + switch (cmd) { +#ifdef VINUMDEBUG + case VINUM_DEBUG: + if (((struct debuginfo *) data)->changeit) /* change debug settings */ + debug = (((struct debuginfo *) data)->param); + else { + if (debug & DEBUG_REMOTEGDB) + boothowto |= RB_GDB; /* serial debug line */ + else + boothowto &= ~RB_GDB; /* local ddb */ + Debugger("vinum debug"); + } + ioctl_reply = (struct _ioctl_reply *) data; /* reinstate the address to reply to */ + ioctl_reply->error = 0; + return 0; +#endif + + case VINUM_CREATE: /* create a vinum object */ + error = lock_config(); /* get the config for us alone */ + if (error) /* can't do it, */ + return error; /* give up */ + error = setjmp(command_fail); /* come back here on error */ + if (error == 0) /* first time, */ + ioctl_reply->error = parse_user_config((char *) data, /* update the config */ + &keyword_set); + else if (ioctl_reply->error == 0) { /* longjmp, but no error status */ + ioctl_reply->error = EINVAL; /* note that something's up */ + ioctl_reply->msg[0] = '\0'; /* no message? */ + } + unlock_config(); + return 0; /* must be 0 to return the real error info */ + + case VINUM_GETCONFIG: /* get the configuration information */ + bcopy(&vinum_conf, data, sizeof(vinum_conf)); + return 0; + + /* start configuring the subsystem */ + case VINUM_STARTCONFIG: + return start_config(*(int *) data); /* just lock it. Parameter is 'force' */ + + /* + * Move the individual parts of the config to user space. + * + * Specify the index of the object in the first word of data, + * and return the object there + */ + case VINUM_DRIVECONFIG: + index = *(int *) data; /* get the index */ + if (index >= (unsigned) vinum_conf.drives_allocated) /* can't do it */ + return ENXIO; /* bang */ + bcopy(&DRIVE[index], data, sizeof(struct _drive)); /* copy the config item out */ + return 0; + + case VINUM_SDCONFIG: + index = *(int *) data; /* get the index */ + if (index >= (unsigned) vinum_conf.subdisks_allocated) /* can't do it */ + return ENXIO; /* bang */ + bcopy(&SD[index], data, sizeof(struct _sd)); /* copy the config item out */ + return 0; + + case VINUM_PLEXCONFIG: + index = *(int *) data; /* get the index */ + if (index >= (unsigned) vinum_conf.plexes_allocated) /* can't do it */ + return ENXIO; /* bang */ + bcopy(&PLEX[index], data, sizeof(struct _plex)); /* copy the config item out */ + return 0; + + case VINUM_VOLCONFIG: + index = *(int *) data; /* get the index */ + if (index >= (unsigned) vinum_conf.volumes_allocated) /* can't do it */ + return ENXIO; /* bang */ + bcopy(&VOL[index], data, sizeof(struct _volume)); /* copy the config item out */ + return 0; + + case VINUM_PLEXSDCONFIG: + index = *(int *) data; /* get the plex index */ + sdno = ((int *) data)[1]; /* and the sd index */ + if ((index >= (unsigned) vinum_conf.plexes_allocated) /* plex doesn't exist */ + ||(sdno >= PLEX[index].subdisks)) /* or it doesn't have this many subdisks */ + return ENXIO; /* bang */ + bcopy(&SD[PLEX[index].sdnos[sdno]], /* copy the config item out */ + data, + sizeof(struct _sd)); + return 0; + + /* + * We get called in two places: one from the + * userland config routines, which call us + * to complete the config and save it. This + * call supplies the value 0 as a parameter. + * + * The other place is from the user "saveconfig" + * routine, which can only work if we're *not* + * configuring. In this case, supply parameter 1. + */ + case VINUM_SAVECONFIG: + if (VFLAGS & VF_CONFIGURING) { /* must be us, the others are asleep */ + if (*(int *) data == 0) /* finish config */ + finish_config(1); /* finish the configuration and update it */ + else + return EBUSY; /* can't do it now */ + } + save_config(); /* save configuration to disk */ + return 0; + + case VINUM_RELEASECONFIG: /* release the config */ + if (VFLAGS & VF_CONFIGURING) { /* must be us, the others are asleep */ + finish_config(0); /* finish the configuration, don't change it */ + save_config(); /* save configuration to disk */ + } else + error = EINVAL; /* release what config? */ + return error; + + case VINUM_READCONFIG: + if (((char *) data)[0] == '\0') + ioctl_reply->error = vinum_scandisk(NULL); /* built your own list */ + else + ioctl_reply->error = vinum_scandisk((char *) data); + if (ioctl_reply->error == ENOENT) { + if (vinum_conf.drives_used > 0) + strcpy(ioctl_reply->msg, "no additional drives found"); + else + strcpy(ioctl_reply->msg, "no drives found"); + } else if (ioctl_reply->error) + strcpy(ioctl_reply->msg, "can't read configuration information, see log file"); + return 0; /* must be 0 to return the real error info */ + + case VINUM_INIT: + ioctl_reply = (struct _ioctl_reply *) data; /* reinstate the address to reply to */ + ioctl_reply->error = 0; + return 0; + + case VINUM_RESETCONFIG: + if (vinum_inactive(0)) { /* if the volumes are not active */ + /* + * Note the open count. We may be called from v, so we'll be open. + * Keep the count so we don't underflow + */ + free_vinum(1); /* clean up everything */ + log(LOG_NOTICE, "vinum: CONFIGURATION OBLITERATED\n"); + ioctl_reply = (struct _ioctl_reply *) data; /* reinstate the address to reply to */ + ioctl_reply->error = 0; + return 0; + } + return EBUSY; + + case VINUM_SETSTATE: + setstate((struct vinum_ioctl_msg *) data); /* set an object state */ + return 0; + + /* + * Set state by force, without changing + * anything else. + */ + case VINUM_SETSTATE_FORCE: + setstate_by_force((struct vinum_ioctl_msg *) data); /* set an object state */ + return 0; + +#ifdef VINUMDEBUG + case VINUM_MEMINFO: + vinum_meminfo(data); + return 0; + + case VINUM_MALLOCINFO: + return vinum_mallocinfo(data); + + case VINUM_RQINFO: + return vinum_rqinfo(data); +#endif + + case VINUM_REMOVE: + remove((struct vinum_ioctl_msg *) data); /* remove an object */ + return 0; + + case VINUM_GETFREELIST: /* get a drive free list element */ + index = *(int *) data; /* get the drive index */ + fe = ((int *) data)[1]; /* and the free list element */ + if ((index >= (unsigned) vinum_conf.drives_allocated) /* plex doesn't exist */ + ||(DRIVE[index].state == drive_unallocated)) + return ENODEV; + if (fe >= DRIVE[index].freelist_entries) /* no such entry */ + return ENOENT; + bcopy(&DRIVE[index].freelist[fe], + data, + sizeof(struct drive_freelist)); + return 0; + + case VINUM_RESETSTATS: + resetstats((struct vinum_ioctl_msg *) data); /* reset object stats */ + return 0; + + /* attach an object to a superordinate object */ + case VINUM_ATTACH: + attachobject((struct vinum_ioctl_msg *) data); + return 0; + + /* detach an object from a superordinate object */ + case VINUM_DETACH: + detachobject((struct vinum_ioctl_msg *) data); + return 0; + + /* rename an object */ + case VINUM_RENAME: + renameobject((struct vinum_rename_msg *) data); + return 0; + + /* replace an object */ + case VINUM_REPLACE: + replaceobject((struct vinum_ioctl_msg *) data); + return 0; + + case VINUM_DAEMON: + vinum_daemon(); /* perform the daemon */ + return 0; + + case VINUM_FINDDAEMON: /* check for presence of daemon */ + return vinum_finddaemon(); + return 0; + + case VINUM_SETDAEMON: /* set daemon flags */ + return vinum_setdaemonopts(*(int *) data); + + case VINUM_GETDAEMON: /* get daemon flags */ + *(int *) data = daemon_options; + return 0; + + case VINUM_PARITYOP: /* check/rebuild RAID-4/5 parity */ + parityops((struct vinum_ioctl_msg *) data); + return 0; + + /* move an object */ + case VINUM_MOVE: + moveobject((struct vinum_ioctl_msg *) data); + return 0; + + case VINUM_READPOL: + setreadpol((struct vinum_ioctl_msg *) data); + return 0; + + default: + /* FALLTHROUGH */ + break; + } + return 0; /* to keep the compiler happy */ +} + +/* + * The following four functions check the supplied + * object index and return a pointer to the object + * if it exists. Otherwise they longjump out via + * throw_rude_remark. + */ +struct drive * +validdrive(int driveno, struct _ioctl_reply *reply) +{ + if ((driveno < vinum_conf.drives_allocated) + && (DRIVE[driveno].state > drive_referenced)) + return &DRIVE[driveno]; + strcpy(reply->msg, "No such drive"); + reply->error = ENOENT; + return NULL; +} + +struct sd * +validsd(int sdno, struct _ioctl_reply *reply) +{ + if ((sdno < vinum_conf.subdisks_allocated) + && (SD[sdno].state > sd_referenced)) + return &SD[sdno]; + strcpy(reply->msg, "No such subdisk"); + reply->error = ENOENT; + return NULL; +} + +struct plex * +validplex(int plexno, struct _ioctl_reply *reply) +{ + if ((plexno < vinum_conf.plexes_allocated) + && (PLEX[plexno].state > plex_referenced)) + return &PLEX[plexno]; + strcpy(reply->msg, "No such plex"); + reply->error = ENOENT; + return NULL; +} + +struct volume * +validvol(int volno, struct _ioctl_reply *reply) +{ + if ((volno < vinum_conf.volumes_allocated) + && (VOL[volno].state > volume_uninit)) + return &VOL[volno]; + strcpy(reply->msg, "No such volume"); + reply->error = ENOENT; + return NULL; +} + +/* reset an object's stats */ +void +resetstats(struct vinum_ioctl_msg *msg) +{ + struct _ioctl_reply *reply = (struct _ioctl_reply *) msg; + + switch (msg->type) { + case drive_object: + if (msg->index < vinum_conf.drives_allocated) { + struct drive *drive = &DRIVE[msg->index]; + if (drive->state > drive_referenced) { + drive->reads = 0; /* number of reads on this drive */ + drive->writes = 0; /* number of writes on this drive */ + drive->bytes_read = 0; /* number of bytes read */ + drive->bytes_written = 0; /* number of bytes written */ + reply->error = 0; + return; + } + reply->error = EINVAL; + return; + } + case sd_object: + if (msg->index < vinum_conf.subdisks_allocated) { + struct sd *sd = &SD[msg->index]; + if (sd->state > sd_referenced) { + sd->reads = 0; /* number of reads on this subdisk */ + sd->writes = 0; /* number of writes on this subdisk */ + sd->bytes_read = 0; /* number of bytes read */ + sd->bytes_written = 0; /* number of bytes written */ + reply->error = 0; + return; + } + reply->error = EINVAL; + return; + } + break; + + case plex_object: + if (msg->index < vinum_conf.plexes_allocated) { + struct plex *plex = &PLEX[msg->index]; + if (plex->state > plex_referenced) { + plex->reads = 0; + plex->writes = 0; /* number of writes on this plex */ + plex->bytes_read = 0; /* number of bytes read */ + plex->bytes_written = 0; /* number of bytes written */ + plex->recovered_reads = 0; /* number of recovered read operations */ + plex->degraded_writes = 0; /* number of degraded writes */ + plex->parityless_writes = 0; /* number of parityless writes */ + plex->multiblock = 0; /* requests that needed more than one block */ + plex->multistripe = 0; /* requests that needed more than one stripe */ + reply->error = 0; + return; + } + reply->error = EINVAL; + return; + } + break; + + case volume_object: + if (msg->index < vinum_conf.volumes_allocated) { + struct volume *vol = &VOL[msg->index]; + if (vol->state > volume_uninit) { + vol->bytes_read = 0; /* number of bytes read */ + vol->bytes_written = 0; /* number of bytes written */ + vol->reads = 0; /* number of reads on this volume */ + vol->writes = 0; /* number of writes on this volume */ + vol->recovered_reads = 0; /* reads recovered from another plex */ + reply->error = 0; + return; + } + reply->error = EINVAL; + return; + } + case invalid_object: /* can't get this */ + reply->error = EINVAL; + return; + } +} + +/* attach an object to a superior object */ +void +attachobject(struct vinum_ioctl_msg *msg) +{ + struct _ioctl_reply *reply = (struct _ioctl_reply *) msg; + int sdno; + struct sd *sd; + struct plex *plex; + struct volume *vol; + + switch (msg->type) { + case drive_object: /* you can't attach a drive to anything */ + case volume_object: /* nor a volume */ + case invalid_object: /* "this can't happen" */ + reply->error = EINVAL; + reply->msg[0] = '\0'; /* vinum(8) doesn't do this */ + return; + + case sd_object: + sd = validsd(msg->index, reply); + if (sd == NULL) /* not a valid subdisk */ + return; + plex = validplex(msg->otherobject, reply); + if (plex) { + /* + * We should be more intelligent about this. + * We should be able to reattach a dead + * subdisk, but if we want to increase the total + * number of subdisks, we have a lot of reshuffling + * to do. XXX + */ + if ((plex->organization != plex_concat) /* can't attach to striped and RAID-4/5 */ + &&(!msg->force)) { /* without using force */ + reply->error = EINVAL; /* no message, the user should check */ + strcpy(reply->msg, "Can't attach to this plex organization"); + } else if (sd->plexno >= 0) { /* already belong to a plex */ + reply->error = EBUSY; /* no message, the user should check */ + sprintf(reply->msg, "%s is already attached to %s", + sd->name, + sd[sd->plexno].name); + reply->msg[0] = '\0'; + } else { + sd->plexoffset = msg->offset; /* this is where we want it */ + set_sd_state(sd->sdno, sd_stale, setstate_force); /* make sure it's stale */ + give_sd_to_plex(plex->plexno, sd->sdno); /* and give it to the plex */ + update_sd_config(sd->sdno, 0); + save_config(); + if (sd->state == sd_reviving) + reply->error = EAGAIN; /* need to revive it */ + else + reply->error = 0; + } + } + break; + + case plex_object: + plex = validplex(msg->index, reply); /* get plex */ + if (plex == NULL) + return; + vol = validvol(msg->otherobject, reply); /* and volume information */ + if (vol) { + if (vol->plexes == MAXPLEX) { /* we have too many already */ + reply->error = ENOSPC; /* nowhere to put it */ + strcpy(reply->msg, "Too many plexes"); + } else if (plex->volno >= 0) { /* the plex has an owner */ + reply->error = EBUSY; /* no message, the user should check */ + sprintf(reply->msg, "%s is already attached to %s", + plex->name, + VOL[plex->volno].name); + } else { + for (sdno = 0; sdno < plex->subdisks; sdno++) { + sd = &SD[plex->sdnos[sdno]]; + + if (sd->state > sd_down) /* real subdisk, vaguely accessible */ + set_sd_state(plex->sdnos[sdno], sd_stale, setstate_force); /* make it stale */ + } + set_plex_state(plex->plexno, plex_up, setstate_none); /* update plex state */ + give_plex_to_volume(msg->otherobject, msg->index, 0); /* and give it to the volume */ + update_plex_config(plex->plexno, 0); + save_config(); + reply->error = 0; /* all went well */ + } + } + } +} + +/* detach an object from a superior object */ +void +detachobject(struct vinum_ioctl_msg *msg) +{ + struct _ioctl_reply *reply = (struct _ioctl_reply *) msg; + struct sd *sd; + struct plex *plex; + struct volume *vol; + int sdno; + int plexno; + + switch (msg->type) { + case drive_object: /* you can't detach a drive from anything */ + case volume_object: /* nor a volume */ + case invalid_object: /* "this can't happen" */ + reply->error = EINVAL; + reply->msg[0] = '\0'; /* vinum(8) doesn't do this */ + return; + + case sd_object: + sd = validsd(msg->index, reply); + if (sd == NULL) + return; + if (sd->plexno < 0) { /* doesn't belong to a plex */ + reply->error = ENOENT; + strcpy(reply->msg, "Subdisk is not attached"); + return; + } else { /* valid plex number */ + plex = &PLEX[sd->plexno]; + if ((!msg->force) /* don't force things */ + &&((plex->state == plex_up) /* and the plex is up */ + ||((plex->state == plex_flaky) && sd->state == sd_up))) { /* or flaky with this sd up */ + reply->error = EBUSY; /* we need this sd */ + reply->msg[0] = '\0'; + return; + } + sd->plexno = -1; /* anonymous sd */ + if (plex->subdisks == 1) { /* this was the only subdisk */ + Free(plex->sdnos); /* free the subdisk array */ + plex->sdnos = NULL; /* and note the fact */ + plex->subdisks_allocated = 0; /* no subdisk space */ + } else { + for (sdno = 0; sdno < plex->subdisks; sdno++) { + if (plex->sdnos[sdno] == msg->index) /* found our subdisk */ + break; + } + if (sdno < (plex->subdisks - 1)) /* not the last one, compact */ + bcopy(&plex->sdnos[sdno + 1], + &plex->sdnos[sdno], + (plex->subdisks - 1 - sdno) * sizeof(int)); + } + plex->subdisks--; + if (!bcmp(plex->name, sd->name, strlen(plex->name) + 1)) + /* this subdisk is named after the plex */ + { + bcopy(sd->name, + &sd->name[3], + min(strlen(sd->name) + 1, MAXSDNAME - 3)); + bcopy("ex-", sd->name, 3); + sd->name[MAXSDNAME - 1] = '\0'; + } + update_plex_config(plex->plexno, 0); + if (isstriped(plex)) /* we've just mutilated our plex, */ + set_plex_state(plex->plexno, + plex_down, + setstate_force | setstate_configuring); + if (plex->volno >= 0) /* plex attached to volume, */ + update_volume_config(plex->volno); + save_config(); + reply->error = 0; + } + return; + + case plex_object: + plex = validplex(msg->index, reply); /* get plex */ + if (plex == NULL) + return; + if (plex->volno >= 0) { + int volno = plex->volno; + + vol = &VOL[volno]; + if ((!msg->force) /* don't force things */ + &&((vol->state == volume_up) /* and the volume is up */ + &&(vol->plexes == 1))) { /* and this is the last plex */ + /* + * XXX As elsewhere, check whether we will lose + * mapping by removing this plex + */ + reply->error = EBUSY; /* we need this plex */ + reply->msg[0] = '\0'; + return; + } + plex->volno = -1; /* anonymous plex */ + for (plexno = 0; plexno < vol->plexes; plexno++) { + if (vol->plex[plexno] == msg->index) /* found our plex */ + break; + } + if (plexno < (vol->plexes - 1)) /* not the last one, compact */ + bcopy(&vol->plex[plexno + 1], + &vol->plex[plexno], + (vol->plexes - 1 - plexno) * sizeof(int)); + vol->plexes--; + vol->last_plex_read = 0; /* don't go beyond the end */ + if (!bcmp(vol->name, plex->name, strlen(vol->name) + 1)) + /* this plex is named after the volume */ + { + /* First, check if the subdisks are the same */ + if (msg->recurse) { + int sdno; + + for (sdno = 0; sdno < plex->subdisks; sdno++) { + struct sd *sd = &SD[plex->sdnos[sdno]]; + + if (!bcmp(plex->name, sd->name, strlen(plex->name) + 1)) + /* subdisk is named after the plex */ + { + bcopy(sd->name, + &sd->name[3], + min(strlen(sd->name) + 1, MAXSDNAME - 3)); + bcopy("ex-", sd->name, 3); + sd->name[MAXSDNAME - 1] = '\0'; + } + } + } + bcopy(plex->name, + &plex->name[3], + min(strlen(plex->name) + 1, MAXPLEXNAME - 3)); + bcopy("ex-", plex->name, 3); + plex->name[MAXPLEXNAME - 1] = '\0'; + } + update_volume_config(volno); + save_config(); + reply->error = 0; + } else { + reply->error = ENOENT; + strcpy(reply->msg, "Plex is not attached"); + } + } +} + +void +renameobject(struct vinum_rename_msg *msg) +{ + struct _ioctl_reply *reply = (struct _ioctl_reply *) msg; + struct drive *drive; + struct sd *sd; + struct plex *plex; + struct volume *vol; + + switch (msg->type) { + case drive_object: /* you can't attach a drive to anything */ + if (find_drive(msg->newname, 0) >= 0) { /* we have that name already, */ + reply->error = EEXIST; + reply->msg[0] = '\0'; + return; + } + drive = validdrive(msg->index, reply); + if (drive) { + bcopy(msg->newname, drive->label.name, MAXDRIVENAME); + save_config(); + reply->error = 0; + } + return; + + case sd_object: /* you can't attach a subdisk to anything */ + if (find_subdisk(msg->newname, 0) >= 0) { /* we have that name already, */ + reply->error = EEXIST; + reply->msg[0] = '\0'; + return; + } + sd = validsd(msg->index, reply); + if (sd) { + bcopy(msg->newname, sd->name, MAXSDNAME); + update_sd_config(sd->sdno, 0); + save_config(); + reply->error = 0; + } + return; + + case plex_object: /* you can't attach a plex to anything */ + if (find_plex(msg->newname, 0) >= 0) { /* we have that name already, */ + reply->error = EEXIST; + reply->msg[0] = '\0'; + return; + } + plex = validplex(msg->index, reply); + if (plex) { + bcopy(msg->newname, plex->name, MAXPLEXNAME); + update_plex_config(plex->plexno, 0); + save_config(); + reply->error = 0; + } + return; + + case volume_object: /* you can't attach a volume to anything */ + if (find_volume(msg->newname, 0) >= 0) { /* we have that name already, */ + reply->error = EEXIST; + reply->msg[0] = '\0'; + return; + } + vol = validvol(msg->index, reply); + if (vol) { + bcopy(msg->newname, vol->name, MAXVOLNAME); + update_volume_config(msg->index); + save_config(); + reply->error = 0; + } + return; + + case invalid_object: + reply->error = EINVAL; + reply->msg[0] = '\0'; + } +} + +/* + * Replace one object with another. + * Currently only for drives. + * message->index is the drive number of the old drive + * message->otherobject is the drive number of the new drive + */ +void +replaceobject(struct vinum_ioctl_msg *msg) +{ + struct _ioctl_reply *reply = (struct _ioctl_reply *) msg; + + reply->error = ENODEV; /* until I know how to do this */ + strcpy(reply->msg, "replace not implemented yet"); +/* save_config (); */ +} + +void +moveobject(struct vinum_ioctl_msg *msg) +{ + struct _ioctl_reply *reply = (struct _ioctl_reply *) msg; + struct drive *drive; + struct sd *sd; + + /* Check that our objects are valid (i.e. they exist) */ + drive = validdrive(msg->index, (struct _ioctl_reply *) msg); + if (drive == NULL) + return; + sd = validsd(msg->otherobject, (struct _ioctl_reply *) msg); + if (sd == NULL) + return; + if (sd->driveno == msg->index) /* sd already belongs to drive */ + return; + + if (sd->state > sd_stale) + set_sd_state(sd->sdno, sd_stale, setstate_force); /* make the subdisk stale */ + else + sd->state = sd_empty; + if (sd->plexno >= 0) /* part of a plex, */ + update_plex_state(sd->plexno); /* update its state */ + + /* Return the space on the old drive */ + if ((sd->driveno >= 0) /* we have a drive, */ + &&(sd->sectors > 0)) /* and some space on it */ + return_drive_space(sd->driveno, /* return the space */ + sd->driveoffset, + sd->sectors); + + /* Reassign the old subdisk */ + sd->driveno = msg->index; + sd->driveoffset = -1; /* let the drive decide where to put us */ + give_sd_to_drive(sd->sdno); + reply->error = 0; +} + +void +setreadpol(struct vinum_ioctl_msg *msg) +{ + struct _ioctl_reply *reply = (struct _ioctl_reply *) msg; + struct volume *vol; + struct plex *plex; + int myplexno = -1; + + /* Check that our objects are valid (i.e. they exist) */ + vol = validvol(msg->index, reply); + if (vol == NULL) + return; + + /* If a plex was specified, check that is is valid */ + if (msg->otherobject >= 0) { + plex = validplex(msg->otherobject, reply); + if (vol == NULL) + return; + + /* Is it attached to this volume? */ + myplexno = my_plex(msg->index, msg->otherobject); + if (myplexno < 0) { + strcpy(reply->msg, "Plex is not attached to volume"); + reply->error = ENOENT; + return; + } + } + lock_config(); + vol->preferred_plex = myplexno; + save_config(); + unlock_config(); + reply->error = 0; +} + +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumkw.h b/sys/dev/vinum/vinumkw.h new file mode 100644 index 0000000..d7bc7a5 --- /dev/null +++ b/sys/dev/vinum/vinumkw.h @@ -0,0 +1,152 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumkw.h,v 1.20 2003/05/07 03:32:09 grog Exp grog $ + * $FreeBSD$ + */ + +/* + * Command keywords that vinum knows. These include both user-level + * and kernel-level stuff + */ + +/* + * Our complete vocabulary. The names of the commands are + * the same as the identifier without the kw_ at the beginning + * (i.e. kw_create defines the "create" keyword). Preprocessor + * magic in parser.c does the rest. + * + * To add a new word: put it in the table below and one of the + * lists in vinumparser.c (probably keywords). + */ +enum keyword { + kw_create, + kw_modify, + kw_list, + kw_l = kw_list, + kw_ld, /* list drive */ + kw_ls, /* list subdisk */ + kw_lp, /* list plex */ + kw_lv, /* list volume */ + kw_set, + kw_rm, + kw_mv, /* move object */ + kw_move, /* synonym for mv */ + kw_start, + kw_stop, + kw_makedev, /* make /dev/vinum devices */ + kw_setdaemon, /* set daemon flags */ + kw_getdaemon, /* set daemon flags */ + kw_help, + kw_drive, + kw_partition, + kw_sd, + kw_subdisk = kw_sd, + kw_plex, + kw_volume, + kw_vol = kw_volume, + kw_read, + kw_readpol, + kw_org, + kw_name, + kw_concat, + kw_striped, + kw_raid4, + kw_raid5, + kw_driveoffset, + kw_plexoffset, + kw_len, + kw_length = kw_len, + kw_size = kw_len, + kw_state, + kw_setupstate, + kw_d, /* flag names */ + kw_f, + kw_r, + kw_s, + kw_v, + kw_w, + kw_round, /* round robin */ + /* + * The first of these is a volume attibute ("prefer plex"), and the + * second is a plex attribute ("preferred" means that the volume + * prefers this plex). + */ + kw_prefer, /* prefer plex */ + kw_preferred, /* preferred plex */ + kw_device, + kw_init, + kw_resetconfig, + kw_writethrough, + kw_writeback, + kw_replace, + kw_resetstats, + kw_attach, + kw_detach, + kw_rename, + kw_printconfig, + kw_saveconfig, + kw_hotspare, + kw_detached, + kw_debug, /* go into debugger */ + kw_stripe, + kw_mirror, + kw_info, + kw_quit, + kw_max, + kw_setstate, + kw_checkparity, + kw_rebuildparity, + kw_dumpconfig, + kw_retryerrors, + kw_invalid_keyword = -1 +}; + +struct _keywords { + char *name; + enum keyword keyword; +}; + +struct keywordset { + int size; + struct _keywords *k; +}; + +extern struct _keywords keywords[]; +extern struct _keywords flag_keywords[]; + +extern struct keywordset keyword_set; +extern struct keywordset flag_set; + +/* Parser functions */ + +enum keyword get_keyword(char *, struct keywordset *); +int tokenize(char *, char *[], int); diff --git a/sys/dev/vinum/vinumlock.c b/sys/dev/vinum/vinumlock.c new file mode 100644 index 0000000..33d9578 --- /dev/null +++ b/sys/dev/vinum/vinumlock.c @@ -0,0 +1,264 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumlock.c,v 1.19 2003/05/23 01:07:18 grog Exp $ + * $FreeBSD$ + */ + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> + +/* Lock a drive, wait if it's in use */ +#ifdef VINUMDEBUG +int +lockdrive(struct drive *drive, char *file, int line) +#else +int +lockdrive(struct drive *drive) +#endif +{ + int error; + + /* XXX get rid of drive->flags |= VF_LOCKING; */ + if ((drive->flags & VF_LOCKED) /* it's locked */ + &&(drive->pid == curproc->p_pid)) { /* by us! */ +#ifdef VINUMDEBUG + log(LOG_WARNING, + "vinum lockdrive: already locking %s from %s:%d, called from %s:%d\n", + drive->label.name, + drive->lockfilename, + drive->lockline, + basename(file), + line); +#else + log(LOG_WARNING, + "vinum lockdrive: already locking %s\n", + drive->label.name); +#endif + return 0; + } + while ((drive->flags & VF_LOCKED) != 0) { + /* + * There are problems sleeping on a unique identifier, + * since the drive structure can move, and the unlock + * function can be called after killing the drive. + * Solve this by waiting on this function; the number + * of conflicts is negligible. + */ + if ((error = tsleep(&lockdrive, + PRIBIO, + "vindrv", + 0)) != 0) + return error; + } + drive->flags |= VF_LOCKED; + drive->pid = curproc->p_pid; /* it's a panic error if curproc is null */ +#ifdef VINUMDEBUG + bcopy(basename(file), drive->lockfilename, 15); + drive->lockfilename[15] = '\0'; /* truncate if necessary */ + drive->lockline = line; +#endif + return 0; +} + +/* Unlock a drive and let the next one at it */ +void +unlockdrive(struct drive *drive) +{ + drive->flags &= ~VF_LOCKED; + /* we don't reset pid: it's of hysterical interest */ + wakeup(&lockdrive); +} + +/* Lock a stripe of a plex, wait if it's in use */ +struct rangelock * +lockrange(daddr_t stripe, struct buf *bp, struct plex *plex) +{ + struct rangelock *lock; + struct rangelock *pos; /* position of first free lock */ + int foundlocks; /* number of locks found */ + + /* + * We could get by without counting the number + * of locks we find, but we have a linear search + * through a table which in most cases will be + * empty. It's faster to stop when we've found + * all the locks that are there. This is also + * the reason why we put pos at the beginning + * instead of the end, though it requires an + * extra test. + */ + pos = NULL; + foundlocks = 0; + + /* + * we can't use 0 as a valid address, so + * increment all addresses by 1. + */ + stripe++; + mtx_lock(plex->lockmtx); + + /* Wait here if the table is full */ + while (plex->usedlocks == PLEX_LOCKS) /* all in use */ + msleep(&plex->usedlocks, plex->lockmtx, PRIBIO, "vlock", 0); + +#ifdef DIAGNOSTIC + if (plex->usedlocks >= PLEX_LOCKS) + panic("lockrange: Too many locks in use"); +#endif + + lock = plex->lock; /* pointer in lock table */ + if (plex->usedlocks > 0) /* something locked, */ + /* Search the lock table for our stripe */ + for (; lock < &plex->lock[PLEX_LOCKS] + && foundlocks < plex->usedlocks; + lock++) { + if (lock->stripe) { /* in use */ + foundlocks++; /* found another one in use */ + if ((lock->stripe == stripe) /* it's our stripe */ + &&(lock->bp != bp)) { /* but not our request */ +#ifdef VINUMDEBUG + if (debug & DEBUG_LOCKREQS) { + struct rangelockinfo lockinfo; + + lockinfo.stripe = stripe; + lockinfo.bp = bp; + lockinfo.plexno = plex->plexno; + logrq(loginfo_lockwait, (union rqinfou) &lockinfo, bp); + } +#endif + plex->lockwaits++; /* waited one more time */ + msleep(lock, plex->lockmtx, PRIBIO, "vrlock", 0); + lock = &plex->lock[-1]; /* start again */ + foundlocks = 0; + pos = NULL; + } + } else if (pos == NULL) /* still looking for somewhere? */ + pos = lock; /* a place to put this one */ + } + /* + * This untidy looking code ensures that we'll + * always end up pointing to the first free lock + * entry, thus minimizing the number of + * iterations necessary. + */ + if (pos == NULL) /* didn't find one on the way, */ + pos = lock; /* use the one we're pointing to */ + + /* + * The address range is free, and we're pointing + * to the first unused entry. Make it ours. + */ + pos->stripe = stripe; + pos->bp = bp; + plex->usedlocks++; /* one more lock */ + mtx_unlock(plex->lockmtx); +#ifdef VINUMDEBUG + if (debug & DEBUG_LOCKREQS) { + struct rangelockinfo lockinfo; + + lockinfo.stripe = stripe; + lockinfo.bp = bp; + lockinfo.plexno = plex->plexno; + logrq(loginfo_lock, (union rqinfou) &lockinfo, bp); + } +#endif + return pos; +} + +/* Unlock a volume and let the next one at it */ +void +unlockrange(int plexno, struct rangelock *lock) +{ + struct plex *plex; + + plex = &PLEX[plexno]; +#ifdef DIAGNOSTIC + if (lock < &plex->lock[0] || lock >= &plex->lock[PLEX_LOCKS]) + panic("vinum: rangelock %p on plex %d invalid, not between %p and %p", + lock, + plexno, + &plex->lock[0], + &plex->lock[PLEX_LOCKS]); +#endif +#ifdef VINUMDEBUG + if (debug & DEBUG_LOCKREQS) { + struct rangelockinfo lockinfo; + + lockinfo.stripe = lock->stripe; + lockinfo.bp = lock->bp; + lockinfo.plexno = plex->plexno; + logrq(loginfo_lockwait, (union rqinfou) &lockinfo, lock->bp); + } +#endif + lock->stripe = 0; /* no longer used */ + plex->usedlocks--; /* one less lock */ + if (plex->usedlocks == PLEX_LOCKS - 1) /* we were full, */ + wakeup(&plex->usedlocks); /* get a waiter if one's there */ + wakeup((void *) lock); +} + +/* Get a lock for the global config. Wait if it's not available. */ +int +lock_config(void) +{ + int error; + + while ((vinum_conf.flags & VF_LOCKED) != 0) { + vinum_conf.flags |= VF_LOCKING; + if ((error = tsleep(&vinum_conf, PRIBIO, "vincfg", 0)) != 0) + return error; + } + vinum_conf.flags |= VF_LOCKED; + return 0; +} + +/* Unlock global config and wake up any waiters. */ +void +unlock_config(void) +{ + vinum_conf.flags &= ~VF_LOCKED; + if ((vinum_conf.flags & VF_LOCKING) != 0) { + vinum_conf.flags &= ~VF_LOCKING; + wakeup(&vinum_conf); + } +} +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinummemory.c b/sys/dev/vinum/vinummemory.c new file mode 100644 index 0000000..b4e9a43 --- /dev/null +++ b/sys/dev/vinum/vinummemory.c @@ -0,0 +1,288 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinummemory.c,v 1.31 2003/05/23 01:08:36 grog Exp $ + * $FreeBSD$ + */ + +#include <dev/vinum/vinumhdr.h> + +#ifdef VINUMDEBUG +#include <dev/vinum/request.h> +extern struct rqinfo rqinfo[]; +extern struct rqinfo *rqip; +int rqinfo_size = RQINFO_SIZE; /* for debugger */ + +#undef longjmp /* this was defined as LongJmp */ +#define strrchr rindex +#ifdef __i386__ /* check for validity */ +void +LongJmp(jmp_buf buf, int retval) +{ +/* + * longjmp is not documented, not even jmp_buf. + * This is what's in i386/i386/support.s: + * ENTRY(longjmp) + * movl 4(%esp),%eax + * movl (%eax),%ebx restore ebx + * movl 4(%eax),%esp restore esp + * movl 8(%eax),%ebp restore ebp + * movl 12(%eax),%esi restore esi + * movl 16(%eax),%edi restore edi + * movl 20(%eax),%edx get rta + * movl %edx,(%esp) put in return frame + * xorl %eax,%eax return(1); + * incl %eax + * ret + * + * from which we deduce the structure of jmp_buf: + */ + struct JmpBuf { + int jb_ebx; + int jb_esp; + int jb_ebp; + int jb_esi; + int jb_edi; + int jb_eip; + }; + + struct JmpBuf *jb = (struct JmpBuf *) buf; + + if ((jb->jb_esp < 0xc0000000) + || (jb->jb_ebp < 0xc0000000) + || (jb->jb_eip < 0xc0000000)) + panic("Invalid longjmp"); + longjmp(buf, retval); +} + +#else /* not i386 */ +#define LongJmp longjmp /* just use the kernel function */ +#endif /* i386 */ +#endif /* VINUMDEBUG */ + +/* find the base name of a path name */ +char * +basename(char *file) +{ + char *f = strrchr(file, '/'); /* chop off dirname if present */ + + if (f == NULL) + return file; + else + return ++f; /* skip the / */ +} + +#ifdef VINUMDEBUG +void +expand_table(void **table, int oldsize, int newsize, char *file, int line) +#else +void +expand_table(void **table, int oldsize, int newsize) +#endif +{ + if (newsize > oldsize) { + int *temp; + int s; + + s = splhigh(); +#ifdef VINUMDEBUG + temp = (int *) MMalloc(newsize, file, line); /* allocate a new table */ +#else + temp = (int *) Malloc(newsize); /* allocate a new table */ +#endif + CHECKALLOC(temp, "vinum: Can't expand table\n"); + bzero((char *) temp, newsize); /* clean it all out */ + if (*table != NULL) { /* already something there, */ + bcopy((char *) *table, (char *) temp, oldsize); /* copy it to the old table */ +#ifdef VINUMDEBUG + FFree(*table, file, line); +#else + Free(*table); +#endif + } + *table = temp; + splx(s); + } +} + +#ifdef VINUMDEBUG +#define MALLOCENTRIES 16384 +int malloccount = 0; +int highwater = 0; /* highest index ever allocated */ +struct mc malloced[MALLOCENTRIES]; + +#define FREECOUNT 64 +int freecount = FREECOUNT; /* for debugger */ +int lastfree = 0; +struct mc freeinfo[FREECOUNT]; + +int total_malloced; +static int mallocseq = 0; + +caddr_t +MMalloc(int size, char *file, int line) +{ + int s; + caddr_t result; + int i; + + if (malloccount >= MALLOCENTRIES) { /* too many */ + log(LOG_ERR, "vinum: can't allocate table space to trace memory allocation"); + return 0; /* can't continue */ + } + /* Wait for malloc if we can */ + result = malloc(size, + M_DEVBUF, + curthread->td_intr_nesting_level == 0 ? M_WAITOK : M_NOWAIT); + if (result == NULL) + log(LOG_ERR, "vinum: can't allocate %d bytes from %s:%d\n", size, file, line); + else { + s = splhigh(); + for (i = 0; i < malloccount; i++) { + if (((result + size) > malloced[i].address) + && (result < malloced[i].address + malloced[i].size)) /* overlap */ + Debugger("Malloc overlap"); + } + if (result) { + char *f = basename(file); + + i = malloccount++; + total_malloced += size; + microtime(&malloced[i].time); + malloced[i].seq = mallocseq++; + malloced[i].size = size; + malloced[i].line = line; + malloced[i].address = result; + strlcpy(malloced[i].file, f, MCFILENAMELEN); + } + if (malloccount > highwater) + highwater = malloccount; + splx(s); + } + return result; +} + +void +FFree(void *mem, char *file, int line) +{ + int s; + int i; + + s = splhigh(); + for (i = 0; i < malloccount; i++) { + if ((caddr_t) mem == malloced[i].address) { /* found it */ + bzero(mem, malloced[i].size); /* XXX */ + free(mem, M_DEVBUF); + malloccount--; + total_malloced -= malloced[i].size; + if (debug & DEBUG_MEMFREE) { /* keep track of recent frees */ + char *f = strrchr(file, '/'); /* chop off dirname if present */ + + if (f == NULL) + f = file; + else + f++; /* skip the / */ + + microtime(&freeinfo[lastfree].time); + freeinfo[lastfree].seq = malloced[i].seq; + freeinfo[lastfree].size = malloced[i].size; + freeinfo[lastfree].line = line; + freeinfo[lastfree].address = mem; + bcopy(f, freeinfo[lastfree].file, MCFILENAMELEN); + if (++lastfree == FREECOUNT) + lastfree = 0; + } + if (i < malloccount) /* more coming after */ + bcopy(&malloced[i + 1], &malloced[i], (malloccount - i) * sizeof(struct mc)); + splx(s); + return; + } + } + splx(s); + log(LOG_ERR, + "Freeing unallocated data at 0x%p from %s, line %d\n", + mem, + file, + line); + Debugger("Free"); +} + +void +vinum_meminfo(caddr_t data) +{ + struct meminfo *m = (struct meminfo *) data; + + m->mallocs = malloccount; + m->total_malloced = total_malloced; + m->malloced = malloced; + m->highwater = highwater; +} + +int +vinum_mallocinfo(caddr_t data) +{ + struct mc *m = (struct mc *) data; + unsigned int ent = m->seq; /* index of entry to return */ + + if (ent >= malloccount) + return ENOENT; + m->address = malloced[ent].address; + m->size = malloced[ent].size; + m->line = malloced[ent].line; + m->seq = malloced[ent].seq; + strlcpy(m->file, malloced[ent].file, MCFILENAMELEN); + return 0; +} + +/* + * return the nth request trace buffer entry. This + * is indexed back from the current entry (which + * has index 0) + */ +int +vinum_rqinfo(caddr_t data) +{ + struct rqinfo *rq = (struct rqinfo *) data; + int ent = *(int *) data; /* 1st word is index */ + int lastent = rqip - rqinfo; /* entry number of current entry */ + + if (ent >= RQINFO_SIZE) /* out of the table */ + return ENOENT; + if ((ent = lastent - ent - 1) < 0) + ent += RQINFO_SIZE; /* roll over backwards */ + bcopy(&rqinfo[ent], rq, sizeof(struct rqinfo)); + return 0; +} +#endif diff --git a/sys/dev/vinum/vinumobj.h b/sys/dev/vinum/vinumobj.h new file mode 100644 index 0000000..81087f3 --- /dev/null +++ b/sys/dev/vinum/vinumobj.h @@ -0,0 +1,320 @@ +/*- + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumobj.h,v 1.7 2003/05/23 01:08:58 grog Exp $ + * $FreeBSD$ + */ + +/* + * Definitions of Vinum objects: drive, subdisk, plex and volume. + * This file is included both by userland programs and by kernel code. + * The userland structures are a subset of the kernel structures, and + * all userland fields are at the beginning, so that a simple copy in + * the length of the userland structure will be sufficient. In order + * to perform this copy, vinumioctl must know both structures, so it + * includes this file again with _KERNEL reset. + */ + +#ifndef _KERNEL +/* + * Flags for all objects. Most of them only apply + * to specific objects, but we currently have + * space for all in any 32 bit flags word. + */ +enum objflags { + VF_LOCKED = 1, /* somebody has locked access to this object */ + VF_LOCKING = 2, /* we want access to this object */ + VF_OPEN = 4, /* object has openers */ + VF_WRITETHROUGH = 8, /* volume: write through */ + VF_INITED = 0x10, /* unit has been initialized */ + VF_WLABEL = 0x20, /* label area is writable */ + VF_LABELLING = 0x40, /* unit is currently being labelled */ + VF_WANTED = 0x80, /* someone is waiting to obtain a lock */ + VF_RAW = 0x100, /* raw volume (no file system) */ + VF_LOADED = 0x200, /* module is loaded */ + VF_CONFIGURING = 0x400, /* somebody is changing the config */ + VF_WILL_CONFIGURE = 0x800, /* somebody wants to change the config */ + VF_CONFIG_INCOMPLETE = 0x1000, /* haven't finished changing the config */ + VF_CONFIG_SETUPSTATE = 0x2000, /* set a volume up if all plexes are empty */ + VF_READING_CONFIG = 0x4000, /* we're reading config database from disk */ + VF_FORCECONFIG = 0x8000, /* configure drives even with different names */ + VF_NEWBORN = 0x10000, /* for objects: we've just created it */ + VF_CONFIGURED = 0x20000, /* for drives: we read the config */ + VF_STOPPING = 0x40000, /* for vinum_conf: stop on last close */ + VF_DAEMONOPEN = 0x80000, /* the daemon has us open (only superdev) */ + VF_CREATED = 0x100000, /* for volumes: freshly created, more then new */ + VF_HOTSPARE = 0x200000, /* for drives: use as hot spare */ + VF_RETRYERRORS = 0x400000, /* don't down subdisks on I/O errors */ + VF_HASDEBUG = 0x800000, /* set if we support debug */ +}; + +#endif + +/* Global configuration information for the vinum subsystem */ +#ifdef _KERNEL +struct _vinum_conf +#else +struct __vinum_conf +#endif +{ + int version; /* version of structures */ +#ifdef _KERNEL + /* Pointers to vinum structures */ + struct drive *drive; + struct sd *sd; + struct plex *plex; + struct volume *volume; +#else + /* Pointers to vinum structures */ + struct _drive *drive; + struct _sd *sd; + struct _plex *plex; + struct _volume *volume; +#endif + + /* the number allocated of each object */ + int drives_allocated; + int subdisks_allocated; + int plexes_allocated; + int volumes_allocated; + + /* and the number currently in use */ + /* + * Note that drives_used is not valid during drive recognition + * (vinum_scandisk and friends). Many invalid drives are added and + * later removed; the count isn't correct until we leave + * vinum_scandisk. + */ + int drives_used; + int subdisks_used; + int plexes_used; + int volumes_used; + + int flags; /* see above */ + +#define VINUM_MAXACTIVE 30000 /* maximum number of active requests */ + int active; /* current number of requests outstanding */ + int maxactive; /* maximum number of requests ever outstanding */ +#ifdef _KERNEL +#ifdef VINUMDEBUG + struct request *lastrq; + struct buf *lastbuf; +#endif +#endif +}; + +/* Use these defines to simplify code */ +#define DRIVE vinum_conf.drive +#define SD vinum_conf.sd +#define PLEX vinum_conf.plex +#define VOL vinum_conf.volume +#define VFLAGS vinum_conf.flags + +/* + * A drive corresponds to a disk slice. We use a different term to show + * the difference in usage: it doesn't have to be a slice, and could + * theoretically be a complete, unpartitioned disk + */ + +#ifdef _KERNEL +struct drive +#else +struct _drive +#endif +{ + char devicename[MAXDRIVENAME]; /* name of the slice it's on */ + struct vinum_label label; /* and the label information */ + enum drivestate state; /* current state */ + int flags; /* flags */ + int subdisks_allocated; /* number of entries in sd */ + int subdisks_used; /* and the number used */ + int blocksize; /* size of fs blocks */ + int pid; /* of locker */ + u_int64_t sectors_available; /* number of sectors still available */ + int secsperblock; + int lasterror; /* last error on drive */ + int driveno; /* index of drive in vinum_conf */ + int opencount; /* number of up subdisks */ + u_int64_t reads; /* number of reads on this drive */ + u_int64_t writes; /* number of writes on this drive */ + u_int64_t bytes_read; /* number of bytes read */ + u_int64_t bytes_written; /* number of bytes written */ +#define DRIVE_MAXACTIVE 30000 /* maximum number of active requests */ + int active; /* current number of requests outstanding */ + int maxactive; /* maximum number of requests ever outstanding */ + int freelist_size; /* number of entries alloced in free list */ + int freelist_entries; /* number of entries used in free list */ + struct drive_freelist *freelist; /* sorted list of free space on drive */ +#ifdef _KERNEL + u_int sectorsize; + off_t mediasize; + dev_t dev; /* device information */ +#ifdef VINUMDEBUG + char lockfilename[16]; /* name of file from which we were locked */ + int lockline; /* and the line number */ +#endif +#endif +}; + +#ifdef _KERNEL +struct sd +#else +struct _sd +#endif +{ + char name[MAXSDNAME]; /* name of subdisk */ + enum sdstate state; /* state */ + int flags; + int lasterror; /* last error occurred */ + /* offsets in blocks */ + int64_t driveoffset; /* offset on drive */ + /* + * plexoffset is the offset from the beginning + * of the plex to the very first part of the + * subdisk, in sectors. For striped, RAID-4 and + * RAID-5 plexes, only the first stripe is + * located at this offset + */ + int64_t plexoffset; /* offset in plex */ + u_int64_t sectors; /* and length in sectors */ + int sectorsize; /* sector size for DIOCGSECTORSIZE */ + int plexno; /* index of plex, if it belongs */ + int driveno; /* index of the drive on which it is located */ + int sdno; /* our index in vinum_conf */ + int plexsdno; /* and our number in our plex */ + /* (undefined if no plex) */ + u_int64_t reads; /* number of reads on this subdisk */ + u_int64_t writes; /* number of writes on this subdisk */ + u_int64_t bytes_read; /* number of bytes read */ + u_int64_t bytes_written; /* number of bytes written */ + /* revive parameters */ + u_int64_t revived; /* block number of current revive request */ + int revive_blocksize; /* revive block size (bytes) */ + int revive_interval; /* and time to wait between transfers */ + pid_t reviver; /* PID of reviving process */ + /* init parameters */ + u_int64_t initialized; /* block number of current init request */ + int init_blocksize; /* init block size (bytes) */ + int init_interval; /* and time to wait between transfers */ +#ifdef _KERNEL + struct request *waitlist; /* list of requests waiting on revive op */ + dev_t dev; /* associated device */ +#endif +}; + +#ifdef _KERNEL +struct plex +#else +struct _plex +#endif +{ + enum plexorg organization; /* Plex organization */ + enum plexstate state; /* and current state */ + u_int64_t length; /* total length of plex (sectors) */ + int flags; + int stripesize; /* size of stripe or raid band, in sectors */ + int sectorsize; /* sector size for DIOCGSECTORSIZE */ + int subdisks; /* number of associated subdisks */ + int subdisks_allocated; /* number of subdisks allocated space for */ + int *sdnos; /* list of component subdisks */ + int plexno; /* index of plex in vinum_conf */ + int volno; /* index of volume */ + int volplexno; /* number of plex in volume */ + /* Statistics */ + u_int64_t reads; /* number of reads on this plex */ + u_int64_t writes; /* number of writes on this plex */ + u_int64_t bytes_read; /* number of bytes read */ + u_int64_t bytes_written; /* number of bytes written */ + u_int64_t recovered_reads; /* number of recovered read operations */ + u_int64_t degraded_writes; /* number of degraded writes */ + u_int64_t parityless_writes; /* number of parityless writes */ + u_int64_t multiblock; /* requests that needed more than one block */ + u_int64_t multistripe; /* requests that needed more than one stripe */ + int sddowncount; /* number of subdisks down */ + /* Lock information */ + int usedlocks; /* number currently in use */ + int lockwaits; /* and number of waits for locks */ + off_t checkblock; /* block number for parity op */ + char name[MAXPLEXNAME]; /* name of plex */ +#ifdef _KERNEL + struct rangelock *lock; /* ranges of locked addresses */ + struct mtx *lockmtx; /* lock mutex, one of plexmutex [] */ + dev_t dev; /* associated device */ +#endif +}; + +#ifdef _KERNEL +struct volume +#else +struct _volume +#endif +{ + char name[MAXVOLNAME]; /* name of volume */ + enum volumestate state; /* current state */ + int plexes; /* number of plexes */ + int preferred_plex; /* index of plex to read from, + * -1 for round-robin */ + /* + * index of plex used for last read, for + * round-robin. + */ + int last_plex_read; + int volno; /* volume number */ + int flags; /* status and configuration flags */ + int openflags; /* flags supplied to last open(2) */ + u_int64_t size; /* size of volume */ + int blocksize; /* logical block size */ + int sectorsize; /* sector size for DIOCGSECTORSIZE */ + int active; /* number of outstanding requests active */ + int subops; /* and the number of suboperations */ + /* Statistics */ + u_int64_t bytes_read; /* number of bytes read */ + u_int64_t bytes_written; /* number of bytes written */ + u_int64_t reads; /* number of reads on this volume */ + u_int64_t writes; /* number of writes on this volume */ + u_int64_t recovered_reads; /* reads recovered from another plex */ + /* + * Unlike subdisks in the plex, space for the + * plex pointers is static. + */ + int plex[MAXPLEX]; /* index of plexes */ +#ifdef _KERNEL + dev_t dev; /* associated device */ +#endif +}; diff --git a/sys/dev/vinum/vinumparser.c b/sys/dev/vinum/vinumparser.c new file mode 100644 index 0000000..2820ffd --- /dev/null +++ b/sys/dev/vinum/vinumparser.c @@ -0,0 +1,234 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumparser.c,v 1.25 2003/05/07 03:33:28 grog Exp grog $ + * $FreeBSD$ + */ + +/* + * This file contains the parser for the configuration routines. It's used + * both in the kernel and in the user interface program, thus the separate file. + */ + +/* + * Go through a text and split up into text tokens. These are either non-blank + * sequences, or any sequence (except \0) enclosed in ' or ". Embedded ' or + * " characters may be escaped by \, which otherwise has no special meaning. + * + * Delimit by following with a \0, and return pointers to the starts at token []. + * Return the number of tokens found as the return value. + * + * This method has the restriction that a closing " or ' must be followed by + * grey space. + * + * Error conditions are end of line before end of quote, or no space after + * a closing quote. In this case, tokenize() returns -1. + */ + +#include <sys/param.h> +#include <dev/vinum/vinumkw.h> +#ifdef _KERNEL +#include <sys/systm.h> +#include <sys/conf.h> +#include <machine/setjmp.h> +/* All this mess for a single struct definition */ +#include <sys/uio.h> +#include <sys/namei.h> +#include <sys/mount.h> + +#include <dev/vinum/vinumvar.h> +#include <dev/vinum/vinumio.h> +#include <dev/vinum/vinumext.h> +#define iswhite(c) ((c == ' ') || (c == '\t')) /* check for white space */ +#else /* userland */ +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#define iswhite isspace /* use the ctype macro */ +#endif + +/* enum keyword is defined in vinumvar.h */ + +#define keypair(x) { #x, kw_##x } /* create pair "foo", kw_foo */ +#define flagkeypair(x) { "-"#x, kw_##x } /* create pair "-foo", kw_foo */ +#define KEYWORDSET(x) {sizeof (x) / sizeof (struct _keywords), x} + +/* Normal keywords. These are all the words that vinum knows. */ +struct _keywords keywords[] = +{keypair(drive), + keypair(partition), + keypair(sd), + keypair(subdisk), + keypair(plex), + keypair(volume), + keypair(vol), + keypair(setupstate), + keypair(readpol), + keypair(org), + keypair(name), + keypair(writethrough), + keypair(writeback), + keypair(device), + keypair(concat), + keypair(raid4), + keypair(raid5), + keypair(striped), + keypair(plexoffset), + keypair(driveoffset), + keypair(length), + keypair(len), + keypair(size), + keypair(state), + keypair(round), + keypair(prefer), + keypair(preferred), + keypair(rename), + keypair(detached), +#ifndef _KERNEL /* for vinum(8) only */ + keypair(debug), + keypair(stripe), + keypair(mirror), +#endif + keypair(attach), + keypair(detach), + keypair(printconfig), + keypair(saveconfig), + keypair(replace), + keypair(create), + keypair(read), + keypair(modify), + keypair(list), + keypair(l), + keypair(ld), + keypair(ls), + keypair(lp), + keypair(lv), + keypair(info), + keypair(set), + keypair(rm), + keypair(mv), + keypair(move), + keypair(init), + keypair(resetconfig), + keypair(start), + keypair(stop), + keypair(makedev), + keypair(help), + keypair(quit), + keypair(setdaemon), + keypair(getdaemon), + keypair(max), + keypair(replace), + keypair(readpol), + keypair(resetstats), + keypair(setstate), + keypair(checkparity), + keypair(rebuildparity), + keypair(dumpconfig), + keypair(retryerrors) +}; +struct keywordset keyword_set = KEYWORDSET(keywords); + +#ifndef _KERNEL +struct _keywords flag_keywords[] = +{flagkeypair(f), + flagkeypair(d), + flagkeypair(v), + flagkeypair(s), + flagkeypair(r), + flagkeypair(w) +}; +struct keywordset flag_set = KEYWORDSET(flag_keywords); + +#endif + +/* + * Take a blank separated list of tokens and turn it into a list of + * individual nul-delimited strings. Build a list of pointers at + * token, which must have enough space for the tokens. Return the + * number of tokens, or -1 on error (typically a missing string + * delimiter). + */ +int +tokenize(char *cptr, char *token[], int maxtoken) +{ + char delim; /* delimiter for searching for the partner */ + int tokennr; /* index of this token */ + + for (tokennr = 0; tokennr < maxtoken;) { + while (iswhite(*cptr)) + cptr++; /* skip initial white space */ + if ((*cptr == '\0') || (*cptr == '\n') || (*cptr == '#')) /* end of line */ + return tokennr; /* return number of tokens found */ + delim = *cptr; + token[tokennr] = cptr; /* point to it */ + tokennr++; /* one more */ + if (tokennr == maxtoken) /* run off the end? */ + return tokennr; + if ((delim == '\'') || (delim == '"')) { /* delimitered */ + for (;;) { + cptr++; + if ((*cptr == delim) && (cptr[-1] != '\\')) { /* found the partner */ + cptr++; /* move on past */ + if (!iswhite(*cptr)) /* error, no space after closing quote */ + return -1; + *cptr++ = '\0'; /* delimit */ + } else if ((*cptr == '\0') || (*cptr == '\n')) /* end of line */ + return -1; + } + } else { /* not quoted */ + while ((*cptr != '\0') && (!iswhite(*cptr)) && (*cptr != '\n')) + cptr++; + if (*cptr != '\0') /* not end of the line, */ + *cptr++ = '\0'; /* delimit and move to the next */ + } + } + return maxtoken; /* can't get here */ +} + +/* Find a keyword and return an index */ +enum keyword +get_keyword(char *name, struct keywordset *keywordset) +{ + int i; + struct _keywords *keywords = keywordset->k; /* point to the keywords */ + if (name != NULL) { /* parameter exists */ + for (i = 0; i < keywordset->size; i++) + if (!strcmp(name, keywords[i].name)) + return (enum keyword) keywords[i].keyword; + } + return kw_invalid_keyword; +} diff --git a/sys/dev/vinum/vinumraid5.c b/sys/dev/vinum/vinumraid5.c new file mode 100644 index 0000000..73b024f --- /dev/null +++ b/sys/dev/vinum/vinumraid5.c @@ -0,0 +1,698 @@ +/*- + * Copyright (c) 1997, 1998 + * Cybernet Corporation and Nan Yang Computer Services Limited. + * All rights reserved. + * + * This software was developed as part of the NetMAX project. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Cybernet Corporation + * and Nan Yang Computer Services Limited + * 4. Neither the name of the Companies nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumraid5.c,v 1.23 2003/02/08 03:32:45 grog Exp $ + * $FreeBSD$ + */ +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> +#include <sys/resourcevar.h> + +/* + * Parameters which describe the current transfer. + * These are only used for calculation, but they + * need to be passed to other functions, so it's + * tidier to put them in a struct + */ +struct metrics { + daddr_t stripebase; /* base address of stripe (1st subdisk) */ + int stripeoffset; /* offset in stripe */ + int stripesectors; /* total sectors to transfer in this stripe */ + daddr_t sdbase; /* offset in subdisk of stripe base */ + int sdcount; /* number of disks involved in this transfer */ + daddr_t diskstart; /* remember where this transfer starts */ + int psdno; /* number of parity subdisk */ + int badsdno; /* number of down subdisk, if there is one */ + int firstsdno; /* first data subdisk number */ + /* These correspond to the fields in rqelement, sort of */ + int useroffset; + /* + * Initial offset and length values for the first + * data block + */ + int initoffset; /* start address of block to transfer */ + short initlen; /* length in sectors of data transfer */ + /* Define a normal operation */ + int dataoffset; /* start address of block to transfer */ + int datalen; /* length in sectors of data transfer */ + /* Define a group operation */ + int groupoffset; /* subdisk offset of group operation */ + int grouplen; /* length in sectors of group operation */ + /* Define a normal write operation */ + int writeoffset; /* subdisk offset of normal write */ + int writelen; /* length in sectors of write operation */ + enum xferinfo flags; /* to check what we're doing */ + int rqcount; /* number of elements in request */ +}; + +enum requeststatus bre5(struct request *rq, + int plexno, + daddr_t * diskstart, + daddr_t diskend); +void complete_raid5_write(struct rqelement *); +enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex); +void setrqebounds(struct rqelement *rqe, struct metrics *mp); + +/* + * define the low-level requests needed to perform + * a high-level I/O operation for a specific plex + * 'plexno'. + * + * Return 0 if all subdisks involved in the + * request are up, 1 if some subdisks are not up, + * and -1 if the request is at least partially + * outside the bounds of the subdisks. + * + * Modify the pointer *diskstart to point to the + * end address. On read, return on the first bad + * subdisk, so that the caller + * (build_read_request) can try alternatives. + * + * On entry to this routine, the prq structures + * are not assigned. The assignment is performed + * by expandrq(). Strictly speaking, the elements + * rqe->sdno of all entries should be set to -1, + * since 0 (from bzero) is a valid subdisk number. + * We avoid this problem by initializing the ones + * we use, and not looking at the others (index >= + * prq->requests). + */ +enum requeststatus +bre5(struct request *rq, + int plexno, + daddr_t * diskaddr, + daddr_t diskend) +{ + struct metrics m; /* most of the information */ + struct sd *sd; + struct plex *plex; + struct buf *bp; /* user's bp */ + struct rqgroup *rqg; /* the request group that we will create */ + struct rqelement *rqe; /* point to this request information */ + int rsectors; /* sectors remaining in this stripe */ + int mysdno; /* another sd index in loops */ + int rqno; /* request number */ + + rqg = NULL; /* shut up, damn compiler */ + m.diskstart = *diskaddr; /* start of transfer */ + bp = rq->bp; /* buffer pointer */ + plex = &PLEX[plexno]; /* point to the plex */ + + + while (*diskaddr < diskend) { /* until we get it all sorted out */ + if (*diskaddr >= plex->length) /* beyond the end of the plex */ + return REQUEST_EOF; /* can't continue */ + + m.badsdno = -1; /* no bad subdisk yet */ + + /* Part A: Define the request */ + /* + * First, calculate some sizes: + * The offset of the start address from + * the start of the stripe. + */ + m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1)); + + /* + * The plex-relative address of the + * start of the stripe. + */ + m.stripebase = *diskaddr - m.stripeoffset; + + /* subdisk containing the parity stripe */ + if (plex->organization == plex_raid5) + m.psdno = plex->subdisks - 1 + - (*diskaddr / (plex->stripesize * (plex->subdisks - 1))) + % plex->subdisks; + else /* RAID-4 */ + m.psdno = plex->subdisks - 1; + + /* + * The number of the subdisk in which + * the start is located. + */ + m.firstsdno = m.stripeoffset / plex->stripesize; + if (m.firstsdno >= m.psdno) /* at or past parity sd */ + m.firstsdno++; /* increment it */ + + /* + * The offset from the beginning of + * the stripe on this subdisk. + */ + m.initoffset = m.stripeoffset % plex->stripesize; + + /* The offset of the stripe start relative to this subdisk */ + m.sdbase = m.stripebase / (plex->subdisks - 1); + + m.useroffset = *diskaddr - m.diskstart; /* The offset of the start in the user buffer */ + + /* + * The number of sectors to transfer in the + * current (first) subdisk. + */ + m.initlen = min(diskend - *diskaddr, /* the amount remaining to transfer */ + plex->stripesize - m.initoffset); /* and the amount left in this block */ + + /* + * The number of sectors to transfer in this stripe + * is the minumum of the amount remaining to transfer + * and the amount left in this stripe. + */ + m.stripesectors = min(diskend - *diskaddr, + plex->stripesize * (plex->subdisks - 1) - m.stripeoffset); + + /* The number of data subdisks involved in this request */ + m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize; + + /* Part B: decide what kind of transfer this will be. + + * start and end addresses of the transfer in + * the current block. + * + * There are a number of different kinds of + * transfer, each of which relates to a + * specific subdisk: + * + * 1. Normal read. All participating subdisks + * are up, and the transfer can be made + * directly to the user buffer. The bounds + * of the transfer are described by + * m.dataoffset and m.datalen. We have + * already calculated m.initoffset and + * m.initlen, which define the parameters + * for the first data block. + * + * 2. Recovery read. One participating + * subdisk is down. To recover data, all + * the other subdisks, including the parity + * subdisk, must be read. The data is + * recovered by exclusive-oring all the + * other blocks. The bounds of the + * transfer are described by m.groupoffset + * and m.grouplen. + * + * 3. A read request may request reading both + * available data (normal read) and + * non-available data (recovery read). + * This can be a problem if the address + * ranges of the two reads do not coincide: + * in this case, the normal read needs to + * be extended to cover the address range + * of the recovery read, and must thus be + * performed out of malloced memory. + * + * 4. Normal write. All the participating + * subdisks are up. The bounds of the + * transfer are described by m.dataoffset + * and m.datalen. Since these values + * differ for each block, we calculate the + * bounds for the parity block + * independently as the maximum of the + * individual blocks and store these values + * in m.writeoffset and m.writelen. This + * write proceeds in four phases: + * + * i. Read the old contents of each block + * and the parity block. + * ii. ``Remove'' the old contents from + * the parity block with exclusive or. + * iii. ``Insert'' the new contents of the + * block in the parity block, again + * with exclusive or. + * + * iv. Write the new contents of the data + * blocks and the parity block. The data + * block transfers can be made directly from + * the user buffer. + * + * 5. Degraded write where the data block is + * not available. The bounds of the + * transfer are described by m.groupoffset + * and m.grouplen. This requires the + * following steps: + * + * i. Read in all the other data blocks, + * excluding the parity block. + * + * ii. Recreate the parity block from the + * other data blocks and the data to be + * written. + * + * iii. Write the parity block. + * + * 6. Parityless write, a write where the + * parity block is not available. This is + * in fact the simplest: just write the + * data blocks. This can proceed directly + * from the user buffer. The bounds of the + * transfer are described by m.dataoffset + * and m.datalen. + * + * 7. Combination of degraded data block write + * and normal write. In this case the + * address ranges of the reads may also + * need to be extended to cover all + * participating blocks. + * + * All requests in a group transfer transfer + * the same address range relative to their + * subdisk. The individual transfers may + * vary, but since our group of requests is + * all in a single slice, we can define a + * range in which they all fall. + * + * In the following code section, we determine + * which kind of transfer we will perform. If + * there is a group transfer, we also decide + * its bounds relative to the subdisks. At + * the end, we have the following values: + * + * m.flags indicates the kinds of transfers + * we will perform. + * m.initoffset indicates the offset of the + * beginning of any data operation relative + * to the beginning of the stripe base. + * m.initlen specifies the length of any data + * operation. + * m.dataoffset contains the same value as + * m.initoffset. + * m.datalen contains the same value as + * m.initlen. Initially dataoffset and + * datalen describe the parameters for the + * first data block; while building the data + * block requests, they are updated for each + * block. + * m.groupoffset indicates the offset of any + * group operation relative to the beginning + * of the stripe base. + * m.grouplen specifies the length of any + * group operation. + * m.writeoffset indicates the offset of a + * normal write relative to the beginning of + * the stripe base. This value differs from + * m.dataoffset in that it applies to the + * entire operation, and not just the first + * block. + * m.writelen specifies the total span of a + * normal write operation. writeoffset and + * writelen are used to define the parity + * block. + */ + m.groupoffset = 0; /* assume no group... */ + m.grouplen = 0; /* until we know we have one */ + m.writeoffset = m.initoffset; /* start offset of transfer */ + m.writelen = 0; /* nothing to write yet */ + m.flags = 0; /* no flags yet */ + rsectors = m.stripesectors; /* remaining sectors to examine */ + m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ + m.datalen = m.initlen; + + if (m.sdcount > 1) { + plex->multiblock++; /* more than one block for the request */ + /* + * If we have two transfers that don't overlap, + * (one at the end of the first block, the other + * at the beginning of the second block), + * it's cheaper to split them. + */ + if (rsectors < plex->stripesize) { + m.sdcount = 1; /* just one subdisk */ + m.stripesectors = m.initlen; /* and just this many sectors */ + rsectors = m.initlen; /* and in the loop counter */ + } + } + if (SD[plex->sdnos[m.psdno]].state < sd_reborn) /* is our parity subdisk down? */ + m.badsdno = m.psdno; /* note that it's down */ + if (bp->b_iocmd == BIO_READ) { /* read operation */ + for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { + if (mysdno == m.psdno) /* ignore parity on read */ + mysdno++; + if (mysdno == plex->subdisks) /* wraparound */ + mysdno = 0; + if (mysdno == m.psdno) /* parity, */ + mysdno++; /* we've given already */ + + if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */ + if (m.badsdno >= 0) /* we had one already, */ + return REQUEST_DOWN; /* we can't take a second */ + m.badsdno = mysdno; /* got the first */ + m.groupoffset = m.dataoffset; /* define the bounds */ + m.grouplen = m.datalen; + m.flags |= XFR_RECOVERY_READ; /* we need recovery */ + plex->recovered_reads++; /* count another one */ + } else + m.flags |= XFR_NORMAL_READ; /* normal read */ + + /* Update the pointers for the next block */ + m.dataoffset = 0; /* back to the start of the stripe */ + rsectors -= m.datalen; /* remaining sectors to examine */ + m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */ + } + } else { /* write operation */ + for (mysdno = m.firstsdno; rsectors > 0; mysdno++) { + if (mysdno == m.psdno) /* parity stripe, we've dealt with that */ + mysdno++; + if (mysdno == plex->subdisks) /* wraparound */ + mysdno = 0; + if (mysdno == m.psdno) /* parity, */ + mysdno++; /* we've given already */ + + sd = &SD[plex->sdnos[mysdno]]; + if (sd->state != sd_up) { + enum requeststatus s; + + s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ + if (s && (m.badsdno >= 0)) { /* second bad disk, */ + int sdno; + /* + * If the parity disk is down, there's + * no recovery. We make all involved + * subdisks stale. Otherwise, we + * should be able to recover, but it's + * like pulling teeth. Fix it later. + */ + for (sdno = 0; sdno < m.sdcount; sdno++) { + struct sd *sd = &SD[plex->sdnos[sdno]]; + if (sd->state >= sd_reborn) /* sort of up, */ + set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */ + } + return s; /* and crap out */ + } + m.badsdno = mysdno; /* note which one is bad */ + m.flags |= XFR_DEGRADED_WRITE; /* we need recovery */ + plex->degraded_writes++; /* count another one */ + m.groupoffset = m.dataoffset; /* define the bounds */ + m.grouplen = m.datalen; + } else { + m.flags |= XFR_NORMAL_WRITE; /* normal write operation */ + if (m.writeoffset > m.dataoffset) { /* move write operation lower */ + m.writelen = max(m.writeoffset + m.writelen, + m.dataoffset + m.datalen) + - m.dataoffset; + m.writeoffset = m.dataoffset; + } else + m.writelen = max(m.writeoffset + m.writelen, + m.dataoffset + m.datalen) + - m.writeoffset; + } + + /* Update the pointers for the next block */ + m.dataoffset = 0; /* back to the start of the stripe */ + rsectors -= m.datalen; /* remaining sectors to examine */ + m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */ + } + if (m.badsdno == m.psdno) { /* got a bad parity block, */ + struct sd *psd = &SD[plex->sdnos[m.psdno]]; + + if (psd->state == sd_down) + set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */ + else if (psd->state == sd_crashed) + set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */ + m.flags &= ~XFR_NORMAL_WRITE; /* this write isn't normal, */ + m.flags |= XFR_PARITYLESS_WRITE; /* it's parityless */ + plex->parityless_writes++; /* count another one */ + } + } + + /* reset the initial transfer values */ + m.dataoffset = m.initoffset; /* start at the beginning of the transfer */ + m.datalen = m.initlen; + + /* decide how many requests we need */ + if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE)) + /* doing a recovery read or degraded write, */ + m.rqcount = plex->subdisks; /* all subdisks */ + else if (m.flags & XFR_NORMAL_WRITE) /* normal write, */ + m.rqcount = m.sdcount + 1; /* all data blocks and the parity block */ + else /* parityless write or normal read */ + m.rqcount = m.sdcount; /* just the data blocks */ + + /* Part C: build the requests */ + rqg = allocrqg(rq, m.rqcount); /* get a request group */ + if (rqg == NULL) { /* malloc failed */ + bp->b_error = ENOMEM; + bp->b_ioflags |= BIO_ERROR; + return REQUEST_ENOMEM; + } + rqg->plexno = plexno; + rqg->flags = m.flags; + rqno = 0; /* index in the request group */ + + /* 1: PARITY BLOCK */ + /* + * Are we performing an operation which requires parity? In that case, + * work out the parameters and define the parity block. + * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE + */ + if (m.flags & XFR_PARITYOP) { /* need parity */ + rqe = &rqg->rqe[rqno]; /* point to element */ + sd = &SD[plex->sdnos[m.psdno]]; /* the subdisk in question */ + rqe->rqg = rqg; /* point back to group */ + rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */ + &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE); /* transfer flags without data op stuf */ + setrqebounds(rqe, &m); /* set up the bounds of the transfer */ + rqe->sdno = sd->sdno; /* subdisk number */ + rqe->driveno = sd->driveno; + if (build_rq_buffer(rqe, plex)) /* build the buffer */ + return REQUEST_ENOMEM; /* can't do it */ + rqe->b.b_iocmd = BIO_READ; /* we must read first */ + m.sdcount++; /* adjust the subdisk count */ + rqno++; /* and point to the next request */ + } + /* + * 2: DATA BLOCKS + * Now build up requests for the blocks required + * for individual transfers + */ + for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) { + if (mysdno == m.psdno) /* parity, */ + mysdno++; /* we've given already */ + if (mysdno == plex->subdisks) /* got to the end, */ + mysdno = 0; /* wrap around */ + if (mysdno == m.psdno) /* parity, */ + mysdno++; /* we've given already */ + + rqe = &rqg->rqe[rqno]; /* point to element */ + sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ + rqe->rqg = rqg; /* point to group */ + if (m.flags & XFR_NEEDS_MALLOC) /* we need a malloced buffer first */ + rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */ + else + rqe->flags = m.flags | XFR_DATA_BLOCK; /* transfer flags */ + if (mysdno == m.badsdno) { /* this is the bad subdisk */ + rqg->badsdno = rqno; /* note which one */ + rqe->flags |= XFR_BAD_SUBDISK; /* note that it's dead */ + /* + * we can't read or write from/to it, + * but we don't need to malloc + */ + rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE); + } + setrqebounds(rqe, &m); /* set up the bounds of the transfer */ + rqe->useroffset = m.useroffset; /* offset in user buffer */ + rqe->sdno = sd->sdno; /* subdisk number */ + rqe->driveno = sd->driveno; + if (build_rq_buffer(rqe, plex)) /* build the buffer */ + return REQUEST_ENOMEM; /* can't do it */ + if ((m.flags & XFR_PARITYOP) /* parity operation, */ + &&((m.flags & XFR_BAD_SUBDISK) == 0)) /* and not the bad subdisk, */ + rqe->b.b_iocmd = BIO_READ; /* we must read first */ + + /* Now update pointers for the next block */ + *diskaddr += m.datalen; /* skip past what we've done */ + m.stripesectors -= m.datalen; /* deduct from what's left */ + m.useroffset += m.datalen; /* and move on in the user buffer */ + m.datalen = min(m.stripesectors, plex->stripesize); /* and recalculate */ + m.dataoffset = 0; /* start at the beginning of next block */ + } + + /* + * 3: REMAINING BLOCKS FOR RECOVERY + * Finally, if we have a recovery operation, build + * up transfers for the other subdisks. Follow the + * subdisks around until we get to where we started. + * These requests use only the group parameters. + */ + if ((rqno < m.rqcount) /* haven't done them all already */ + &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) { + for (; rqno < m.rqcount; rqno++, mysdno++) { + if (mysdno == m.psdno) /* parity, */ + mysdno++; /* we've given already */ + if (mysdno == plex->subdisks) /* got to the end, */ + mysdno = 0; /* wrap around */ + if (mysdno == m.psdno) /* parity, */ + mysdno++; /* we've given already */ + + rqe = &rqg->rqe[rqno]; /* point to element */ + sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */ + rqe->rqg = rqg; /* point to group */ + + rqe->sdoffset = m.sdbase + m.groupoffset; /* start of transfer */ + rqe->dataoffset = 0; /* for tidiness' sake */ + rqe->groupoffset = 0; /* group starts at the beginining */ + rqe->datalen = 0; + rqe->grouplen = m.grouplen; + rqe->buflen = m.grouplen; + rqe->flags = (m.flags | XFR_MALLOCED) /* transfer flags without data op stuf */ + &~XFR_DATAOP; + rqe->sdno = sd->sdno; /* subdisk number */ + rqe->driveno = sd->driveno; + if (build_rq_buffer(rqe, plex)) /* build the buffer */ + return REQUEST_ENOMEM; /* can't do it */ + rqe->b.b_iocmd = BIO_READ; /* we must read first */ + } + } + /* + * We need to lock the address range before + * doing anything. We don't have to be + * performing a recovery operation: somebody + * else could be doing so, and the results could + * influence us. Note the fact here, we'll perform + * the lock in launch_requests. + */ + rqg->lockbase = m.stripebase; + if (*diskaddr < diskend) /* didn't finish the request on this stripe */ + plex->multistripe++; /* count another one */ + } + return REQUEST_OK; +} + +/* + * Helper function for rqe5: adjust the bounds of + * the transfers to minimize the buffer + * allocation. + * + * Each request can handle two of three different + * data ranges: + * + * 1. The range described by the parameters + * dataoffset and datalen, for normal read or + * parityless write. + * 2. The range described by the parameters + * groupoffset and grouplen, for recovery read + * and degraded write. + * 3. For normal write, the range depends on the + * kind of block. For data blocks, the range + * is defined by dataoffset and datalen. For + * parity blocks, it is defined by writeoffset + * and writelen. + * + * In order not to allocate more memory than + * necessary, this function adjusts the bounds + * parameter for each request to cover just the + * minimum necessary for the function it performs. + * This will normally vary from one request to the + * next. + * + * Things are slightly different for the parity + * block. In this case, the bounds defined by + * mp->writeoffset and mp->writelen also play a + * rôle. Select this case by setting the + * parameter forparity != 0. + */ +void +setrqebounds(struct rqelement *rqe, struct metrics *mp) +{ + /* parity block of a normal write */ + if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) + == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) { /* case 3 */ + if (rqe->flags & XFR_DEGRADED_WRITE) { /* also degraded write */ + /* + * With a combined normal and degraded write, we + * will zero out the area of the degraded write + * in the second phase, so we don't need to read + * it in. Unfortunately, we need a way to tell + * build_request_buffer the size of the buffer, + * and currently that's the length of the read. + * As a result, we read everything, even the stuff + * that we're going to nuke. + * FIXME XXX + */ + if (mp->groupoffset < mp->writeoffset) { /* group operation starts lower */ + rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ + rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */ + rqe->groupoffset = 0; /* and the group at the beginning */ + } else { /* individual data starts first */ + rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ + rqe->dataoffset = 0; /* individual data starts at the beginning */ + rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */ + } + rqe->datalen = mp->writelen; + rqe->grouplen = mp->grouplen; + } else { /* just normal write (case 3) */ + rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */ + rqe->dataoffset = 0; /* degradation starts at the beginning */ + rqe->groupoffset = 0; /* for tidiness' sake */ + rqe->datalen = mp->writelen; + rqe->grouplen = 0; + } + } else if (rqe->flags & XFR_DATAOP) { /* data operation (case 1 or 3) */ + if (rqe->flags & XFR_GROUPOP) { /* also a group operation (case 2) */ + if (mp->groupoffset < mp->dataoffset) { /* group operation starts lower */ + rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ + rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */ + rqe->groupoffset = 0; /* and the group at the beginning */ + } else { /* individual data starts first */ + rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ + rqe->dataoffset = 0; /* individual data starts at the beginning */ + rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */ + } + rqe->datalen = mp->datalen; + rqe->grouplen = mp->grouplen; + } else { /* just data operation (case 1) */ + rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */ + rqe->dataoffset = 0; /* degradation starts at the beginning */ + rqe->groupoffset = 0; /* for tidiness' sake */ + rqe->datalen = mp->datalen; + rqe->grouplen = 0; + } + } else { /* just group operations (case 2) */ + rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */ + rqe->dataoffset = 0; /* for tidiness' sake */ + rqe->groupoffset = 0; /* group starts at the beginining */ + rqe->datalen = 0; + rqe->grouplen = mp->grouplen; + } + rqe->buflen = max(rqe->dataoffset + rqe->datalen, /* total buffer length */ + rqe->groupoffset + rqe->grouplen); +} +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumrequest.c b/sys/dev/vinum/vinumrequest.c new file mode 100644 index 0000000..f74fc89 --- /dev/null +++ b/sys/dev/vinum/vinumrequest.c @@ -0,0 +1,1112 @@ +/*- + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumrequest.c,v 1.36 2003/05/08 04:34:55 grog Exp grog $ + * $FreeBSD$ + */ + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> +#include <sys/resourcevar.h> + +enum requeststatus bre(struct request *rq, + int plexno, + daddr_t * diskstart, + daddr_t diskend); +enum requeststatus bre5(struct request *rq, + int plexno, + daddr_t * diskstart, + daddr_t diskend); +enum requeststatus build_read_request(struct request *rq, int volplexno); +enum requeststatus build_write_request(struct request *rq); +enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex); +int find_alternate_sd(struct request *rq); +int check_range_covered(struct request *); +void complete_rqe(struct buf *bp); +void complete_raid5_write(struct rqelement *); +int abortrequest(struct request *rq, int error); +void sdio_done(struct buf *bp); +int vinum_bounds_check(struct buf *bp, struct volume *vol); +caddr_t allocdatabuf(struct rqelement *rqe); +void freedatabuf(struct rqelement *rqe); + +#ifdef VINUMDEBUG +struct rqinfo rqinfo[RQINFO_SIZE]; +struct rqinfo *rqip = rqinfo; + +void +logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp) +{ + int s = splhigh(); + + microtime(&rqip->timestamp); /* when did this happen? */ + rqip->type = type; + rqip->bp = ubp; /* user buffer */ + switch (type) { + case loginfo_user_bp: + case loginfo_user_bpl: + case loginfo_sdio: /* subdisk I/O */ + case loginfo_sdiol: /* subdisk I/O launch */ + case loginfo_sdiodone: /* subdisk I/O complete */ + bcopy(info.bp, &rqip->info.b, sizeof(struct buf)); + rqip->devmajor = major(info.bp->b_dev); + rqip->devminor = minor(info.bp->b_dev); + break; + + case loginfo_iodone: + case loginfo_rqe: + case loginfo_raid5_data: + case loginfo_raid5_parity: + bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement)); + rqip->devmajor = major(info.rqe->b.b_dev); + rqip->devminor = minor(info.rqe->b.b_dev); + break; + + case loginfo_lockwait: + case loginfo_lock: + case loginfo_unlock: + bcopy(info.lockinfo, &rqip->info.lockinfo, sizeof(struct rangelock)); + + break; + + case loginfo_unused: + break; + } + rqip++; + if (rqip >= &rqinfo[RQINFO_SIZE]) /* wrap around */ + rqip = rqinfo; + splx(s); +} + +#endif + +void +vinumstrategy(struct bio *biop) +{ + struct buf *bp = (struct buf *) biop; + int volno; + struct volume *vol = NULL; + + switch (DEVTYPE(bp->b_dev)) { + case VINUM_SD_TYPE: + case VINUM_SD2_TYPE: + sdio(bp); + return; + + default: + bp->b_error = EIO; /* I/O error */ + bp->b_io.bio_flags |= BIO_ERROR; + bufdone(bp); + return; + + case VINUM_VOLUME_TYPE: /* volume I/O */ + volno = Volno(bp->b_dev); + vol = &VOL[volno]; + if (vol->state != volume_up) { /* can't access this volume */ + bp->b_error = EIO; /* I/O error */ + bp->b_io.bio_flags |= BIO_ERROR; + bufdone(bp); + return; + } + if (vinum_bounds_check(bp, vol) <= 0) { /* don't like them bounds */ + bufdone(bp); + return; + } + /* FALLTHROUGH */ + /* + * Plex I/O is pretty much the same as volume I/O + * for a single plex. Indicate this by passing a NULL + * pointer (set above) for the volume + */ + case VINUM_PLEX_TYPE: + bp->b_resid = bp->b_bcount; /* transfer everything */ + vinumstart(bp, 0); + return; + } +} + +/* + * Start a transfer. Return -1 on error, 0 if OK, + * 1 if we need to retry. Parameter reviveok is + * set when doing transfers for revives: it allows + * transfers to be started immediately when a + * revive is in progress. During revive, normal + * transfers are queued if they share address + * space with a currently active revive operation. + */ +int +vinumstart(struct buf *bp, int reviveok) +{ + int plexno; + int maxplex; /* maximum number of plexes to handle */ + struct volume *vol; + struct request *rq; /* build up our request here */ + enum requeststatus status; + +#ifdef VINUMDEBUG + if (debug & DEBUG_LASTREQS) + logrq(loginfo_user_bp, (union rqinfou) bp, bp); +#endif + + if ((bp->b_bcount % DEV_BSIZE) != 0) { /* bad length */ + bp->b_error = EINVAL; /* invalid size */ + bp->b_io.bio_flags |= BIO_ERROR; + bufdone(bp); + return -1; + } + rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */ + if (rq == NULL) { /* can't do it */ + bp->b_error = ENOMEM; /* can't get memory */ + bp->b_io.bio_flags |= BIO_ERROR; + bufdone(bp); + return -1; + } + bzero(rq, sizeof(struct request)); + + /* + * Note the volume ID. This can be NULL, which + * the request building functions use as an + * indication for single plex I/O. + */ + rq->bp = bp; /* and the user buffer struct */ + + if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) { /* it's a volume, */ + rq->volplex.volno = Volno(bp->b_dev); /* get the volume number */ + vol = &VOL[rq->volplex.volno]; /* and point to it */ + vol->active++; /* one more active request */ + maxplex = vol->plexes; /* consider all its plexes */ + } else { + vol = NULL; /* no volume */ + rq->volplex.plexno = Plexno(bp->b_dev); /* point to the plex */ + rq->isplex = 1; /* note that it's a plex */ + maxplex = 1; /* just the one plex */ + } + + if (bp->b_iocmd == BIO_READ) { + /* + * This is a read request. Decide + * which plex to read from. + * + * There's a potential race condition here, + * since we're not locked, and we could end + * up multiply incrementing the round-robin + * counter. This doesn't have any serious + * effects, however. + */ + if (vol != NULL) { + plexno = vol->preferred_plex; /* get the plex to use */ + if (plexno < 0) { /* round robin */ + plexno = vol->last_plex_read; + vol->last_plex_read++; + if (vol->last_plex_read >= vol->plexes) /* got the the end? */ + vol->last_plex_read = 0; /* wrap around */ + } + status = build_read_request(rq, plexno); /* build a request */ + } else { + daddr_t diskaddr = bp->b_blkno; /* start offset of transfer */ + status = bre(rq, /* build a request list */ + rq->volplex.plexno, + &diskaddr, + diskaddr + (bp->b_bcount / DEV_BSIZE)); + } + + if (status > REQUEST_RECOVERED) { /* can't satisfy it */ + if (status == REQUEST_DOWN) { /* not enough subdisks */ + bp->b_error = EIO; /* I/O error */ + bp->b_io.bio_flags |= BIO_ERROR; + } + bufdone(bp); + freerq(rq); + return -1; + } + return launch_requests(rq, reviveok); /* now start the requests if we can */ + } else + /* + * This is a write operation. We write to all plexes. If this is + * a RAID-4 or RAID-5 plex, we must also update the parity stripe. + */ + { + if (vol != NULL) { + if ((vol->plexes > 0) /* multiple plex */ + ||(isparity((&PLEX[vol->plex[0]])))) { /* or RAID-[45], */ + rq->save_data = bp->b_data; /* save the data buffer address */ + bp->b_data = Malloc(bp->b_bcount); + bcopy(rq->save_data, bp->b_data, bp->b_bcount); /* make a copy */ + rq->flags |= XFR_COPYBUF; /* and note that we did it */ + } + status = build_write_request(rq); + } else { /* plex I/O */ + daddr_t diskstart; + + diskstart = bp->b_blkno; /* start offset of transfer */ + status = bre(rq, + Plexno(bp->b_dev), + &diskstart, + bp->b_blkno + (bp->b_bcount / DEV_BSIZE)); /* build requests for the plex */ + } + if (status > REQUEST_RECOVERED) { /* can't satisfy it */ + if (status == REQUEST_DOWN) { /* not enough subdisks */ + bp->b_error = EIO; /* I/O error */ + bp->b_io.bio_flags |= BIO_ERROR; + } + if (rq->flags & XFR_COPYBUF) { + Free(bp->b_data); + bp->b_data = rq->save_data; + } + bufdone(bp); + freerq(rq); + return -1; + } + return launch_requests(rq, reviveok); /* now start the requests if we can */ + } +} + +/* + * Call the low-level strategy routines to + * perform the requests in a struct request + */ +int +launch_requests(struct request *rq, int reviveok) +{ + struct rqgroup *rqg; + int rqno; /* loop index */ + struct rqelement *rqe; /* current element */ + struct drive *drive; + int rcount; /* request count */ + + /* + * First find out whether we're reviving, and + * the request contains a conflict. If so, we + * hang the request off plex->waitlist of the + * first plex we find which is reviving. + */ + + if ((rq->flags & XFR_REVIVECONFLICT) /* possible revive conflict */ + &&(!reviveok)) { /* and we don't want to do it now, */ + struct sd *sd; + struct request *waitlist; /* point to the waitlist */ + + sd = &SD[rq->sdno]; + if (sd->waitlist != NULL) { /* something there already, */ + waitlist = sd->waitlist; + while (waitlist->next != NULL) /* find the end */ + waitlist = waitlist->next; + waitlist->next = rq; /* hook our request there */ + } else + sd->waitlist = rq; /* hook our request at the front */ + +#ifdef VINUMDEBUG + if (debug & DEBUG_REVIVECONFLICT) + log(LOG_DEBUG, + "Revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n", + rq->sdno, + rq, + rq->bp->b_iocmd == BIO_READ ? "Read" : "Write", + major(rq->bp->b_dev), + minor(rq->bp->b_dev), + (intmax_t) rq->bp->b_blkno, + rq->bp->b_bcount); +#endif + return 0; /* and get out of here */ + } + rq->active = 0; /* nothing yet */ +#ifdef VINUMDEBUG + /* XXX This is probably due to a bug */ + if (rq->rqg == NULL) { /* no request */ + log(LOG_ERR, "vinum: null rqg\n"); + abortrequest(rq, EINVAL); + return -1; + } +#endif +#ifdef VINUMDEBUG + if (debug & DEBUG_ADDRESSES) + log(LOG_DEBUG, + "Request: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n", + rq, + rq->bp->b_iocmd == BIO_READ ? "Read" : "Write", + major(rq->bp->b_dev), + minor(rq->bp->b_dev), + (intmax_t) rq->bp->b_blkno, + rq->bp->b_bcount); + vinum_conf.lastrq = rq; + vinum_conf.lastbuf = rq->bp; + if (debug & DEBUG_LASTREQS) + logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp); +#endif + + /* + * We used to have an splbio() here anyway, out + * of superstition. With the division of labour + * below (first count the requests, then issue + * them), it looks as if we don't need this + * splbio() protection. In fact, as dillon + * points out, there's a race condition + * incrementing and decrementing rq->active and + * rqg->active. This splbio() didn't help + * there, because the device strategy routine + * can sleep. Solve this by putting shorter + * duration locks on the code. + */ + /* + * This loop happens without any participation + * of the bottom half, so it requires no + * protection. + */ + for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) { /* through the whole request chain */ + rqg->active = rqg->count; /* they're all active */ + for (rqno = 0; rqno < rqg->count; rqno++) { + rqe = &rqg->rqe[rqno]; + if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */ + rqg->active--; /* one less active request */ + } + if (rqg->active) /* we have at least one active request, */ + rq->active++; /* one more active request group */ + } + + /* + * Now fire off the requests. In this loop the + * bottom half could be completing requests + * before we finish. We avoid splbio() + * protection by ensuring we don't tread in the + * same places that the bottom half does. + */ + for (rqg = rq->rqg; rqg != NULL;) { /* through the whole request chain */ + if (rqg->lockbase >= 0) /* this rqg needs a lock first */ + rqg->lock = lockrange(rqg->lockbase, rqg->rq->bp, &PLEX[rqg->plexno]); + rcount = rqg->count; + for (rqno = 0; rqno < rcount;) { + rqe = &rqg->rqe[rqno]; + + /* + * Point to next rqg before the bottom half + * changes the structures. + */ + if (++rqno >= rcount) + rqg = rqg->next; + if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* this subdisk is good, */ + drive = &DRIVE[rqe->driveno]; /* look at drive */ + drive->active++; + if (drive->active >= drive->maxactive) + drive->maxactive = drive->active; + vinum_conf.active++; + if (vinum_conf.active >= vinum_conf.maxactive) + vinum_conf.maxactive = vinum_conf.active; + +#ifdef VINUMDEBUG + if (debug & DEBUG_ADDRESSES) + log(LOG_DEBUG, + " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%jx, length %ld\n", + rqe->b.b_iocmd == BIO_READ ? "Read" : "Write", + major(rqe->b.b_dev), + minor(rqe->b.b_dev), + rqe->sdno, + (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), + (intmax_t) rqe->b.b_blkno, + rqe->b.b_bcount); + if (debug & DEBUG_LASTREQS) { + microtime(&rqe->launchtime); /* time we launched this request */ + logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp); + } +#endif + /* fire off the request */ + DEV_STRATEGY(&rqe->b); + } + } + } + return 0; +} + +/* + * define the low-level requests needed to perform a + * high-level I/O operation for a specific plex 'plexno'. + * + * Return REQUEST_OK if all subdisks involved in the request are up, + * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the + * request is at least partially outside the bounds of the subdisks. + * + * Modify the pointer *diskstart to point to the end address. On + * read, return on the first bad subdisk, so that the caller + * (build_read_request) can try alternatives. + * + * On entry to this routine, the rqg structures are not assigned. The + * assignment is performed by expandrq(). Strictly speaking, the + * elements rqe->sdno of all entries should be set to -1, since 0 + * (from bzero) is a valid subdisk number. We avoid this problem by + * initializing the ones we use, and not looking at the others (index + * >= rqg->requests). + */ +enum requeststatus +bre(struct request *rq, + int plexno, + daddr_t * diskaddr, + daddr_t diskend) +{ + int sdno; + struct sd *sd; + struct rqgroup *rqg; + struct buf *bp; /* user's bp */ + struct plex *plex; + enum requeststatus status; /* return value */ + daddr_t plexoffset; /* offset of transfer in plex */ + daddr_t stripebase; /* base address of stripe (1st subdisk) */ + daddr_t stripeoffset; /* offset in stripe */ + daddr_t blockoffset; /* offset in stripe on subdisk */ + struct rqelement *rqe; /* point to this request information */ + daddr_t diskstart = *diskaddr; /* remember where this transfer starts */ + enum requeststatus s; /* temp return value */ + + bp = rq->bp; /* buffer pointer */ + status = REQUEST_OK; /* return value: OK until proven otherwise */ + plex = &PLEX[plexno]; /* point to the plex */ + + switch (plex->organization) { + case plex_concat: + sd = NULL; /* (keep compiler quiet) */ + for (sdno = 0; sdno < plex->subdisks; sdno++) { + sd = &SD[plex->sdnos[sdno]]; + if (*diskaddr < sd->plexoffset) /* we must have a hole, */ + status = REQUEST_DEGRADED; /* note the fact */ + if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */ + rqg = allocrqg(rq, 1); /* space for the request */ + if (rqg == NULL) { /* malloc failed */ + bp->b_error = ENOMEM; + bp->b_io.bio_flags |= BIO_ERROR; + return REQUEST_ENOMEM; + } + rqg->plexno = plexno; + + rqe = &rqg->rqe[0]; /* point to the element */ + rqe->rqg = rqg; /* group */ + rqe->sdno = sd->sdno; /* put in the subdisk number */ + plexoffset = *diskaddr; /* start offset in plex */ + rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */ + rqe->useroffset = plexoffset - diskstart; /* start offset in user buffer */ + rqe->dataoffset = 0; + rqe->datalen = min(diskend - *diskaddr, /* number of sectors to transfer in this sd */ + sd->sectors - rqe->sdoffset); + rqe->groupoffset = 0; /* no groups for concatenated plexes */ + rqe->grouplen = 0; + rqe->buflen = rqe->datalen; /* buffer length is data buffer length */ + rqe->flags = 0; + rqe->driveno = sd->driveno; + if (sd->state != sd_up) { /* *now* we find the sd is down */ + s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ + if (s == REQUEST_DOWN) { /* down? */ + rqe->flags = XFR_BAD_SUBDISK; /* yup */ + if (rq->bp->b_iocmd == BIO_READ) /* read request, */ + return REQUEST_DEGRADED; /* give up here */ + /* + * If we're writing, don't give up + * because of a bad subdisk. Go + * through to the bitter end, but note + * which ones we can't access. + */ + status = REQUEST_DEGRADED; /* can't do it all */ + } + } + *diskaddr += rqe->datalen; /* bump the address */ + if (build_rq_buffer(rqe, plex)) { /* build the buffer */ + deallocrqg(rqg); + bp->b_error = ENOMEM; + bp->b_io.bio_flags |= BIO_ERROR; + return REQUEST_ENOMEM; /* can't do it */ + } + } + if (*diskaddr == diskend) /* we're finished, */ + break; /* get out of here */ + } + /* + * We've got to the end of the plex. Have we got to the end of + * the transfer? It would seem that having an offset beyond the + * end of the subdisk is an error, but in fact it can happen if + * the volume has another plex of different size. There's a valid + * question as to why you would want to do this, but currently + * it's allowed. + * + * In a previous version, I returned REQUEST_DOWN here. I think + * REQUEST_EOF is more appropriate now. + */ + if (diskend > sd->sectors + sd->plexoffset) /* pointing beyond EOF? */ + status = REQUEST_EOF; + break; + + case plex_striped: + { + while (*diskaddr < diskend) { /* until we get it all sorted out */ + if (*diskaddr >= plex->length) /* beyond the end of the plex */ + return REQUEST_EOF; /* can't continue */ + + /* The offset of the start address from the start of the stripe. */ + stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks); + + /* The plex-relative address of the start of the stripe. */ + stripebase = *diskaddr - stripeoffset; + + /* The number of the subdisk in which the start is located. */ + sdno = stripeoffset / plex->stripesize; + + /* The offset from the beginning of the stripe on this subdisk. */ + blockoffset = stripeoffset % plex->stripesize; + + sd = &SD[plex->sdnos[sdno]]; /* the subdisk in question */ + rqg = allocrqg(rq, 1); /* space for the request */ + if (rqg == NULL) { /* malloc failed */ + bp->b_error = ENOMEM; + bp->b_io.bio_flags |= BIO_ERROR; + return REQUEST_ENOMEM; + } + rqg->plexno = plexno; + + rqe = &rqg->rqe[0]; /* point to the element */ + rqe->rqg = rqg; + rqe->sdoffset = stripebase / plex->subdisks + blockoffset; /* start offset in this subdisk */ + rqe->useroffset = *diskaddr - diskstart; /* The offset of the start in the user buffer */ + rqe->dataoffset = 0; + rqe->datalen = min(diskend - *diskaddr, /* the amount remaining to transfer */ + plex->stripesize - blockoffset); /* and the amount left in this stripe */ + rqe->groupoffset = 0; /* no groups for striped plexes */ + rqe->grouplen = 0; + rqe->buflen = rqe->datalen; /* buffer length is data buffer length */ + rqe->flags = 0; + rqe->sdno = sd->sdno; /* put in the subdisk number */ + rqe->driveno = sd->driveno; + + if (sd->state != sd_up) { /* *now* we find the sd is down */ + s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */ + if (s == REQUEST_DOWN) { /* down? */ + rqe->flags = XFR_BAD_SUBDISK; /* yup */ + if (rq->bp->b_iocmd == BIO_READ) /* read request, */ + return REQUEST_DEGRADED; /* give up here */ + /* + * If we're writing, don't give up + * because of a bad subdisk. Go through + * to the bitter end, but note which + * ones we can't access. + */ + status = REQUEST_DEGRADED; /* can't do it all */ + } + } + /* + * It would seem that having an offset + * beyond the end of the subdisk is an + * error, but in fact it can happen if the + * volume has another plex of different + * size. There's a valid question as to why + * you would want to do this, but currently + * it's allowed. + */ + if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */ + rqe->datalen = sd->sectors - rqe->sdoffset; /* truncate */ +#ifdef VINUMDEBUG + if (debug & DEBUG_EOFINFO) { /* tell on the request */ + log(LOG_DEBUG, + "vinum: EOF on plex %s, sd %s offset %x (user offset 0x%jx)\n", + plex->name, + sd->name, + (u_int) sd->sectors, + (intmax_t) bp->b_blkno); + log(LOG_DEBUG, + "vinum: stripebase %#jx, stripeoffset %#jx, blockoffset %#jx\n", + (intmax_t) stripebase, + (intmax_t) stripeoffset, + (intmax_t) blockoffset); + } +#endif + } + if (build_rq_buffer(rqe, plex)) { /* build the buffer */ + deallocrqg(rqg); + bp->b_error = ENOMEM; + bp->b_io.bio_flags |= BIO_ERROR; + return REQUEST_ENOMEM; /* can't do it */ + } + *diskaddr += rqe->datalen; /* look at the remainder */ + if ((*diskaddr < diskend) /* didn't finish the request on this stripe */ + &&(*diskaddr < plex->length)) { /* and there's more to come */ + plex->multiblock++; /* count another one */ + if (sdno == plex->subdisks - 1) /* last subdisk, */ + plex->multistripe++; /* another stripe as well */ + } + } + } + break; + + /* + * RAID-4 and RAID-5 are complicated enough to have their own + * function. + */ + case plex_raid4: + case plex_raid5: + status = bre5(rq, plexno, diskaddr, diskend); + break; + + default: + log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization); + status = REQUEST_DOWN; /* can't access it */ + } + + return status; +} + +/* + * Build up a request structure for reading volumes. + * This function is not needed for plex reads, since there's + * no recovery if a plex read can't be satisified. + */ +enum requeststatus +build_read_request(struct request *rq, /* request */ + int plexindex) +{ /* index in the volume's plex table */ + struct buf *bp; + daddr_t startaddr; /* offset of previous part of transfer */ + daddr_t diskaddr; /* offset of current part of transfer */ + daddr_t diskend; /* and end offset of transfer */ + int plexno; /* plex index in vinum_conf */ + struct rqgroup *rqg; /* point to the request we're working on */ + struct volume *vol; /* volume in question */ + int recovered = 0; /* set if we recover a read */ + enum requeststatus status = REQUEST_OK; + int plexmask; /* bit mask of plexes, for recovery */ + + bp = rq->bp; /* buffer pointer */ + diskaddr = bp->b_blkno; /* start offset of transfer */ + diskend = diskaddr + (bp->b_bcount / DEV_BSIZE); /* and end offset of transfer */ + rqg = &rq->rqg[plexindex]; /* plex request */ + vol = &VOL[rq->volplex.volno]; /* point to volume */ + + while (diskaddr < diskend) { /* build up request components */ + startaddr = diskaddr; + status = bre(rq, vol->plex[plexindex], &diskaddr, diskend); /* build up a request */ + switch (status) { + case REQUEST_OK: + continue; + + case REQUEST_RECOVERED: + /* + * XXX FIXME if we have more than one plex, and we can + * satisfy the request from another, don't use the + * recovered request, since it's more expensive. + */ + recovered = 1; + break; + + case REQUEST_ENOMEM: + return status; + /* + * If we get here, our request is not complete. Try + * to fill in the missing parts from another plex. + * This can happen multiple times in this function, + * and we reinitialize the plex mask each time, since + * we could have a hole in our plexes. + */ + case REQUEST_EOF: + case REQUEST_DOWN: /* can't access the plex */ + case REQUEST_DEGRADED: /* can't access the plex */ + plexmask = ((1 << vol->plexes) - 1) /* all plexes in the volume */ + &~(1 << plexindex); /* except for the one we were looking at */ + for (plexno = 0; plexno < vol->plexes; plexno++) { + if (plexmask == 0) /* no plexes left to try */ + return REQUEST_DOWN; /* failed */ + diskaddr = startaddr; /* start at the beginning again */ + if (plexmask & (1 << plexno)) { /* we haven't tried this plex yet */ + bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */ + if (diskaddr > startaddr) { /* we satisfied another part */ + recovered = 1; /* we recovered from the problem */ + status = REQUEST_OK; /* don't complain about it */ + break; + } + } + } + if (diskaddr == startaddr) /* didn't get any further, */ + return status; + } + if (recovered) + vol->recovered_reads += recovered; /* adjust our recovery count */ + } + return status; +} + +/* + * Build up a request structure for writes. + * Return 0 if all subdisks involved in the request are up, 1 if some + * subdisks are not up, and -1 if the request is at least partially + * outside the bounds of the subdisks. + */ +enum requeststatus +build_write_request(struct request *rq) +{ /* request */ + struct buf *bp; + daddr_t diskstart; /* offset of current part of transfer */ + daddr_t diskend; /* and end offset of transfer */ + int plexno; /* plex index in vinum_conf */ + struct volume *vol; /* volume in question */ + enum requeststatus status; + + bp = rq->bp; /* buffer pointer */ + vol = &VOL[rq->volplex.volno]; /* point to volume */ + diskend = bp->b_blkno + (bp->b_bcount / DEV_BSIZE); /* end offset of transfer */ + status = REQUEST_DOWN; /* assume the worst */ + for (plexno = 0; plexno < vol->plexes; plexno++) { + diskstart = bp->b_blkno; /* start offset of transfer */ + /* + * Build requests for the plex. + * We take the best possible result here (min, + * not max): we're happy if we can write at all + */ + status = min(status, bre(rq, + vol->plex[plexno], + &diskstart, + diskend)); + } + return status; +} + +/* Fill in the struct buf part of a request element. */ +enum requeststatus +build_rq_buffer(struct rqelement *rqe, struct plex *plex) +{ + struct sd *sd; /* point to subdisk */ + struct volume *vol; + struct buf *bp; + struct buf *ubp; /* user (high level) buffer header */ + + vol = &VOL[rqe->rqg->rq->volplex.volno]; + sd = &SD[rqe->sdno]; /* point to subdisk */ + bp = &rqe->b; + ubp = rqe->rqg->rq->bp; /* pointer to user buffer header */ + + /* Initialize the buf struct */ + /* copy these flags from user bp */ + bp->b_flags = ubp->b_flags & (B_NOCACHE | B_ASYNC); + bp->b_io.bio_flags = 0; + bp->b_iocmd = ubp->b_iocmd; +#ifdef VINUMDEBUG + if (rqe->flags & XFR_BUFLOCKED) /* paranoia */ + panic("build_rq_buffer: rqe already locked"); /* XXX remove this when we're sure */ +#endif + BUF_LOCKINIT(bp); /* get a lock for the buffer */ + BUF_LOCK(bp, LK_EXCLUSIVE, NULL); /* and lock it */ + BUF_KERNPROC(bp); + rqe->flags |= XFR_BUFLOCKED; + bp->b_iodone = complete_rqe; + /* + * You'd think that we wouldn't need to even + * build the request buffer for a dead subdisk, + * but in some cases we need information like + * the user buffer address. Err on the side of + * generosity and supply what we can. That + * obviously doesn't include drive information + * when the drive is dead. + */ + if ((rqe->flags & XFR_BAD_SUBDISK) == 0) /* subdisk is accessible, */ + bp->b_dev = DRIVE[rqe->driveno].dev; /* drive device */ + bp->b_blkno = rqe->sdoffset + sd->driveoffset; /* start address */ + bp->b_bcount = rqe->buflen << DEV_BSHIFT; /* number of bytes to transfer */ + bp->b_resid = bp->b_bcount; /* and it's still all waiting */ + bp->b_bufsize = bp->b_bcount; /* and buffer size */ + bp->b_rcred = FSCRED; /* we have the file system credentials */ + bp->b_wcred = FSCRED; /* we have the file system credentials */ + + if (rqe->flags & XFR_MALLOCED) { /* this operation requires a malloced buffer */ + bp->b_data = Malloc(bp->b_bcount); /* get a buffer to put it in */ + if (bp->b_data == NULL) { /* failed */ + abortrequest(rqe->rqg->rq, ENOMEM); + return REQUEST_ENOMEM; /* no memory */ + } + } else + /* + * Point directly to user buffer data. This means + * that we don't need to do anything when we have + * finished the transfer + */ + bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE; + /* + * On a recovery read, we perform an XOR of + * all blocks to the user buffer. To make + * this work, we first clean out the buffer + */ + if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) + == (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) { /* bad subdisk of a recovery read */ + int length = rqe->grouplen << DEV_BSHIFT; /* and count involved */ + char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */ + + bzero(data, length); /* clean it out */ + } + return 0; +} + +/* + * Abort a request: free resources and complete the + * user request with the specified error + */ +int +abortrequest(struct request *rq, int error) +{ + struct buf *bp = rq->bp; /* user buffer */ + + bp->b_error = error; + freerq(rq); /* free everything we're doing */ + bp->b_io.bio_flags |= BIO_ERROR; + return error; /* and give up */ +} + +/* + * Check that our transfer will cover the + * complete address space of the user request. + * + * Return 1 if it can, otherwise 0 + */ +int +check_range_covered(struct request *rq) +{ + return 1; +} + +/* Perform I/O on a subdisk */ +void +sdio(struct buf *bp) +{ + int s; /* spl */ + struct sd *sd; + struct sdbuf *sbp; + daddr_t endoffset; + struct drive *drive; + +#ifdef VINUMDEBUG + if (debug & DEBUG_LASTREQS) + logrq(loginfo_sdio, (union rqinfou) bp, bp); +#endif + sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */ + drive = &DRIVE[sd->driveno]; + + if (drive->state != drive_up) { + if (sd->state >= sd_crashed) { + if (bp->b_iocmd == BIO_WRITE) /* writing, */ + set_sd_state(sd->sdno, sd_stale, setstate_force); + else + set_sd_state(sd->sdno, sd_crashed, setstate_force); + } + bp->b_error = EIO; + bp->b_io.bio_flags |= BIO_ERROR; + bufdone(bp); + return; + } + /* + * We allow access to any kind of subdisk as long as we can expect + * to get the I/O performed. + */ + if (sd->state < sd_empty) { /* nothing to talk to, */ + bp->b_error = EIO; + bp->b_io.bio_flags |= BIO_ERROR; + bufdone(bp); + return; + } + /* Get a buffer */ + sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf)); + if (sbp == NULL) { + bp->b_error = ENOMEM; + bp->b_io.bio_flags |= BIO_ERROR; + bufdone(bp); + return; + } + bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */ + sbp->b.b_flags = bp->b_flags; + sbp->b.b_iocmd = bp->b_iocmd; + sbp->b.b_bufsize = bp->b_bcount; /* buffer size */ + sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */ + sbp->b.b_resid = bp->b_resid; /* and amount waiting */ + sbp->b.b_dev = DRIVE[sd->driveno].dev; /* device */ + sbp->b.b_data = bp->b_data; /* data buffer */ + sbp->b.b_blkno = bp->b_blkno + sd->driveoffset; + sbp->b.b_iodone = sdio_done; /* come here on completion */ + BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */ + BUF_LOCK(&sbp->b, LK_EXCLUSIVE, NULL); /* and lock it */ + BUF_KERNPROC(&sbp->b); + sbp->bp = bp; /* note the address of the original header */ + sbp->sdno = sd->sdno; /* note for statistics */ + sbp->driveno = sd->driveno; + endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */ + if (endoffset > sd->sectors) { /* beyond the end */ + sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */ + if (sbp->b.b_bcount <= 0) { /* nothing to transfer */ + bp->b_resid = bp->b_bcount; /* nothing transferred */ + bufdone(bp); + BUF_UNLOCK(&sbp->b); + BUF_LOCKFREE(&sbp->b); + Free(sbp); + return; + } + } +#ifdef VINUMDEBUG + if (debug & DEBUG_ADDRESSES) + log(LOG_DEBUG, + " %s dev %d.%d, sd %d, offset 0x%jx, devoffset 0x%jx, length %ld\n", + sbp->b.b_iocmd == BIO_READ ? "Read" : "Write", + major(sbp->b.b_dev), + minor(sbp->b.b_dev), + sbp->sdno, + (intmax_t) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset), + (intmax_t) sbp->b.b_blkno, + sbp->b.b_bcount); +#endif + s = splbio(); +#ifdef VINUMDEBUG + if (debug & DEBUG_LASTREQS) + logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b); +#endif + DEV_STRATEGY(&sbp->b); + splx(s); +} + +/* + * Simplified version of bounds_check_with_label + * Determine the size of the transfer, and make sure it is + * within the boundaries of the partition. Adjust transfer + * if needed, and signal errors or early completion. + * + * Volumes are simpler than disk slices: they only contain + * one component (though we call them a, b and c to make + * system utilities happy), and they always take up the + * complete space of the "partition". + * + * I'm still not happy with this: why should the label be + * protected? If it weren't so damned difficult to write + * one in the first pleace (because it's protected), it wouldn't + * be a problem. + */ +int +vinum_bounds_check(struct buf *bp, struct volume *vol) +{ + int maxsize = vol->size; /* size of the partition (sectors) */ + int size = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* size of this request (sectors) */ + +#ifdef LABELSECTOR + /* Would this transfer overwrite the disk label? */ + if (bp->b_blkno <= LABELSECTOR /* starts before or at the label */ +#if LABELSECTOR != 0 + && bp->b_blkno + size > LABELSECTOR /* and finishes after */ +#endif + && (bp->b_iocmd == BIO_WRITE) /* and it's a write */ + &&(!vol->flags & (VF_WLABEL | VF_LABELLING))) { /* and we're not allowed to write the label */ + bp->b_error = EROFS; /* read-only */ + bp->b_io.bio_flags |= BIO_ERROR; + return -1; + } +#endif + if (size == 0) /* no transfer specified, */ + return 0; /* treat as EOF */ + /* beyond partition? */ + if (bp->b_blkno < 0 /* negative start */ + || bp->b_blkno + size > maxsize) { /* or goes beyond the end of the partition */ + /* if exactly at end of disk, return an EOF */ + if (bp->b_blkno == maxsize) { + bp->b_resid = bp->b_bcount; + return 0; + } + /* or truncate if part of it fits */ + size = maxsize - bp->b_blkno; + if (size <= 0) { /* nothing to transfer */ + bp->b_error = EINVAL; + bp->b_io.bio_flags |= BIO_ERROR; + return -1; + } + bp->b_bcount = size << DEV_BSHIFT; + } + bp->b_pblkno = bp->b_blkno; + return 1; +} + +/* + * Allocate a request group and hook + * it in in the list for rq + */ +struct rqgroup * +allocrqg(struct request *rq, int elements) +{ + struct rqgroup *rqg; /* the one we're going to allocate */ + int size = sizeof(struct rqgroup) + elements * sizeof(struct rqelement); + + rqg = (struct rqgroup *) Malloc(size); + if (rqg != NULL) { /* malloc OK, */ + if (rq->rqg) /* we already have requests */ + rq->lrqg->next = rqg; /* hang it off the end */ + else /* first request */ + rq->rqg = rqg; /* at the start */ + rq->lrqg = rqg; /* this one is the last in the list */ + + bzero(rqg, size); /* no old junk */ + rqg->rq = rq; /* point back to the parent request */ + rqg->count = elements; /* number of requests in the group */ + rqg->lockbase = -1; /* no lock required yet */ + } + return rqg; +} + +/* + * Deallocate a request group out of a chain. We do + * this by linear search: the chain is short, this + * almost never happens, and currently it can only + * happen to the first member of the chain. + */ +void +deallocrqg(struct rqgroup *rqg) +{ + struct rqgroup *rqgc = rqg->rq->rqg; /* point to the request chain */ + + if (rqg->lock) /* got a lock? */ + unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ + if (rqgc == rqg) /* we're first in line */ + rqg->rq->rqg = rqg->next; /* unhook ourselves */ + else { + while ((rqgc->next != NULL) /* find the group */ + &&(rqgc->next != rqg)) + rqgc = rqgc->next; + if (rqgc->next == NULL) + log(LOG_ERR, + "vinum deallocrqg: rqg %p not found in request %p\n", + rqg->rq, + rqg); + else + rqgc->next = rqg->next; /* make the chain jump over us */ + } + Free(rqg); +} + +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumrevive.c b/sys/dev/vinum/vinumrevive.c new file mode 100644 index 0000000..03e16f9 --- /dev/null +++ b/sys/dev/vinum/vinumrevive.c @@ -0,0 +1,622 @@ +/*- + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumrevive.c,v 1.18 2003/04/28 02:54:43 grog Exp $ + * $FreeBSD$ + */ + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> + +/* + * Revive a block of a subdisk. Return an error + * indication. EAGAIN means successful copy, but + * that more blocks remain to be copied. EINVAL + * means that the subdisk isn't associated with a + * plex (which means a programming error if we get + * here at all; FIXME). + */ + +int +revive_block(int sdno) +{ + int s; /* priority level */ + struct sd *sd; + struct plex *plex; + struct volume *vol; + struct buf *bp; + int error = EAGAIN; + int size; /* size of revive block, bytes */ + daddr_t plexblkno; /* lblkno in plex */ + int psd; /* parity subdisk number */ + u_int64_t stripe; /* stripe number */ + int paritysd = 0; /* set if this is the parity stripe */ + struct rangelock *lock; /* for locking */ + daddr_t stripeoffset; /* offset in stripe */ + + plexblkno = 0; /* to keep the compiler happy */ + sd = &SD[sdno]; + lock = NULL; + if (sd->plexno < 0) /* no plex? */ + return EINVAL; + plex = &PLEX[sd->plexno]; /* point to plex */ + if (plex->volno >= 0) + vol = &VOL[plex->volno]; + else + vol = NULL; + + if ((sd->revive_blocksize == 0) /* no block size */ + ||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1))) /* or invalid block size */ + sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE; + else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE) + sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE; + size = min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT; + sd->reviver = curproc->p_pid; /* note who last had a bash at it */ + + /* Now decide where to read from */ + switch (plex->organization) { + case plex_concat: + plexblkno = sd->revived + sd->plexoffset; /* corresponding address in plex */ + break; + + case plex_striped: + stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */ + if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize) + size = (plex->stripesize - stripeoffset) << DEV_BSHIFT; + plexblkno = sd->plexoffset /* base */ + + (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */ + + stripeoffset; /* offset from beginning of stripe */ + break; + + case plex_raid4: + case plex_raid5: + stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */ + plexblkno = sd->plexoffset /* base */ + + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */ + +stripeoffset; /* offset from beginning of stripe */ + stripe = (sd->revived / plex->stripesize); /* stripe number */ + + /* Make sure we don't go beyond the end of the band. */ + size = min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT); + if (plex->organization == plex_raid4) + psd = plex->subdisks - 1; /* parity subdisk for this stripe */ + else + psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */ + paritysd = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */ + + /* + * Now adjust for the strangenesses + * in RAID-4 and RAID-5 striping. + */ + if (sd->plexsdno > psd) /* beyond the parity stripe, */ + plexblkno -= plex->stripesize; /* one stripe less */ + else if (paritysd) + plexblkno -= plex->stripesize * sd->plexsdno; /* go back to the beginning of the band */ + break; + + case plex_disorg: /* to keep the compiler happy */ + break; /* to keep the pedants happy */ + } + + if (paritysd) { /* we're reviving a parity block, */ + bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */ + if (bp == NULL) /* no buffer space */ + return ENOMEM; /* chicken out */ + } else { /* data block */ + s = splbio(); + bp = geteblk(size); /* Get a buffer */ + splx(s); + if (bp == NULL) + return ENOMEM; + + /* + * Amount to transfer: block size, unless it + * would overlap the end. + */ + bp->b_bcount = size; + bp->b_resid = bp->b_bcount; + bp->b_blkno = plexblkno; /* start here */ + if (isstriped(plex)) /* we need to lock striped plexes */ + lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */ + if (vol != NULL) /* it's part of a volume, */ + /* + * First, read the data from the volume. We + * don't care which plex, that's bre's job. + */ + bp->b_dev = VINUM_VOL(plex->volno); /* create the device number */ + else /* it's an unattached plex */ + bp->b_dev = VINUM_PLEX(sd->plexno); /* create the device number */ + + bp->b_iocmd = BIO_READ; /* either way, read it */ + bp->b_flags = 0; + vinumstart(bp, 1); + bufwait(bp); + } + + if (bp->b_ioflags & BIO_ERROR) { + error = bp->b_error; + if (lock) /* we took a lock, */ + unlockrange(sd->plexno, lock); /* give it back */ + } else + /* Now write to the subdisk */ + { + bp->b_dev = VINUM_SD(sdno); /* create the device number */ + bp->b_flags &= ~B_DONE; /* no longer done */ + bp->b_ioflags = 0; + bp->b_iocmd = BIO_WRITE; + bp->b_resid = bp->b_bcount; + bp->b_blkno = sd->revived; /* write it to here */ + sdio(bp); /* perform the I/O */ + bufwait(bp); + if (bp->b_ioflags & BIO_ERROR) + error = bp->b_error; + else { + sd->revived += bp->b_bcount >> DEV_BSHIFT; /* moved this much further down */ + if (sd->revived >= sd->sectors) { /* finished */ + sd->revived = 0; + set_sd_state(sdno, sd_up, setstate_force); /* bring the sd up */ + log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state)); + save_config(); /* and save the updated configuration */ + error = 0; /* we're done */ + } + } + if (lock) /* we took a lock, */ + unlockrange(sd->plexno, lock); /* give it back */ + while (sd->waitlist) { /* we have waiting requests */ +#ifdef VINUMDEBUG + struct request *rq = sd->waitlist; + + if (debug & DEBUG_REVIVECONFLICT) + log(LOG_DEBUG, + "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%jx, length %ld\n", + rq->sdno, + rq, + rq->bp->b_iocmd == BIO_READ ? "Read" : "Write", + major(rq->bp->b_dev), + minor(rq->bp->b_dev), + (intmax_t)rq->bp->b_blkno, + rq->bp->b_bcount); +#endif + launch_requests(sd->waitlist, 1); /* do them now */ + sd->waitlist = sd->waitlist->next; /* and move on to the next */ + } + } + if (bp->b_qindex == 0) { /* not on a queue, */ + bp->b_flags |= B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + brelse(bp); /* is this kosher? */ + } + return error; +} + +/* + * Check or rebuild the parity blocks of a RAID-4 + * or RAID-5 plex. + * + * The variables plex->checkblock and + * plex->rebuildblock represent the + * subdisk-relative address of the stripe we're + * looking at, not the plex-relative address. We + * store it in the plex and not as a local + * variable because this function could be + * stopped, and we don't want to repeat the part + * we've already done. This is also the reason + * why we don't initialize it here except at the + * end. It gets initialized with the plex on + * creation. + * + * Each call to this function processes at most + * one stripe. We can't loop in this function, + * because we're unstoppable, so we have to be + * called repeatedly from userland. + */ +void +parityops(struct vinum_ioctl_msg *data) +{ + int plexno; + struct plex *plex; + int size; /* I/O transfer size, bytes */ + int stripe; /* stripe number in plex */ + int psd; /* parity subdisk number */ + struct rangelock *lock; /* lock on stripe */ + struct _ioctl_reply *reply; + off_t pstripe; /* pointer to our stripe counter */ + struct buf *pbp; + off_t errorloc; /* offset of parity error */ + enum parityop op; /* operation to perform */ + + plexno = data->index; + op = data->op; + pbp = NULL; + reply = (struct _ioctl_reply *) data; + reply->error = EAGAIN; /* expect to repeat this call */ + plex = &PLEX[plexno]; + if (!isparity(plex)) { /* not RAID-4 or RAID-5 */ + reply->error = EINVAL; + return; + } else if (plex->state < plex_flaky) { + reply->error = EIO; + strcpy(reply->msg, "Plex is not completely accessible\n"); + return; + } + pstripe = data->offset; + stripe = pstripe / plex->stripesize; /* stripe number */ + psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */ + size = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */ + plex->stripesize << DEV_BSHIFT); + + pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */ + if (pbp == NULL) { /* no buffer space */ + reply->error = ENOMEM; + return; /* chicken out */ + } + /* + * Now we have a result in the data buffer of + * the parity buffer header, which we have kept. + * Decide what to do with it. + */ + reply->msg[0] = '\0'; /* until shown otherwise */ + if ((pbp->b_ioflags & BIO_ERROR) == 0) { /* no error */ + if ((op == rebuildparity) + || (op == rebuildandcheckparity)) { + pbp->b_iocmd = BIO_WRITE; + pbp->b_resid = pbp->b_bcount; + sdio(pbp); /* write the parity block */ + bufwait(pbp); + } + if (((op == checkparity) + || (op == rebuildandcheckparity)) + && (errorloc != -1)) { + if (op == checkparity) + reply->error = EIO; + sprintf(reply->msg, + "Parity incorrect at offset 0x%jx\n", + (intmax_t)errorloc); + } + if (reply->error == EAGAIN) { /* still OK, */ + plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT); /* moved this much further down */ + if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */ + plex->checkblock = 0; + reply->error = 0; + } + } + } + if (pbp->b_ioflags & BIO_ERROR) + reply->error = pbp->b_error; + pbp->b_flags |= B_INVAL; + pbp->b_ioflags &= ~BIO_ERROR; + brelse(pbp); + unlockrange(plexno, lock); +} + +/* + * Rebuild a parity stripe. Return pointer to + * parity bp. On return, + * + * 1. The band is locked. The caller must unlock + * the band and release the buffer header. + * + * 2. All buffer headers except php have been + * released. The caller must release pbp. + * + * 3. For checkparity and rebuildandcheckparity, + * the parity is compared with the current + * parity block. If it's different, the + * offset of the error is returned to + * errorloc. The caller can set the value of + * the pointer to NULL if this is called for + * rebuilding parity. + * + * pstripe is the subdisk-relative base address of + * the data to be reconstructed, size is the size + * of the transfer in bytes. + */ +struct buf * +parityrebuild(struct plex *plex, + u_int64_t pstripe, + int size, + enum parityop op, + struct rangelock **lockp, + off_t * errorloc) +{ + int error; + int s; + int sdno; + u_int64_t stripe; /* stripe number */ + int *parity_buf; /* buffer address for current parity block */ + int *newparity_buf; /* and for new parity block */ + int mysize; /* I/O transfer size for this transfer */ + int isize; /* mysize in ints */ + int i; + int psd; /* parity subdisk number */ + int newpsd; /* and "subdisk number" of new parity */ + struct buf **bpp; /* pointers to our bps */ + struct buf *pbp; /* buffer header for parity stripe */ + int *sbuf; + int bufcount; /* number of buffers we need */ + + stripe = pstripe / plex->stripesize; /* stripe number */ + psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */ + parity_buf = NULL; /* to keep the compiler happy */ + error = 0; + + /* + * It's possible that the default transfer size + * we chose is not a factor of the stripe size. + * We *must* limit this operation to a single + * stripe, at least for RAID-5 rebuild, since + * the parity subdisk changes between stripes, + * so in this case we need to perform a short + * transfer. Set variable mysize to reflect + * this. + */ + mysize = min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT); + isize = mysize / (sizeof(int)); /* number of ints in the buffer */ + bufcount = plex->subdisks + 1; /* sd buffers plus result buffer */ + newpsd = plex->subdisks; + bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */ + + /* First, build requests for all subdisks */ + for (sdno = 0; sdno < bufcount; sdno++) { /* for each subdisk */ + if ((sdno != psd) || (op != rebuildparity)) { + /* Get a buffer header and initialize it. */ + s = splbio(); + bpp[sdno] = geteblk(mysize); /* Get a buffer */ + if (bpp[sdno] == NULL) { + while (sdno-- > 0) { /* release the ones we got */ + bpp[sdno]->b_flags |= B_INVAL; + brelse(bpp[sdno]); /* give back our resources */ + } + splx(s); + printf("vinum: can't allocate buffer space for parity op.\n"); + return NULL; /* no bpps */ + } + splx(s); + if (sdno == psd) + parity_buf = (int *) bpp[sdno]->b_data; + if (sdno == newpsd) /* the new one? */ + bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[psd]); /* write back to the parity SD */ + else + bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[sdno]); /* device number */ + bpp[sdno]->b_iocmd = BIO_READ; /* either way, read it */ + bpp[sdno]->b_flags = 0; + bpp[sdno]->b_bcount = mysize; + bpp[sdno]->b_resid = bpp[sdno]->b_bcount; + bpp[sdno]->b_blkno = pstripe; /* transfer from here */ + } + } + + /* Initialize result buffer */ + pbp = bpp[newpsd]; + newparity_buf = (int *) bpp[newpsd]->b_data; + bzero(newparity_buf, mysize); + + /* + * Now lock the stripe with the first non-parity + * bp as locking bp. + */ + *lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1), + bpp[psd ? 0 : 1], + plex); + + /* + * Then issue requests for all subdisks in + * parallel. Don't transfer the parity stripe + * if we're rebuilding parity, unless we also + * want to check it. + */ + for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each real subdisk */ + if ((sdno != psd) || (op != rebuildparity)) { + sdio(bpp[sdno]); + } + } + + /* + * Next, wait for the requests to complete. + * We wait in the order in which they were + * issued, which isn't necessarily the order in + * which they complete, but we don't have a + * convenient way of doing the latter, and the + * delay is minimal. + */ + for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */ + if ((sdno != psd) || (op != rebuildparity)) { + bufwait(bpp[sdno]); + if (bpp[sdno]->b_ioflags & BIO_ERROR) /* can't read, */ + error = bpp[sdno]->b_error; + else if (sdno != psd) { /* update parity */ + sbuf = (int *) bpp[sdno]->b_data; + for (i = 0; i < isize; i++) + ((int *) newparity_buf)[i] ^= sbuf[i]; /* xor in the buffer */ + } + } + if (sdno != psd) { /* release all bps except parity */ + bpp[sdno]->b_flags |= B_INVAL; + brelse(bpp[sdno]); /* give back our resources */ + } + } + + /* + * If we're checking, compare the calculated + * and the read parity block. If they're + * different, return the plex-relative offset; + * otherwise return -1. + */ + if ((op == checkparity) + || (op == rebuildandcheckparity)) { + *errorloc = -1; /* no error yet */ + for (i = 0; i < isize; i++) { + if (parity_buf[i] != newparity_buf[i]) { + *errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1) + + i * sizeof(int); + break; + } + } + bpp[psd]->b_flags |= B_INVAL; + brelse(bpp[psd]); /* give back our resources */ + } + /* release our resources */ + Free(bpp); + if (error) { + pbp->b_ioflags |= BIO_ERROR; + pbp->b_error = error; + } + return pbp; +} + +/* + * Initialize a subdisk by writing zeroes to the + * complete address space. If verify is set, + * check each transfer for correctness. + * + * Each call to this function writes (and maybe + * checks) a single block. + */ +int +initsd(int sdno, int verify) +{ + int s; /* priority level */ + struct sd *sd; + struct plex *plex; + struct volume *vol; + struct buf *bp; + int error; + int size; /* size of init block, bytes */ + daddr_t plexblkno; /* lblkno in plex */ + int verified; /* set when we're happy with what we wrote */ + + error = 0; + plexblkno = 0; /* to keep the compiler happy */ + sd = &SD[sdno]; + if (sd->plexno < 0) /* no plex? */ + return EINVAL; + plex = &PLEX[sd->plexno]; /* point to plex */ + if (plex->volno >= 0) + vol = &VOL[plex->volno]; + else + vol = NULL; + + if (sd->init_blocksize == 0) { + if (plex->stripesize != 0) /* we're striped, don't init more than */ + sd->init_blocksize = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */ + plex->stripesize << DEV_BSHIFT); + else + sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE; + } else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE) + sd->init_blocksize = MAX_REVIVE_BLOCKSIZE; + + size = min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT; + + verified = 0; + while (!verified) { /* until we're happy with it, */ + s = splbio(); + bp = geteblk(size); /* Get a buffer */ + splx(s); + if (bp == NULL) + return ENOMEM; + + bp->b_bcount = size; + bp->b_resid = bp->b_bcount; + bp->b_blkno = sd->initialized; /* write it to here */ + bzero(bp->b_data, bp->b_bcount); + bp->b_dev = VINUM_SD(sdno); /* create the device number */ + bp->b_iocmd = BIO_WRITE; + sdio(bp); /* perform the I/O */ + bufwait(bp); + if (bp->b_ioflags & BIO_ERROR) + error = bp->b_error; + if (bp->b_qindex == 0) { /* not on a queue, */ + bp->b_flags |= B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + brelse(bp); /* is this kosher? */ + } + if ((error == 0) && verify) { /* check that it got there */ + s = splbio(); + bp = geteblk(size); /* get a buffer */ + if (bp == NULL) { + splx(s); + error = ENOMEM; + } else { + bp->b_bcount = size; + bp->b_resid = bp->b_bcount; + bp->b_blkno = sd->initialized; /* read from here */ + bp->b_dev = VINUM_SD(sdno); /* create the device number */ + bp->b_iocmd = BIO_READ; /* read it back */ + splx(s); + sdio(bp); + bufwait(bp); + /* + * XXX Bug fix code. This is hopefully no + * longer needed (21 February 2000). + */ + if (bp->b_ioflags & BIO_ERROR) + error = bp->b_error; + else if ((*bp->b_data != 0) /* first word spammed */ + ||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */ + printf("vinum: init error on %s, offset 0x%llx sectors\n", + sd->name, + (long long) sd->initialized); + verified = 0; + } else + verified = 1; + if (bp->b_qindex == 0) { /* not on a queue, */ + bp->b_flags |= B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + brelse(bp); /* is this kosher? */ + } + } + } else + verified = 1; + } + if (error == 0) { /* did it, */ + sd->initialized += size >> DEV_BSHIFT; /* moved this much further down */ + if (sd->initialized >= sd->sectors) { /* finished */ + sd->initialized = 0; + set_sd_state(sdno, sd_initialized, setstate_force); /* bring the sd up */ + log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state)); + save_config(); /* and save the updated configuration */ + } else /* more to go, */ + error = EAGAIN; /* ya'll come back, see? */ + } + return error; +} + +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumstate.c b/sys/dev/vinum/vinumstate.c new file mode 100644 index 0000000..59c9860 --- /dev/null +++ b/sys/dev/vinum/vinumstate.c @@ -0,0 +1,1093 @@ +/*- + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumstate.c,v 2.21 2003/04/28 02:54:43 grog Exp $ + * $FreeBSD$ + */ + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/request.h> + +/* Update drive state */ +/* Return 1 if the state changes, otherwise 0 */ +int +set_drive_state(int driveno, enum drivestate newstate, enum setstateflags flags) +{ + struct drive *drive = &DRIVE[driveno]; + int oldstate = drive->state; + int sdno; + + if (drive->state == drive_unallocated) /* no drive to do anything with, */ + return 0; + + if (newstate == oldstate) /* don't change it if it's not different */ + return 1; /* all OK */ + if ((newstate == drive_down) /* the drive's going down */ + &&(!(flags & setstate_force)) + && (drive->opencount != 0)) /* we can't do it */ + return 0; /* don't do it */ + drive->state = newstate; /* set the state */ + if (drive->label.name[0] != '\0') /* we have a name, */ + log(LOG_INFO, + "vinum: drive %s is %s\n", + drive->label.name, + drive_state(drive->state)); + if (drive->state != oldstate) { /* state has changed */ + for (sdno = 0; sdno < vinum_conf.subdisks_allocated; sdno++) { /* find this drive's subdisks */ + if ((SD[sdno].state >= sd_referenced) + && (SD[sdno].driveno == driveno)) /* belongs to this drive */ + update_sd_state(sdno); /* update the state */ + } + } + if (newstate == drive_up) { /* want to bring it up */ + if ((drive->flags & VF_OPEN) == 0) /* should be open, but we're not */ + init_drive(drive, 1); /* which changes the state again */ + } else /* taking it down or worse */ + queue_daemon_request(daemonrq_closedrive, /* get the daemon to close it */ + (union daemoninfo) drive); + if ((flags & setstate_configuring) == 0) /* configuring? */ + save_config(); /* no: save the updated configuration now */ + return 1; +} + +/* + * Try to set the subdisk state. Return 1 if + * state changed to what we wanted, -1 if it + * changed to something else, and 0 if no change. + * + * This routine is called both from the user (up, + * down states only) and internally. + * + * The setstate_force bit in the flags enables the + * state change even if it could be dangerous to + * data consistency. It shouldn't allow nonsense. + */ +int +set_sd_state(int sdno, enum sdstate newstate, enum setstateflags flags) +{ + struct sd *sd = &SD[sdno]; + struct plex *plex; + struct volume *vol; + int oldstate = sd->state; + int status = 1; /* status to return */ + + if (newstate == oldstate) /* already there, */ + return 1; + else if (sd->state == sd_unallocated) /* no subdisk to do anything with, */ + return 0; /* can't do it */ + + if (sd->driveoffset < 0) { /* not allocated space */ + sd->state = sd_down; + if (newstate != sd_down) { + if (sd->plexno >= 0) + sdstatemap(&PLEX[sd->plexno]); /* count up subdisks */ + return -1; + } + } else { /* space allocated */ + switch (newstate) { + case sd_down: /* take it down? */ + /* + * If we're attached to a plex, and we're + * not reborn, we won't go down without + * use of force. + */ + if ((!flags & setstate_force) + && (sd->plexno >= 0) + && (sd->state != sd_reborn)) + return 0; /* don't do it */ + break; + + case sd_initialized: + if ((sd->state == sd_initializing) /* we were initializing */ + ||(flags & setstate_force)) /* or we forced it */ + break; + return 0; /* can't do it otherwise */ + + case sd_up: + if (DRIVE[sd->driveno].state != drive_up) /* can't bring the sd up if the drive isn't, */ + return 0; /* not even by force */ + if (flags & setstate_force) /* forcing it, */ + break; /* just do it, and damn the consequences */ + switch (sd->state) { + /* + * Perform the necessary tests. To allow + * the state transition, just break out of + * the switch. + */ + case sd_crashed: + case sd_reborn: + case sd_down: /* been down, no data lost */ + /* + * If we're associated with a plex, and + * the plex isn't up, or we're the only + * subdisk in the plex, we can do it. + */ + if ((sd->plexno >= 0) + && (((PLEX[sd->plexno].state < plex_firstup) + || (PLEX[sd->plexno].subdisks > 1)))) + break; /* do it */ + if (oldstate != sd_reborn) { + sd->state = sd_reborn; /* here it is again */ + log(LOG_INFO, + "vinum: %s is %s, not %s\n", + sd->name, + sd_state(sd->state), + sd_state(newstate)); + } + status = -1; + break; + + case sd_init: /* brand new */ + if (flags & setstate_configuring) /* we're doing this while configuring */ + break; + /* otherwise it's like being empty */ + /* FALLTHROUGH */ + + case sd_empty: + case sd_initialized: + /* + * If we're not part of a plex, or the + * plex is not part of a volume with other + * plexes which are up, we can come up + * without being inconsistent. + * + * If we're part of a parity plex, we'll + * come up if the caller uses force. This + * is the way we bring them up after + * initialization. + */ + if ((sd->plexno < 0) + || ((vpstate(&PLEX[sd->plexno]) & volplex_otherup) == 0) + || (isparity((&PLEX[sd->plexno])) + && (flags & setstate_force))) + break; + + /* Otherwise it's just out of date */ + /* FALLTHROUGH */ + + case sd_stale: /* out of date info, need reviving */ + case sd_obsolete: + /* + + * 1. If the subdisk is not part of a + * plex, bring it up, don't revive. + * + * 2. If the subdisk is part of a + * one-plex volume or an unattached + * plex, and it's not RAID-4 or + * RAID-5, we *can't revive*. The + * subdisk doesn't change its state. + * + * 3. If the subdisk is part of a + * one-plex volume or an unattached + * plex, and it's RAID-4 or RAID-5, + * but more than one subdisk is down, + * we *still can't revive*. The + * subdisk doesn't change its state. + * + * 4. If the subdisk is part of a + * multi-plex volume, we'll change to + * reviving and let the revive + * routines find out whether it will + * work or not. If they don't, the + * revive stops with an error message, + * but the state doesn't change + * (FWIW). + */ + if (sd->plexno < 0) /* no plex associated, */ + break; /* bring it up */ + plex = &PLEX[sd->plexno]; + if (plex->volno >= 0) /* have a volume */ + vol = &VOL[plex->volno]; + else + vol = NULL; + /* + * We can't do it if: + * + * 1: we don't have a volume + * 2: we're the only plex in the volume + * 3: we're a RAID-4 or RAID-5 plex, and + * more than one subdisk is down. + */ + if (((vol == NULL) + || (vol->plexes == 1)) + && ((!isparity(plex)) + || (plex->sddowncount > 1))) { + if (sd->state == sd_initializing) /* it's finished initializing */ + sd->state = sd_initialized; + else + return 0; /* can't do it */ + } else { + sd->state = sd_reviving; /* put in reviving state */ + sd->revived = 0; /* nothing done yet */ + status = EAGAIN; /* need to repeat */ + } + break; + + case sd_reviving: + if (flags & setstate_force) /* insist, */ + break; + return EAGAIN; /* no, try again */ + + default: /* can't do it */ + /* + * There's no way to bring subdisks up directly from + * other states. First they need to be initialized + * or revived. + */ + return 0; + } + break; + + default: /* other ones, only internal with force */ + if ((flags & setstate_force) == 0) /* no force? What's this? */ + return 0; /* don't do it */ + } + } + if (status == 1) { /* we can do it, */ + sd->state = newstate; + if (flags & setstate_force) + log(LOG_INFO, "vinum: %s is %s by force\n", sd->name, sd_state(sd->state)); + else + log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state)); + } else /* we don't get here with status 0 */ + log(LOG_INFO, + "vinum: %s is %s, not %s\n", + sd->name, + sd_state(sd->state), + sd_state(newstate)); + if (sd->plexno >= 0) /* we belong to a plex */ + update_plex_state(sd->plexno); /* update plex state */ + if ((flags & setstate_configuring) == 0) /* save config now */ + save_config(); + return status; +} + +/* + * Set the state of a plex dependent on its subdisks. + * This time round, we'll let plex state just reflect + * aggregate subdisk state, so this becomes an order of + * magnitude less complicated. In particular, ignore + * the requested state. + */ +int +set_plex_state(int plexno, enum plexstate state, enum setstateflags flags) +{ + struct plex *plex; /* point to our plex */ + enum plexstate oldstate; + enum volplexstate vps; /* how do we compare with the other plexes? */ + + plex = &PLEX[plexno]; /* point to our plex */ + oldstate = plex->state; + + /* If the plex isn't allocated, we can't do it. */ + if (plex->state == plex_unallocated) + return 0; + + /* + * If it's already in the the state we want, + * and it's not up, just return. If it's up, + * we still need to do some housekeeping. + */ + if ((state == oldstate) + && (state != plex_up)) + return 1; + vps = vpstate(plex); /* how do we compare with the other plexes? */ + switch (state) { + /* + * We can't bring the plex up, even by force, + * unless it's ready. update_plex_state + * checks that. + */ + case plex_up: /* bring the plex up */ + update_plex_state(plex->plexno); /* it'll come up if it can */ + break; + + case plex_down: /* want to take it down */ + /* + * If we're the only one, or the only one + * which is up, we need force to do it. + */ + if (((vps == volplex_onlyus) + || (vps == volplex_onlyusup)) + && (!(flags & setstate_force))) + return 0; /* can't do it */ + plex->state = state; /* do it */ + invalidate_subdisks(plex, sd_down); /* and down all up subdisks */ + break; + + /* + * This is only requested internally. + * Trust ourselves + */ + case plex_faulty: + plex->state = state; /* do it */ + invalidate_subdisks(plex, sd_crashed); /* and crash all up subdisks */ + break; + + case plex_initializing: + /* XXX consider what safeguards we need here */ + if ((flags & setstate_force) == 0) + return 0; + plex->state = state; /* do it */ + break; + + /* What's this? */ + default: + return 0; + } + if (plex->state != oldstate) /* we've changed, */ + log(LOG_INFO, /* tell them about it */ + "vinum: %s is %s\n", + plex->name, + plex_state(plex->state)); + /* + * Now see what we have left, and whether + * we're taking the volume down + */ + if (plex->volno >= 0) /* we have a volume */ + update_volume_state(plex->volno); /* update its state */ + if ((flags & setstate_configuring) == 0) /* save config now */ + save_config(); /* yes: save the updated configuration */ + return 1; +} + +/* Update the state of a plex dependent on its plexes. */ +int +set_volume_state(int volno, enum volumestate state, enum setstateflags flags) +{ + struct volume *vol = &VOL[volno]; /* point to our volume */ + + if (vol->state == volume_unallocated) /* no volume to do anything with, */ + return 0; + if (vol->state == state) /* we're there already */ + return 1; + + if (state == volume_up) /* want to come up */ + update_volume_state(volno); + else if (state == volume_down) { /* want to go down */ + if (((vol->flags & VF_OPEN) == 0) /* not open */ + ||((flags & setstate_force) != 0)) { /* or we're forcing */ + vol->state = volume_down; + log(LOG_INFO, + "vinum: volume %s is %s\n", + vol->name, + volume_state(vol->state)); + if ((flags & setstate_configuring) == 0) /* save config now */ + save_config(); /* yes: save the updated configuration */ + return 1; + } + } + return 0; /* no change */ +} + +/* Set the state of a subdisk based on its environment */ +void +update_sd_state(int sdno) +{ + struct sd *sd; + struct drive *drive; + enum sdstate oldstate; + + sd = &SD[sdno]; + oldstate = sd->state; + drive = &DRIVE[sd->driveno]; + + if (drive->state == drive_up) { + switch (sd->state) { + case sd_down: + case sd_crashed: + sd->state = sd_reborn; /* back up again with no loss */ + break; + + default: + break; + } + } else { /* down or worse */ + switch (sd->state) { + case sd_up: + case sd_reborn: + case sd_reviving: + case sd_empty: + sd->state = sd_crashed; /* lost our drive */ + break; + + default: + break; + } + } + if (sd->state != oldstate) /* state has changed, */ + log(LOG_INFO, /* say so */ + "vinum: %s is %s\n", + sd->name, + sd_state(sd->state)); + if (sd->plexno >= 0) /* we're part of a plex, */ + update_plex_state(sd->plexno); /* update its state */ +} + +/* + * Force a plex and all its subdisks + * into an 'up' state. This is a helper + * for update_plex_state. + */ +void +forceup(int plexno) +{ + struct plex *plex; + int sdno; + + plex = &PLEX[plexno]; /* point to the plex */ + plex->state = plex_up; /* and bring it up */ + + /* change the subdisks to up state */ + for (sdno = 0; sdno < plex->subdisks; sdno++) { + SD[plex->sdnos[sdno]].state = sd_up; + log(LOG_INFO, /* tell them about it */ + "vinum: %s is up\n", + SD[plex->sdnos[sdno]].name); + } +} + +/* Set the state of a plex based on its environment */ +void +update_plex_state(int plexno) +{ + struct plex *plex; /* point to our plex */ + enum plexstate oldstate; + enum sdstates statemap; /* get a map of the subdisk states */ + enum volplexstate vps; /* how do we compare with the other plexes? */ + + plex = &PLEX[plexno]; /* point to our plex */ + oldstate = plex->state; + statemap = sdstatemap(plex); /* get a map of the subdisk states */ + vps = vpstate(plex); /* how do we compare with the other plexes? */ + + if (statemap & sd_initstate) /* something initializing? */ + plex->state = plex_initializing; /* yup, that makes the plex the same */ + else if (statemap == sd_upstate) + /* + * All the subdisks are up. This also means that + * they are consistent, so we can just bring + * the plex up + */ + plex->state = plex_up; + else if (isparity(plex) /* RAID-4 or RAID-5 plex */ + &&(plex->sddowncount == 1)) /* and exactly one subdisk down */ + plex->state = plex_degraded; /* limping a bit */ + else if (((statemap & ~sd_downstate) == sd_emptystate) /* all subdisks empty */ + ||((statemap & ~sd_downstate) + == (statemap & ~sd_downstate & (sd_initializedstate | sd_upstate)))) { + if ((vps & volplex_otherup) == 0) { /* no other plex is up */ + struct volume *vol = &VOL[plex->volno]; /* possible volume to which it points */ + + /* + * If we're a striped or concat plex + * associated with a volume, none of whose + * plexes are up, and we're new and untested, + * and the volume has the setupstate bit set, + * we can pretend to be in a consistent state. + * + * We need to do this in one swell foop: on + * the next call we will no longer be just + * empty. + * + * This code assumes that all the other plexes + * are also capable of coming up (i.e. all the + * sds are up), but that's OK: we'll come back + * to this function for the remaining plexes + * in the volume. + */ + if ((plex->state == plex_init) + && (plex->volno >= 0) + && (vol->flags & VF_CONFIG_SETUPSTATE)) { + for (plexno = 0; plexno < vol->plexes; plexno++) + forceup(VOL[plex->volno].plex[plexno]); + } else if ((statemap == sd_initializedstate) /* if it's initialized (not empty) */ + ||(plex->organization == plex_concat) /* and we're not RAID-4 or RAID-5 */ + ||(plex->organization == plex_striped)) + forceup(plexno); /* we'll do it */ + /* + * This leaves a case where things don't get + * done: the plex is RAID-4 or RAID-5, and + * the subdisks are all empty. They need to + * be initialized first. + */ + } else { + if (statemap == sd_upstate) /* all subdisks up */ + plex->state = plex_up; /* we can come up too */ + else + plex->state = plex_faulty; + } + } else if ((statemap & (sd_upstate | sd_rebornstate)) == statemap) /* all up or reborn */ + plex->state = plex_flaky; + else if (statemap & (sd_upstate | sd_rebornstate)) /* some up or reborn */ + plex->state = plex_corrupt; /* corrupt */ + else if (statemap & (sd_initstate | sd_emptystate)) /* some subdisks empty or initializing */ + plex->state = plex_initializing; + else /* nothing at all up */ + plex->state = plex_faulty; + + if (plex->state != oldstate) /* state has changed, */ + log(LOG_INFO, /* tell them about it */ + "vinum: %s is %s\n", + plex->name, + plex_state(plex->state)); + if (plex->volno >= 0) /* we're part of a volume, */ + update_volume_state(plex->volno); /* update its state */ +} + +/* Set volume state based on its components */ +void +update_volume_state(int volno) +{ + struct volume *vol; /* our volume */ + int plexno; + enum volumestate oldstate; + + vol = &VOL[volno]; /* point to our volume */ + oldstate = vol->state; + + for (plexno = 0; plexno < vol->plexes; plexno++) { + struct plex *plex = &PLEX[vol->plex[plexno]]; /* point to the plex */ + if (plex->state >= plex_corrupt) { /* something accessible, */ + vol->state = volume_up; + break; + } + } + if (plexno == vol->plexes) /* didn't find an up plex */ + vol->state = volume_down; + + if (vol->state != oldstate) { /* state changed */ + log(LOG_INFO, "vinum: %s is %s\n", vol->name, volume_state(vol->state)); + save_config(); /* save the updated configuration */ + } +} + +/* + * Called from request routines when they find + * a subdisk which is not kosher. Decide whether + * it warrants changing the state. Return + * REQUEST_DOWN if we can't use the subdisk, + * REQUEST_OK if we can. + */ +/* + * A prior version of this function checked the plex + * state as well. At the moment, consider plex states + * information for the user only. We'll ignore them + * and use the subdisk state only. The last version of + * this file with the old logic was 2.7. XXX + */ +enum requeststatus +checksdstate(struct sd *sd, struct request *rq, daddr_t diskaddr, daddr_t diskend) +{ + struct plex *plex = &PLEX[sd->plexno]; + int writeop = (rq->bp->b_iocmd == BIO_WRITE); /* note if we're writing */ + + switch (sd->state) { + /* We shouldn't get called if the subdisk is up */ + case sd_up: + return REQUEST_OK; + + case sd_reviving: + /* + * Access to a reviving subdisk depends on the + * organization of the plex: + * + * - If it's concatenated, access the subdisk + * up to its current revive point. If we + * want to write to the subdisk overlapping + * the current revive block, set the + * conflict flag in the request, asking the + * caller to put the request on the wait + * list, which will be attended to by + * revive_block when it's done. + * - if it's striped, we can't do it (we could + * do some hairy calculations, but it's + * unlikely to work). + * - if it's RAID-4 or RAID-5, we can do it as + * long as only one subdisk is down + */ + if (plex->organization == plex_striped) /* plex is striped, */ + return REQUEST_DOWN; + else if (isparity(plex)) { /* RAID-4 or RAID-5 plex */ + if (plex->sddowncount > 1) /* with more than one sd down, */ + return REQUEST_DOWN; + else + /* + * XXX We shouldn't do this if we can find a + * better way. Check the other plexes + * first, and return a DOWN if another + * plex will do it better + */ + return REQUEST_OK; /* OK, we'll find a way */ + } + if (diskaddr > (sd->revived + + sd->plexoffset + + (sd->revive_blocksize >> DEV_BSHIFT))) /* we're beyond the end */ + return REQUEST_DOWN; + else if (diskend > (sd->revived + sd->plexoffset)) { /* we finish beyond the end */ + if (writeop) { + rq->flags |= XFR_REVIVECONFLICT; /* note a potential conflict */ + rq->sdno = sd->sdno; /* and which sd last caused it */ + } else + return REQUEST_DOWN; + } + return REQUEST_OK; + + case sd_reborn: + if (writeop) + return REQUEST_OK; /* always write to a reborn disk */ + else /* don't allow a read */ + /* + * Handle the mapping. We don't want to reject + * a read request to a reborn subdisk if that's + * all we have. XXX + */ + return REQUEST_DOWN; + + case sd_down: + if (writeop) /* writing to a consistent down disk */ + set_sd_state(sd->sdno, sd_obsolete, setstate_force); /* it's not consistent now */ + return REQUEST_DOWN; + + case sd_crashed: + if (writeop) /* writing to a consistent down disk */ + set_sd_state(sd->sdno, sd_stale, setstate_force); /* it's not consistent now */ + return REQUEST_DOWN; + + default: + return REQUEST_DOWN; + } +} + +/* return a state map for the subdisks of a plex */ +enum sdstates +sdstatemap(struct plex *plex) +{ + int sdno; + enum sdstates statemap = 0; /* note the states we find */ + + plex->sddowncount = 0; /* no subdisks down yet */ + for (sdno = 0; sdno < plex->subdisks; sdno++) { + struct sd *sd = &SD[plex->sdnos[sdno]]; /* point to the subdisk */ + + switch (sd->state) { + case sd_empty: + statemap |= sd_emptystate; + (plex->sddowncount)++; /* another unusable subdisk */ + break; + + case sd_init: + statemap |= sd_initstate; + (plex->sddowncount)++; /* another unusable subdisk */ + break; + + case sd_down: + statemap |= sd_downstate; + (plex->sddowncount)++; /* another unusable subdisk */ + break; + + case sd_crashed: + statemap |= sd_crashedstate; + (plex->sddowncount)++; /* another unusable subdisk */ + break; + + case sd_obsolete: + statemap |= sd_obsoletestate; + (plex->sddowncount)++; /* another unusable subdisk */ + break; + + case sd_stale: + statemap |= sd_stalestate; + (plex->sddowncount)++; /* another unusable subdisk */ + break; + + case sd_reborn: + statemap |= sd_rebornstate; + break; + + case sd_up: + statemap |= sd_upstate; + break; + + case sd_initializing: + statemap |= sd_initstate; + (plex->sddowncount)++; /* another unusable subdisk */ + break; + + case sd_initialized: + statemap |= sd_initializedstate; + (plex->sddowncount)++; /* another unusable subdisk */ + break; + + case sd_unallocated: + case sd_uninit: + case sd_reviving: + case sd_referenced: + statemap |= sd_otherstate; + (plex->sddowncount)++; /* another unusable subdisk */ + } + } + return statemap; +} + +/* determine the state of the volume relative to this plex */ +enum volplexstate +vpstate(struct plex *plex) +{ + struct volume *vol; + enum volplexstate state = volplex_onlyusdown; /* state to return */ + int plexno; + + if (plex->volno < 0) { /* not associated with a volume */ + if (plex->state > plex_degraded) + return volplex_onlyus; /* just us */ + else + return volplex_onlyusdown; /* assume the worst */ + } + vol = &VOL[plex->volno]; /* point to our volume */ + for (plexno = 0; plexno < vol->plexes; plexno++) { + if (&PLEX[vol->plex[plexno]] == plex) { /* us */ + if (PLEX[vol->plex[plexno]].state >= plex_degraded) /* are we up? */ + state |= volplex_onlyus; /* yes */ + } else { + if (PLEX[vol->plex[plexno]].state >= plex_degraded) /* not us */ + state |= volplex_otherup; /* and when they were up, they were up */ + else + state |= volplex_alldown; /* and when they were down, they were down */ + } + } + return state; /* and when they were only halfway up */ +} /* they were neither up nor down */ + +/* Check if all bits b are set in a */ +int allset(int a, int b); + +int +allset(int a, int b) +{ + return (a & b) == b; +} + +/* Invalidate the subdisks belonging to a plex */ +void +invalidate_subdisks(struct plex *plex, enum sdstate state) +{ + int sdno; + + for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */ + struct sd *sd = &SD[plex->sdnos[sdno]]; + + switch (sd->state) { + case sd_unallocated: + case sd_uninit: + case sd_init: + case sd_initializing: + case sd_initialized: + case sd_empty: + case sd_obsolete: + case sd_stale: + case sd_crashed: + case sd_down: + case sd_referenced: + break; + + case sd_reviving: + case sd_reborn: + case sd_up: + set_sd_state(plex->sdnos[sdno], state, setstate_force); + } + } +} + +/* + * Start an object, in other words do what we can to get it up. + * This is called from vinumioctl (VINUMSTART). + * Return error indications via ioctl_reply + */ +void +start_object(struct vinum_ioctl_msg *data) +{ + int status; + int objindex = data->index; /* data gets overwritten */ + struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) data; /* format for returning replies */ + enum setstateflags flags; + + if (data->force != 0) /* are we going to use force? */ + flags = setstate_force; /* yes */ + else + flags = setstate_none; /* no */ + + switch (data->type) { + case drive_object: + status = set_drive_state(objindex, drive_up, flags); + if (DRIVE[objindex].state != drive_up) /* set status on whether we really did it */ + ioctl_reply->error = EBUSY; + else + ioctl_reply->error = 0; + break; + + case sd_object: + if (DRIVE[SD[objindex].driveno].state != drive_up) { + ioctl_reply->error = EIO; + strcpy(ioctl_reply->msg, "Drive is down"); + return; + } + if (data->blocksize) + SD[objindex].revive_blocksize = data->blocksize; + if ((SD[objindex].state == sd_reviving) /* reviving, */ + ||(SD[objindex].state == sd_stale)) { /* or stale, will revive */ + SD[objindex].state = sd_reviving; /* make sure we're reviving */ + ioctl_reply->error = revive_block(objindex); /* revive another block */ + ioctl_reply->msg[0] = '\0'; /* no comment */ + return; + } else if (SD[objindex].state == sd_initializing) { /* initializing, */ + if (data->blocksize) + SD[objindex].init_blocksize = data->blocksize; + ioctl_reply->error = initsd(objindex, data->verify); /* initialize another block */ + ioctl_reply->msg[0] = '\0'; /* no comment */ + return; + } + status = set_sd_state(objindex, sd_up, flags); /* set state */ + if (status != EAGAIN) { /* not first revive or initialize, */ + if (SD[objindex].state != sd_up) /* set status on whether we really did it */ + ioctl_reply->error = EBUSY; + else + ioctl_reply->error = 0; + } else + ioctl_reply->error = status; + break; + + case plex_object: + status = set_plex_state(objindex, plex_up, flags); + if (PLEX[objindex].state != plex_up) /* set status on whether we really did it */ + ioctl_reply->error = EBUSY; + else + ioctl_reply->error = 0; + break; + + case volume_object: + status = set_volume_state(objindex, volume_up, flags); + if (VOL[objindex].state != volume_up) /* set status on whether we really did it */ + ioctl_reply->error = EBUSY; + else + ioctl_reply->error = 0; + break; + + default: + ioctl_reply->error = EINVAL; + strcpy(ioctl_reply->msg, "Invalid object type"); + return; + } + /* + * There's no point in saying anything here: + * the userland program does it better + */ + ioctl_reply->msg[0] = '\0'; +} + +/* + * Stop an object, in other words do what we can to get it down + * This is called from vinumioctl (VINUMSTOP). + * Return error indications via ioctl_reply. + */ +void +stop_object(struct vinum_ioctl_msg *data) +{ + int status = 1; + int objindex = data->index; /* save the number from change */ + struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) data; /* format for returning replies */ + + switch (data->type) { + case drive_object: + status = set_drive_state(objindex, drive_down, data->force); + break; + + case sd_object: + status = set_sd_state(objindex, sd_down, data->force); + break; + + case plex_object: + status = set_plex_state(objindex, plex_down, data->force); + break; + + case volume_object: + status = set_volume_state(objindex, volume_down, data->force); + break; + + default: + ioctl_reply->error = EINVAL; + strcpy(ioctl_reply->msg, "Invalid object type"); + return; + } + ioctl_reply->msg[0] = '\0'; + if (status == 0) /* couldn't do it */ + ioctl_reply->error = EBUSY; + else + ioctl_reply->error = 0; +} + +/* + * VINUM_SETSTATE ioctl: set an object state. + * msg is the message passed by the user. + */ +void +setstate(struct vinum_ioctl_msg *msg) +{ + int sdno; + struct sd *sd; + struct plex *plex; + struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) msg; /* format for returning replies */ + + switch (msg->state) { + case object_down: + stop_object(msg); + break; + + case object_initializing: + switch (msg->type) { + case sd_object: + sd = &SD[msg->index]; + if ((msg->index >= vinum_conf.subdisks_allocated) + || (sd->state <= sd_referenced)) { + sprintf(ioctl_reply->msg, "Invalid subdisk %d", msg->index); + ioctl_reply->error = EFAULT; + return; + } + set_sd_state(msg->index, sd_initializing, msg->force); + if (sd->state != sd_initializing) { + strcpy(ioctl_reply->msg, "Can't set state"); + ioctl_reply->error = EBUSY; + } else + ioctl_reply->error = 0; + break; + + case plex_object: + plex = &PLEX[msg->index]; + if ((msg->index >= vinum_conf.plexes_allocated) + || (plex->state <= plex_unallocated)) { + sprintf(ioctl_reply->msg, "Invalid plex %d", msg->index); + ioctl_reply->error = EFAULT; + return; + } + set_plex_state(msg->index, plex_initializing, msg->force); + if (plex->state != plex_initializing) { + strcpy(ioctl_reply->msg, "Can't set state"); + ioctl_reply->error = EBUSY; + } else { + ioctl_reply->error = 0; + for (sdno = 0; sdno < plex->subdisks; sdno++) { + sd = &SD[plex->sdnos[sdno]]; + set_sd_state(plex->sdnos[sdno], sd_initializing, msg->force); + if (sd->state != sd_initializing) { + strcpy(ioctl_reply->msg, "Can't set state"); + ioctl_reply->error = EBUSY; + break; + } + } + } + break; + + default: + strcpy(ioctl_reply->msg, "Invalid object"); + ioctl_reply->error = EINVAL; + } + break; + + case object_initialized: + if (msg->type == sd_object) { + sd = &SD[msg->index]; + if ((msg->index >= vinum_conf.subdisks_allocated) + || (sd->state <= sd_referenced)) { + sprintf(ioctl_reply->msg, "Invalid subdisk %d", msg->index); + ioctl_reply->error = EFAULT; + return; + } + set_sd_state(msg->index, sd_initialized, msg->force); + if (sd->state != sd_initializing) { + strcpy(ioctl_reply->msg, "Can't set state"); + ioctl_reply->error = EBUSY; + } else + ioctl_reply->error = 0; + } else { + strcpy(ioctl_reply->msg, "Invalid object"); + ioctl_reply->error = EINVAL; + } + break; + + case object_up: + start_object(msg); + } +} + +/* + * Brute force set state function. Don't look at + * any dependencies, just do it. This is mainly + * intended for testing and recovery. + */ +void +setstate_by_force(struct vinum_ioctl_msg *msg) +{ + struct _ioctl_reply *ioctl_reply = (struct _ioctl_reply *) msg; /* format for returning replies */ + + switch (msg->type) { + case drive_object: + DRIVE[msg->index].state = msg->state; + break; + + case sd_object: + SD[msg->index].state = msg->state; + break; + + case plex_object: + PLEX[msg->index].state = msg->state; + break; + + case volume_object: + VOL[msg->index].state = msg->state; + break; + + default: + break; + } + ioctl_reply->error = 0; +} +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumstate.h b/sys/dev/vinum/vinumstate.h new file mode 100644 index 0000000..572f317 --- /dev/null +++ b/sys/dev/vinum/vinumstate.h @@ -0,0 +1,257 @@ +/*- + * Copyright (c) 1997, 1998 + * Nan Yang Computer Services Limited. All rights reserved. + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $FreeBSD$ + */ + +/* + * This file gets read by makestatetext to create text files + * with the names of the states, so don't change the file + * format + */ + +enum volumestate { + volume_unallocated, + /* present but unused. Must be 0 */ + + volume_uninit, + /* mentioned elsewhere but not known to the configuration */ + + volume_down, + + /* The volume is up and functional, but not all plexes may be available */ + volume_up, + volume_laststate = volume_up /* last value, for table dimensions */ +}; + +enum plexstate { + /* An empty entry, not a plex at all. */ + plex_unallocated, + + /* The plex has been referenced by a volume */ + plex_referenced, + /* + * The plex has been allocated, but there configuration + * is not complete + */ + plex_init, + + /* + * A plex which has gone completely down because of + * I/O errors. + */ + plex_faulty, + + /* + * A plex which has been taken down by the + * administrator. + */ + plex_down, + + /* A plex which is being initialized */ + plex_initializing, + + /* + * *** The remaining states represent plexes which are + * at least partially up. Keep these separate so that + * they can be checked more easily. + */ + + /* + * A plex entry which is at least partially up. Not + * all subdisks are available, and an inconsistency + * has occurred. If no other plex is uncorrupted, + * the volume is no longer consistent. + */ + plex_corrupt, + + plex_firstup = plex_corrupt, /* first "up" state */ + + /* + * A RAID-5 plex entry which is accessible, but one + * subdisk is down, requiring recovery for many + * I/O requests. + */ + plex_degraded, + + /* + * A plex which is really up, but which has a reborn + * subdisk which we don't completely trust, and + * which we don't want to read if we can avoid it + */ + plex_flaky, + + /* + * A plex entry which is completely up. All subdisks + * are up. + */ + plex_up, + + plex_laststate = plex_up /* last value, for table dimensions */ +}; + +/* subdisk states */ +enum sdstate { + /* An empty entry, not a subdisk at all. */ + sd_unallocated, + + /* + * A subdisk entry which has not been created + * completely. Some fields may be empty. + */ + sd_uninit, + + /* The subdisk has been referenced by a plex */ + sd_referenced, + + /* + * A subdisk entry which has been created completely. + * All fields are correct, but the disk hasn't + * been updated. + */ + sd_init, + + /* + * A subdisk entry which has been created completely. + * All fields are correct, and the disk has been + * updated, but there is no data on the disk. + */ + sd_empty, + + /* + * A subdisk entry which has been created completely and + * which is currently being initialized + */ + sd_initializing, + + /* + * A subdisk entry which has been initialized, + * but which can't come up because it would + * cause inconsistencies. + */ + sd_initialized, + + /* *** The following states represent invalid data */ + /* + * A subdisk entry which has been created completely. + * All fields are correct, the config on disk has been + * updated, and the data was valid, but since then the + * drive has been taken down, and as a result updates + * have been missed. + */ + sd_obsolete, + + /* + * A subdisk entry which has been created completely. + * All fields are correct, the disk has been updated, + * and the data was valid, but since then the drive + * has been crashed and updates have been lost. + */ + sd_stale, + + /* *** The following states represent valid, inaccessible data */ + + /* + * A subdisk entry which has been created completely. + * All fields are correct, the disk has been updated, + * and the data was valid, but since then the drive + * has gone down. No attempt has been made to write + * to the subdisk since the crash, so the data is valid. + */ + sd_crashed, + + /* + * A subdisk entry which was up, which contained + * valid data, and which was taken down by the + * administrator. The data is valid. + */ + sd_down, + + /* + * *** This is invalid data (the subdisk previously had + * a numerically lower state), but it is currently in the + * process of being revived. We can write but not read. + */ + sd_reviving, + + /* + * *** The following states represent accessible subdisks + * with valid data + */ + + /* + * A subdisk entry which has been created completely. + * All fields are correct, the disk has been updated, + * and the data was valid, but since then the drive + * has gone down and up again. No updates were lost, + * but it is possible that the subdisk has been + * damaged. We won't read from this subdisk if we + * have a choice. If this is the only subdisk which + * covers this address space in the plex, we set its + * state to sd_up under these circumstances, so this + * status implies that there is another subdisk to + * fulfil the request. + */ + sd_reborn, + + /* + * A subdisk entry which has been created completely. + * All fields are correct, the disk has been updated, + * and the data is valid. + */ + sd_up, + + sd_laststate = sd_up /* last value, for table dimensions */ +}; + +enum drivestate { + drive_unallocated, + /* present but unused. Must be 0 */ + + drive_referenced, + /* just mentioned in some other config entry */ + + drive_down, + /* not accessible */ + + drive_up, + /* up and running */ + + drive_laststate = drive_up /* last value, for table dimensions */ +}; + +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ diff --git a/sys/dev/vinum/vinumutil.c b/sys/dev/vinum/vinumutil.c new file mode 100644 index 0000000..5d3fe82 --- /dev/null +++ b/sys/dev/vinum/vinumutil.c @@ -0,0 +1,304 @@ +/*- + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumutil.c,v 1.17 2003/04/28 02:54:43 grog Exp $ + * $FreeBSD$ + */ + +/* This file contains utility routines used both in kernel and user context */ + +#include <dev/vinum/vinumhdr.h> +#include <dev/vinum/statetexts.h> +#ifndef _KERNEL +#include <stdio.h> +#include <string.h> +extern jmp_buf command_fail; /* return on a failed command */ +#endif + +static char numeric_state[32]; /* temporary buffer for ASCII conversions */ +#define STATECOUNT(x) (sizeof (x##statetext) / sizeof (char *)) +/* Return drive state as a string */ +char * +drive_state(enum drivestate state) +{ + if (((unsigned) state) >= STATECOUNT(drive)) { + sprintf(numeric_state, "Invalid state %d", (int) state); + return numeric_state; + } else + return drivestatetext[state]; +} + +/* Return volume state as a string */ +char * +volume_state(enum volumestate state) +{ + if (((unsigned) state) >= STATECOUNT(vol)) { + sprintf(numeric_state, "Invalid state %d", (int) state); + return numeric_state; + } else + return volstatetext[state]; +} + +/* Return plex state as a string */ +char * +plex_state(enum plexstate state) +{ + if (((unsigned) state) >= STATECOUNT(plex)) { + sprintf(numeric_state, "Invalid state %d", (int) state); + return numeric_state; + } else + return plexstatetext[state]; +} + +/* Return plex organization as a string */ +char * +plex_org(enum plexorg org) +{ + switch (org) { + case plex_disorg: /* disorganized */ + return "disorg"; + break; + + case plex_concat: /* concatenated plex */ + return "concat"; + break; + + case plex_striped: /* striped plex */ + return "striped"; + break; + + case plex_raid4: /* RAID-4 plex */ + return "raid4"; + + case plex_raid5: /* RAID-5 plex */ + return "raid5"; + break; + + default: + sprintf(numeric_state, "Invalid org %d", (int) org); + return numeric_state; + } +} + +/* Return sd state as a string */ +char * +sd_state(enum sdstate state) +{ + if (((unsigned) state) >= STATECOUNT(sd)) { + sprintf(numeric_state, "Invalid state %d", (int) state); + return numeric_state; + } else + return sdstatetext[state]; +} + +/* Now convert in the other direction */ +/* + * These are currently used only internally, + * so we don't do too much error checking + */ +enum drivestate +DriveState(char *text) +{ + int i; + for (i = 0; i < STATECOUNT(drive); i++) + if (strcmp(text, drivestatetext[i]) == 0) /* found it */ + return (enum drivestate) i; + return -1; +} + +enum sdstate +SdState(char *text) +{ + int i; + for (i = 0; i < STATECOUNT(sd); i++) + if (strcmp(text, sdstatetext[i]) == 0) /* found it */ + return (enum sdstate) i; + return -1; +} + +enum plexstate +PlexState(char *text) +{ + int i; + for (i = 0; i < STATECOUNT(plex); i++) + if (strcmp(text, plexstatetext[i]) == 0) /* found it */ + return (enum plexstate) i; + return -1; +} + +enum volumestate +VolState(char *text) +{ + int i; + for (i = 0; i < STATECOUNT(vol); i++) + if (strcmp(text, volstatetext[i]) == 0) /* found it */ + return (enum volumestate) i; + return -1; +} + +/* + * Take a number with an optional scale factor and convert + * it to a number of bytes. + * + * The scale factors are: + * + * s sectors (of 512 bytes) + * b blocks (of 512 bytes). This unit is deprecated, + * because it's confusing, but maintained to avoid + * confusing Veritas users. + * k kilobytes (1024 bytes) + * m megabytes (of 1024 * 1024 bytes) + * g gigabytes (of 1024 * 1024 * 1024 bytes) + */ +u_int64_t +sizespec(char *spec) +{ + u_int64_t size; + char *s; + int sign = 1; /* -1 if negative */ + + size = 0; + if (spec != NULL) { /* we have a parameter */ + s = spec; + if (*s == '-') { /* negative, */ + sign = -1; + s++; /* skip */ + } + if ((*s >= '0') && (*s <= '9')) { /* it's numeric */ + while ((*s >= '0') && (*s <= '9')) /* it's numeric */ + size = size * 10 + *s++ - '0'; /* convert it */ + switch (*s) { + case '\0': + return size * sign; + + case 'B': + case 'b': + case 'S': + case 's': + return size * sign * 512; + + case 'K': + case 'k': + return size * sign * 1024; + + case 'M': + case 'm': + return size * sign * 1024 * 1024; + + case 'G': + case 'g': + return size * sign * 1024 * 1024 * 1024; + } + } +#ifdef _KERNEL + throw_rude_remark(EINVAL, "Invalid length specification: %s", spec); +#else + fprintf(stderr, "Invalid length specification: %s", spec); + longjmp(command_fail, 1); +#endif + } +#ifdef _KERNEL + throw_rude_remark(EINVAL, "Missing length specification"); +#else + fprintf(stderr, "Missing length specification"); + longjmp(command_fail, 1); +#endif + /* NOTREACHED */ + return -1; +} + +/* + * Extract the volume number from a device number. Check that it's + * the correct type, and that it isn't one of the superdevs. + */ +int +Volno(dev_t dev) +{ + int volno = minor(dev); + + if (OBJTYPE(dev) != VINUM_VOLUME_TYPE) + return -1; + else + volno = ((volno & 0x3fff0000) >> 8) | (volno & 0xff); + if ((volno == VINUM_SUPERDEV_VOL) + || (volno == VINUM_DAEMON_VOL)) + return -1; + else + return volno; +} + +/* + * Extract a plex number from a device number. + * Don't check the major number, but check the + * type. Return -1 for invalid types. + */ +int +Plexno(dev_t dev) +{ + int plexno = minor(dev); + + if (OBJTYPE(dev) != VINUM_PLEX_TYPE) + return -1; + else + return ((plexno & 0x3fff0000) >> 8) | (plexno & 0xff); +} + +/* + * Extract a subdisk number from a device number. + * Don't check the major number, but check the + * type. Return -1 for invalid types. + */ +int +Sdno(dev_t dev) +{ + int sdno = minor(dev); + + /* + * Care: VINUM_SD_TYPE is 2 or 3, which is why we use < instead of + * !=. It's not clear that this makes any sense abstracting it to + * this level. + */ + if (OBJTYPE(dev) < VINUM_SD_TYPE) + return -1; + else +/* + * Note that the number we return includes the low-order bit of the + * type field. This gives us twice as many potential subdisks as + * plexes or volumes. + */ + return ((sdno & 0x7fff0000) >> 8) | (sdno & 0xff); +} diff --git a/sys/dev/vinum/vinumutil.h b/sys/dev/vinum/vinumutil.h new file mode 100644 index 0000000..2efa42c --- /dev/null +++ b/sys/dev/vinum/vinumutil.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumutil.h,v 1.1 2001/05/22 04:07:22 grog Exp grog $ + * $FreeBSD$ + */ + +/* + * Functions defined in vinumutil.c, which is used both in userland + * and in the kernel. + */ +char *drive_state(enum drivestate); +char *volume_state(enum volumestate); +char *plex_state(enum plexstate); +char *plex_org(enum plexorg); +char *sd_state(enum sdstate); +enum drivestate DriveState(char *text); +enum sdstate SdState(char *text); +enum plexstate PlexState(char *text); +enum volumestate VolState(char *text); diff --git a/sys/dev/vinum/vinumvar.h b/sys/dev/vinum/vinumvar.h new file mode 100644 index 0000000..8c6a07b --- /dev/null +++ b/sys/dev/vinum/vinumvar.h @@ -0,0 +1,400 @@ +/*- + * Copyright (c) 1997, 1998, 1999 + * Nan Yang Computer Services Limited. All rights reserved. + * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * + * This software is distributed under the so-called ``Berkeley + * License'': + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nan Yang Computer + * Services Limited. + * 4. Neither the name of the Company nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided ``as is'', and any express or implied + * warranties, including, but not limited to, the implied warranties of + * merchantability and fitness for a particular purpose are disclaimed. + * In no event shall the company or contributors be liable for any + * direct, indirect, incidental, special, exemplary, or consequential + * damages (including, but not limited to, procurement of substitute + * goods or services; loss of use, data, or profits; or business + * interruption) however caused and on any theory of liability, whether + * in contract, strict liability, or tort (including negligence or + * otherwise) arising in any way out of the use of this software, even if + * advised of the possibility of such damage. + * + * $Id: vinumvar.h,v 1.33 2003/05/23 01:09:23 grog Exp $ + * $FreeBSD$ + */ + +#include <sys/time.h> +#include <dev/vinum/vinumstate.h> +#include <sys/mutex.h> + +/* Directory for device nodes. */ +#define VINUM_DIR "/dev/vinum" + +/* + * Some configuration maxima. They're an enum because + * we can't define global constants. Sorry about that. + * + * These aren't as bad as they look: most of them are soft limits. + */ + +#define VINUMROOT +enum constants { + /* + * Current version of the data structures. This + * is used to ensure synchronization between + * kernel module and userland vinum(8). + */ + VINUMVERSION = 1, + VINUM_HEADER = 512, /* size of header on disk */ + MAXCONFIGLINE = 1024, /* maximum size of a single config line */ + MINVINUMSLICE = 1048576, /* minimum size of a slice */ + + VINUM_CDEV_MAJOR = 91, /* major number for character device */ + + ROUND_ROBIN_READPOL = -1, /* round robin read policy */ + + /* + * Type field in high-order two bits of minor + * number. Subdisks are in fact both type 2 and + * type 3, giving twice the number of subdisks. + * This causes some ugliness in the code. + */ + VINUM_VOLUME_TYPE = 0, + VINUM_PLEX_TYPE = 1, + VINUM_SD_TYPE = 2, + VINUM_SD2_TYPE = 3, + + + /* + * Define a minor device number. + * This is not used directly; instead, it's + * called by the other macros. + */ +#define VINUMMINOR(o,t) ((o & 0xff) | ((o & 0x3fff00) << 8) | (t << VINUM_TYPE_SHIFT)) + + VINUM_TYPE_SHIFT = 30, + VINUM_MAXVOL = 0x3ffffd, /* highest numbered volume */ + + /* + * The super device and the daemon device are + * magic: they're the two highest-numbered + * volumes. + */ + VINUM_SUPERDEV_VOL = 0x3ffffe, + VINUM_DAEMON_VOL = 0x3fffff, + VINUM_MAXPLEX = 0x3fffff, + VINUM_MAXSD = 0x7fffff, + +#define VINUM_SUPERDEV_MINOR VINUMMINOR (VINUM_SUPERDEV_VOL, VINUM_VOLUME_TYPE) +#define VINUM_DAEMON_MINOR VINUMMINOR (VINUM_DAEMON_VOL, VINUM_VOLUME_TYPE) + + /* + * Mask for the number part of each object. + * Plexes and volumes are the same, subdisks use + * the low-order bit of the type field and thus + * have twice the number. + */ + + MAJORDEV_SHIFT = 8, + + MAXPLEX = 8, /* maximum number of plexes in a volume */ + MAXSD = 256, /* maximum number of subdisks in a plex */ + MAXDRIVENAME = 32, /* maximum length of a device name */ + MAXSDNAME = 64, /* maximum length of a subdisk name */ + MAXPLEXNAME = 64, /* maximum length of a plex name */ + MAXVOLNAME = 64, /* maximum length of a volume name */ + MAXNAME = 64, /* maximum length of any name */ + + +#define OBJTYPE(x) ((minor(x) >> VINUM_TYPE_SHIFT) & 3) + + /* Create device minor numbers */ +#define VINUMDEV(o, t) makedev (VINUM_CDEV_MAJOR, VINUMMINOR (o, t)) + +#define VINUM_VOL(v) makedev (VINUM_CDEV_MAJOR, \ + VINUMMINOR (v, VINUM_VOLUME_TYPE)) +#define VINUM_PLEX(p) makedev (VINUM_CDEV_MAJOR, \ + VINUMMINOR (p, VINUM_PLEX_TYPE)) +#define VINUM_SD(s) makedev (VINUM_CDEV_MAJOR, \ + VINUMMINOR (s, VINUM_SD_TYPE)) + + /* extract device type */ +#define DEVTYPE(x) ((minor (x) >> VINUM_TYPE_SHIFT) & 3) + +#define VINUM_SUPERDEV_NAME VINUM_DIR"/control" /* normal super device */ +#define VINUM_DAEMON_DEV_NAME VINUM_DIR"/controld" /* super device for daemon only */ + + /* + * the number of object entries to cater for initially, and also the + * value by which they are incremented. It doesn't take long + * to extend them, so theoretically we could start with 1 of each, but + * it's untidy to allocate such small areas. These values are + * probably too small. + */ + + INITIAL_DRIVES = 4, + INITIAL_VOLUMES = 4, + INITIAL_PLEXES = 8, + INITIAL_SUBDISKS = 16, + INITIAL_SUBDISKS_IN_PLEX = 4, /* number of subdisks to allocate to a plex */ + INITIAL_SUBDISKS_IN_DRIVE = 4, /* number of subdisks to allocate to a drive */ + INITIAL_DRIVE_FREELIST = 16, /* number of entries in drive freelist */ + PLEX_REGION_TABLE_SIZE = 8, /* number of entries in plex region tables */ + PLEX_LOCKS = 256, /* number of locks to allocate to a plex */ + PLEXMUTEXES = 32, + MAX_REVIVE_BLOCKSIZE = MAXPHYS, /* maximum revive block size */ + DEFAULT_REVIVE_BLOCKSIZE = 65536, /* default revive block size */ + VINUMHOSTNAMELEN = 32, /* host name field in label */ +}; + +/* + * Slice header + * + * Vinum drives start with this structure: + * + *\ Sector + * |--------------------------------------| + * | PDP-11 memorial boot block | 0 + * |--------------------------------------| + * | Disk label, maybe | 1 + * |--------------------------------------| + * | Slice definition (vinum_hdr) | 8 + * |--------------------------------------| + * | | + * | Configuration info, first copy | 9 + * | | + * |--------------------------------------| + * | | + * | Configuration info, second copy | 9 + size of config + * | | + * |--------------------------------------| + */ + +/* Sizes and offsets of our information */ +enum { + VINUM_LABEL_OFFSET = 4096, /* offset of vinum label */ + VINUMHEADERLEN = 512, /* size of vinum label */ + VINUM_CONFIG_OFFSET = 4608, /* offset of first config copy */ + MAXCONFIG = 65536, /* and size of config copy */ + DATASTART = (MAXCONFIG * 2 + VINUM_CONFIG_OFFSET) / DEV_BSIZE /* this is where the data starts */ +}; + +/* + * hostname is 256 bytes long, but we don't need to shlep + * multiple copies in vinum. We use the host name just + * to identify this system, and 32 bytes should be ample + * for that purpose + */ + +struct vinum_label { + char sysname[VINUMHOSTNAMELEN]; /* system name at time of creation */ + char name[MAXDRIVENAME]; /* our name of the drive */ + struct timeval date_of_birth; /* the time it was created */ + struct timeval last_update; /* and the time of last update */ + /* + * total size in bytes of the drive. This value + * includes the headers. + */ + off_t drive_size; +}; + +struct vinum_hdr { + uint64_t magic; /* we're long on magic numbers */ +#define VINUM_MAGIC 22322600044678729LL /* should be this */ +#define VINUM_NOMAGIC 22322600044678990LL /* becomes this after obliteration */ + /* + * Size in bytes of each copy of the + * configuration info. This must be a multiple + * of the sector size. + */ + int config_length; + struct vinum_label label; /* unique label */ +}; + +/* Information returned from read_drive_label */ +enum drive_label_info { + DL_CANT_OPEN, /* invalid partition */ + DL_NOT_OURS, /* valid partition, but no vinum label */ + DL_DELETED_LABEL, /* valid partition, deleted label found */ + DL_WRONG_DRIVE, /* drive name doesn't match */ + DL_OURS /* valid partition and label found */ +}; + +/* kinds of plex organization */ +enum plexorg { + plex_disorg, /* disorganized */ + plex_concat, /* concatenated plex */ + plex_striped, /* striped plex */ + plex_raid4, /* RAID4 plex */ + plex_raid5 /* RAID5 plex */ +}; + +/* Recognize plex organizations */ +#define isstriped(p) (p->organization >= plex_striped) /* RAID 1, 4 or 5 */ +#define isparity(p) (p->organization >= plex_raid4) /* RAID 4 or 5 */ + +/* Address range definitions, for locking volumes */ +struct rangelock { + daddr_t stripe; /* address + 1 of the range being locked */ + struct buf *bp; /* user's buffer pointer */ +}; + +struct drive_freelist { /* sorted list of free space on drive */ + u_int64_t offset; /* offset of entry */ + u_int64_t sectors; /* and length in sectors */ +}; + +/* + * Include the structure definitions shared + * between userland and kernel. + */ + +#ifdef _KERNEL +#include <dev/vinum/vinumobj.h> +#undef _KERNEL +#include <dev/vinum/vinumobj.h> +#define _KERNEL +#else +#include <dev/vinum/vinumobj.h> +#endif + +/* + * Table expansion. Expand table, which contains oldcount + * entries of type element, by increment entries, and change + * oldcount accordingly + */ +#ifdef VINUMDEBUG +#define EXPAND(table, element, oldcount, increment) \ +{ \ + expand_table ((void **) &table, \ + oldcount * sizeof (element), \ + (oldcount + increment) * sizeof (element), \ + __FILE__, \ + __LINE__ ); \ + oldcount += increment; \ + } +#else +#define EXPAND(table, element, oldcount, increment) \ +{ \ + expand_table ((void **) &table, \ + oldcount * sizeof (element), \ + (oldcount + increment) * sizeof (element)); \ + oldcount += increment; \ + } +#endif + +/* Information on vinum's memory usage */ +struct meminfo { + int mallocs; /* number of malloced blocks */ + int total_malloced; /* total amount malloced */ + int highwater; /* maximum number of mallocs */ + struct mc *malloced; /* pointer to kernel table */ +}; + +#define MCFILENAMELEN 16 +struct mc { + struct timeval time; + int seq; + int size; + short line; + caddr_t address; + char file[MCFILENAMELEN]; +}; + +/* + * These enums are used by the state transition + * routines. They're in bit map format: + * + * Bit 0: Other plexes in the volume are down + * Bit 1: Other plexes in the volume are up + * Bit 2: The current plex is up + * Maybe they should be local to + * state.c + */ +enum volplexstate { + volplex_onlyusdown = 0, /* 0: we're the only plex, and we're down */ + volplex_alldown, /* 1: another plex is down, and so are we */ + volplex_otherup, /* 2: another plex is up */ + volplex_otherupdown, /* 3: other plexes are up and down */ + volplex_onlyus, /* 4: we're up and alone */ + volplex_onlyusup, /* 5: only we are up, others are down */ + volplex_allup, /* 6: all plexes are up */ + volplex_someup /* 7: some plexes are up, including us */ +}; + +/* state map for plex */ +enum sdstates { + sd_emptystate = 1, + sd_downstate = 2, /* SD is down */ + sd_crashedstate = 4, /* SD is crashed */ + sd_obsoletestate = 8, /* SD is obsolete */ + sd_stalestate = 16, /* SD is stale */ + sd_rebornstate = 32, /* SD is reborn */ + sd_upstate = 64, /* SD is up */ + sd_initstate = 128, /* SD is initializing */ + sd_initializedstate = 256, /* SD is initialized */ + sd_otherstate = 512, /* SD is in some other state */ +}; + +/* + * This is really just a parameter to pass to + * set_<foo>_state, but since it needs to be known + * in the external definitions, we need to define + * it here + */ +enum setstateflags { + setstate_none = 0, /* no flags */ + setstate_force = 1, /* force the state change */ + setstate_configuring = 2, /* we're currently configuring, don't save */ +}; + +/* Operations for parityops to perform. */ +enum parityop { + checkparity, + rebuildparity, + rebuildandcheckparity, /* rebuildparity with the -v option */ +}; + +#ifdef VINUMDEBUG +/* Debugging stuff */ +enum debugflags { + DEBUG_ADDRESSES = 1, /* show buffer information during requests */ + DEBUG_NUMOUTPUT = 2, /* show the value of vp->v_numoutput */ + DEBUG_RESID = 4, /* go into debugger in complete_rqe */ + DEBUG_LASTREQS = 8, /* keep a circular buffer of last requests */ + DEBUG_REVIVECONFLICT = 16, /* print info about revive conflicts */ + DEBUG_EOFINFO = 32, /* print info about EOF detection */ + DEBUG_MEMFREE = 64, /* keep info about Frees */ + DEBUG_BIGDRIVE = 128, /* pretend our drives are 100 times the size */ + DEBUG_REMOTEGDB = 256, /* go into remote gdb */ + DEBUG_WARNINGS = 512, /* log various relatively harmless warnings */ + DEBUG_LOCKREQS = 1024, /* log locking requests */ +}; + +#ifdef _KERNEL +#ifdef __i386__ +#define longjmp LongJmp /* test our longjmps */ +#endif +#endif +#endif +/* Local Variables: */ +/* fill-column: 50 */ +/* End: */ |