/*- * Copyright (c) 1997, 1998 * Nan Yang Computer Services Limited. All rights reserved. * * This software is distributed under the so-called ``Berkeley * License'': * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nan Yang Computer * Services Limited. * 4. Neither the name of the Company nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided ``as is'', and any express or implied * warranties, including, but not limited to, the implied warranties of * merchantability and fitness for a particular purpose are disclaimed. * In no event shall the company or contributors be liable for any * direct, indirect, incidental, special, exemplary, or consequential * damages (including, but not limited to, procurement of substitute * goods or services; loss of use, data, or profits; or business * interruption) however caused and on any theory of liability, whether * in contract, strict liability, or tort (including negligence or * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * * $Id: vinumvar.h,v 1.16 1998/10/26 05:50:43 grog Exp grog $ */ /* XXX gdb can't find our global pointers, so use this kludge to * point to them locally. Remove after testing */ #define BROKEN_GDB struct _vinum_conf *VC = &vinum_conf #include #include "vinumstate.h" /* Some configuration maxima. They're an enum because * we can't define global constants. Sorry about that. * * These aren't as bad as they look: most of them are soft limits. */ enum constants { VINUM_HEADER = 512, /* size of header on disk */ MAXCONFIGLINE = 1024, /* maximum size of a single config line */ /* XXX Do we still need this? */ MINVINUMSLICE = 1048576, /* minimum size of a slice */ CDEV_MAJOR = 91, /* major number for character device */ BDEV_MAJOR = 25, /* and block device */ ROUND_ROBIN_READPOL = -1, /* round robin read policy */ /* type field in minor number */ VINUM_VOLUME_TYPE = 0, VINUM_PLEX_TYPE = 1, VINUM_SD_TYPE = 2, VINUM_DRIVE_TYPE = 3, VINUM_SUPERDEV_TYPE = 4, /* super device. */ /* Shifts for the individual fields in the device */ VINUM_TYPE_SHIFT = 28, VINUM_VOL_SHIFT = 0, VINUM_PLEX_SHIFT = 16, VINUM_SD_SHIFT = 20, VINUM_VOL_WIDTH = 8, VINUM_PLEX_WIDTH = 3, VINUM_SD_WIDTH = 8, MAJORDEV_SHIFT = 8, MAXPLEX = 8, /* maximum number of plexes in a volume */ MAXSD = 256, /* maximum number of subdisks in a plex */ MAXDRIVENAME = 32, /* maximum length of a device name */ MAXSDNAME = 64, /* maximum length of a subdisk name */ MAXPLEXNAME = 64, /* maximum length of a plex name */ MAXVOLNAME = 64, /* maximum length of a volume name */ MAXNAME = 64, /* maximum length of any name */ /* Create a block device number */ #define VINUMBDEV(v,p,s,t) ((BDEV_MAJOR << MAJORDEV_SHIFT) \ | (v << VINUM_VOL_SHIFT) \ | (p << VINUM_PLEX_SHIFT) \ | (s << VINUM_SD_SHIFT) \ | (t << VINUM_TYPE_SHIFT) ) /* And a character device number */ #define VINUMCDEV(v,p,s,t) ((CDEV_MAJOR << MAJORDEV_SHIFT) \ | (v << VINUM_VOL_SHIFT) \ | (p << VINUM_PLEX_SHIFT) \ | (s << VINUM_SD_SHIFT) \ | (t << VINUM_TYPE_SHIFT) ) /* extract device type */ #define DEVTYPE(x) ((x >> VINUM_TYPE_SHIFT) & 7) /* extract volume number */ #define VOLNO(x) (x & ((1 << VINUM_VOL_WIDTH) - 1)) /* extract plex number */ #define PLEXNO(x) (VOL [VOLNO (x)].plex [(x >> VINUM_PLEX_SHIFT) & ((1 << VINUM_PLEX_WIDTH) - 1)]) /* extract subdisk number */ #define SDNO(x) (PLEX [PLEXNO (x)].sdnos [(x >> VINUM_SD_SHIFT) & ((1 << VINUM_SD_WIDTH) - 1)]) /* extract drive number */ #define DRIVENO(x) (SD [SDNO (x)].driveno) VINUM_SUPERDEV = VINUMBDEV(0, 0, 0, VINUM_SUPERDEV_TYPE), /* superdevice number */ /* the number of object entries to cater for initially, and also the * value by which they are incremented. It doesn't take long * to extend them, so theoretically we could start with 1 of each, but * it's untidy to allocate such small areas. These values are * probably too small. */ INITIAL_DRIVES = 4, INITIAL_VOLUMES = 4, INITIAL_PLEXES = 8, INITIAL_SUBDISKS = 16, INITIAL_SUBDISKS_IN_PLEX = 4, /* number of subdisks to allocate to a plex */ INITIAL_SUBDISKS_IN_DRIVE = 4, /* number of subdisks to allocate to a drive */ INITIAL_DRIVE_FREELIST = 16, /* number of entries in drive freelist */ PLEX_REGION_TABLE_SIZE = 8, /* number of entries in plex region tables */ INITIAL_LOCKS = 8, /* number of locks to allocate to a volume */ DEFAULT_REVIVE_BLOCKSIZE = 32768, /* size of block to transfer in one op */ VINUMHOSTNAMELEN = 32, /* host name field in label */ }; /* device numbers */ /* * 31 30 28 27 20 19 18 16 15 8 7 0 * |-----------------------------------------------------------------------------------------------| * |X | Type | Subdisk number | X| Plex | Major number | volume number | * |-----------------------------------------------------------------------------------------------| * * 0x2 03 1 19 06 */ struct devcode { /* CARE. These fields assume a big-endian word. On a * little-endian system, they're the wrong way around */ unsigned volume:8; /* up to 256 volumes */ unsigned major:8; /* this is where the major number fits */ unsigned plex:3; /* up to 8 plexes per volume */ unsigned unused:1; /* up for grabs */ unsigned sd:8; /* up to 256 subdisks per plex */ unsigned type:3; /* type of object */ /* type field VINUM_VOLUME = 0, VINUM_PLEX = 1, VINUM_SUBDISK = 2, VINUM_DRIVE = 3, VINUM_SUPERDEV = 4, */ unsigned signbit:1; /* to make 32 bits */ }; #define VINUM_DIR "/dev/vinum" #define VINUM_RDIR "/dev/rvinum" #define VINUM_SUPERDEV_NAME VINUM_DIR"/control" /* Flags for all objects. Most of them only apply to * specific objects, but we have space for all in any * 32 bit flags word. */ enum objflags { VF_LOCKED = 1, /* somebody has locked access to this object */ VF_LOCKING = 2, /* we want access to this object */ VF_WRITETHROUGH = 8, /* volume: write through */ VF_INITED = 0x10, /* unit has been initialized */ VF_WLABEL = 0x20, /* label area is writable */ VF_LABELLING = 0x40, /* unit is currently being labelled */ VF_WANTED = 0x80, /* someone is waiting to obtain a lock */ VF_RAW = 0x100, /* raw volume (no file system) */ VF_LOADED = 0x200, /* module is loaded */ VF_CONFIGURING = 0x400, /* somebody is changing the config */ VF_WILL_CONFIGURE = 0x800, /* somebody wants to change the config */ VF_CONFIG_INCOMPLETE = 0x1000, /* haven't finished changing the config */ VF_CONFIG_SETUPSTATE = 0x2000, /* set a volume up if all plexes are empty */ VF_READING_CONFIG = 0x4000, /* we're reading config database from disk */ VF_KERNELOP = 0x8000, /* we're performing ops from kernel space */ VF_DIRTYCONFIG = 0x10000, /* config needs updating */ }; /* Global configuration information for the vinum subsystem */ struct _vinum_conf { /* Pointers to vinum structures */ struct drive *drive; struct sd *sd; struct plex *plex; struct volume *volume; /* the number allocated */ int drives_allocated; int subdisks_allocated; int plexes_allocated; int volumes_allocated; /* and the number currently in use */ int drives_used; int subdisks_used; int plexes_used; int volumes_used; int flags; int opencount; /* number of times we've been opened */ #if DEBUG int lastrq; struct buf *lastbuf; struct rqinfo **rqipp; struct rqinfo *rqinfop; #endif }; /* Use these defines to simplify code */ #define DRIVE vinum_conf.drive #define SD vinum_conf.sd #define PLEX vinum_conf.plex #define VOL vinum_conf.volume #define VFLAGS vinum_conf.flags /* Slice header * Vinum drives start with this structure: * *\ Sector * |--------------------------------------| * | PDP-11 memorial boot block | 0 * |--------------------------------------| * | Disk label, maybe | 1 * |--------------------------------------| * | Slice definition (vinum_hdr) | 2 * |--------------------------------------| * | | * | Configuration info, first copy | 3 * | | * |--------------------------------------| * | | * | Configuration info, second copy | 3 + size of config * | | * |--------------------------------------| */ /* Sizes and offsets of our information */ enum { VINUM_LABEL_OFFSET = 4096, /* offset of vinum label */ VINUMHEADERLEN = 512, /* size of vinum label */ VINUM_CONFIG_OFFSET = 4608, /* offset of first config copy */ MAXCONFIG = 65536, /* and size of config copy */ DATASTART = (MAXCONFIG * 2 + VINUM_CONFIG_OFFSET) / DEV_BSIZE /* this is where the data starts */ }; /* hostname is 256 bytes long, but we don't need to shlep * multiple copies in vinum. We use the host name just * to identify this system, and 32 bytes should be ample * for that purpose */ struct vinum_label { char sysname[VINUMHOSTNAMELEN]; /* system name at time of creation */ char name[MAXDRIVENAME]; /* our name of the drive */ struct timeval date_of_birth; /* the time it was created */ struct timeval last_update; /* and the time of last update */ off_t drive_size; /* total size in bytes of the drive. * This value includes the headers */ }; struct vinum_hdr { long long magic; /* we're long on magic numbers */ /* XXX Get these right for big-endian */ #define VINUM_MAGIC 22322600044678729LL /* should be this */ #define VINUM_NOMAGIC 22322600044678990LL /* becomes this after obliteration */ int config_length; /* size in bytes of each copy of the * configuration info. * This must be a multiple of the sector size. */ struct vinum_label label; /* unique label */ }; /* Information returned from read_drive_label */ enum drive_label_info { DL_CANT_OPEN, /* invalid partition */ DL_NOT_OURS, /* valid partition, but no vinum label */ DL_DELETED_LABEL, /* valid partition, deleted label found */ DL_WRONG_DRIVE, /* drive name doesn't match */ DL_OURS /* valid partition and label found */ }; /*** Drive definitions ***/ /* A drive corresponds to a disk slice. We use a different term to show * the difference in usage: it doesn't have to be a slice, and could * theroretically be a complete, unpartitioned disk */ struct drive { enum drivestate state; /* current state */ int subdisks_allocated; /* number of entries in sd */ int subdisks_used; /* and the number used */ int blocksize; /* size of fs blocks */ u_int64_t sectors_available; /* number of sectors still available */ int secsperblock; int lasterror; /* last error on drive */ int driveno; /* index of drive in vinum_conf */ int opencount; /* number of up subdisks */ u_int64_t reads; /* number of reads on this drive */ u_int64_t writes; /* number of writes on this drive */ u_int64_t bytes_read; /* number of bytes read */ u_int64_t bytes_written; /* number of bytes written */ dev_t dev; /* and device number */ char devicename[MAXDRIVENAME]; /* name of the slice it's on */ struct vnode *vp; /* vnode pointer */ struct proc *p; struct vinum_label label; /* and the label information */ struct partinfo partinfo; /* partition information */ int freelist_size; /* number of entries alloced in free list */ int freelist_entries; /* number of entries used in free list */ struct drive_freelist { /* sorted list of free space on drive */ u_int64_t offset; long sectors; } *freelist; }; /*** Subdisk definitions ***/ struct sd { enum sdstate state; /* state */ /* offsets in blocks */ int64_t driveoffset; /* offset on drive */ int64_t plexoffset; /* offset in plex */ u_int64_t sectors; /* and length in sectors */ int plexno; /* index of plex, if it belongs */ int driveno; /* index of the drive on which it is located */ int sdno; /* our index in vinum_conf */ int pid; /* pid of process which opened us */ u_int64_t reads; /* number of reads on this subdisk */ u_int64_t writes; /* number of writes on this subdisk */ u_int64_t bytes_read; /* number of bytes read */ u_int64_t bytes_written; /* number of bytes written */ char name[MAXSDNAME]; /* name of subdisk */ }; /*** Plex definitions ***/ /* kinds of plex organization */ enum plexorg { plex_disorg, /* disorganized */ plex_concat, /* concatenated plex */ plex_striped, /* striped plex */ plex_raid5 /* RAID5 plex */ }; /* Region in plex (either defective or unmapped) */ struct plexregion { u_int64_t offset; /* start of region */ u_int64_t length; /* length */ }; struct plex { enum plexorg organization; /* Plex organization */ enum plexstate state; /* and current state */ u_int64_t length; /* total length of plex (max offset) */ int flags; int stripesize; /* size of stripe or raid band, in sectors */ int subdisks; /* number of associated subdisks */ int subdisks_allocated; /* number of subdisks allocated space for */ int *sdnos; /* list of component subdisks */ int plexno; /* index of plex in vinum_conf */ int volno; /* index of volume */ int volplexno; /* number of plex in volume */ int pid; /* pid of process which opened us */ /* Lock information */ int locks; /* number of locks used */ int alloclocks; /* number of locks allocated */ struct rangelock *lock; /* ranges of locked addresses */ /* Statistics */ u_int64_t reads; /* number of reads on this plex */ u_int64_t writes; /* number of writes on this plex */ u_int64_t bytes_read; /* number of bytes read */ u_int64_t bytes_written; /* number of bytes written */ u_int64_t multiblock; /* requests that needed more than one block */ u_int64_t multistripe; /* requests that needed more than one stripe */ /* revive parameters */ u_int64_t revived; /* block number of current revive request */ int revive_blocksize; /* revive block size (bytes) */ int revive_interval; /* and time to wait between transfers */ struct request *waitlist; /* list of requests waiting on revive op */ /* geometry control */ int defective_regions; /* number of regions which are defective */ int defective_region_count; /* number of entries in defective_region */ struct plexregion *defective_region; /* list of offset/length pairs: defective sds */ int unmapped_regions; /* number of regions which are missing */ int unmapped_region_count; /* number of entries in unmapped_region */ struct plexregion *unmapped_region; /* list of offset/length pairs: missing sds */ char name[MAXPLEXNAME]; /* name of plex */ }; /*** Volume definitions ***/ struct volume { enum volumestate state; /* current state */ int plexes; /* number of plexes */ int preferred_plex; /* plex to read from, -1 for round-robin */ int last_plex_read; /* index of plex used for last read, * for round-robin */ dev_t devno; /* device number */ int flags; /* status and configuration flags */ int opencount; /* number of opens (all the same process) */ int openflags; /* flags supplied to last open(2) */ u_int64_t size; /* size of volume */ int disk; /* disk index */ int blocksize; /* logical block size */ int active; /* number of outstanding requests active */ int subops; /* and the number of suboperations */ pid_t pid; /* pid of locker */ /* Statistics */ u_int64_t bytes_read; /* number of bytes read */ u_int64_t bytes_written; /* number of bytes written */ u_int64_t reads; /* number of reads on this volume */ u_int64_t writes; /* number of writes on this volume */ u_int64_t recovered_reads; /* reads recovered from another plex */ /* Unlike subdisks in the plex, space for the plex pointers is static */ int plex[MAXPLEX]; /* index of plexes */ char name[MAXVOLNAME]; /* name of volume */ struct disklabel label; /* for DIOCGPART */ }; /* Table expansion. Expand table, which contains oldcount * entries of type element, by increment entries, and change * oldcount accordingly */ #define EXPAND(table, element, oldcount, increment) \ { \ expand_table ((void **) &table, \ oldcount * sizeof (element), \ (oldcount + increment) * sizeof (element) ); \ oldcount += increment; \ } /* Information on vinum's memory usage */ struct meminfo { int mallocs; /* number of malloced blocks */ int total_malloced; /* total amount malloced */ int highwater; /* maximum number of mallocs */ struct mc *malloced; /* pointer to kernel table */ }; struct mc { int seq; int size; short line; short flags; #define ALLOC_KVA 1 /* allocated via kva calls */ int *databuf; /* really vm_object_t */ caddr_t address; char file[16]; }; /* These enums are used by the state transition * routines. They're in bit map format: * * Bit 0: Other plexes in the volume are down * Bit 1: Other plexes in the volume are up * Bit 2: The current plex is up * Maybe they should be local to * state.c */ enum volplexstate { volplex_onlyusdown = 0, /* we're the only plex, and we're down */ volplex_alldown, /* 1: another plex is down, and so are we */ volplex_otherup, /* 2: another plex is up */ volplex_otherupdown, /* other plexes are up and down */ volplex_onlyus, /* 4: we're up and alone */ volplex_onlyusup, /* only we are up, others are down */ volplex_allup, /* all plexes are up */ volplex_someup /* some plexes are up, including us */ }; /* state map for plex */ enum sdstates { sd_emptystate = 1, sd_downstate = 2, /* found an SD which is down */ sd_crashedstate = 4, /* found an SD which is crashed */ sd_obsoletestate = 8, /* found an SD which is obsolete */ sd_stalestate = 16, /* found an SD which is stale */ sd_rebornstate = 32, /* found an SD which is reborn */ sd_upstate = 64, /* found an SD which is up */ sd_initstate = 128, /* found an SD which is init */ sd_otherstate = 256 /* found an SD in some other state */ }; /* This is really just a parameter to pass to * set__state, but since it needs to be known * in the external definitions, we need to define * it here */ enum setstateflags { setstate_none = 0, /* no flags */ setstate_force = 1, /* force the state change */ setstate_configuring = 2, /* we're currently configuring, don't save */ setstate_recursing = 4, /* we're called from another setstate function */ setstate_norecurse = 8, /* don't call other setstate functions */ setstate_noupdate = 16 /* don't update config */ }; #ifdef DEBUG /* Debugging stuff */ #define DEBUG_ADDRESSES 1 #define DEBUG_NUMOUTPUT 2 #define DEBUG_RESID 4 /* go into debugger in complete_rqe */ #define DEBUG_LASTREQS 8 /* keep a circular buffer of last requests */ #define DEBUG_REMOTEGDB 256 /* go into remote gdb */ #endif