diff options
author | grog <grog@FreeBSD.org> | 1999-08-07 08:11:22 +0000 |
---|---|---|
committer | grog <grog@FreeBSD.org> | 1999-08-07 08:11:22 +0000 |
commit | 1068a4df87234f0f1203dfa8a37ce355fe656ec9 (patch) | |
tree | 845ed9a26acd293392d0166147aaa0f8567c9e88 /sys/dev/vinum | |
parent | 8b624b0301181444da4c23345f9e8a153b2776b5 (diff) | |
download | FreeBSD-src-1068a4df87234f0f1203dfa8a37ce355fe656ec9.zip FreeBSD-src-1068a4df87234f0f1203dfa8a37ce355fe656ec9.tar.gz |
Import RAID-5 code.
Add Cybernet copyright.
OK'd-by: Chuck Jacobus <chuck@cybernet.com>
Diffstat (limited to 'sys/dev/vinum')
-rw-r--r-- | sys/dev/vinum/vinuminterrupt.c | 227 | ||||
-rw-r--r-- | sys/dev/vinum/vinumlock.c | 68 | ||||
-rw-r--r-- | sys/dev/vinum/vinumrevive.c | 87 |
3 files changed, 375 insertions, 7 deletions
diff --git a/sys/dev/vinum/vinuminterrupt.c b/sys/dev/vinum/vinuminterrupt.c index bc3f56c..963a11e 100644 --- a/sys/dev/vinum/vinuminterrupt.c +++ b/sys/dev/vinum/vinuminterrupt.c @@ -1,9 +1,13 @@ -/* interrupt.c: bottom half of the driver */ +/* vinuminterrupt.c: bottom half of the driver */ /*- - * Copyright (c) 1997, 1998 + * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * * This software is distributed under the so-called ``Berkeley * License'': * @@ -35,7 +39,7 @@ * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * - * $Id: vinuminterrupt.c,v 1.5 1999/03/16 03:40:25 grog Exp grog $ + * $Id: vinuminterrupt.c,v 1.6 1999/06/18 00:50:53 grog Exp grog $ */ #include <dev/vinum/vinumhdr.h> @@ -112,6 +116,46 @@ complete_rqe(struct buf *bp) PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount; } rqg->active--; /* one less request active */ + if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */ + int *sdata; /* source */ + int *data; /* and group data */ + int length; /* and count involved */ + int count; /* loop counter */ + struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */ + + /* XOR destination is the user data */ + sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */ + data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */ + length = urqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ + + for (count = 0; count < length; count++) + data[count] ^= sdata[count]; + +#ifdef VINUMDEBUG + if (debug & DEBUG_RESID) { + if ((rqg->active == 0) /* XXXX finished this group */ + &&(*(char *) data != '<')) /* and not what we expected */ + Debugger("complete_request checksum"); + } +#endif + + /* + * In a normal read, we will normally read directly + * into the user buffer. This doesn't work if + * we're also doing a recovery, so we have to + * copy it + */ + if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */ + char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */ + char *dst; + + dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */ + length = rqe->datalen << DEV_BSHIFT; /* and count involved */ + bcopy(src, dst, length); /* move it */ + } + } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 5 group write operation */ + &&(rqg->active == 0)) /* and we've finished phase 1 */ + complete_raid5_write(rqe); if (rqg->active == 0) /* request group finished, */ rq->active--; /* one less */ if (rq->active == 0) { /* request finished, */ @@ -208,3 +252,180 @@ sdio_done(struct buf *bp) } Free(sbp); } + +/* Start the second phase of a RAID5 group write operation. */ +/* + * XXX This could be improved on. It's quite CPU intensive, + * and doing it at the end tends to lump it all together. + * We should do this a transfer at a time + */ +void +complete_raid5_write(struct rqelement *rqe) +{ + int *sdata; /* source */ + int *pdata; /* and parity block data */ + int length; /* and count involved */ + int count; /* loop counter */ + int rqno; /* request index */ + int rqoffset; /* offset of request data from parity data */ + struct buf *bp; /* user buffer header */ + struct request *rq; /* pointer to our request */ + struct rqgroup *rqg; /* and to the request group */ + struct rqelement *prqe; /* point to the parity block */ + struct drive *drive; /* drive to access */ + + rqg = rqe->rqg; /* and to our request group */ + rq = rqg->rq; /* point to our request */ + bp = rq->bp; /* user's buffer header */ + prqe = &rqg->rqe[0]; /* point to the parity block */ + + /* + * If we get to this function, we have normal or + * degraded writes, or a combination of both. We do + * the same thing in each case: we perform an + * exclusive or to the parity block. The only + * difference is the origin of the data and the + * address range. + */ + + if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */ + pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */ + bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */ + + /* Now get what data we need from each block */ + for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ + /* + * This can do with improvement. If we're doing + * both a degraded and a normal write, we don't + * need to xor (nor to read) the part of the block + * that we're going to overwrite. FIXME XXX + */ + rqe = &rqg->rqe[rqno]; /* this request */ + sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */ + length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ + + /* + * add the data block to the parity block. Before + * we started the request, we zeroed the parity + * block, so the result of adding all the other + * blocks and the block we want to write will be + * the correct parity block. + */ + /* XXX do this in assembler */ + for (count = 0; count < length; count++) + pdata[count] ^= sdata[count]; + if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */ + &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */ + Free(rqe->b.b_data); /* free it now */ + rqe->flags &= ~XFR_MALLOCED; + } + } + } + if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */ + /* Get what data we need from each block */ + for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ + rqe = &rqg->rqe[rqno]; /* this request */ + if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE)) + == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */ + sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */ + rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */ + pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */ + length = rqe->datalen << (DEV_BSHIFT - 2); /* and count involved */ + /* + * "remove" the old data block + * from the parity block + */ + /* XXX do this in assembler */ + if ((pdata < ((int *) prqe->b.b_data)) + || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount))) + || (sdata < ((int *) rqe->b.b_data)) + || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount)))) + Debugger("Bounds overflow"); /* XXX */ + for (count = 0; count < length; count++) + pdata[count] ^= sdata[count]; + + /* "add" the new data block */ + sdata = (int *) (&bp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */ + if ((sdata < ((int *) bp->b_data)) + || (&sdata[length] > ((int *) (bp->b_data + bp->b_bcount)))) + Debugger("Bounds overflow"); /* XXX */ + for (count = 0; count < length; count++) + pdata[count] ^= sdata[count]; + + /* Free the malloced buffer */ + if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */ + Free(rqe->b.b_data); /* free it */ + rqe->flags &= ~XFR_MALLOCED; + } else + Debugger("not malloced"); /* XXX */ + + if ((rqe->b.b_flags & B_READ) /* this was a read */ + &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */ + rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */ + rqe->b.b_flags |= B_CALL; /* call us when you're done */ + rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */ + rqe->b.b_data = &bp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */ + rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */ + rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */ + rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ + rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */ + rqg->active++; /* another active request */ + rqe->b.b_vp->v_numoutput++; /* one more output going */ + drive = &DRIVE[rqe->driveno]; /* drive to access */ +#if VINUMDEBUG + if (debug & DEBUG_ADDRESSES) + log(LOG_DEBUG, + " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", + rqe->b.b_flags & B_READ ? "Read" : "Write", + major(rqe->b.b_dev), + minor(rqe->b.b_dev), + rqe->sdno, + (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), + rqe->b.b_blkno, + rqe->b.b_bcount); /* XXX */ + if (debug & DEBUG_NUMOUTPUT) + log(LOG_DEBUG, + " raid5.2 sd %d numoutput %ld\n", + rqe->sdno, + rqe->b.b_vp->v_numoutput); + if (debug & DEBUG_LASTREQS) + logrq(loginfo_raid5_data, (union rqinfou) rqe, bp); +#endif + (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b); + } + } + } + } + /* Finally, write the parity block */ + rqe = &rqg->rqe[0]; + rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */ + rqe->b.b_flags |= B_CALL; /* call us when you're done */ + rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */ + rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */ + rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */ + rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */ + rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ + rqg->active++; /* another active request */ + rqe->b.b_vp->v_numoutput++; /* one more output going */ + drive = &DRIVE[rqe->driveno]; /* drive to access */ +#if VINUMDEBUG + if (debug & DEBUG_ADDRESSES) + log(LOG_DEBUG, + " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", + rqe->b.b_flags & B_READ ? "Read" : "Write", + major(rqe->b.b_dev), + minor(rqe->b.b_dev), + rqe->sdno, + (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), + rqe->b.b_blkno, + rqe->b.b_bcount); /* XXX */ + if (debug & DEBUG_NUMOUTPUT) + log(LOG_DEBUG, + " raid5.3 sd %d numoutput %ld\n", + rqe->sdno, + rqe->b.b_vp->v_numoutput); + if (debug & DEBUG_LASTREQS) + logrq(loginfo_raid5_parity, (union rqinfou) rqe, bp); +#endif + (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b); +} diff --git a/sys/dev/vinum/vinumlock.c b/sys/dev/vinum/vinumlock.c index 01e9812..fe03157 100644 --- a/sys/dev/vinum/vinumlock.c +++ b/sys/dev/vinum/vinumlock.c @@ -2,6 +2,10 @@ * Copyright (c) 1997, 1998 * Nan Yang Computer Services Limited. All rights reserved. * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * * This software is distributed under the so-called ``Berkeley * License'': * @@ -33,7 +37,7 @@ * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * - * $Id: vinumlock.c,v 1.9 1999/03/13 03:26:00 grog Exp grog $ + * $Id: vinumlock.c,v 1.10 1999/05/15 03:47:45 grog Exp grog $ */ #include <dev/vinum/vinumhdr.h> @@ -176,6 +180,68 @@ unlockplex(struct plex *plex) } } +#define LOCK_UNALLOC -1 /* mark unused lock entries */ + +/* Lock an address range in a plex, wait if it's in use */ +int +lockrange(struct plex *plex, off_t first, off_t last) +{ + int lock; + int pos = -1; /* place to insert */ + + lockplex(plex); /* diddle one at a time */ + if (plex->locks >= plex->alloclocks) + EXPAND(plex->lock, struct rangelock, plex->alloclocks, INITIAL_LOCKS) + unlockplex(plex); + for (;;) { + lockplex(plex); + for (lock = 0; lock < plex->locks; lock++) { + if (plex->lock[lock].first == LOCK_UNALLOC) /* empty place */ + pos = lock; /* a place to put this one */ + else if ((plex->lock[lock].first < last) + && (plex->lock[lock].last > first)) { /* overlap, */ + unlockplex(plex); + tsleep(((caddr_t *) & lockrange) + plex->sdnos[0], PRIBIO | PCATCH, "vrlock", 0); + break; /* out of the inner level loop */ + } + } + if (lock == plex->locks) /* made it to the end, */ + break; + } + + /* + * The address range is free, and the plex is locked. + * Add our lock entry + */ + if (pos == -1) { /* no free space, */ + pos = lock; /* put it at the end */ + plex->locks++; + } + plex->lock[pos].first = first; + plex->lock[pos].last = last; + unlockplex(plex); + return 0; +} + +/* Unlock a volume and let the next one at it */ +void +unlockrange(struct plex *plex, off_t first, off_t last) +{ + int lock; + + lockplex(plex); + for (lock = 0; lock < plex->locks; lock++) { + if ((plex->lock[lock].first == first) + && (plex->lock[lock].last == last)) { /* found our lock */ + plex->lock[lock].first = LOCK_UNALLOC; /* not used */ + break; /* out of the inner level loop */ + } + } + if (lock == plex->locks) /* made it to the end, */ + panic("vinum: unlock without lock"); + + unlockplex(plex); +} /* Get a lock for the global config, wait if it's not available */ int diff --git a/sys/dev/vinum/vinumrevive.c b/sys/dev/vinum/vinumrevive.c index 9eec5f5..32a30d2 100644 --- a/sys/dev/vinum/vinumrevive.c +++ b/sys/dev/vinum/vinumrevive.c @@ -1,7 +1,11 @@ /*- - * Copyright (c) 1997, 1998 + * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * * This software is distributed under the so-called ``Berkeley * License'': * @@ -33,7 +37,7 @@ * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * - * $Id: vinumrevive.c,v 1.7 1999/02/28 02:12:18 grog Exp grog $ + * $Id: vinumrevive.c,v 1.8 1999/06/28 01:57:50 grog Exp grog $ */ #include <dev/vinum/vinumhdr.h> @@ -60,6 +64,9 @@ revive_block(int sdno) int size; /* size of revive block, bytes */ int s; /* priority level */ daddr_t plexblkno; /* lblkno in plex */ + int psd; /* parity subdisk number */ + int stripe; /* stripe number */ + int isparity = 0; /* set if this is the parity stripe */ plexblkno = 0; /* to keep the compiler happy */ sd = &SD[sdno]; @@ -116,10 +123,84 @@ revive_block(int sdno) break; case plex_raid5: + stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */ + plexblkno = sd->plexoffset /* base */ + + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */ + +sd->revived % plex->stripesize; /* offset from beginning of stripe */ + stripe = (sd->revived / plex->stripesize); /* stripe number */ + psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */ + isparity = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */ + /* + * Now adjust for the strangenesses + * in RAID-5 striping + */ + if (sd->plexsdno > psd) /* beyond the parity stripe, */ + plexblkno -= plex->stripesize; /* one stripe less */ + break; case plex_disorg: /* to keep the compiler happy */ } - { + if (isparity) { /* we're reviving a parity block, */ + int mysdno; + int *tbuf; /* temporary buffer to read the stuff in to */ + caddr_t parity_buf; /* the address supplied by geteblk */ + int isize; + int i; + + tbuf = (int *) Malloc(size); + isize = size / (sizeof(int)); /* number of ints in the buffer */ + /* + * We have calculated plexblkno assuming it + * was a data block. Go back to the beginning + * of the band + */ + plexblkno -= plex->stripesize * sd->plexsdno; + + /* + * Read each subdisk in turn, except for + * this one, and xor them together + */ + parity_buf = bp->b_data; /* save the buffer getblk gave us */ + bzero(parity_buf, size); /* start with nothing */ + bp->b_data = (caddr_t) tbuf; /* read into here */ + for (mysdno = 0; mysdno < plex->subdisks; mysdno++) { /* for each subdisk */ + if (mysdno != sdno) { /* not our subdisk */ + if (vol != NULL) /* it's part of a volume, */ + /* + * First, read the data from the volume. We don't + * care which plex, that's the driver's job + */ + bp->b_dev = VINUMBDEV(plex->volno, 0, 0, VINUM_VOLUME_TYPE); /* create the device number */ + else /* it's an unattached plex */ + bp->b_dev = VINUMRBDEV(sd->plexno, VINUM_RAWPLEX_TYPE); /* create the device number */ + + bp->b_blkno = plexblkno; /* read from here */ + bp->b_flags = B_READ; /* either way, read it */ + BUF_LOCKINIT(bp); /* get a lock for the buffer */ + BUF_LOCK(bp, LK_EXCLUSIVE); /* and lock it */ + vinumstart(bp, 1); + biowait(bp); + if (bp->b_flags & B_ERROR) /* can't read, */ + /* + * If we have a read error, there's nothing + * we can do. By this time, the daemon has + * already run out of magic + */ + break; + /* + * To save time, we do the XOR wordwise. This + * requires sectors to be a multiple of the + * length of an int, which is currently always + * the case + */ + for (i = 0; i < isize; i++) + ((int *) parity_buf)[i] ^= tbuf[i]; /* xor in the buffer */ + plexblkno += plex->stripesize; /* move on to the next subdisk */ + } + } + bp->b_data = parity_buf; /* put the buf header back the way it was */ + Free(tbuf); + } else { bp->b_blkno = plexblkno; /* start here */ if (vol != NULL) /* it's part of a volume, */ /* |