diff options
author | grog <grog@FreeBSD.org> | 1999-08-07 08:11:22 +0000 |
---|---|---|
committer | grog <grog@FreeBSD.org> | 1999-08-07 08:11:22 +0000 |
commit | 1068a4df87234f0f1203dfa8a37ce355fe656ec9 (patch) | |
tree | 845ed9a26acd293392d0166147aaa0f8567c9e88 /sys/dev/vinum/vinuminterrupt.c | |
parent | 8b624b0301181444da4c23345f9e8a153b2776b5 (diff) | |
download | FreeBSD-src-1068a4df87234f0f1203dfa8a37ce355fe656ec9.zip FreeBSD-src-1068a4df87234f0f1203dfa8a37ce355fe656ec9.tar.gz |
Import RAID-5 code.
Add Cybernet copyright.
OK'd-by: Chuck Jacobus <chuck@cybernet.com>
Diffstat (limited to 'sys/dev/vinum/vinuminterrupt.c')
-rw-r--r-- | sys/dev/vinum/vinuminterrupt.c | 227 |
1 files changed, 224 insertions, 3 deletions
diff --git a/sys/dev/vinum/vinuminterrupt.c b/sys/dev/vinum/vinuminterrupt.c index bc3f56c..963a11e 100644 --- a/sys/dev/vinum/vinuminterrupt.c +++ b/sys/dev/vinum/vinuminterrupt.c @@ -1,9 +1,13 @@ -/* interrupt.c: bottom half of the driver */ +/* vinuminterrupt.c: bottom half of the driver */ /*- - * Copyright (c) 1997, 1998 + * Copyright (c) 1997, 1998, 1999 * Nan Yang Computer Services Limited. All rights reserved. * + * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. + * + * Written by Greg Lehey + * * This software is distributed under the so-called ``Berkeley * License'': * @@ -35,7 +39,7 @@ * otherwise) arising in any way out of the use of this software, even if * advised of the possibility of such damage. * - * $Id: vinuminterrupt.c,v 1.5 1999/03/16 03:40:25 grog Exp grog $ + * $Id: vinuminterrupt.c,v 1.6 1999/06/18 00:50:53 grog Exp grog $ */ #include <dev/vinum/vinumhdr.h> @@ -112,6 +116,46 @@ complete_rqe(struct buf *bp) PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount; } rqg->active--; /* one less request active */ + if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */ + int *sdata; /* source */ + int *data; /* and group data */ + int length; /* and count involved */ + int count; /* loop counter */ + struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */ + + /* XOR destination is the user data */ + sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */ + data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */ + length = urqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ + + for (count = 0; count < length; count++) + data[count] ^= sdata[count]; + +#ifdef VINUMDEBUG + if (debug & DEBUG_RESID) { + if ((rqg->active == 0) /* XXXX finished this group */ + &&(*(char *) data != '<')) /* and not what we expected */ + Debugger("complete_request checksum"); + } +#endif + + /* + * In a normal read, we will normally read directly + * into the user buffer. This doesn't work if + * we're also doing a recovery, so we have to + * copy it + */ + if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */ + char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */ + char *dst; + + dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */ + length = rqe->datalen << DEV_BSHIFT; /* and count involved */ + bcopy(src, dst, length); /* move it */ + } + } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 5 group write operation */ + &&(rqg->active == 0)) /* and we've finished phase 1 */ + complete_raid5_write(rqe); if (rqg->active == 0) /* request group finished, */ rq->active--; /* one less */ if (rq->active == 0) { /* request finished, */ @@ -208,3 +252,180 @@ sdio_done(struct buf *bp) } Free(sbp); } + +/* Start the second phase of a RAID5 group write operation. */ +/* + * XXX This could be improved on. It's quite CPU intensive, + * and doing it at the end tends to lump it all together. + * We should do this a transfer at a time + */ +void +complete_raid5_write(struct rqelement *rqe) +{ + int *sdata; /* source */ + int *pdata; /* and parity block data */ + int length; /* and count involved */ + int count; /* loop counter */ + int rqno; /* request index */ + int rqoffset; /* offset of request data from parity data */ + struct buf *bp; /* user buffer header */ + struct request *rq; /* pointer to our request */ + struct rqgroup *rqg; /* and to the request group */ + struct rqelement *prqe; /* point to the parity block */ + struct drive *drive; /* drive to access */ + + rqg = rqe->rqg; /* and to our request group */ + rq = rqg->rq; /* point to our request */ + bp = rq->bp; /* user's buffer header */ + prqe = &rqg->rqe[0]; /* point to the parity block */ + + /* + * If we get to this function, we have normal or + * degraded writes, or a combination of both. We do + * the same thing in each case: we perform an + * exclusive or to the parity block. The only + * difference is the origin of the data and the + * address range. + */ + + if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */ + pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */ + bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */ + + /* Now get what data we need from each block */ + for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ + /* + * This can do with improvement. If we're doing + * both a degraded and a normal write, we don't + * need to xor (nor to read) the part of the block + * that we're going to overwrite. FIXME XXX + */ + rqe = &rqg->rqe[rqno]; /* this request */ + sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */ + length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ + + /* + * add the data block to the parity block. Before + * we started the request, we zeroed the parity + * block, so the result of adding all the other + * blocks and the block we want to write will be + * the correct parity block. + */ + /* XXX do this in assembler */ + for (count = 0; count < length; count++) + pdata[count] ^= sdata[count]; + if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */ + &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */ + Free(rqe->b.b_data); /* free it now */ + rqe->flags &= ~XFR_MALLOCED; + } + } + } + if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */ + /* Get what data we need from each block */ + for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ + rqe = &rqg->rqe[rqno]; /* this request */ + if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE)) + == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */ + sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */ + rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */ + pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */ + length = rqe->datalen << (DEV_BSHIFT - 2); /* and count involved */ + /* + * "remove" the old data block + * from the parity block + */ + /* XXX do this in assembler */ + if ((pdata < ((int *) prqe->b.b_data)) + || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount))) + || (sdata < ((int *) rqe->b.b_data)) + || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount)))) + Debugger("Bounds overflow"); /* XXX */ + for (count = 0; count < length; count++) + pdata[count] ^= sdata[count]; + + /* "add" the new data block */ + sdata = (int *) (&bp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */ + if ((sdata < ((int *) bp->b_data)) + || (&sdata[length] > ((int *) (bp->b_data + bp->b_bcount)))) + Debugger("Bounds overflow"); /* XXX */ + for (count = 0; count < length; count++) + pdata[count] ^= sdata[count]; + + /* Free the malloced buffer */ + if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */ + Free(rqe->b.b_data); /* free it */ + rqe->flags &= ~XFR_MALLOCED; + } else + Debugger("not malloced"); /* XXX */ + + if ((rqe->b.b_flags & B_READ) /* this was a read */ + &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */ + rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */ + rqe->b.b_flags |= B_CALL; /* call us when you're done */ + rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */ + rqe->b.b_data = &bp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */ + rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */ + rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */ + rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ + rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */ + rqg->active++; /* another active request */ + rqe->b.b_vp->v_numoutput++; /* one more output going */ + drive = &DRIVE[rqe->driveno]; /* drive to access */ +#if VINUMDEBUG + if (debug & DEBUG_ADDRESSES) + log(LOG_DEBUG, + " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", + rqe->b.b_flags & B_READ ? "Read" : "Write", + major(rqe->b.b_dev), + minor(rqe->b.b_dev), + rqe->sdno, + (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), + rqe->b.b_blkno, + rqe->b.b_bcount); /* XXX */ + if (debug & DEBUG_NUMOUTPUT) + log(LOG_DEBUG, + " raid5.2 sd %d numoutput %ld\n", + rqe->sdno, + rqe->b.b_vp->v_numoutput); + if (debug & DEBUG_LASTREQS) + logrq(loginfo_raid5_data, (union rqinfou) rqe, bp); +#endif + (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b); + } + } + } + } + /* Finally, write the parity block */ + rqe = &rqg->rqe[0]; + rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */ + rqe->b.b_flags |= B_CALL; /* call us when you're done */ + rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */ + rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought use here */ + rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */ + rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */ + rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ + rqg->active++; /* another active request */ + rqe->b.b_vp->v_numoutput++; /* one more output going */ + drive = &DRIVE[rqe->driveno]; /* drive to access */ +#if VINUMDEBUG + if (debug & DEBUG_ADDRESSES) + log(LOG_DEBUG, + " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", + rqe->b.b_flags & B_READ ? "Read" : "Write", + major(rqe->b.b_dev), + minor(rqe->b.b_dev), + rqe->sdno, + (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), + rqe->b.b_blkno, + rqe->b.b_bcount); /* XXX */ + if (debug & DEBUG_NUMOUTPUT) + log(LOG_DEBUG, + " raid5.3 sd %d numoutput %ld\n", + rqe->sdno, + rqe->b.b_vp->v_numoutput); + if (debug & DEBUG_LASTREQS) + logrq(loginfo_raid5_parity, (union rqinfou) rqe, bp); +#endif + (*bdevsw(rqe->b.b_dev)->d_strategy) (&rqe->b); +} |