diff options
Diffstat (limited to 'sys/dev/raidframe/rf_pqdegdags.c')
-rw-r--r-- | sys/dev/raidframe/rf_pqdegdags.c | 430 |
1 files changed, 430 insertions, 0 deletions
diff --git a/sys/dev/raidframe/rf_pqdegdags.c b/sys/dev/raidframe/rf_pqdegdags.c new file mode 100644 index 0000000..e0d97ed --- /dev/null +++ b/sys/dev/raidframe/rf_pqdegdags.c @@ -0,0 +1,430 @@ +/* $FreeBSD$ */ +/* $NetBSD: rf_pqdegdags.c,v 1.5 1999/08/15 02:36:40 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_pqdegdags.c + * Degraded mode dags for double fault cases. +*/ + + +#include <dev/raidframe/rf_archs.h> + +#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) + +#include <dev/raidframe/rf_types.h> +#include <dev/raidframe/rf_raid.h> +#include <dev/raidframe/rf_dag.h> +#include <dev/raidframe/rf_dagdegrd.h> +#include <dev/raidframe/rf_dagdegwr.h> +#include <dev/raidframe/rf_dagfuncs.h> +#include <dev/raidframe/rf_dagutils.h> +#include <dev/raidframe/rf_etimer.h> +#include <dev/raidframe/rf_acctrace.h> +#include <dev/raidframe/rf_general.h> +#include <dev/raidframe/rf_pqdegdags.h> +#include <dev/raidframe/rf_pq.h> + +static void +applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda, + RF_PhysDiskAddr_t * qpda, void *bp); + +/* + Two data drives have failed, and we are doing a read that covers one of them. + We may also be reading some of the surviving drives. + + + ***************************************************************************************** + * + * creates a DAG to perform a degraded-mode read of data within one stripe. + * This DAG is as follows: + * + * Hdr + * | + * Block + * / / \ \ \ \ + * Rud ... Rud Rrd ... Rrd Rp Rq + * | \ | \ | \ | \ | \ | \ + * + * | | + * Unblock X + * \ / + * ------ T ------ + * + * Each R node is a successor of the L node + * One successor arc from each R node goes to U, and the other to X + * There is one Rud for each chunk of surviving user data requested by the user, + * and one Rrd for each chunk of surviving user data _not_ being read by the user + * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata + * X = pq recovery node, T = terminate + * + * The block & unblock nodes are leftovers from a previous version. They + * do nothing, but I haven't deleted them because it would be a tremendous + * effort to put them back in. + * + * Note: The target buffer for the XOR node is set to the actual user buffer where the + * failed data is supposed to end up. This buffer is zero'd by the code here. Thus, + * if you create a degraded read dag, use it, and then re-use, you have to be sure to + * zero the target buffer prior to the re-use. + * + * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats + * needs and what's not. + ****************************************************************************************/ +/* init a disk node with 2 successors and one predecessor */ +#define INIT_DISK_NODE(node,name) \ +rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \ +(node)->succedents[0] = unblockNode; \ +(node)->succedents[1] = recoveryNode; \ +(node)->antecedents[0] = blockNode; \ +(node)->antType[0] = rf_control + +#define DISK_NODE_PARAMS(_node_,_p_) \ + (_node_).params[0].p = _p_ ; \ + (_node_).params[1].p = (_p_)->bufPtr; \ + (_node_).params[2].v = parityStripeID; \ + (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru) + +#define DISK_NODE_PDA(node) ((node)->params[0].p) + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead) +{ + rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList, + "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc); +} + +static void +applyPDA(raidPtr, pda, ppda, qpda, bp) + RF_Raid_t *raidPtr; + RF_PhysDiskAddr_t *pda; + RF_PhysDiskAddr_t *ppda; + RF_PhysDiskAddr_t *qpda; + void *bp; +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector); + RF_SectorCount_t s0len = ppda->numSector, len; + RF_SectorNum_t suoffset; + unsigned coeff; + char *pbuf = ppda->bufPtr; + char *qbuf = qpda->bufPtr; + char *buf; + int delta; + + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + len = pda->numSector; + /* see if pda intersects a recovery pda */ + if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) { + buf = pda->bufPtr; + coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress); + coeff = (coeff % raidPtr->Layout.numDataCol); + + if (suoffset < s0off) { + delta = s0off - suoffset; + buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); + suoffset = s0off; + len -= delta; + } + if (suoffset > s0off) { + delta = suoffset - s0off; + pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); + qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); + } + if ((suoffset + len) > (s0len + s0off)) + len = s0len + s0off - suoffset; + + /* src, dest, len */ + rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp); + + /* dest, src, len, coeff */ + rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff); + } +} +/* + Recover data in the case of a double failure. There can be two + result buffers, one for each chunk of data trying to be recovered. + The params are pda's that have not been range restricted or otherwise + politely massaged - this should be done here. The last params are the + pdas of P and Q, followed by the raidPtr. The list can look like + + pda, pda, ... , p pda, q pda, raidptr, asm + + or + + pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm + + depending on wether two chunks of recovery data were required. + + The second condition only arises if there are two failed buffers + whose lengths do not add up a stripe unit. +*/ + + +int +rf_PQDoubleRecoveryFunc(node) + RF_DagNode_t *node; +{ + int np = node->numParams; + RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); + int d, i; + unsigned coeff; + RF_RaidAddr_t sosAddr, suoffset; + RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit; + int two = 0; + RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda; + char *buf; + int numDataCol = layoutPtr->numDataCol; + RF_Etimer_t timer; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + + RF_ETIMER_START(timer); + + if (asmap->failedPDAs[1] && + (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) { + RF_ASSERT(0); + ppda = node->params[np - 6].p; + ppda2 = node->params[np - 5].p; + qpda = node->params[np - 4].p; + qpda2 = node->params[np - 3].p; + d = (np - 6); + two = 1; + } else { + ppda = node->params[np - 4].p; + qpda = node->params[np - 3].p; + d = (np - 4); + } + + for (i = 0; i < d; i++) { + pda = node->params[i].p; + buf = pda->bufPtr; + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + len = pda->numSector; + coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); + /* compute the data unit offset within the column */ + coeff = (coeff % raidPtr->Layout.numDataCol); + /* see if pda intersects a recovery pda */ + applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp); + if (two) + applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp); + } + + /* ok, we got the parity back to the point where we can recover. We + * now need to determine the coeff of the columns that need to be + * recovered. We can also only need to recover a single stripe unit. */ + + if (asmap->failedPDAs[1] == NULL) { /* only a single stripe unit + * to recover. */ + pda = asmap->failedPDAs[0]; + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + /* need to determine the column of the other failed disk */ + coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); + /* compute the data unit offset within the column */ + coeff = (coeff % raidPtr->Layout.numDataCol); + for (i = 0; i < numDataCol; i++) { + npda.raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); + /* skip over dead disks */ + if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) + if (i != coeff) + break; + } + RF_ASSERT(i < numDataCol); + RF_ASSERT(two == 0); + /* recover the data. Since we need only want to recover one + * column, we overwrite the parity with the other one. */ + if (coeff < i) /* recovering 'a' */ + rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i); + else /* recovering 'b' */ + rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff); + } else + RF_PANIC(); + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + if (tracerec) + tracerec->q_us += RF_ETIMER_VAL_US(timer); + rf_GenericWakeupFunc(node, 0); + return (0); +} + +int +rf_PQWriteDoubleRecoveryFunc(node) + RF_DagNode_t *node; +{ + /* The situation: + * + * We are doing a write that hits only one failed data unit. The other + * failed data unit is not being overwritten, so we need to generate + * it. + * + * For the moment, we assume all the nonfailed data being written is in + * the shadow of the failed data unit. (i.e,, either a single data + * unit write or the entire failed stripe unit is being overwritten. ) + * + * Recovery strategy: apply the recovery data to the parity and q. Use P + * & Q to recover the second failed data unit in P. Zero fill Q, then + * apply the recovered data to p. Then apply the data being written to + * the failed drive. Then walk through the surviving drives, applying + * new data when it exists, othewise the recovery data. Quite a mess. + * + * + * The params + * + * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... , + * write pda (numStripeUnitAccess - numDataFailed), failed pda, + * raidPtr, asmap */ + + int np = node->numParams; + RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); + int i; + RF_RaidAddr_t sosAddr; + unsigned coeff; + RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; + RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda; + int numDataCol = layoutPtr->numDataCol; + RF_Etimer_t timer; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + + RF_ASSERT(node->numResults == 2); + RF_ASSERT(asmap->failedPDAs[1] == NULL); + RF_ETIMER_START(timer); + ppda = node->results[0]; + qpda = node->results[1]; + /* apply the recovery data */ + for (i = 0; i < numDataCol - 2; i++) + applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp); + + /* determine the other failed data unit */ + pda = asmap->failedPDAs[0]; + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + /* need to determine the column of the other failed disk */ + coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); + /* compute the data unit offset within the column */ + coeff = (coeff % raidPtr->Layout.numDataCol); + for (i = 0; i < numDataCol; i++) { + npda.raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); + /* skip over dead disks */ + if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) + if (i != coeff) + break; + } + RF_ASSERT(i < numDataCol); + /* recover the data. The column we want to recover we write over the + * parity. The column we don't care about we dump in q. */ + if (coeff < i) /* recovering 'a' */ + rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i); + else /* recovering 'b' */ + rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff); + + /* OK. The valid data is in P. Zero fill Q, then inc it into it. */ + bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector)); + rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i); + + /* now apply all the write data to the buffer */ + /* single stripe unit write case: the failed data is only thing we are + * writing. */ + RF_ASSERT(asmap->numStripeUnitsAccessed == 1); + /* dest, src, len, coeff */ + rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff); + rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp); + + /* now apply all the recovery data */ + for (i = 0; i < numDataCol - 2; i++) + applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp); + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + if (tracerec) + tracerec->q_us += RF_ETIMER_VAL_US(timer); + + rf_GenericWakeupFunc(node, 0); + return (0); +} +RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite) +{ + RF_PANIC(); +} +/* + Two lost data unit write case. + + There are really two cases here: + + (1) The write completely covers the two lost data units. + In that case, a reconstruct write that doesn't write the + failed data units will do the correct thing. So in this case, + the dag looks like + + full stripe read of surviving data units (not being overwriten) + write new data (ignoring failed units) compute P&Q + write P&Q + + + (2) The write does not completely cover both failed data units + (but touches at least one of them). Then we need to do the + equivalent of a reconstruct read to recover the missing data + unit from the other stripe. + + For any data we are writing that is not in the "shadow" + of the failed units, we need to do a four cycle update. + PANIC on this case. for now + +*/ + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit; + int sum; + int nf = asmap->numDataFailed; + + sum = asmap->failedPDAs[0]->numSector; + if (nf == 2) + sum += asmap->failedPDAs[1]->numSector; + + if ((nf == 2) && (sum == (2 * sectorsPerSU))) { + /* large write case */ + rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList); + return; + } + if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) { + /* small write case, no user data not in shadow */ + rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList); + return; + } + RF_PANIC(); +} +RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite) +{ + rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc); +} +#endif /* (RF_INCLUDE_DECL_PQ > 0) || + * (RF_INCLUDE_RAID6 > 0) */ |