summaryrefslogtreecommitdiffstats
path: root/sys/dev/raidframe/rf_pqdegdags.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/raidframe/rf_pqdegdags.c')
-rw-r--r--sys/dev/raidframe/rf_pqdegdags.c430
1 files changed, 430 insertions, 0 deletions
diff --git a/sys/dev/raidframe/rf_pqdegdags.c b/sys/dev/raidframe/rf_pqdegdags.c
new file mode 100644
index 0000000..e0d97ed
--- /dev/null
+++ b/sys/dev/raidframe/rf_pqdegdags.c
@@ -0,0 +1,430 @@
+/* $FreeBSD$ */
+/* $NetBSD: rf_pqdegdags.c,v 1.5 1999/08/15 02:36:40 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_pqdegdags.c
+ * Degraded mode dags for double fault cases.
+*/
+
+
+#include <dev/raidframe/rf_archs.h>
+
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+#include <dev/raidframe/rf_types.h>
+#include <dev/raidframe/rf_raid.h>
+#include <dev/raidframe/rf_dag.h>
+#include <dev/raidframe/rf_dagdegrd.h>
+#include <dev/raidframe/rf_dagdegwr.h>
+#include <dev/raidframe/rf_dagfuncs.h>
+#include <dev/raidframe/rf_dagutils.h>
+#include <dev/raidframe/rf_etimer.h>
+#include <dev/raidframe/rf_acctrace.h>
+#include <dev/raidframe/rf_general.h>
+#include <dev/raidframe/rf_pqdegdags.h>
+#include <dev/raidframe/rf_pq.h>
+
+static void
+applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
+ RF_PhysDiskAddr_t * qpda, void *bp);
+
+/*
+ Two data drives have failed, and we are doing a read that covers one of them.
+ We may also be reading some of the surviving drives.
+
+
+ *****************************************************************************************
+ *
+ * creates a DAG to perform a degraded-mode read of data within one stripe.
+ * This DAG is as follows:
+ *
+ * Hdr
+ * |
+ * Block
+ * / / \ \ \ \
+ * Rud ... Rud Rrd ... Rrd Rp Rq
+ * | \ | \ | \ | \ | \ | \
+ *
+ * | |
+ * Unblock X
+ * \ /
+ * ------ T ------
+ *
+ * Each R node is a successor of the L node
+ * One successor arc from each R node goes to U, and the other to X
+ * There is one Rud for each chunk of surviving user data requested by the user,
+ * and one Rrd for each chunk of surviving user data _not_ being read by the user
+ * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
+ * X = pq recovery node, T = terminate
+ *
+ * The block & unblock nodes are leftovers from a previous version. They
+ * do nothing, but I haven't deleted them because it would be a tremendous
+ * effort to put them back in.
+ *
+ * Note: The target buffer for the XOR node is set to the actual user buffer where the
+ * failed data is supposed to end up. This buffer is zero'd by the code here. Thus,
+ * if you create a degraded read dag, use it, and then re-use, you have to be sure to
+ * zero the target buffer prior to the re-use.
+ *
+ * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
+ * needs and what's not.
+ ****************************************************************************************/
+/* init a disk node with 2 successors and one predecessor */
+#define INIT_DISK_NODE(node,name) \
+rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
+(node)->succedents[0] = unblockNode; \
+(node)->succedents[1] = recoveryNode; \
+(node)->antecedents[0] = blockNode; \
+(node)->antType[0] = rf_control
+
+#define DISK_NODE_PARAMS(_node_,_p_) \
+ (_node_).params[0].p = _p_ ; \
+ (_node_).params[1].p = (_p_)->bufPtr; \
+ (_node_).params[2].v = parityStripeID; \
+ (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
+
+#define DISK_NODE_PDA(node) ((node)->params[0].p)
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
+{
+ rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
+ "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
+}
+
+static void
+applyPDA(raidPtr, pda, ppda, qpda, bp)
+ RF_Raid_t *raidPtr;
+ RF_PhysDiskAddr_t *pda;
+ RF_PhysDiskAddr_t *ppda;
+ RF_PhysDiskAddr_t *qpda;
+ void *bp;
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
+ RF_SectorCount_t s0len = ppda->numSector, len;
+ RF_SectorNum_t suoffset;
+ unsigned coeff;
+ char *pbuf = ppda->bufPtr;
+ char *qbuf = qpda->bufPtr;
+ char *buf;
+ int delta;
+
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ len = pda->numSector;
+ /* see if pda intersects a recovery pda */
+ if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
+ buf = pda->bufPtr;
+ coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+
+ if (suoffset < s0off) {
+ delta = s0off - suoffset;
+ buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
+ suoffset = s0off;
+ len -= delta;
+ }
+ if (suoffset > s0off) {
+ delta = suoffset - s0off;
+ pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
+ qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
+ }
+ if ((suoffset + len) > (s0len + s0off))
+ len = s0len + s0off - suoffset;
+
+ /* src, dest, len */
+ rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
+
+ /* dest, src, len, coeff */
+ rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
+ }
+}
+/*
+ Recover data in the case of a double failure. There can be two
+ result buffers, one for each chunk of data trying to be recovered.
+ The params are pda's that have not been range restricted or otherwise
+ politely massaged - this should be done here. The last params are the
+ pdas of P and Q, followed by the raidPtr. The list can look like
+
+ pda, pda, ... , p pda, q pda, raidptr, asm
+
+ or
+
+ pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
+
+ depending on wether two chunks of recovery data were required.
+
+ The second condition only arises if there are two failed buffers
+ whose lengths do not add up a stripe unit.
+*/
+
+
+int
+rf_PQDoubleRecoveryFunc(node)
+ RF_DagNode_t *node;
+{
+ int np = node->numParams;
+ RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
+ int d, i;
+ unsigned coeff;
+ RF_RaidAddr_t sosAddr, suoffset;
+ RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
+ int two = 0;
+ RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
+ char *buf;
+ int numDataCol = layoutPtr->numDataCol;
+ RF_Etimer_t timer;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+ RF_ETIMER_START(timer);
+
+ if (asmap->failedPDAs[1] &&
+ (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
+ RF_ASSERT(0);
+ ppda = node->params[np - 6].p;
+ ppda2 = node->params[np - 5].p;
+ qpda = node->params[np - 4].p;
+ qpda2 = node->params[np - 3].p;
+ d = (np - 6);
+ two = 1;
+ } else {
+ ppda = node->params[np - 4].p;
+ qpda = node->params[np - 3].p;
+ d = (np - 4);
+ }
+
+ for (i = 0; i < d; i++) {
+ pda = node->params[i].p;
+ buf = pda->bufPtr;
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ len = pda->numSector;
+ coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
+ /* compute the data unit offset within the column */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ /* see if pda intersects a recovery pda */
+ applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
+ if (two)
+ applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
+ }
+
+ /* ok, we got the parity back to the point where we can recover. We
+ * now need to determine the coeff of the columns that need to be
+ * recovered. We can also only need to recover a single stripe unit. */
+
+ if (asmap->failedPDAs[1] == NULL) { /* only a single stripe unit
+ * to recover. */
+ pda = asmap->failedPDAs[0];
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ /* need to determine the column of the other failed disk */
+ coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
+ /* compute the data unit offset within the column */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ for (i = 0; i < numDataCol; i++) {
+ npda.raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+ /* skip over dead disks */
+ if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+ if (i != coeff)
+ break;
+ }
+ RF_ASSERT(i < numDataCol);
+ RF_ASSERT(two == 0);
+ /* recover the data. Since we need only want to recover one
+ * column, we overwrite the parity with the other one. */
+ if (coeff < i) /* recovering 'a' */
+ rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
+ else /* recovering 'b' */
+ rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
+ } else
+ RF_PANIC();
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ if (tracerec)
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+ rf_GenericWakeupFunc(node, 0);
+ return (0);
+}
+
+int
+rf_PQWriteDoubleRecoveryFunc(node)
+ RF_DagNode_t *node;
+{
+ /* The situation:
+ *
+ * We are doing a write that hits only one failed data unit. The other
+ * failed data unit is not being overwritten, so we need to generate
+ * it.
+ *
+ * For the moment, we assume all the nonfailed data being written is in
+ * the shadow of the failed data unit. (i.e,, either a single data
+ * unit write or the entire failed stripe unit is being overwritten. )
+ *
+ * Recovery strategy: apply the recovery data to the parity and q. Use P
+ * & Q to recover the second failed data unit in P. Zero fill Q, then
+ * apply the recovered data to p. Then apply the data being written to
+ * the failed drive. Then walk through the surviving drives, applying
+ * new data when it exists, othewise the recovery data. Quite a mess.
+ *
+ *
+ * The params
+ *
+ * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
+ * write pda (numStripeUnitAccess - numDataFailed), failed pda,
+ * raidPtr, asmap */
+
+ int np = node->numParams;
+ RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
+ int i;
+ RF_RaidAddr_t sosAddr;
+ unsigned coeff;
+ RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+ RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
+ int numDataCol = layoutPtr->numDataCol;
+ RF_Etimer_t timer;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+ RF_ASSERT(node->numResults == 2);
+ RF_ASSERT(asmap->failedPDAs[1] == NULL);
+ RF_ETIMER_START(timer);
+ ppda = node->results[0];
+ qpda = node->results[1];
+ /* apply the recovery data */
+ for (i = 0; i < numDataCol - 2; i++)
+ applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
+
+ /* determine the other failed data unit */
+ pda = asmap->failedPDAs[0];
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ /* need to determine the column of the other failed disk */
+ coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
+ /* compute the data unit offset within the column */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ for (i = 0; i < numDataCol; i++) {
+ npda.raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+ /* skip over dead disks */
+ if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+ if (i != coeff)
+ break;
+ }
+ RF_ASSERT(i < numDataCol);
+ /* recover the data. The column we want to recover we write over the
+ * parity. The column we don't care about we dump in q. */
+ if (coeff < i) /* recovering 'a' */
+ rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
+ else /* recovering 'b' */
+ rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
+
+ /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
+ bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
+ rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
+
+ /* now apply all the write data to the buffer */
+ /* single stripe unit write case: the failed data is only thing we are
+ * writing. */
+ RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
+ /* dest, src, len, coeff */
+ rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
+ rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
+
+ /* now apply all the recovery data */
+ for (i = 0; i < numDataCol - 2; i++)
+ applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ if (tracerec)
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+
+ rf_GenericWakeupFunc(node, 0);
+ return (0);
+}
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
+{
+ RF_PANIC();
+}
+/*
+ Two lost data unit write case.
+
+ There are really two cases here:
+
+ (1) The write completely covers the two lost data units.
+ In that case, a reconstruct write that doesn't write the
+ failed data units will do the correct thing. So in this case,
+ the dag looks like
+
+ full stripe read of surviving data units (not being overwriten)
+ write new data (ignoring failed units) compute P&Q
+ write P&Q
+
+
+ (2) The write does not completely cover both failed data units
+ (but touches at least one of them). Then we need to do the
+ equivalent of a reconstruct read to recover the missing data
+ unit from the other stripe.
+
+ For any data we are writing that is not in the "shadow"
+ of the failed units, we need to do a four cycle update.
+ PANIC on this case. for now
+
+*/
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
+ int sum;
+ int nf = asmap->numDataFailed;
+
+ sum = asmap->failedPDAs[0]->numSector;
+ if (nf == 2)
+ sum += asmap->failedPDAs[1]->numSector;
+
+ if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
+ /* large write case */
+ rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
+ return;
+ }
+ if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
+ /* small write case, no user data not in shadow */
+ rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
+ return;
+ }
+ RF_PANIC();
+}
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
+{
+ rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
+}
+#endif /* (RF_INCLUDE_DECL_PQ > 0) ||
+ * (RF_INCLUDE_RAID6 > 0) */
OpenPOWER on IntegriCloud