summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/dev/vinum/vinumraid5.c638
1 files changed, 638 insertions, 0 deletions
diff --git a/sys/dev/vinum/vinumraid5.c b/sys/dev/vinum/vinumraid5.c
new file mode 100644
index 0000000..0d3af63
--- /dev/null
+++ b/sys/dev/vinum/vinumraid5.c
@@ -0,0 +1,638 @@
+/*-
+ * Copyright (c) 1997, 1998
+ * Cybernet Corporation and Nan Yang Computer Services Limited.
+ * All rights reserved.
+ *
+ * This software was developed as part of the NetMAX project.
+ *
+ * Written by Greg Lehey
+ *
+ * This software is distributed under the so-called ``Berkeley
+ * License'':
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Cybernet Corporation
+ * and Nan Yang Computer Services Limited
+ * 4. Neither the name of the Companies nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * This software is provided ``as is'', and any express or implied
+ * warranties, including, but not limited to, the implied warranties of
+ * merchantability and fitness for a particular purpose are disclaimed.
+ * In no event shall the company or contributors be liable for any
+ * direct, indirect, incidental, special, exemplary, or consequential
+ * damages (including, but not limited to, procurement of substitute
+ * goods or services; loss of use, data, or profits; or business
+ * interruption) however caused and on any theory of liability, whether
+ * in contract, strict liability, or tort (including negligence or
+ * otherwise) arising in any way out of the use of this software, even if
+ * advised of the possibility of such damage.
+ *
+ * $Id: raid5.c,v 1.15 1999/07/07 03:46:01 grog Exp grog $
+ */
+/*
+ * XXX To do:
+ *
+ * lock ranges while calculating parity
+ */
+
+#include <dev/vinum/vinumhdr.h>
+#include <dev/vinum/request.h>
+#include <miscfs/specfs/specdev.h>
+#include <sys/resourcevar.h>
+
+/*
+ * Parameters which describe the current transfer.
+ * These are only used for calculation, but they
+ * need to be passed to other functions, so it's
+ * tidier to put them in a struct
+ */
+struct metrics {
+ daddr_t stripebase; /* base address of stripe (1st subdisk) */
+ int stripeoffset; /* offset in stripe */
+ int stripesectors; /* total sectors to transfer in this stripe */
+ daddr_t sdbase; /* offset in subdisk of stripe base */
+ int sdcount; /* number of disks involved in this transfer */
+ daddr_t diskstart; /* remember where this transfer starts */
+ int psdno; /* number of parity subdisk */
+ int badsdno; /* number of down subdisk, if there is one */
+ int firstsdno; /* first data subdisk number */
+ /* These correspond to the fields in rqelement, sort of */
+ int useroffset;
+ /*
+ * Initial offset and length values for the first
+ * data block
+ */
+ int initoffset; /* start address of block to transfer */
+ short initlen; /* length in sectors of data transfer */
+ /* Define a normal operation */
+ int dataoffset; /* start address of block to transfer */
+ int datalen; /* length in sectors of data transfer */
+ /* Define a group operation */
+ int groupoffset; /* subdisk offset of group operation */
+ int grouplen; /* length in sectors of group operation */
+ /* Define a normal write operation */
+ int writeoffset; /* subdisk offset of normal write */
+ int writelen; /* length in sectors of write operation */
+ enum xferinfo flags; /* to check what we're doing */
+ int rqcount; /* number of elements in request */
+};
+
+enum requeststatus bre5(struct request *rq,
+ int plexno,
+ daddr_t * diskstart,
+ daddr_t diskend);
+void complete_raid5_write(struct rqelement *);
+enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
+void setrqebounds(struct rqelement *rqe, struct metrics *mp);
+
+/*
+ * define the low-level requests needed to perform a
+ * high-level I/O operation for a specific plex 'plexno'.
+ *
+ * Return 0 if all subdisks involved in the request are up, 1 if some
+ * subdisks are not up, and -1 if the request is at least partially
+ * outside the bounds of the subdisks.
+ *
+ * Modify the pointer *diskstart to point to the end address. On
+ * read, return on the first bad subdisk, so that the caller
+ * (build_read_request) can try alternatives.
+ *
+ * On entry to this routine, the prq structures are not assigned. The
+ * assignment is performed by expandrq(). Strictly speaking, the
+ * elements rqe->sdno of all entries should be set to -1, since 0
+ * (from bzero) is a valid subdisk number. We avoid this problem by
+ * initializing the ones we use, and not looking at the others (index
+ * >= prq->requests).
+ */
+enum requeststatus
+bre5(struct request *rq,
+ int plexno,
+ daddr_t * diskaddr,
+ daddr_t diskend)
+{
+ struct metrics m; /* most of the information */
+ struct sd *sd;
+ struct plex *plex;
+ struct buf *bp; /* user's bp */
+ struct rqgroup *rqg; /* the request group that we will create */
+ struct rqelement *rqe; /* point to this request information */
+ int rsectors; /* sectors remaining in this stripe */
+ int mysdno; /* another sd index in loops */
+ int rqno; /* request number */
+
+ m.diskstart = *diskaddr; /* start of transfer */
+ bp = rq->bp; /* buffer pointer */
+ plex = &PLEX[plexno]; /* point to the plex */
+
+
+ while (*diskaddr < diskend) { /* until we get it all sorted out */
+ struct rqelement *prqe = NULL; /* XXX */
+ m.badsdno = -1; /* no bad subdisk yet */
+
+ /* Part A: Define the request */
+ /*
+ * First, calculate some sizes:
+ * The offset of the start address from
+ * the start of the stripe
+ */
+ m.stripeoffset = *diskaddr % (plex->stripesize * (plex->subdisks - 1));
+
+ /*
+ * The plex-relative address of the
+ * start of the stripe
+ */
+ m.stripebase = *diskaddr - m.stripeoffset;
+
+ /* subdisk containing the parity stripe */
+ m.psdno = plex->subdisks - 1 - (*diskaddr / (plex->stripesize * (plex->subdisks - 1))) % plex->subdisks;
+
+ /*
+ * The number of the subdisk in which
+ * the start is located
+ */
+ m.firstsdno = m.stripeoffset / plex->stripesize;
+ if (m.firstsdno >= m.psdno) /* at or past parity sd */
+ m.firstsdno++; /* increment it */
+
+ /*
+ * The offset from the beginning of
+ * the stripe on this subdisk
+ */
+ m.initoffset = m.stripeoffset % plex->stripesize;
+
+ /* The offset of the stripe start relative to this subdisk */
+ m.sdbase = m.stripebase / (plex->subdisks - 1);
+
+ m.useroffset = *diskaddr - m.diskstart; /* The offset of the start in the user buffer */
+
+ /*
+ * The number of sectors to transfer in the
+ * current (first) subdisk
+ */
+ m.initlen = min(diskend - *diskaddr, /* the amount remaining to transfer */
+ plex->stripesize - m.initoffset); /* and the amount left in this block */
+
+ /*
+ * The number of sectors to transfer in this stripe
+ * is the minumum of the amount remaining to transfer
+ * and the amount left in this stripe
+ */
+ m.stripesectors = min(diskend - *diskaddr,
+ plex->stripesize * (plex->subdisks - 1) - m.stripeoffset);
+
+ /* The number of data subdisks involved in this request */
+ m.sdcount = (m.stripesectors + m.initoffset + plex->stripesize - 1) / plex->stripesize;
+
+ /* Part B: decide what kind of transfer this will be */
+ /*
+ * start and end addresses of the transfer in
+ * the current block.
+ *
+ * There are a number of different kinds of transfer, each of which relates to a
+ * specific subdisk:
+ *
+ * 1. Normal read. All participating subdisks are up, and the transfer can be
+ * made directly to the user buffer. The bounds of the transfer are described
+ * by m.dataoffset and m.datalen. We have already calculated m.initoffset and
+ * m.initlen, which define the parameters for the first data block.
+ *
+ * 2. Recovery read. One participating subdisk is down. To recover data, all
+ * the other subdisks, including the parity subdisk, must be read. The data is
+ * recovered by exclusive-oring all the other blocks. The bounds of the transfer
+ * are described by m.groupoffset and m.grouplen.
+ *
+ * 3. A read request may request reading both available data (normal read) and
+ * non-available data (recovery read). This can be a problem if the address ranges
+ * of the two reads do not coincide: in this case, the normal read needs to be
+ * extended to cover the address range of the recovery read, and must thus be
+ * performed out of malloced memory.
+ *
+ * 4. Normal write. All the participating subdisks are up. The bounds of the transfer
+ * are described by m.dataoffset and m.datalen. Since these values differ for each
+ * block, we calculate the bounds for the parity block independently as the maximum
+ * of the individual blocks and store these values in m.writeoffset and m.writelen.
+ * This write proceeds in four phases:
+ *
+ * i. Read the old contents of each block and the parity block.
+ *
+ * ii. ``Remove'' the old contents from the parity block with exclusive or.
+ *
+ * iii. ``Insert'' the new contents of the block in the parity block, again with
+ * exclusive or.
+ *
+ * iv. Write the new contents of the data blocks and the parity block. The data block
+ * transfers can be made directly from the user buffer.
+ *
+ * 5. Degraded write where the data block is not available. The bounds of the
+ * transfer are described by m.groupoffset and m.grouplen. This requires the
+ * following steps:
+ *
+ * i. Read in all the other data blocks, excluding the parity block.
+ *
+ * ii. Recreate the parity block from the other data blocks and the data to be written.
+ *
+ * iii. Write the parity block.
+ *
+ * 6. Parityless write, a write where the parity block is not available. This
+ * is in fact the simplest: just write the data blocks. This can proceed directly
+ * from the user buffer. The bounds of the transfer are described
+ * by m.dataoffset and m.datalen.
+ *
+ * 7. Combination of degraded data block write and normal write. In this case the
+ * address ranges of the reads may also need to be extended to cover all
+ * participating blocks.
+ *
+ * All requests in a group transfer transfer the same address range relative
+ * to their subdisk. The individual transfers may vary, but since our group of
+ * requests is all in a single slice, we can define a range in which they all
+ * fall.
+ *
+ * In the following code section, we determine which kind of transfer we will perform.
+ * If there is a group transfer, we also decide its bounds relative to the subdisks.
+ * At the end, we have the following values:
+ *
+ * m.flags indicates the kinds of transfers we will perform
+ * m.initoffset indicates the offset of the beginning of any data
+ * operation relative to the beginning of the stripe base.
+ * m.initlen specifies the length of any data operation.
+ * m.dataoffset contains the same value as m.initoffset.
+ * m.datalen contains the same value as m.initlen. Initially
+ * dataoffset and datalen describe the parameters for the first
+ * data block; while building the data block requests, they are
+ * updated for each block.
+ * m.groupoffset indicates the offset of any group operation relative
+ * to the beginning of the stripe base
+ * m.grouplen specifies the length of any group operation
+ * m.writeoffset indicates the offset of a normal write relative
+ * to the beginning of the stripe base. This value differs from
+ * m.dataoffset in that it applies to the entire operation, and
+ * not just the first block.
+ * m.writelen specifies the total span of a normal write operation.
+ * writeoffset and writelen are used to define the parity block.
+ */
+ m.groupoffset = 0; /* assume no group... */
+ m.grouplen = 0; /* until we know we have one */
+ m.writeoffset = m.initoffset; /* start offset of transfer */
+ m.writelen = 0; /* nothing to write yet */
+ m.flags = 0; /* no flags yet */
+ rsectors = m.stripesectors; /* remaining sectors to examine */
+ m.dataoffset = m.initoffset; /* start at the beginning of the transfer */
+ m.datalen = m.initlen;
+
+ if (m.sdcount > 1) {
+ plex->multiblock++; /* more than one block for the request */
+ /*
+ * If we have two transfers that don't overlap,
+ * (one at the end of the first block, the other
+ * at the beginning of the second block),
+ * it's cheaper to split them
+ */
+ if (rsectors < plex->stripesize) {
+ m.sdcount = 1; /* just one subdisk */
+ m.stripesectors = m.initlen; /* and just this many sectors */
+ rsectors = m.initlen; /* and in the loop counter */
+ }
+ }
+ if (SD[plex->sdnos[m.psdno]].state < sd_reborn) /* is our parity subdisk down? */
+ m.badsdno = m.psdno; /* note that it's down */
+ if (bp->b_flags & B_READ) { /* read operation */
+ for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
+ if (mysdno == m.psdno) /* ignore parity on read */
+ mysdno++;
+ if (mysdno == plex->subdisks) /* wraparound */
+ mysdno = 0;
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+
+ if (SD[plex->sdnos[mysdno]].state < sd_reborn) { /* got a bad subdisk, */
+ if (m.badsdno >= 0) /* we had one already, */
+ /*
+ * XXX be cleverer here. We can still
+ * read what we can read.
+ */
+ return REQUEST_DOWN; /* we can't take a second */
+ m.badsdno = mysdno; /* got the first */
+ m.groupoffset = m.dataoffset; /* define the bounds */
+ m.grouplen = m.datalen;
+ m.flags |= XFR_RECOVERY_READ; /* we need recovery */
+ plex->recovered_reads++; /* count another one */
+ } else
+ m.flags |= XFR_NORMAL_READ; /* normal read */
+
+ /* Update the pointers for the next block */
+ m.dataoffset = 0; /* back to the start of the stripe */
+ rsectors -= m.datalen; /* remaining sectors to examine */
+ m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
+ }
+ } else { /* write operation */
+ for (mysdno = m.firstsdno; rsectors > 0; mysdno++) {
+ if (mysdno == m.psdno) /* parity stripe, we've dealt with that */
+ mysdno++;
+ if (mysdno == plex->subdisks) /* wraparound */
+ mysdno = 0;
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+
+ sd = &SD[plex->sdnos[mysdno]];
+ if (sd->state != sd_up) {
+ enum requeststatus s;
+
+ s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
+ if (s && (m.badsdno >= 0)) { /* second bad disk, */
+ int sdno;
+ /*
+ * If the parity disk is down, there's
+ * no recovery. We make all involved
+ * subdisks stale. Otherwise, we
+ * should be able to recover, but it's
+ * like pulling teeth. Fix it later.
+ *
+ * XXX be cleverer here. We should
+ * still write what we can write.
+ */
+ for (sdno = 0; sdno < m.sdcount; sdno++) {
+ struct sd *sd = &SD[plex->sdnos[sdno]];
+ if (sd->state >= sd_reborn) /* sort of up, */
+ set_sd_state(sd->sdno, sd_stale, setstate_force); /* make it stale */
+ }
+ return s; /* and crap out */
+ }
+ m.badsdno = mysdno; /* note which one is bad */
+ m.flags |= XFR_DEGRADED_WRITE; /* we need recovery */
+ plex->degraded_writes++; /* count another one */
+ m.groupoffset = m.dataoffset; /* define the bounds */
+ m.grouplen = m.datalen;
+ } else {
+ m.flags |= XFR_NORMAL_WRITE; /* normal write operation */
+ if (m.writeoffset > m.dataoffset) { /* move write operation lower */
+ m.writelen = max(m.writeoffset + m.writelen,
+ m.dataoffset + m.datalen)
+ - m.dataoffset;
+ m.writeoffset = m.dataoffset;
+ } else
+ m.writelen = max(m.writeoffset + m.writelen,
+ m.dataoffset + m.datalen)
+ - m.writeoffset;
+ }
+
+ /* Update the pointers for the next block */
+ m.dataoffset = 0; /* back to the start of the stripe */
+ rsectors -= m.datalen; /* remaining sectors to examine */
+ m.datalen = min(rsectors, plex->stripesize); /* amount that will fit in this block */
+ }
+ if (m.badsdno == m.psdno) { /* got a bad parity block, */
+ struct sd *psd = &SD[plex->sdnos[m.psdno]];
+
+ if (psd->state == sd_down)
+ set_sd_state(psd->sdno, sd_obsolete, setstate_force); /* it's obsolete now */
+ else if (psd->state == sd_crashed)
+ set_sd_state(psd->sdno, sd_stale, setstate_force); /* it's stale now */
+ m.flags &= ~XFR_NORMAL_WRITE; /* this write isn't normal, */
+ m.flags |= XFR_PARITYLESS_WRITE; /* it's parityless */
+ plex->parityless_writes++; /* count another one */
+ }
+ }
+
+ /* reset the initial transfer values */
+ m.dataoffset = m.initoffset; /* start at the beginning of the transfer */
+ m.datalen = m.initlen;
+
+ /*
+ * XXX see if we can satisfy a recovery_read from a
+ * different plex. If so, return from here with no requests WRITEME
+ */
+
+ /* decide how many requests we need */
+ if (m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE)) /* doing a recovery read or degraded write, */
+ m.rqcount = plex->subdisks; /* all subdisks */
+ else if (m.flags & XFR_NORMAL_WRITE) /* normal write, */
+ m.rqcount = m.sdcount + 1; /* all data blocks and the parity block */
+ else /* parityless write or normal read */
+ m.rqcount = m.sdcount; /* just the data blocks */
+
+ /* Part C: build the requests */
+ rqg = allocrqg(rq, m.rqcount); /* get a request group */
+ if (rqg == NULL) { /* malloc failed */
+ bp->b_flags |= B_ERROR;
+ bp->b_error = ENOMEM;
+ biodone(bp);
+ return REQUEST_ENOMEM;
+ }
+ rqg->plexno = plexno;
+ rqg->flags = m.flags;
+ rqno = 0; /* index in the request group */
+
+ /* 1: PARITY BLOCK */
+ /*
+ * Are we performing an operation which requires parity? In that case,
+ * work out the parameters and define the parity block.
+ * XFR_PARITYOP is XFR_NORMAL_WRITE | XFR_RECOVERY_READ | XFR_DEGRADED_WRITE
+ */
+ if (m.flags & XFR_PARITYOP) { /* need parity */
+ rqe = &rqg->rqe[rqno]; /* point to element */
+ sd = &SD[plex->sdnos[m.psdno]]; /* the subdisk in question */
+ rqe->rqg = rqg; /* point back to group */
+ rqe->flags = (m.flags | XFR_PARITY_BLOCK | XFR_MALLOCED) /* always malloc parity block */
+ &~(XFR_NORMAL_READ | XFR_PARITYLESS_WRITE); /* transfer flags without data op stuf */
+ setrqebounds(rqe, &m); /* set up the bounds of the transfer */
+ rqe->sdno = sd->sdno; /* subdisk number */
+ rqe->driveno = sd->driveno;
+ prqe = rqe; /* debug XXX */
+ if (build_rq_buffer(rqe, plex)) /* build the buffer */
+ return REQUEST_ENOMEM; /* can't do it */
+ rqe->b.b_flags |= B_READ; /* we must read first */
+ m.sdcount++; /* adjust the subdisk count */
+ rqno++; /* and point to the next request */
+ }
+ /*
+ * 2: DATA BLOCKS
+ * Now build up requests for the blocks required
+ * for individual transfers
+ */
+ for (mysdno = m.firstsdno; rqno < m.sdcount; mysdno++, rqno++) {
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+ if (mysdno == plex->subdisks) /* got to the end, */
+ mysdno = 0; /* wrap around */
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+
+ rqe = &rqg->rqe[rqno]; /* point to element */
+ sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */
+ rqe->rqg = rqg; /* point to group */
+ if (m.flags & XFR_NEEDS_MALLOC) /* we need a malloced buffer first */
+ rqe->flags = m.flags | XFR_DATA_BLOCK | XFR_MALLOCED; /* transfer flags */
+ else
+ rqe->flags = m.flags | XFR_DATA_BLOCK; /* transfer flags */
+ if (mysdno == m.badsdno) { /* this is the bad subdisk */
+ rqg->badsdno = rqno; /* note which one */
+ rqe->flags |= XFR_BAD_SUBDISK; /* note that it's dead */
+ /*
+ * we can't read or write from/to it,
+ * but we don't need to malloc
+ */
+ rqe->flags &= ~(XFR_MALLOCED | XFR_NORMAL_READ | XFR_NORMAL_WRITE);
+ }
+ setrqebounds(rqe, &m); /* set up the bounds of the transfer */
+#if VINUMDEBUG
+ if (prqe
+ && (rqe->groupoffset + rqe->sdoffset) < prqe->sdoffset) /* XXX */
+ Debugger("Low data block"); /* XXX */
+#endif
+ rqe->useroffset = m.useroffset; /* offset in user buffer */
+ rqe->sdno = sd->sdno; /* subdisk number */
+ rqe->driveno = sd->driveno;
+ if (build_rq_buffer(rqe, plex)) /* build the buffer */
+ return REQUEST_ENOMEM; /* can't do it */
+ if ((m.flags & XFR_PARITYOP) /* parity operation, */
+ &&((m.flags & XFR_BAD_SUBDISK) == 0)) /* and not the bad subdisk, */
+ rqe->b.b_flags |= B_READ; /* we must read first */
+
+ /* Now update pointers for the next block */
+ *diskaddr += m.datalen; /* skip past what we've done */
+ m.stripesectors -= m.datalen; /* deduct from what's left */
+ m.useroffset += m.datalen; /* and move on in the user buffer */
+ m.datalen = min(m.stripesectors, plex->stripesize); /* and recalculate */
+ m.dataoffset = 0; /* start at the beginning of next block */
+ }
+
+ /*
+ * 3: REMAINING BLOCKS FOR RECOVERY
+ * Finally, if we have a recovery operation, build
+ * up transfers for the other subdisks. Follow the
+ * subdisks around until we get to where we started.
+ * These requests use only the group parameters.
+ */
+ if ((rqno < m.rqcount) /* haven't done them all already */
+ &&(m.flags & (XFR_RECOVERY_READ | XFR_DEGRADED_WRITE))) {
+ for (; rqno < m.rqcount; rqno++, mysdno++) {
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+ if (mysdno == plex->subdisks) /* got to the end, */
+ mysdno = 0; /* wrap around */
+ if (mysdno == m.psdno) /* parity, */
+ mysdno++; /* we've given already */
+
+ rqe = &rqg->rqe[rqno]; /* point to element */
+ sd = &SD[plex->sdnos[mysdno]]; /* the subdisk in question */
+ rqe->rqg = rqg; /* point to group */
+
+ rqe->sdoffset = m.sdbase + m.groupoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* for tidiness' sake */
+ rqe->groupoffset = 0; /* group starts at the beginining */
+ rqe->datalen = 0;
+ rqe->grouplen = m.grouplen;
+ rqe->buflen = m.grouplen;
+ rqe->flags = (m.flags | XFR_MALLOCED) & ~XFR_DATAOP; /* transfer flags without data op stuf */
+ rqe->sdno = sd->sdno; /* subdisk number */
+ rqe->driveno = sd->driveno;
+ if (build_rq_buffer(rqe, plex)) /* build the buffer */
+ return REQUEST_ENOMEM; /* can't do it */
+ rqe->b.b_flags |= B_READ; /* we must read first */
+ }
+ }
+ if (*diskaddr < diskend) /* didn't finish the request on this stripe */
+ plex->multistripe++; /* count another one */
+ }
+ return REQUEST_OK;
+}
+
+/*
+ * Helper function for rqe5: adjust the bounds of the transfers to minimize
+ * the buffer allocation.
+ *
+ * Each request can handle two of three different data ranges:
+ *
+ * 1. The range described by the parameters dataoffset and datalen,
+ * for normal read or parityless write.
+ * 2. The range described by the parameters groupoffset and grouplen,
+ * for recovery read and degraded write.
+ * 3. For normal write, the range depends on the kind of block. For
+ * data blocks, the range is defined by dataoffset and datalen. For
+ * parity blocks, it is defined by writeoffset and writelen.
+ *
+ * In order not to allocate more memory than necessary, this function
+ * adjusts the bounds parameter for each request to cover just the minimum
+ * necessary for the function it performs. This will normally vary from one
+ * request to the next.
+ *
+ * Things are slightly different for the parity block. In this case, the bounds
+ * defined by mp->writeoffset and mp->writelen also play a rôle. Select this
+ * case by setting the parameter forparity != 0
+ */
+void
+setrqebounds(struct rqelement *rqe, struct metrics *mp)
+{
+ /* parity block of a normal write */
+ if ((rqe->flags & (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) == (XFR_NORMAL_WRITE | XFR_PARITY_BLOCK)) { /* case 3 */
+ if (rqe->flags & XFR_DEGRADED_WRITE) { /* also degraded write */
+ /*
+ * With a combined normal and degraded write, we
+ * will zero out the area of the degraded write
+ * in the second phase, so we don't need to read
+ * it in. Unfortunately, we need a way to tell
+ * build_request_buffer the size of the buffer,
+ * and currently that's the length of the read.
+ * As a result, we read everything, even the stuff
+ * that we're going to nuke.
+ * FIXME XXX
+ */
+ if (mp->groupoffset < mp->writeoffset) { /* group operation starts lower */
+ rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
+ rqe->dataoffset = mp->writeoffset - mp->groupoffset; /* data starts here */
+ rqe->groupoffset = 0; /* and the group at the beginning */
+ } else { /* individual data starts first */
+ rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* individual data starts at the beginning */
+ rqe->groupoffset = mp->groupoffset - mp->writeoffset; /* group starts here */
+ }
+ rqe->datalen = mp->writelen;
+ rqe->grouplen = mp->grouplen;
+ } else { /* just normal write (case 3) */
+ rqe->sdoffset = mp->sdbase + mp->writeoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* degradation starts at the beginning */
+ rqe->groupoffset = 0; /* for tidiness' sake */
+ rqe->datalen = mp->writelen;
+ rqe->grouplen = 0;
+ }
+ } else if (rqe->flags & XFR_DATAOP) { /* data operation (case 1 or 3) */
+ if (rqe->flags & XFR_GROUPOP) { /* also a group operation (case 2) */
+ if (mp->groupoffset < mp->dataoffset) { /* group operation starts lower */
+ rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
+ rqe->dataoffset = mp->dataoffset - mp->groupoffset; /* data starts here */
+ rqe->groupoffset = 0; /* and the group at the beginning */
+ } else { /* individual data starts first */
+ rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* individual data starts at the beginning */
+ rqe->groupoffset = mp->groupoffset - mp->dataoffset; /* group starts here */
+ }
+ rqe->datalen = mp->datalen;
+ rqe->grouplen = mp->grouplen;
+ } else { /* just data operation (case 1) */
+ rqe->sdoffset = mp->sdbase + mp->dataoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* degradation starts at the beginning */
+ rqe->groupoffset = 0; /* for tidiness' sake */
+ rqe->datalen = mp->datalen;
+ rqe->grouplen = 0;
+ }
+ } else { /* just group operations (case 2) */
+ rqe->sdoffset = mp->sdbase + mp->groupoffset; /* start of transfer */
+ rqe->dataoffset = 0; /* for tidiness' sake */
+ rqe->groupoffset = 0; /* group starts at the beginining */
+ rqe->datalen = 0;
+ rqe->grouplen = mp->grouplen;
+ }
+ rqe->buflen = max(rqe->dataoffset + rqe->datalen, /* total buffer length */
+ rqe->groupoffset + rqe->grouplen);
+}
OpenPOWER on IntegriCloud