summaryrefslogtreecommitdiffstats
path: root/sys/geom/vinum/geom_vinum_raid5.c
diff options
context:
space:
mode:
authorle <le@FreeBSD.org>2004-06-12 21:16:10 +0000
committerle <le@FreeBSD.org>2004-06-12 21:16:10 +0000
commitcf31d52b42bd2309bb855b34e8260283eabfc570 (patch)
treeb37e9b83eff28125aba7f626ab2e3bea5b487658 /sys/geom/vinum/geom_vinum_raid5.c
parentf66d897510d4772f7c5efd834cd66203558e9cb5 (diff)
downloadFreeBSD-src-cf31d52b42bd2309bb855b34e8260283eabfc570.zip
FreeBSD-src-cf31d52b42bd2309bb855b34e8260283eabfc570.tar.gz
Add a first version of a GEOMified vinum.
Diffstat (limited to 'sys/geom/vinum/geom_vinum_raid5.c')
-rw-r--r--sys/geom/vinum/geom_vinum_raid5.c616
1 files changed, 616 insertions, 0 deletions
diff --git a/sys/geom/vinum/geom_vinum_raid5.c b/sys/geom/vinum/geom_vinum_raid5.c
new file mode 100644
index 0000000..0c604fe
--- /dev/null
+++ b/sys/geom/vinum/geom_vinum_raid5.c
@@ -0,0 +1,616 @@
+/*-
+ * Copyright (c) 2004 Lukas Ertl
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/errno.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+#include <geom/geom.h>
+#include <geom/vinum/geom_vinum_var.h>
+#include <geom/vinum/geom_vinum_raid5.h>
+#include <geom/vinum/geom_vinum.h>
+
+int gv_raid5_parity(struct gv_raid5_packet *);
+int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);
+
+struct gv_raid5_bit *
+gv_new_raid5_bit(void)
+{
+ struct gv_raid5_bit *r;
+ r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);
+ KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));
+ return (r);
+}
+
+struct gv_raid5_packet *
+gv_new_raid5_packet(void)
+{
+ struct gv_raid5_packet *wp;
+
+ wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);
+ KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));
+ wp->state = SETUP;
+ wp->type = JUNK;
+ TAILQ_INIT(&wp->bits);
+
+ return (wp);
+}
+
+/*
+ * Check if the stripe that the work packet wants is already being used by
+ * some other work packet.
+ */
+int
+gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)
+{
+ struct gv_raid5_packet *wpa;
+
+ TAILQ_FOREACH(wpa, &sc->worklist, list) {
+ if (wpa->lockbase == wp->lockbase) {
+ if (wpa->bio == wp->bio)
+ return (0);
+ return (1);
+ }
+ }
+ return (0);
+}
+
+/*
+ * The "worker" thread that runs through the worklist and fires off the
+ * "subrequests" needed to fulfill a RAID5 read or write request.
+ */
+void
+gv_raid5_worker(void *arg)
+{
+ struct bio *bp;
+ struct g_geom *gp;
+ struct gv_plex *p;
+ struct gv_raid5_packet *wp, *wpt;
+ struct gv_raid5_bit *rbp, *rbpt;
+ int error, restart;
+
+ gp = arg;
+ p = gp->softc;
+
+ mtx_lock(&p->worklist_mtx);
+ for (;;) {
+ restart = 0;
+ g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan");
+ TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {
+ /* This request packet is already being processed. */
+ if (wp->state == IO)
+ continue;
+ /* This request packet is ready for processing. */
+ if (wp->state == VALID) {
+ /* Couldn't get the lock, try again. */
+ if ((wp->lockbase != -1) &&
+ gv_stripe_active(wp, p))
+ continue;
+
+ wp->state = IO;
+ mtx_unlock(&p->worklist_mtx);
+ TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)
+ g_io_request(rbp->bio, rbp->consumer);
+ mtx_lock(&p->worklist_mtx);
+ continue;
+ }
+ if (wp->state == FINISH) {
+ bp = wp->bio;
+ bp->bio_completed += wp->length;
+ /*
+ * Deliver the original request if we have
+ * finished.
+ */
+ if (bp->bio_completed == bp->bio_length) {
+ mtx_unlock(&p->worklist_mtx);
+ g_io_deliver(bp, 0);
+ mtx_lock(&p->worklist_mtx);
+ }
+ TAILQ_REMOVE(&p->worklist, wp, list);
+ if (wp->bufmalloc == 1)
+ g_free(wp->buf);
+ g_free(wp);
+ restart++;
+ /*break;*/
+ }
+ }
+ if (!restart) {
+ /* Self-destruct. */
+ if (p->flags & GV_PLEX_THREAD_DIE)
+ break;
+ g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep");
+ error = msleep(p, &p->worklist_mtx, PRIBIO, "-",
+ hz/100);
+ }
+ }
+ mtx_unlock(&p->worklist_mtx);
+
+ g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");
+
+ /* Signal our plex that we are dead. */
+ p->flags |= GV_PLEX_THREAD_DEAD;
+ wakeup(p);
+ kthread_exit(0);
+}
+
+/* Final bio transaction to write out the parity data. */
+int
+gv_raid5_parity(struct gv_raid5_packet *wp)
+{
+ struct bio *bp;
+
+ bp = g_new_bio();
+ if (bp == NULL)
+ return (ENOMEM);
+
+ wp->type = ISPARITY;
+ bp->bio_cmd = BIO_WRITE;
+ bp->bio_data = wp->buf;
+ bp->bio_offset = wp->offset;
+ bp->bio_length = wp->length;
+ bp->bio_done = gv_raid5_done;
+ bp->bio_caller1 = wp;
+ bp->bio_caller2 = NULL;
+ g_io_request(bp, wp->parity);
+
+ return (0);
+}
+
+/* We end up here after each subrequest. */
+void
+gv_raid5_done(struct bio *bp)
+{
+ struct bio *obp;
+ struct g_geom *gp;
+ struct gv_plex *p;
+ struct gv_raid5_packet *wp;
+ struct gv_raid5_bit *rbp;
+ off_t i;
+ int error;
+
+ wp = bp->bio_caller1;
+ rbp = bp->bio_caller2;
+ obp = wp->bio;
+ gp = bp->bio_from->geom;
+ p = gp->softc;
+
+ /* One less active subrequest. */
+ wp->active--;
+
+ switch (obp->bio_cmd) {
+ case BIO_READ:
+ /* Degraded reads need to handle parity data. */
+ if (wp->type == DEGRADED) {
+ for (i = 0; i < wp->length; i++)
+ wp->buf[i] ^= bp->bio_data[i];
+
+ /* When we're finished copy back the data we want. */
+ if (wp->active == 0)
+ bcopy(wp->buf, wp->data, wp->length);
+ }
+
+ break;
+
+ case BIO_WRITE:
+ /* Handle the parity data, if needed. */
+ if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {
+ for (i = 0; i < wp->length; i++)
+ wp->buf[i] ^= bp->bio_data[i];
+
+ /* Write out the parity data we calculated. */
+ if (wp->active == 0) {
+ wp->active++;
+ error = gv_raid5_parity(wp);
+ }
+ }
+ break;
+ }
+
+ g_destroy_bio(bp);
+
+ if (rbp != NULL) {
+ if (rbp->malloc == 1)
+ g_free(rbp->buf);
+ TAILQ_REMOVE(&wp->bits, rbp, list);
+ g_free(rbp);
+ }
+
+ /* This request group is done. */
+ if (wp->active == 0)
+ wp->state = FINISH;
+}
+
+/* Build a request group to perform (part of) a RAID5 request. */
+int
+gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
+ long bcount, off_t boff)
+{
+ struct g_geom *gp;
+ struct gv_plex *p;
+ struct gv_raid5_bit *rbp;
+ struct gv_sd *broken, *original, *parity, *s;
+ int i, psdno, sdno;
+ off_t len_left, real_off, stripeend, stripeoff, stripestart;
+
+ gp = bp->bio_to->geom;
+ p = gp->softc;
+
+ if (p == NULL || LIST_EMPTY(&p->subdisks))
+ return (ENXIO);
+
+ /* We are optimistic and assume that this request will be OK. */
+ wp->type = NORMAL;
+ original = parity = broken = NULL;
+
+ /* The number of the subdisk containing the parity stripe. */
+ psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
+ p->sdcount;
+ KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0"));
+
+ /* Offset of the start address from the start of the stripe. */
+ stripeoff = boff % (p->stripesize * (p->sdcount - 1));
+ KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
+
+ /* The number of the subdisk where the stripe resides. */
+ sdno = stripeoff / p->stripesize;
+ KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0"));
+
+ /* At or past parity subdisk. */
+ if (sdno >= psdno)
+ sdno++;
+
+ /* The offset of the stripe on this subdisk. */
+ stripestart = (boff - stripeoff) / (p->sdcount - 1);
+ KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
+
+ if (stripeoff >= p->stripesize)
+ stripeoff -= p->stripesize;
+
+ /* The offset of the request on this subdisk. */
+ real_off = stripestart + stripeoff;
+
+ stripeend = stripestart + p->stripesize;
+ len_left = stripeend - real_off;
+ KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
+
+ /* Find the right subdisks. */
+ i = 0;
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ if (i == sdno)
+ original = s;
+ if (i == psdno)
+ parity = s;
+ if (s->state != GV_SD_UP)
+ broken = s;
+ i++;
+ }
+
+ if ((original == NULL) || (parity == NULL))
+ return (ENXIO);
+
+ /* Our data stripe is missing. */
+ if (original->state != GV_SD_UP)
+ wp->type = DEGRADED;
+ /* Our parity stripe is missing. */
+ if (parity->state != GV_SD_UP) {
+ /* We cannot take another failure if we're already degraded. */
+ if (wp->type != NORMAL)
+ return (ENXIO);
+ else
+ wp->type = NOPARITY;
+ }
+
+ /*
+ * A combined write is necessary when the original data subdisk and the
+ * parity subdisk are both up, but one of the other subdisks isn't.
+ */
+ if ((broken != NULL) && (broken != parity) && (broken != original))
+ wp->type = COMBINED;
+
+ wp->offset = real_off;
+ wp->length = (bcount <= len_left) ? bcount : len_left;
+ wp->data = addr;
+ wp->original = original->consumer;
+ wp->parity = parity->consumer;
+ wp->lockbase = stripestart;
+
+ KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
+
+ switch (bp->bio_cmd) {
+ case BIO_READ:
+ /*
+ * For a degraded read we need to read in all stripes except
+ * the broken one plus the parity stripe and then recalculate
+ * the desired data.
+ */
+ if (wp->type == DEGRADED) {
+ wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
+ wp->bufmalloc = 1;
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ /* Skip the broken subdisk. */
+ if (s == broken)
+ continue;
+ rbp = gv_new_raid5_bit();
+ rbp->consumer = s->consumer;
+ rbp->bio = g_new_bio();
+ if (rbp->bio == NULL)
+ return (ENOMEM);
+ rbp->buf = g_malloc(wp->length,
+ M_WAITOK | M_ZERO);
+ rbp->malloc = 1;
+ rbp->bio->bio_cmd = BIO_READ;
+ rbp->bio->bio_offset = wp->offset;
+ rbp->bio->bio_length = wp->length;
+ rbp->bio->bio_data = rbp->buf;
+ rbp->bio->bio_done = gv_raid5_done;
+ rbp->bio->bio_caller1 = wp;
+ rbp->bio->bio_caller2 = rbp;
+ TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
+ wp->active++;
+ wp->rqcount++;
+ }
+
+ /* A normal read can be fulfilled with the original subdisk. */
+ } else {
+ rbp = gv_new_raid5_bit();
+ rbp->consumer = wp->original;
+ rbp->bio = g_new_bio();
+ if (rbp->bio == NULL)
+ return (ENOMEM);
+ rbp->bio->bio_cmd = BIO_READ;
+ rbp->bio->bio_offset = wp->offset;
+ rbp->bio->bio_length = wp->length;
+ rbp->buf = addr;
+ rbp->bio->bio_data = rbp->buf;
+ rbp->bio->bio_done = gv_raid5_done;
+ rbp->bio->bio_caller1 = wp;
+ rbp->bio->bio_caller2 = rbp;
+ TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
+ wp->active++;
+ wp->rqcount++;
+ }
+ if (wp->type != COMBINED)
+ wp->lockbase = -1;
+ break;
+
+ case BIO_WRITE:
+ /*
+ * A degraded write means we cannot write to the original data
+ * subdisk. Thus we need to read in all valid stripes,
+ * recalculate the parity from the original data, and then
+ * write the parity stripe back out.
+ */
+ if (wp->type == DEGRADED) {
+ wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
+ wp->bufmalloc = 1;
+
+ /* Copy the original data. */
+ bcopy(wp->data, wp->buf, wp->length);
+
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ /* Skip the broken and the parity subdisk. */
+ if ((s == broken) ||
+ (s->consumer == wp->parity))
+ continue;
+
+ rbp = gv_new_raid5_bit();
+ rbp->consumer = s->consumer;
+ rbp->bio = g_new_bio();
+ if (rbp->bio == NULL)
+ return (ENOMEM);
+ rbp->buf = g_malloc(wp->length,
+ M_WAITOK | M_ZERO);
+ rbp->malloc = 1;
+ rbp->bio->bio_cmd = BIO_READ;
+ rbp->bio->bio_data = rbp->buf;
+ rbp->bio->bio_offset = wp->offset;
+ rbp->bio->bio_length = wp->length;
+ rbp->bio->bio_done = gv_raid5_done;
+ rbp->bio->bio_caller1 = wp;
+ rbp->bio->bio_caller2 = rbp;
+ TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
+ wp->active++;
+ wp->rqcount++;
+ }
+
+ /*
+ * When we don't have the parity stripe we just write out the
+ * data.
+ */
+ } else if (wp->type == NOPARITY) {
+ rbp = gv_new_raid5_bit();
+ rbp->consumer = wp->original;
+ rbp->bio = g_new_bio();
+ if (rbp->bio == NULL)
+ return (ENOMEM);
+ rbp->bio->bio_cmd = BIO_WRITE;
+ rbp->bio->bio_offset = wp->offset;
+ rbp->bio->bio_length = wp->length;
+ rbp->bio->bio_data = addr;
+ rbp->bio->bio_done = gv_raid5_done;
+ rbp->bio->bio_caller1 = wp;
+ rbp->bio->bio_caller2 = rbp;
+ TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
+ wp->active++;
+ wp->rqcount++;
+
+ /*
+ * A combined write means that our data subdisk and the parity
+ * subdisks are both up, but another subdisk isn't. We need to
+ * read all valid stripes including the parity to recalculate
+ * the data of the stripe that is missing. Then we write our
+ * original data, and together with the other data stripes
+ * recalculate the parity again.
+ */
+ } else if (wp->type == COMBINED) {
+ wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
+ wp->bufmalloc = 1;
+
+ /* Get the data from all subdisks. */
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ /* Skip the broken subdisk. */
+ if (s == broken)
+ continue;
+
+ rbp = gv_new_raid5_bit();
+ rbp->consumer = s->consumer;
+ rbp->bio = g_new_bio();
+ if (rbp->bio == NULL)
+ return (ENOMEM);
+ rbp->bio->bio_cmd = BIO_READ;
+ rbp->buf = g_malloc(wp->length,
+ M_WAITOK | M_ZERO);
+ rbp->malloc = 1;
+ rbp->bio->bio_data = rbp->buf;
+ rbp->bio->bio_offset = wp->offset;
+ rbp->bio->bio_length = wp->length;
+ rbp->bio->bio_done = gv_raid5_done;
+ rbp->bio->bio_caller1 = wp;
+ rbp->bio->bio_caller2 = rbp;
+ TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
+ wp->active++;
+ wp->rqcount++;
+ }
+
+ /* Write the original data. */
+ rbp = gv_new_raid5_bit();
+ rbp->consumer = wp->original;
+ rbp->buf = addr;
+ rbp->bio = g_new_bio();
+ if (rbp->bio == NULL)
+ return (ENOMEM);
+ rbp->bio->bio_cmd = BIO_WRITE;
+ rbp->bio->bio_data = rbp->buf;
+ rbp->bio->bio_offset = wp->offset;
+ rbp->bio->bio_length = wp->length;
+ rbp->bio->bio_done = gv_raid5_done;
+ rbp->bio->bio_caller1 = wp;
+ rbp->bio->bio_caller2 = rbp;
+ /*
+ * Insert at the tail, because we want to read the old
+ * data first.
+ */
+ TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
+ wp->active++;
+ wp->rqcount++;
+
+ /* Get the rest of the data again. */
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ /*
+ * Skip the broken subdisk, the parity, and the
+ * one we just wrote.
+ */
+ if ((s == broken) ||
+ (s->consumer == wp->parity) ||
+ (s->consumer == wp->original))
+ continue;
+ rbp = gv_new_raid5_bit();
+ rbp->consumer = s->consumer;
+ rbp->bio = g_new_bio();
+ if (rbp->bio == NULL)
+ return (ENOMEM);
+ rbp->bio->bio_cmd = BIO_READ;
+ rbp->buf = g_malloc(wp->length,
+ M_WAITOK | M_ZERO);
+ rbp->malloc = 1;
+ rbp->bio->bio_data = rbp->buf;
+ rbp->bio->bio_offset = wp->offset;
+ rbp->bio->bio_length = wp->length;
+ rbp->bio->bio_done = gv_raid5_done;
+ rbp->bio->bio_caller1 = wp;
+ rbp->bio->bio_caller2 = rbp;
+ /*
+ * Again, insert at the tail to keep correct
+ * order.
+ */
+ TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
+ wp->active++;
+ wp->rqcount++;
+ }
+
+
+ /*
+ * A normal write request goes to the original subdisk, then we
+ * read in all other stripes, recalculate the parity and write
+ * out the parity again.
+ */
+ } else {
+ wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO);
+ wp->bufmalloc = 1;
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ /* Skip the parity stripe. */
+ if (s->consumer == wp->parity)
+ continue;
+
+ rbp = gv_new_raid5_bit();
+ rbp->consumer = s->consumer;
+ rbp->bio = g_new_bio();
+ if (rbp->bio == NULL)
+ return (ENOMEM);
+ /*
+ * The data for the original stripe is written,
+ * the others need to be read in for the parity
+ * calculation.
+ */
+ if (s->consumer == wp->original) {
+ rbp->bio->bio_cmd = BIO_WRITE;
+ rbp->buf = addr;
+ } else {
+ rbp->bio->bio_cmd = BIO_READ;
+ rbp->buf = g_malloc(wp->length,
+ M_WAITOK | M_ZERO);
+ rbp->malloc = 1;
+ }
+ rbp->bio->bio_data = rbp->buf;
+ rbp->bio->bio_offset = wp->offset;
+ rbp->bio->bio_length = wp->length;
+ rbp->bio->bio_done = gv_raid5_done;
+ rbp->bio->bio_caller1 = wp;
+ rbp->bio->bio_caller2 = rbp;
+ TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
+ wp->active++;
+ wp->rqcount++;
+ }
+ }
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ wp->state = VALID;
+ return (0);
+}
OpenPOWER on IntegriCloud