diff options
author | le <le@FreeBSD.org> | 2004-06-12 21:16:10 +0000 |
---|---|---|
committer | le <le@FreeBSD.org> | 2004-06-12 21:16:10 +0000 |
commit | cf31d52b42bd2309bb855b34e8260283eabfc570 (patch) | |
tree | b37e9b83eff28125aba7f626ab2e3bea5b487658 /sys/geom/vinum/geom_vinum_raid5.c | |
parent | f66d897510d4772f7c5efd834cd66203558e9cb5 (diff) | |
download | FreeBSD-src-cf31d52b42bd2309bb855b34e8260283eabfc570.zip FreeBSD-src-cf31d52b42bd2309bb855b34e8260283eabfc570.tar.gz |
Add a first version of a GEOMified vinum.
Diffstat (limited to 'sys/geom/vinum/geom_vinum_raid5.c')
-rw-r--r-- | sys/geom/vinum/geom_vinum_raid5.c | 616 |
1 files changed, 616 insertions, 0 deletions
diff --git a/sys/geom/vinum/geom_vinum_raid5.c b/sys/geom/vinum/geom_vinum_raid5.c new file mode 100644 index 0000000..0c604fe --- /dev/null +++ b/sys/geom/vinum/geom_vinum_raid5.c @@ -0,0 +1,616 @@ +/*- + * Copyright (c) 2004 Lukas Ertl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/bio.h> +#include <sys/conf.h> +#include <sys/errno.h> +#include <sys/kernel.h> +#include <sys/kthread.h> +#include <sys/libkern.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/systm.h> + +#include <geom/geom.h> +#include <geom/vinum/geom_vinum_var.h> +#include <geom/vinum/geom_vinum_raid5.h> +#include <geom/vinum/geom_vinum.h> + +int gv_raid5_parity(struct gv_raid5_packet *); +int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); + +struct gv_raid5_bit * +gv_new_raid5_bit(void) +{ + struct gv_raid5_bit *r; + r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); + KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); + return (r); +} + +struct gv_raid5_packet * +gv_new_raid5_packet(void) +{ + struct gv_raid5_packet *wp; + + wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); + KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); + wp->state = SETUP; + wp->type = JUNK; + TAILQ_INIT(&wp->bits); + + return (wp); +} + +/* + * Check if the stripe that the work packet wants is already being used by + * some other work packet. + */ +int +gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) +{ + struct gv_raid5_packet *wpa; + + TAILQ_FOREACH(wpa, &sc->worklist, list) { + if (wpa->lockbase == wp->lockbase) { + if (wpa->bio == wp->bio) + return (0); + return (1); + } + } + return (0); +} + +/* + * The "worker" thread that runs through the worklist and fires off the + * "subrequests" needed to fulfill a RAID5 read or write request. + */ +void +gv_raid5_worker(void *arg) +{ + struct bio *bp; + struct g_geom *gp; + struct gv_plex *p; + struct gv_raid5_packet *wp, *wpt; + struct gv_raid5_bit *rbp, *rbpt; + int error, restart; + + gp = arg; + p = gp->softc; + + mtx_lock(&p->worklist_mtx); + for (;;) { + restart = 0; + g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan"); + TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { + /* This request packet is already being processed. */ + if (wp->state == IO) + continue; + /* This request packet is ready for processing. */ + if (wp->state == VALID) { + /* Couldn't get the lock, try again. */ + if ((wp->lockbase != -1) && + gv_stripe_active(wp, p)) + continue; + + wp->state = IO; + mtx_unlock(&p->worklist_mtx); + TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) + g_io_request(rbp->bio, rbp->consumer); + mtx_lock(&p->worklist_mtx); + continue; + } + if (wp->state == FINISH) { + bp = wp->bio; + bp->bio_completed += wp->length; + /* + * Deliver the original request if we have + * finished. + */ + if (bp->bio_completed == bp->bio_length) { + mtx_unlock(&p->worklist_mtx); + g_io_deliver(bp, 0); + mtx_lock(&p->worklist_mtx); + } + TAILQ_REMOVE(&p->worklist, wp, list); + if (wp->bufmalloc == 1) + g_free(wp->buf); + g_free(wp); + restart++; + /*break;*/ + } + } + if (!restart) { + /* Self-destruct. */ + if (p->flags & GV_PLEX_THREAD_DIE) + break; + g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep"); + error = msleep(p, &p->worklist_mtx, PRIBIO, "-", + hz/100); + } + } + mtx_unlock(&p->worklist_mtx); + + g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); + + /* Signal our plex that we are dead. */ + p->flags |= GV_PLEX_THREAD_DEAD; + wakeup(p); + kthread_exit(0); +} + +/* Final bio transaction to write out the parity data. */ +int +gv_raid5_parity(struct gv_raid5_packet *wp) +{ + struct bio *bp; + + bp = g_new_bio(); + if (bp == NULL) + return (ENOMEM); + + wp->type = ISPARITY; + bp->bio_cmd = BIO_WRITE; + bp->bio_data = wp->buf; + bp->bio_offset = wp->offset; + bp->bio_length = wp->length; + bp->bio_done = gv_raid5_done; + bp->bio_caller1 = wp; + bp->bio_caller2 = NULL; + g_io_request(bp, wp->parity); + + return (0); +} + +/* We end up here after each subrequest. */ +void +gv_raid5_done(struct bio *bp) +{ + struct bio *obp; + struct g_geom *gp; + struct gv_plex *p; + struct gv_raid5_packet *wp; + struct gv_raid5_bit *rbp; + off_t i; + int error; + + wp = bp->bio_caller1; + rbp = bp->bio_caller2; + obp = wp->bio; + gp = bp->bio_from->geom; + p = gp->softc; + + /* One less active subrequest. */ + wp->active--; + + switch (obp->bio_cmd) { + case BIO_READ: + /* Degraded reads need to handle parity data. */ + if (wp->type == DEGRADED) { + for (i = 0; i < wp->length; i++) + wp->buf[i] ^= bp->bio_data[i]; + + /* When we're finished copy back the data we want. */ + if (wp->active == 0) + bcopy(wp->buf, wp->data, wp->length); + } + + break; + + case BIO_WRITE: + /* Handle the parity data, if needed. */ + if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { + for (i = 0; i < wp->length; i++) + wp->buf[i] ^= bp->bio_data[i]; + + /* Write out the parity data we calculated. */ + if (wp->active == 0) { + wp->active++; + error = gv_raid5_parity(wp); + } + } + break; + } + + g_destroy_bio(bp); + + if (rbp != NULL) { + if (rbp->malloc == 1) + g_free(rbp->buf); + TAILQ_REMOVE(&wp->bits, rbp, list); + g_free(rbp); + } + + /* This request group is done. */ + if (wp->active == 0) + wp->state = FINISH; +} + +/* Build a request group to perform (part of) a RAID5 request. */ +int +gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, + long bcount, off_t boff) +{ + struct g_geom *gp; + struct gv_plex *p; + struct gv_raid5_bit *rbp; + struct gv_sd *broken, *original, *parity, *s; + int i, psdno, sdno; + off_t len_left, real_off, stripeend, stripeoff, stripestart; + + gp = bp->bio_to->geom; + p = gp->softc; + + if (p == NULL || LIST_EMPTY(&p->subdisks)) + return (ENXIO); + + /* We are optimistic and assume that this request will be OK. */ + wp->type = NORMAL; + original = parity = broken = NULL; + + /* The number of the subdisk containing the parity stripe. */ + psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % + p->sdcount; + KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); + + /* Offset of the start address from the start of the stripe. */ + stripeoff = boff % (p->stripesize * (p->sdcount - 1)); + KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); + + /* The number of the subdisk where the stripe resides. */ + sdno = stripeoff / p->stripesize; + KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); + + /* At or past parity subdisk. */ + if (sdno >= psdno) + sdno++; + + /* The offset of the stripe on this subdisk. */ + stripestart = (boff - stripeoff) / (p->sdcount - 1); + KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); + + if (stripeoff >= p->stripesize) + stripeoff -= p->stripesize; + + /* The offset of the request on this subdisk. */ + real_off = stripestart + stripeoff; + + stripeend = stripestart + p->stripesize; + len_left = stripeend - real_off; + KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); + + /* Find the right subdisks. */ + i = 0; + LIST_FOREACH(s, &p->subdisks, in_plex) { + if (i == sdno) + original = s; + if (i == psdno) + parity = s; + if (s->state != GV_SD_UP) + broken = s; + i++; + } + + if ((original == NULL) || (parity == NULL)) + return (ENXIO); + + /* Our data stripe is missing. */ + if (original->state != GV_SD_UP) + wp->type = DEGRADED; + /* Our parity stripe is missing. */ + if (parity->state != GV_SD_UP) { + /* We cannot take another failure if we're already degraded. */ + if (wp->type != NORMAL) + return (ENXIO); + else + wp->type = NOPARITY; + } + + /* + * A combined write is necessary when the original data subdisk and the + * parity subdisk are both up, but one of the other subdisks isn't. + */ + if ((broken != NULL) && (broken != parity) && (broken != original)) + wp->type = COMBINED; + + wp->offset = real_off; + wp->length = (bcount <= len_left) ? bcount : len_left; + wp->data = addr; + wp->original = original->consumer; + wp->parity = parity->consumer; + wp->lockbase = stripestart; + + KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); + + switch (bp->bio_cmd) { + case BIO_READ: + /* + * For a degraded read we need to read in all stripes except + * the broken one plus the parity stripe and then recalculate + * the desired data. + */ + if (wp->type == DEGRADED) { + wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); + wp->bufmalloc = 1; + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Skip the broken subdisk. */ + if (s == broken) + continue; + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + rbp->bio->bio_cmd = BIO_READ; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + + /* A normal read can be fulfilled with the original subdisk. */ + } else { + rbp = gv_new_raid5_bit(); + rbp->consumer = wp->original; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_READ; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->buf = addr; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + if (wp->type != COMBINED) + wp->lockbase = -1; + break; + + case BIO_WRITE: + /* + * A degraded write means we cannot write to the original data + * subdisk. Thus we need to read in all valid stripes, + * recalculate the parity from the original data, and then + * write the parity stripe back out. + */ + if (wp->type == DEGRADED) { + wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); + wp->bufmalloc = 1; + + /* Copy the original data. */ + bcopy(wp->data, wp->buf, wp->length); + + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Skip the broken and the parity subdisk. */ + if ((s == broken) || + (s->consumer == wp->parity)) + continue; + + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + rbp->bio->bio_cmd = BIO_READ; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + + /* + * When we don't have the parity stripe we just write out the + * data. + */ + } else if (wp->type == NOPARITY) { + rbp = gv_new_raid5_bit(); + rbp->consumer = wp->original; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_WRITE; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_data = addr; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + + /* + * A combined write means that our data subdisk and the parity + * subdisks are both up, but another subdisk isn't. We need to + * read all valid stripes including the parity to recalculate + * the data of the stripe that is missing. Then we write our + * original data, and together with the other data stripes + * recalculate the parity again. + */ + } else if (wp->type == COMBINED) { + wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); + wp->bufmalloc = 1; + + /* Get the data from all subdisks. */ + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Skip the broken subdisk. */ + if (s == broken) + continue; + + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_READ; + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + + /* Write the original data. */ + rbp = gv_new_raid5_bit(); + rbp->consumer = wp->original; + rbp->buf = addr; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_WRITE; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + /* + * Insert at the tail, because we want to read the old + * data first. + */ + TAILQ_INSERT_TAIL(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + + /* Get the rest of the data again. */ + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* + * Skip the broken subdisk, the parity, and the + * one we just wrote. + */ + if ((s == broken) || + (s->consumer == wp->parity) || + (s->consumer == wp->original)) + continue; + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + rbp->bio->bio_cmd = BIO_READ; + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + /* + * Again, insert at the tail to keep correct + * order. + */ + TAILQ_INSERT_TAIL(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + + + /* + * A normal write request goes to the original subdisk, then we + * read in all other stripes, recalculate the parity and write + * out the parity again. + */ + } else { + wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); + wp->bufmalloc = 1; + LIST_FOREACH(s, &p->subdisks, in_plex) { + /* Skip the parity stripe. */ + if (s->consumer == wp->parity) + continue; + + rbp = gv_new_raid5_bit(); + rbp->consumer = s->consumer; + rbp->bio = g_new_bio(); + if (rbp->bio == NULL) + return (ENOMEM); + /* + * The data for the original stripe is written, + * the others need to be read in for the parity + * calculation. + */ + if (s->consumer == wp->original) { + rbp->bio->bio_cmd = BIO_WRITE; + rbp->buf = addr; + } else { + rbp->bio->bio_cmd = BIO_READ; + rbp->buf = g_malloc(wp->length, + M_WAITOK | M_ZERO); + rbp->malloc = 1; + } + rbp->bio->bio_data = rbp->buf; + rbp->bio->bio_offset = wp->offset; + rbp->bio->bio_length = wp->length; + rbp->bio->bio_done = gv_raid5_done; + rbp->bio->bio_caller1 = wp; + rbp->bio->bio_caller2 = rbp; + TAILQ_INSERT_HEAD(&wp->bits, rbp, list); + wp->active++; + wp->rqcount++; + } + } + break; + default: + return (EINVAL); + } + + wp->state = VALID; + return (0); +} |