summaryrefslogtreecommitdiffstats
path: root/sys/geom/vinum/geom_vinum_raid5.c
diff options
context:
space:
mode:
authorle <le@FreeBSD.org>2004-09-18 13:44:43 +0000
committerle <le@FreeBSD.org>2004-09-18 13:44:43 +0000
commit18ba8315a7f99725d24144b4ba8f020a09eb6c80 (patch)
treeb26a5245ec9f7555a82415414ac98257151bf50c /sys/geom/vinum/geom_vinum_raid5.c
parent1991acc23e3343dcdae467ab20ed1b0e729ea778 (diff)
downloadFreeBSD-src-18ba8315a7f99725d24144b4ba8f020a09eb6c80.zip
FreeBSD-src-18ba8315a7f99725d24144b4ba8f020a09eb6c80.tar.gz
Re-vamp how I/O is handled in volumes and plexes.
Analogous to the drive level, give each volume and plex a worker thread that picks up and processes incoming and completed BIOs. This should fix the data corruption issues that have come up a few weeks ago and improve performance, especially of RAID5 plexes. The volume level needs a little work, though.
Diffstat (limited to 'sys/geom/vinum/geom_vinum_raid5.c')
-rw-r--r--sys/geom/vinum/geom_vinum_raid5.c633
1 files changed, 181 insertions, 452 deletions
diff --git a/sys/geom/vinum/geom_vinum_raid5.c b/sys/geom/vinum/geom_vinum_raid5.c
index 8dfe8ab..62fb246 100644
--- a/sys/geom/vinum/geom_vinum_raid5.c
+++ b/sys/geom/vinum/geom_vinum_raid5.c
@@ -44,243 +44,62 @@ __FBSDID("$FreeBSD$");
#include <geom/vinum/geom_vinum_raid5.h>
#include <geom/vinum/geom_vinum.h>
-int gv_raid5_parity(struct gv_raid5_packet *);
-int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);
-
-struct gv_raid5_bit *
-gv_new_raid5_bit(void)
-{
- struct gv_raid5_bit *r;
- r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);
- KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));
- return (r);
-}
-
-struct gv_raid5_packet *
-gv_new_raid5_packet(void)
-{
- struct gv_raid5_packet *wp;
-
- wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);
- KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));
- wp->state = SETUP;
- wp->type = JUNK;
- TAILQ_INIT(&wp->bits);
-
- return (wp);
-}
-
-void
-gv_free_raid5_packet(struct gv_raid5_packet *wp)
-{
- struct gv_raid5_bit *r, *r2;
-
- /* Remove all the bits from this work packet. */
- TAILQ_FOREACH_SAFE(r, &wp->bits, list, r2) {
- TAILQ_REMOVE(&wp->bits, r, list);
- if (r->malloc)
- g_free(r->buf);
- if (r->bio != NULL)
- g_destroy_bio(r->bio);
- g_free(r);
- }
-
- if (wp->bufmalloc == 1)
- g_free(wp->buf);
- g_free(wp);
-}
-
/*
* Check if the stripe that the work packet wants is already being used by
* some other work packet.
*/
int
-gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)
-{
- struct gv_raid5_packet *wpa;
-
- TAILQ_FOREACH(wpa, &sc->worklist, list) {
- if (wpa->lockbase == wp->lockbase) {
- if (wpa == wp)
- return (0);
- return (1);
- }
- }
- return (0);
-}
-
-/*
- * The "worker" thread that runs through the worklist and fires off the
- * "subrequests" needed to fulfill a RAID5 read or write request.
- */
-void
-gv_raid5_worker(void *arg)
+gv_stripe_active(struct gv_plex *p, struct bio *bp)
{
- struct bio *bp;
- struct g_geom *gp;
- struct gv_plex *p;
- struct gv_raid5_packet *wp, *wpt;
- struct gv_raid5_bit *rbp, *rbpt;
- int error, restart;
-
- gp = arg;
- p = gp->softc;
-
- mtx_lock(&p->worklist_mtx);
- for (;;) {
- restart = 0;
- TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {
- /* This request packet is already being processed. */
- if (wp->state == IO)
- continue;
- /* This request packet is ready for processing. */
- if (wp->state == VALID) {
- /* Couldn't get the lock, try again. */
- if ((wp->lockbase != -1) &&
- gv_stripe_active(wp, p))
- continue;
-
- wp->state = IO;
- mtx_unlock(&p->worklist_mtx);
- TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)
- g_io_request(rbp->bio, rbp->consumer);
- mtx_lock(&p->worklist_mtx);
- continue;
- }
- if (wp->state == FINISH) {
- bp = wp->bio;
- bp->bio_completed += wp->length;
- /*
- * Deliver the original request if we have
- * finished.
- */
- if (bp->bio_completed == bp->bio_length) {
- mtx_unlock(&p->worklist_mtx);
- g_io_deliver(bp, 0);
- mtx_lock(&p->worklist_mtx);
- }
- TAILQ_REMOVE(&p->worklist, wp, list);
- gv_free_raid5_packet(wp);
- restart++;
- /*break;*/
- }
+ struct gv_raid5_packet *wp, *owp;
+ int overlap;
+
+ wp = bp->bio_driver1;
+ if (wp->lockbase == -1)
+ return (0);
+
+ overlap = 0;
+ TAILQ_FOREACH(owp, &p->packets, list) {
+ if (owp == wp)
+ break;
+ if ((wp->lockbase >= owp->lockbase) &&
+ (wp->lockbase <= owp->lockbase + owp->length)) {
+ overlap++;
+ break;
}
- if (!restart) {
- /* Self-destruct. */
- if (p->flags & GV_PLEX_THREAD_DIE)
- break;
- error = msleep(p, &p->worklist_mtx, PRIBIO, "-",
- hz/100);
+ if ((wp->lockbase <= owp->lockbase) &&
+ (wp->lockbase + wp->length >= owp->lockbase)) {
+ overlap++;
+ break;
}
}
- mtx_unlock(&p->worklist_mtx);
-
- g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");
- /* Signal our plex that we are dead. */
- p->flags |= GV_PLEX_THREAD_DEAD;
- wakeup(p);
- kthread_exit(0);
-}
-
-/* Final bio transaction to write out the parity data. */
-int
-gv_raid5_parity(struct gv_raid5_packet *wp)
-{
- struct bio *bp;
-
- bp = g_new_bio();
- if (bp == NULL)
- return (ENOMEM);
-
- wp->type = ISPARITY;
- bp->bio_cmd = BIO_WRITE;
- bp->bio_data = wp->buf;
- bp->bio_offset = wp->offset;
- bp->bio_length = wp->length;
- bp->bio_done = gv_raid5_done;
- bp->bio_caller1 = wp;
- bp->bio_caller2 = NULL;
- g_io_request(bp, wp->parity);
-
- return (0);
-}
-
-/* We end up here after each subrequest. */
-void
-gv_raid5_done(struct bio *bp)
-{
- struct bio *obp;
- struct g_geom *gp;
- struct gv_plex *p;
- struct gv_raid5_packet *wp;
- struct gv_raid5_bit *rbp;
- off_t i;
- int error;
-
- wp = bp->bio_caller1;
- rbp = bp->bio_caller2;
- obp = wp->bio;
- gp = bp->bio_from->geom;
- p = gp->softc;
-
- /* One less active subrequest. */
- wp->active--;
-
- switch (obp->bio_cmd) {
- case BIO_READ:
- /* Degraded reads need to handle parity data. */
- if (wp->type == DEGRADED) {
- for (i = 0; i < wp->length; i++)
- wp->buf[i] ^= bp->bio_data[i];
-
- /* When we're finished copy back the data we want. */
- if (wp->active == 0)
- bcopy(wp->buf, wp->data, wp->length);
- }
-
- break;
-
- case BIO_WRITE:
- /* Handle the parity data, if needed. */
- if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {
- for (i = 0; i < wp->length; i++)
- wp->buf[i] ^= bp->bio_data[i];
-
- /* Write out the parity data we calculated. */
- if (wp->active == 0) {
- wp->active++;
- error = gv_raid5_parity(wp);
- }
- }
- break;
- }
-
- /* This request group is done. */
- if (wp->active == 0)
- wp->state = FINISH;
+ return (overlap);
}
/* Build a request group to perform (part of) a RAID5 request. */
int
-gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
- long bcount, off_t boff)
+gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
+ struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
{
struct g_geom *gp;
- struct gv_plex *p;
- struct gv_raid5_bit *rbp;
struct gv_sd *broken, *original, *parity, *s;
- int i, psdno, sdno;
- off_t len_left, real_off, stripeend, stripeoff, stripestart;
+ struct gv_bioq *bq;
+ struct bio *cbp, *pbp;
+ int i, psdno, sdno, type;
+ off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;
gp = bp->bio_to->geom;
- p = gp->softc;
if (p == NULL || LIST_EMPTY(&p->subdisks))
return (ENXIO);
/* We are optimistic and assume that this request will be OK. */
- wp->type = NORMAL;
+#define REQ_TYPE_NORMAL 0
+#define REQ_TYPE_DEGRADED 1
+#define REQ_TYPE_NOPARITY 2
+
+ type = REQ_TYPE_NORMAL;
original = parity = broken = NULL;
/* The number of the subdisk containing the parity stripe. */
@@ -330,29 +149,20 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
/* Our data stripe is missing. */
if (original->state != GV_SD_UP)
- wp->type = DEGRADED;
+ type = REQ_TYPE_DEGRADED;
/* Our parity stripe is missing. */
if (parity->state != GV_SD_UP) {
/* We cannot take another failure if we're already degraded. */
- if (wp->type != NORMAL)
+ if (type != REQ_TYPE_NORMAL)
return (ENXIO);
else
- wp->type = NOPARITY;
+ type = REQ_TYPE_NOPARITY;
}
- /*
- * A combined write is necessary when the original data subdisk and the
- * parity subdisk are both up, but one of the other subdisks isn't.
- */
- if ((broken != NULL) && (broken != parity) && (broken != original))
- wp->type = COMBINED;
-
- wp->offset = real_off;
- wp->length = (bcount <= len_left) ? bcount : len_left;
+ real_len = (bcount <= len_left) ? bcount : len_left;
+ wp->length = real_len;
wp->data = addr;
- wp->original = original->consumer;
- wp->parity = parity->consumer;
- wp->lockbase = stripestart;
+ wp->lockbase = real_off;
KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
@@ -363,58 +173,45 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
* the broken one plus the parity stripe and then recalculate
* the desired data.
*/
- if (wp->type == DEGRADED) {
- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
- if (wp->buf == NULL)
- return (ENOMEM);
- wp->bufmalloc = 1;
+ if (type == REQ_TYPE_DEGRADED) {
+ bzero(wp->data, wp->length);
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Skip the broken subdisk. */
if (s == broken)
continue;
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- rbp->malloc = 1;
- rbp->bio->bio_cmd = BIO_READ;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = s->consumer;
+ cbp->bio_driver1 = wp;
+
+ GV_ENQUEUE(bp, cbp, pbp);
+
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
}
/* A normal read can be fulfilled with the original subdisk. */
} else {
- rbp = gv_new_raid5_bit();
- rbp->consumer = wp->original;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- rbp->bio->bio_cmd = BIO_READ;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->buf = addr;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_data = addr;
+ cbp->bio_done = g_std_done;
+ cbp->bio_caller2 = original->consumer;
+
+ GV_ENQUEUE(bp, cbp, pbp);
}
- if (wp->type != COMBINED)
- wp->lockbase = -1;
+ wp->lockbase = -1;
+
break;
case BIO_WRITE:
@@ -424,164 +221,65 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
* recalculate the parity from the original data, and then
* write the parity stripe back out.
*/
- if (wp->type == DEGRADED) {
- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
- if (wp->buf == NULL)
- return (ENOMEM);
- wp->bufmalloc = 1;
-
- /* Copy the original data. */
- bcopy(wp->data, wp->buf, wp->length);
-
+ if (type == REQ_TYPE_DEGRADED) {
+ /* Read all subdisks. */
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Skip the broken and the parity subdisk. */
- if ((s == broken) ||
- (s->consumer == wp->parity))
+ if ((s == broken) || (s == parity))
continue;
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- rbp->malloc = 1;
- rbp->bio->bio_cmd = BIO_READ;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
+ cbp->bio_cmd = BIO_READ;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = s->consumer;
+ cbp->bio_driver1 = wp;
+
+ GV_ENQUEUE(bp, cbp, pbp);
+
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
}
- /*
- * When we don't have the parity stripe we just write out the
- * data.
- */
- } else if (wp->type == NOPARITY) {
- rbp = gv_new_raid5_bit();
- rbp->consumer = wp->original;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
+ /* Write the parity data. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- rbp->bio->bio_cmd = BIO_WRITE;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_data = addr;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ bcopy(addr, cbp->bio_data, real_len);
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = parity->consumer;
+ cbp->bio_driver1 = wp;
+ wp->parity = cbp;
/*
- * A combined write means that our data subdisk and the parity
- * subdisks are both up, but another subdisk isn't. We need to
- * read all valid stripes including the parity to recalculate
- * the data of the stripe that is missing. Then we write our
- * original data, and together with the other data stripes
- * recalculate the parity again.
+ * When the parity stripe is missing we just write out the data.
*/
- } else if (wp->type == COMBINED) {
- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
- if (wp->buf == NULL)
+ } else if (type == REQ_TYPE_NOPARITY) {
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- wp->bufmalloc = 1;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_data = addr;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = original->consumer;
+ cbp->bio_driver1 = wp;
- /* Get the data from all subdisks. */
- LIST_FOREACH(s, &p->subdisks, in_plex) {
- /* Skip the broken subdisk. */
- if (s == broken)
- continue;
+ GV_ENQUEUE(bp, cbp, pbp);
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->bio->bio_cmd = BIO_READ;
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
- return (ENOMEM);
- rbp->malloc = 1;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
- }
-
- /* Write the original data. */
- rbp = gv_new_raid5_bit();
- rbp->consumer = wp->original;
- rbp->buf = addr;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->bio->bio_cmd = BIO_WRITE;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- /*
- * Insert at the tail, because we want to read the old
- * data first.
- */
- TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
-
- /* Get the rest of the data again. */
- LIST_FOREACH(s, &p->subdisks, in_plex) {
- /*
- * Skip the broken subdisk, the parity, and the
- * one we just wrote.
- */
- if ((s == broken) ||
- (s->consumer == wp->parity) ||
- (s->consumer == wp->original))
- continue;
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->bio->bio_cmd = BIO_READ;
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
- return (ENOMEM);
- rbp->malloc = 1;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- /*
- * Again, insert at the tail to keep correct
- * order.
- */
- TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
- }
-
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
/*
* A normal write request goes to the original subdisk, then we
@@ -589,52 +287,83 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
* out the parity again.
*/
} else {
- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
- if (wp->buf == NULL)
+ /* Read old parity. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- wp->bufmalloc = 1;
- LIST_FOREACH(s, &p->subdisks, in_plex) {
- /* Skip the parity stripe. */
- if (s->consumer == wp->parity)
- continue;
+ cbp->bio_cmd = BIO_READ;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = parity->consumer;
+ cbp->bio_driver1 = wp;
+
+ GV_ENQUEUE(bp, cbp, pbp);
+
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
+
+ /* Read old data. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
+ return (ENOMEM);
+ cbp->bio_cmd = BIO_READ;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = original->consumer;
+ cbp->bio_driver1 = wp;
+
+ GV_ENQUEUE(bp, cbp, pbp);
+
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
+
+ /* Write new data. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
+ return (ENOMEM);
+ cbp->bio_data = addr;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = original->consumer;
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- /*
- * The data for the original stripe is written,
- * the others need to be read in for the parity
- * calculation.
- */
- if (s->consumer == wp->original) {
- rbp->bio->bio_cmd = BIO_WRITE;
- rbp->buf = addr;
- } else {
- rbp->bio->bio_cmd = BIO_READ;
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
- return (ENOMEM);
- rbp->malloc = 1;
- }
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
- }
+ cbp->bio_driver1 = wp;
+
+ /*
+ * We must not write the new data until the old data
+ * was read, so hold this BIO back until we're ready
+ * for it.
+ */
+ wp->waiting = cbp;
+
+ /* The final bio for the parity. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
+ return (ENOMEM);
+ cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = parity->consumer;
+ cbp->bio_driver1 = wp;
+
+ /* Remember that this is the BIO for the parity data. */
+ wp->parity = cbp;
}
break;
+
default:
return (EINVAL);
}
- wp->state = VALID;
return (0);
}
OpenPOWER on IntegriCloud