summaryrefslogtreecommitdiffstats
path: root/sys/geom/vinum
diff options
context:
space:
mode:
authorle <le@FreeBSD.org>2004-09-18 13:44:43 +0000
committerle <le@FreeBSD.org>2004-09-18 13:44:43 +0000
commit18ba8315a7f99725d24144b4ba8f020a09eb6c80 (patch)
treeb26a5245ec9f7555a82415414ac98257151bf50c /sys/geom/vinum
parent1991acc23e3343dcdae467ab20ed1b0e729ea778 (diff)
downloadFreeBSD-src-18ba8315a7f99725d24144b4ba8f020a09eb6c80.zip
FreeBSD-src-18ba8315a7f99725d24144b4ba8f020a09eb6c80.tar.gz
Re-vamp how I/O is handled in volumes and plexes.
Analogous to the drive level, give each volume and plex a worker thread that picks up and processes incoming and completed BIOs. This should fix the data corruption issues that have come up a few weeks ago and improve performance, especially of RAID5 plexes. The volume level needs a little work, though.
Diffstat (limited to 'sys/geom/vinum')
-rw-r--r--sys/geom/vinum/geom_vinum.h1
-rw-r--r--sys/geom/vinum/geom_vinum_init.c2
-rw-r--r--sys/geom/vinum/geom_vinum_plex.c475
-rw-r--r--sys/geom/vinum/geom_vinum_raid5.c633
-rw-r--r--sys/geom/vinum/geom_vinum_raid5.h63
-rw-r--r--sys/geom/vinum/geom_vinum_rm.c1
-rw-r--r--sys/geom/vinum/geom_vinum_subr.c17
-rw-r--r--sys/geom/vinum/geom_vinum_var.h15
-rw-r--r--sys/geom/vinum/geom_vinum_volume.c228
9 files changed, 772 insertions, 663 deletions
diff --git a/sys/geom/vinum/geom_vinum.h b/sys/geom/vinum/geom_vinum.h
index a507d73..ddbf5cf 100644
--- a/sys/geom/vinum/geom_vinum.h
+++ b/sys/geom/vinum/geom_vinum.h
@@ -70,6 +70,7 @@ int gv_is_striped(struct gv_plex *);
int gv_is_open(struct g_geom *);
void gv_kill_drive_thread(struct gv_drive *);
void gv_kill_plex_thread(struct gv_plex *);
+void gv_kill_vol_thread(struct gv_volume *);
int gv_object_type(struct gv_softc *, char *);
void gv_parse_config(struct gv_softc *, u_char *, int);
const char *gv_roughlength(off_t, int);
diff --git a/sys/geom/vinum/geom_vinum_init.c b/sys/geom/vinum/geom_vinum_init.c
index 4ad7a03..382ea15 100644
--- a/sys/geom/vinum/geom_vinum_init.c
+++ b/sys/geom/vinum/geom_vinum_init.c
@@ -293,7 +293,7 @@ gv_sync_td(void *arg)
* This hack declare this bio as part of an initialization
* process, so that the lower levels allow it to get through.
*/
- bp->bio_caller1 = p;
+ bp->bio_cflags |= GV_BIO_SYNCREQ;
/* Schedule it down ... */
g_io_request(bp, to);
diff --git a/sys/geom/vinum/geom_vinum_plex.c b/sys/geom/vinum/geom_vinum_plex.c
index 8cfa6be..494ec2c 100644
--- a/sys/geom/vinum/geom_vinum_plex.c
+++ b/sys/geom/vinum/geom_vinum_plex.c
@@ -43,6 +43,10 @@ __FBSDID("$FreeBSD$");
#include <geom/vinum/geom_vinum_raid5.h>
#include <geom/vinum/geom_vinum.h>
+static void gv_plex_completed_request(struct gv_plex *, struct bio *);
+static void gv_plex_normal_request(struct gv_plex *, struct bio *);
+static void gv_plex_worker(void *);
+
/* XXX: is this the place to catch dying subdisks? */
static void
gv_plex_orphan(struct g_consumer *cp)
@@ -76,48 +80,39 @@ gv_plex_orphan(struct g_consumer *cp)
g_wither_geom(gp, error);
}
-static void
+void
gv_plex_done(struct bio *bp)
{
- struct g_geom *gp;
- struct gv_sd *s;
-
- gp = bp->bio_to->geom;
-
- s = bp->bio_caller1;
- KASSERT(s != NULL, ("gv_plex_done: NULL s"));
-
- if (bp->bio_error == 0)
- s->initialized += bp->bio_length;
-
- if (s->initialized >= s->size) {
- gv_set_sd_state(s, GV_SD_UP, 0);
- s->initialized = 0;
- }
-
- g_std_done(bp);
+ struct gv_plex *p;
+ struct gv_bioq *bq;
+
+ p = bp->bio_from->geom->softc;
+ bp->bio_cflags |= GV_BIO_DONE;
+ bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
+ bq->bp = bp;
+ mtx_lock(&p->bqueue_mtx);
+ TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
+ wakeup(p);
+ mtx_unlock(&p->bqueue_mtx);
}
/* Find the correct subdisk to send the bio to and build a bio to send. */
static int
-gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
- caddr_t addr, long bcount, off_t boff)
+gv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
{
struct g_geom *gp;
- struct gv_plex *p;
struct gv_sd *s;
- struct bio *cbp;
+ struct bio *cbp, *pbp;
int i, sdno;
- off_t len_left, real_len, real_off, stripeend, stripeno, stripestart;
-
- s = NULL;
-
- gp = bp->bio_to->geom;
- p = gp->softc;
+ off_t len_left, real_len, real_off;
+ off_t stripeend, stripeno, stripestart;
if (p == NULL || LIST_EMPTY(&p->subdisks))
return (ENXIO);
+ s = NULL;
+ gp = bp->bio_to->geom;
+
/*
* We only handle concatenated and striped plexes here. RAID5 plexes
* are handled in build_raid5_request().
@@ -190,10 +185,10 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
break;
case GV_SD_STALE:
- if (bp->bio_caller1 != p)
+ if (!(bp->bio_cflags & GV_BIO_SYNCREQ))
return (ENXIO);
- printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name);
+ printf("GEOM_VINUM: sd %s is initializing\n", s->name);
gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
break;
@@ -214,103 +209,365 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_data = addr;
- if (bp->bio_caller1 == p) {
- cbp->bio_caller1 = s;
+ cbp->bio_done = g_std_done;
+ cbp->bio_caller2 = s->consumer;
+ if ((bp->bio_cflags & GV_BIO_SYNCREQ)) {
+ cbp->bio_cflags |= GV_BIO_SYNCREQ;
cbp->bio_done = gv_plex_done;
- } else
- cbp->bio_done = g_std_done;
- *bp2 = cbp;
- *cp = s->consumer;
+ }
+
+ if (bp->bio_driver1 == NULL) {
+ bp->bio_driver1 = cbp;
+ } else {
+ pbp = bp->bio_driver1;
+ while (pbp->bio_caller1 != NULL)
+ pbp = pbp->bio_caller1;
+ pbp->bio_caller1 = cbp;
+ }
+
return (0);
}
static void
gv_plex_start(struct bio *bp)
{
- struct g_geom *gp;
- struct g_consumer *cp;
struct gv_plex *p;
- struct gv_raid5_packet *wp;
- struct bio *bp2;
- caddr_t addr;
- off_t boff;
- long bcount, rcount;
- int err;
+ struct gv_bioq *bq;
- gp = bp->bio_to->geom;
- p = gp->softc;
+ switch(bp->bio_cmd) {
+ case BIO_READ:
+ case BIO_WRITE:
+ case BIO_DELETE:
+ break;
+ case BIO_GETATTR:
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ return;
+ }
/*
* We cannot handle this request if too many of our subdisks are
* inaccessible.
*/
- if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) {
- g_io_deliver(bp, ENXIO); /* XXX: correct way? */
+ p = bp->bio_to->geom->softc;
+ if ((p->state < GV_PLEX_DEGRADED) &&
+ !(bp->bio_cflags & GV_BIO_SYNCREQ)) {
+ g_io_deliver(bp, ENXIO);
return;
}
- switch(bp->bio_cmd) {
- case BIO_READ:
- case BIO_WRITE:
- case BIO_DELETE:
+ bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
+ bq->bp = bp;
+ mtx_lock(&p->bqueue_mtx);
+ TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
+ wakeup(p);
+ mtx_unlock(&p->bqueue_mtx);
+}
+
+static void
+gv_plex_worker(void *arg)
+{
+ struct bio *bp;
+ struct gv_plex *p;
+ struct gv_sd *s;
+ struct gv_bioq *bq;
+
+ p = arg;
+ KASSERT(p != NULL, ("NULL p"));
+
+ mtx_lock(&p->bqueue_mtx);
+ for (;;) {
+ /* We were signaled to exit. */
+ if (p->flags & GV_PLEX_THREAD_DIE)
+ break;
+
+ /* Take the first BIO from our queue. */
+ bq = TAILQ_FIRST(&p->bqueue);
+ if (bq == NULL) {
+ msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10);
+ continue;
+ }
+ TAILQ_REMOVE(&p->bqueue, bq, queue);
+ mtx_unlock(&p->bqueue_mtx);
+
+ bp = bq->bp;
+
+ /* A completed request. */
+ if (bp->bio_cflags & GV_BIO_DONE) {
+ g_free(bq);
+ if (bp->bio_cflags & GV_BIO_SYNCREQ) {
+ s = bp->bio_to->private;
+ if (bp->bio_error == 0)
+ s->initialized += bp->bio_length;
+ if (s->initialized >= s->size) {
+ g_topology_lock();
+ gv_set_sd_state(s, GV_SD_UP,
+ GV_SETSTATE_CONFIG);
+ g_topology_unlock();
+ s->initialized = 0;
+ }
+ g_std_done(bp);
+ } else
+ gv_plex_completed_request(p, bp);
/*
- * We split up the request in smaller packets and hand them
- * down to our subdisks.
+ * A sub-request that was hold back because it interfered with
+ * another sub-request.
*/
- wp = NULL;
- addr = bp->bio_data;
- boff = bp->bio_offset;
- for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {
- /*
- * RAID5 requests usually need to be split up in
- * several subrequests.
- */
- if (p->org == GV_PLEX_RAID5) {
- wp = gv_new_raid5_packet();
- wp->bio = bp;
- err = gv_build_raid5_req(wp, bp, addr, bcount,
- boff);
- } else
- err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount,
- boff);
+ } else if (bp->bio_cflags & GV_BIO_ONHOLD) {
+ /* Is it still locked out? */
+ if (gv_stripe_active(p, bp)) {
+ mtx_lock(&p->bqueue_mtx);
+ TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
+ mtx_unlock(&p->bqueue_mtx);
+ } else {
+ g_free(bq);
+ bp->bio_cflags &= ~GV_BIO_ONHOLD;
+ g_io_request(bp, bp->bio_caller2);
+ }
- if (err) {
- if (p->org == GV_PLEX_RAID5)
- gv_free_raid5_packet(wp);
- bp->bio_completed += bcount;
- if (bp->bio_error == 0)
- bp->bio_error = err;
- if (bp->bio_completed == bp->bio_length)
- g_io_deliver(bp, bp->bio_error);
- return;
+ /* A normal request to this plex. */
+ } else {
+ g_free(bq);
+ gv_plex_normal_request(p, bp);
+ }
+
+ mtx_lock(&p->bqueue_mtx);
+ }
+ mtx_unlock(&p->bqueue_mtx);
+ p->flags |= GV_PLEX_THREAD_DEAD;
+ wakeup(p);
+
+ kthread_exit(ENXIO);
+}
+
+void
+gv_plex_completed_request(struct gv_plex *p, struct bio *bp)
+{
+ struct bio *cbp, *pbp;
+ struct gv_bioq *bq, *bq2;
+ struct gv_raid5_packet *wp;
+ int i;
+
+ wp = bp->bio_driver1;
+
+ switch (bp->bio_parent->bio_cmd) {
+ case BIO_READ:
+ if (wp == NULL)
+ break;
+
+ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
+ if (bq->bp == bp) {
+ TAILQ_REMOVE(&wp->bits, bq, queue);
+ g_free(bq);
+ for (i = 0; i < wp->length; i++)
+ wp->data[i] ^= bp->bio_data[i];
+ break;
+ }
+ }
+ if (TAILQ_EMPTY(&wp->bits)) {
+ bp->bio_parent->bio_completed += wp->length;
+ if (wp->lockbase != -1)
+ TAILQ_REMOVE(&p->packets, wp, list);
+ g_free(wp);
+ }
+
+ break;
+
+ case BIO_WRITE:
+ if (wp == NULL)
+ break;
+
+ /* Check if we need to handle parity data. */
+ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
+ if (bq->bp == bp) {
+ TAILQ_REMOVE(&wp->bits, bq, queue);
+ g_free(bq);
+ cbp = wp->parity;
+ if (cbp != NULL) {
+ for (i = 0; i < wp->length; i++)
+ cbp->bio_data[i] ^=
+ bp->bio_data[i];
+ }
+ break;
}
-
- if (p->org != GV_PLEX_RAID5) {
- rcount = bp2->bio_length;
- g_io_request(bp2, cp);
-
- /*
- * RAID5 subrequests are queued on a worklist
- * and picked up from the worker thread. This
- * ensures correct order.
- */
+ }
+
+ /* Handle parity data. */
+ if (TAILQ_EMPTY(&wp->bits)) {
+ if (wp->waiting != NULL) {
+ pbp = wp->waiting;
+ wp->waiting = NULL;
+ cbp = wp->parity;
+ for (i = 0; i < wp->length; i++)
+ cbp->bio_data[i] ^= pbp->bio_data[i];
+ g_io_request(pbp, pbp->bio_caller2);
+ } else if (wp->parity != NULL) {
+ cbp = wp->parity;
+ wp->parity = NULL;
+ g_io_request(cbp, cbp->bio_caller2);
} else {
- mtx_lock(&p->worklist_mtx);
- TAILQ_INSERT_TAIL(&p->worklist, wp,
- list);
- mtx_unlock(&p->worklist_mtx);
- wakeup(&p);
- rcount = wp->length;
+ bp->bio_parent->bio_completed += wp->length;
+ TAILQ_REMOVE(&p->packets, wp, list);
+ g_free(wp);
}
+ }
+
+ break;
+ }
- boff += rcount;
- addr += rcount;
+ pbp = bp->bio_parent;
+ if (pbp->bio_error == 0)
+ pbp->bio_error = bp->bio_error;
+
+ /* When the original request is finished, we deliver it. */
+ pbp->bio_inbed++;
+ if (pbp->bio_inbed == pbp->bio_children)
+ g_io_deliver(pbp, pbp->bio_error);
+
+ /* Clean up what we allocated. */
+ if (bp->bio_cflags & GV_BIO_MALLOC)
+ g_free(bp->bio_data);
+ g_destroy_bio(bp);
+}
+
+void
+gv_plex_normal_request(struct gv_plex *p, struct bio *bp)
+{
+ struct bio *cbp, *pbp;
+ struct gv_bioq *bq, *bq2;
+ struct gv_raid5_packet *wp, *wp2;
+ caddr_t addr;
+ off_t bcount, boff;
+ int err;
+
+ bcount = bp->bio_length;
+ addr = bp->bio_data;
+ boff = bp->bio_offset;
+
+ /* Walk over the whole length of the request, we might split it up. */
+ while (bcount > 0) {
+ wp = NULL;
+
+ /*
+ * RAID5 plexes need special treatment, as a single write
+ * request involves several read/write sub-requests.
+ */
+ if (p->org == GV_PLEX_RAID5) {
+ wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
+ wp->bio = bp;
+ TAILQ_INIT(&wp->bits);
+
+ err = gv_build_raid5_req(p, wp, bp, addr, boff, bcount);
+
+ /*
+ * Building the sub-request failed, we probably need to
+ * clean up a lot.
+ */
+ if (err) {
+ printf("GEOM_VINUM: plex request failed for ");
+ g_print_bio(bp);
+ printf("\n");
+ TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
+ TAILQ_REMOVE(&wp->bits, bq, queue);
+ g_free(bq);
+ }
+ if (wp->waiting != NULL) {
+ if (wp->waiting->bio_cflags &
+ GV_BIO_MALLOC)
+ g_free(wp->waiting->bio_data);
+ g_destroy_bio(wp->waiting);
+ }
+ if (wp->parity != NULL) {
+ if (wp->parity->bio_cflags &
+ GV_BIO_MALLOC)
+ g_free(wp->parity->bio_data);
+ g_destroy_bio(wp->parity);
+ }
+ g_free(wp);
+
+ TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
+ if (wp->bio == bp) {
+ TAILQ_REMOVE(&p->packets, wp,
+ list);
+ TAILQ_FOREACH_SAFE(bq,
+ &wp->bits, queue, bq2) {
+ TAILQ_REMOVE(&wp->bits,
+ bq, queue);
+ g_free(bq);
+ }
+ g_free(wp);
+ }
+ }
+
+ cbp = bp->bio_driver1;
+ while (cbp != NULL) {
+ pbp = cbp->bio_caller1;
+ if (cbp->bio_cflags & GV_BIO_MALLOC)
+ g_free(cbp->bio_data);
+ g_destroy_bio(cbp);
+ cbp = pbp;
+ }
+
+ g_io_deliver(bp, err);
+ return;
+ }
+
+ if (TAILQ_EMPTY(&wp->bits))
+ g_free(wp);
+ else if (wp->lockbase != -1)
+ TAILQ_INSERT_TAIL(&p->packets, wp, list);
+
+ /*
+ * Requests to concatenated and striped plexes go straight
+ * through.
+ */
+ } else {
+ err = gv_plexbuffer(p, bp, addr, boff, bcount);
+
+ /* Building the sub-request failed. */
+ if (err) {
+ printf("GEOM_VINUM: plex request failed for ");
+ g_print_bio(bp);
+ printf("\n");
+ cbp = bp->bio_driver1;
+ while (cbp != NULL) {
+ pbp = cbp->bio_caller1;
+ g_destroy_bio(cbp);
+ cbp = pbp;
+ }
+ g_io_deliver(bp, err);
+ return;
+ }
}
- return;
+
+ /* Abuse bio_caller1 as linked list. */
+ pbp = bp->bio_driver1;
+ while (pbp->bio_caller1 != NULL)
+ pbp = pbp->bio_caller1;
+ bcount -= pbp->bio_length;
+ addr += pbp->bio_length;
+ boff += pbp->bio_length;
+ }
- default:
- g_io_deliver(bp, EOPNOTSUPP);
- return;
+ /* Fire off all sub-requests. */
+ pbp = bp->bio_driver1;
+ while (pbp != NULL) {
+ /*
+ * RAID5 sub-requests need to come in correct order, otherwise
+ * we trip over the parity, as it might be overwritten by
+ * another sub-request.
+ */
+ if (pbp->bio_driver1 != NULL &&
+ gv_stripe_active(p, pbp)) {
+ pbp->bio_cflags |= GV_BIO_ONHOLD;
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = pbp;
+ mtx_lock(&p->bqueue_mtx);
+ TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
+ mtx_unlock(&p->bqueue_mtx);
+ } else
+ g_io_request(pbp, pbp->bio_caller2);
+ pbp = pbp->bio_caller1;
}
}
@@ -425,16 +682,12 @@ gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
gp->softc = p;
p->geom = gp;
- /* RAID5 plexes need a 'worker' thread, where IO is handled. */
- if (p->org == GV_PLEX_RAID5) {
- TAILQ_INIT(&p->worklist);
- mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL,
- MTX_DEF);
- p->flags &= ~GV_PLEX_THREAD_DIE;
- kthread_create(gv_raid5_worker, gp, NULL, 0, 0,
- "gv_raid5");
- p->flags |= GV_PLEX_THREAD_ACTIVE;
- }
+ TAILQ_INIT(&p->packets);
+ TAILQ_INIT(&p->bqueue);
+ mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
+ kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",
+ p->name);
+ p->flags |= GV_PLEX_THREAD_ACTIVE;
/* Attach a consumer to this provider. */
cp = g_new_consumer(gp);
diff --git a/sys/geom/vinum/geom_vinum_raid5.c b/sys/geom/vinum/geom_vinum_raid5.c
index 8dfe8ab..62fb246 100644
--- a/sys/geom/vinum/geom_vinum_raid5.c
+++ b/sys/geom/vinum/geom_vinum_raid5.c
@@ -44,243 +44,62 @@ __FBSDID("$FreeBSD$");
#include <geom/vinum/geom_vinum_raid5.h>
#include <geom/vinum/geom_vinum.h>
-int gv_raid5_parity(struct gv_raid5_packet *);
-int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);
-
-struct gv_raid5_bit *
-gv_new_raid5_bit(void)
-{
- struct gv_raid5_bit *r;
- r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);
- KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));
- return (r);
-}
-
-struct gv_raid5_packet *
-gv_new_raid5_packet(void)
-{
- struct gv_raid5_packet *wp;
-
- wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);
- KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));
- wp->state = SETUP;
- wp->type = JUNK;
- TAILQ_INIT(&wp->bits);
-
- return (wp);
-}
-
-void
-gv_free_raid5_packet(struct gv_raid5_packet *wp)
-{
- struct gv_raid5_bit *r, *r2;
-
- /* Remove all the bits from this work packet. */
- TAILQ_FOREACH_SAFE(r, &wp->bits, list, r2) {
- TAILQ_REMOVE(&wp->bits, r, list);
- if (r->malloc)
- g_free(r->buf);
- if (r->bio != NULL)
- g_destroy_bio(r->bio);
- g_free(r);
- }
-
- if (wp->bufmalloc == 1)
- g_free(wp->buf);
- g_free(wp);
-}
-
/*
* Check if the stripe that the work packet wants is already being used by
* some other work packet.
*/
int
-gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)
-{
- struct gv_raid5_packet *wpa;
-
- TAILQ_FOREACH(wpa, &sc->worklist, list) {
- if (wpa->lockbase == wp->lockbase) {
- if (wpa == wp)
- return (0);
- return (1);
- }
- }
- return (0);
-}
-
-/*
- * The "worker" thread that runs through the worklist and fires off the
- * "subrequests" needed to fulfill a RAID5 read or write request.
- */
-void
-gv_raid5_worker(void *arg)
+gv_stripe_active(struct gv_plex *p, struct bio *bp)
{
- struct bio *bp;
- struct g_geom *gp;
- struct gv_plex *p;
- struct gv_raid5_packet *wp, *wpt;
- struct gv_raid5_bit *rbp, *rbpt;
- int error, restart;
-
- gp = arg;
- p = gp->softc;
-
- mtx_lock(&p->worklist_mtx);
- for (;;) {
- restart = 0;
- TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {
- /* This request packet is already being processed. */
- if (wp->state == IO)
- continue;
- /* This request packet is ready for processing. */
- if (wp->state == VALID) {
- /* Couldn't get the lock, try again. */
- if ((wp->lockbase != -1) &&
- gv_stripe_active(wp, p))
- continue;
-
- wp->state = IO;
- mtx_unlock(&p->worklist_mtx);
- TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)
- g_io_request(rbp->bio, rbp->consumer);
- mtx_lock(&p->worklist_mtx);
- continue;
- }
- if (wp->state == FINISH) {
- bp = wp->bio;
- bp->bio_completed += wp->length;
- /*
- * Deliver the original request if we have
- * finished.
- */
- if (bp->bio_completed == bp->bio_length) {
- mtx_unlock(&p->worklist_mtx);
- g_io_deliver(bp, 0);
- mtx_lock(&p->worklist_mtx);
- }
- TAILQ_REMOVE(&p->worklist, wp, list);
- gv_free_raid5_packet(wp);
- restart++;
- /*break;*/
- }
+ struct gv_raid5_packet *wp, *owp;
+ int overlap;
+
+ wp = bp->bio_driver1;
+ if (wp->lockbase == -1)
+ return (0);
+
+ overlap = 0;
+ TAILQ_FOREACH(owp, &p->packets, list) {
+ if (owp == wp)
+ break;
+ if ((wp->lockbase >= owp->lockbase) &&
+ (wp->lockbase <= owp->lockbase + owp->length)) {
+ overlap++;
+ break;
}
- if (!restart) {
- /* Self-destruct. */
- if (p->flags & GV_PLEX_THREAD_DIE)
- break;
- error = msleep(p, &p->worklist_mtx, PRIBIO, "-",
- hz/100);
+ if ((wp->lockbase <= owp->lockbase) &&
+ (wp->lockbase + wp->length >= owp->lockbase)) {
+ overlap++;
+ break;
}
}
- mtx_unlock(&p->worklist_mtx);
-
- g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");
- /* Signal our plex that we are dead. */
- p->flags |= GV_PLEX_THREAD_DEAD;
- wakeup(p);
- kthread_exit(0);
-}
-
-/* Final bio transaction to write out the parity data. */
-int
-gv_raid5_parity(struct gv_raid5_packet *wp)
-{
- struct bio *bp;
-
- bp = g_new_bio();
- if (bp == NULL)
- return (ENOMEM);
-
- wp->type = ISPARITY;
- bp->bio_cmd = BIO_WRITE;
- bp->bio_data = wp->buf;
- bp->bio_offset = wp->offset;
- bp->bio_length = wp->length;
- bp->bio_done = gv_raid5_done;
- bp->bio_caller1 = wp;
- bp->bio_caller2 = NULL;
- g_io_request(bp, wp->parity);
-
- return (0);
-}
-
-/* We end up here after each subrequest. */
-void
-gv_raid5_done(struct bio *bp)
-{
- struct bio *obp;
- struct g_geom *gp;
- struct gv_plex *p;
- struct gv_raid5_packet *wp;
- struct gv_raid5_bit *rbp;
- off_t i;
- int error;
-
- wp = bp->bio_caller1;
- rbp = bp->bio_caller2;
- obp = wp->bio;
- gp = bp->bio_from->geom;
- p = gp->softc;
-
- /* One less active subrequest. */
- wp->active--;
-
- switch (obp->bio_cmd) {
- case BIO_READ:
- /* Degraded reads need to handle parity data. */
- if (wp->type == DEGRADED) {
- for (i = 0; i < wp->length; i++)
- wp->buf[i] ^= bp->bio_data[i];
-
- /* When we're finished copy back the data we want. */
- if (wp->active == 0)
- bcopy(wp->buf, wp->data, wp->length);
- }
-
- break;
-
- case BIO_WRITE:
- /* Handle the parity data, if needed. */
- if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {
- for (i = 0; i < wp->length; i++)
- wp->buf[i] ^= bp->bio_data[i];
-
- /* Write out the parity data we calculated. */
- if (wp->active == 0) {
- wp->active++;
- error = gv_raid5_parity(wp);
- }
- }
- break;
- }
-
- /* This request group is done. */
- if (wp->active == 0)
- wp->state = FINISH;
+ return (overlap);
}
/* Build a request group to perform (part of) a RAID5 request. */
int
-gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
- long bcount, off_t boff)
+gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
+ struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
{
struct g_geom *gp;
- struct gv_plex *p;
- struct gv_raid5_bit *rbp;
struct gv_sd *broken, *original, *parity, *s;
- int i, psdno, sdno;
- off_t len_left, real_off, stripeend, stripeoff, stripestart;
+ struct gv_bioq *bq;
+ struct bio *cbp, *pbp;
+ int i, psdno, sdno, type;
+ off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;
gp = bp->bio_to->geom;
- p = gp->softc;
if (p == NULL || LIST_EMPTY(&p->subdisks))
return (ENXIO);
/* We are optimistic and assume that this request will be OK. */
- wp->type = NORMAL;
+#define REQ_TYPE_NORMAL 0
+#define REQ_TYPE_DEGRADED 1
+#define REQ_TYPE_NOPARITY 2
+
+ type = REQ_TYPE_NORMAL;
original = parity = broken = NULL;
/* The number of the subdisk containing the parity stripe. */
@@ -330,29 +149,20 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
/* Our data stripe is missing. */
if (original->state != GV_SD_UP)
- wp->type = DEGRADED;
+ type = REQ_TYPE_DEGRADED;
/* Our parity stripe is missing. */
if (parity->state != GV_SD_UP) {
/* We cannot take another failure if we're already degraded. */
- if (wp->type != NORMAL)
+ if (type != REQ_TYPE_NORMAL)
return (ENXIO);
else
- wp->type = NOPARITY;
+ type = REQ_TYPE_NOPARITY;
}
- /*
- * A combined write is necessary when the original data subdisk and the
- * parity subdisk are both up, but one of the other subdisks isn't.
- */
- if ((broken != NULL) && (broken != parity) && (broken != original))
- wp->type = COMBINED;
-
- wp->offset = real_off;
- wp->length = (bcount <= len_left) ? bcount : len_left;
+ real_len = (bcount <= len_left) ? bcount : len_left;
+ wp->length = real_len;
wp->data = addr;
- wp->original = original->consumer;
- wp->parity = parity->consumer;
- wp->lockbase = stripestart;
+ wp->lockbase = real_off;
KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
@@ -363,58 +173,45 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
* the broken one plus the parity stripe and then recalculate
* the desired data.
*/
- if (wp->type == DEGRADED) {
- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
- if (wp->buf == NULL)
- return (ENOMEM);
- wp->bufmalloc = 1;
+ if (type == REQ_TYPE_DEGRADED) {
+ bzero(wp->data, wp->length);
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Skip the broken subdisk. */
if (s == broken)
continue;
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- rbp->malloc = 1;
- rbp->bio->bio_cmd = BIO_READ;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = s->consumer;
+ cbp->bio_driver1 = wp;
+
+ GV_ENQUEUE(bp, cbp, pbp);
+
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
}
/* A normal read can be fulfilled with the original subdisk. */
} else {
- rbp = gv_new_raid5_bit();
- rbp->consumer = wp->original;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- rbp->bio->bio_cmd = BIO_READ;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->buf = addr;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_data = addr;
+ cbp->bio_done = g_std_done;
+ cbp->bio_caller2 = original->consumer;
+
+ GV_ENQUEUE(bp, cbp, pbp);
}
- if (wp->type != COMBINED)
- wp->lockbase = -1;
+ wp->lockbase = -1;
+
break;
case BIO_WRITE:
@@ -424,164 +221,65 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
* recalculate the parity from the original data, and then
* write the parity stripe back out.
*/
- if (wp->type == DEGRADED) {
- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
- if (wp->buf == NULL)
- return (ENOMEM);
- wp->bufmalloc = 1;
-
- /* Copy the original data. */
- bcopy(wp->data, wp->buf, wp->length);
-
+ if (type == REQ_TYPE_DEGRADED) {
+ /* Read all subdisks. */
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Skip the broken and the parity subdisk. */
- if ((s == broken) ||
- (s->consumer == wp->parity))
+ if ((s == broken) || (s == parity))
continue;
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- rbp->malloc = 1;
- rbp->bio->bio_cmd = BIO_READ;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
+ cbp->bio_cmd = BIO_READ;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = s->consumer;
+ cbp->bio_driver1 = wp;
+
+ GV_ENQUEUE(bp, cbp, pbp);
+
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
}
- /*
- * When we don't have the parity stripe we just write out the
- * data.
- */
- } else if (wp->type == NOPARITY) {
- rbp = gv_new_raid5_bit();
- rbp->consumer = wp->original;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
+ /* Write the parity data. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- rbp->bio->bio_cmd = BIO_WRITE;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_data = addr;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ bcopy(addr, cbp->bio_data, real_len);
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = parity->consumer;
+ cbp->bio_driver1 = wp;
+ wp->parity = cbp;
/*
- * A combined write means that our data subdisk and the parity
- * subdisks are both up, but another subdisk isn't. We need to
- * read all valid stripes including the parity to recalculate
- * the data of the stripe that is missing. Then we write our
- * original data, and together with the other data stripes
- * recalculate the parity again.
+ * When the parity stripe is missing we just write out the data.
*/
- } else if (wp->type == COMBINED) {
- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
- if (wp->buf == NULL)
+ } else if (type == REQ_TYPE_NOPARITY) {
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- wp->bufmalloc = 1;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_data = addr;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = original->consumer;
+ cbp->bio_driver1 = wp;
- /* Get the data from all subdisks. */
- LIST_FOREACH(s, &p->subdisks, in_plex) {
- /* Skip the broken subdisk. */
- if (s == broken)
- continue;
+ GV_ENQUEUE(bp, cbp, pbp);
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->bio->bio_cmd = BIO_READ;
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
- return (ENOMEM);
- rbp->malloc = 1;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
- }
-
- /* Write the original data. */
- rbp = gv_new_raid5_bit();
- rbp->consumer = wp->original;
- rbp->buf = addr;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->bio->bio_cmd = BIO_WRITE;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- /*
- * Insert at the tail, because we want to read the old
- * data first.
- */
- TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
-
- /* Get the rest of the data again. */
- LIST_FOREACH(s, &p->subdisks, in_plex) {
- /*
- * Skip the broken subdisk, the parity, and the
- * one we just wrote.
- */
- if ((s == broken) ||
- (s->consumer == wp->parity) ||
- (s->consumer == wp->original))
- continue;
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- rbp->bio->bio_cmd = BIO_READ;
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
- return (ENOMEM);
- rbp->malloc = 1;
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- /*
- * Again, insert at the tail to keep correct
- * order.
- */
- TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
- }
-
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
/*
* A normal write request goes to the original subdisk, then we
@@ -589,52 +287,83 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
* out the parity again.
*/
} else {
- wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
- if (wp->buf == NULL)
+ /* Read old parity. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
return (ENOMEM);
- wp->bufmalloc = 1;
- LIST_FOREACH(s, &p->subdisks, in_plex) {
- /* Skip the parity stripe. */
- if (s->consumer == wp->parity)
- continue;
+ cbp->bio_cmd = BIO_READ;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = parity->consumer;
+ cbp->bio_driver1 = wp;
+
+ GV_ENQUEUE(bp, cbp, pbp);
+
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
+
+ /* Read old data. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
+ return (ENOMEM);
+ cbp->bio_cmd = BIO_READ;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = original->consumer;
+ cbp->bio_driver1 = wp;
+
+ GV_ENQUEUE(bp, cbp, pbp);
+
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
+
+ /* Write new data. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
+ return (ENOMEM);
+ cbp->bio_data = addr;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = original->consumer;
- rbp = gv_new_raid5_bit();
- rbp->consumer = s->consumer;
- rbp->bio = g_new_bio();
- if (rbp->bio == NULL)
- return (ENOMEM);
- /*
- * The data for the original stripe is written,
- * the others need to be read in for the parity
- * calculation.
- */
- if (s->consumer == wp->original) {
- rbp->bio->bio_cmd = BIO_WRITE;
- rbp->buf = addr;
- } else {
- rbp->bio->bio_cmd = BIO_READ;
- rbp->buf = g_malloc(wp->length,
- M_NOWAIT | M_ZERO);
- if (rbp->buf == NULL)
- return (ENOMEM);
- rbp->malloc = 1;
- }
- rbp->bio->bio_data = rbp->buf;
- rbp->bio->bio_offset = wp->offset;
- rbp->bio->bio_length = wp->length;
- rbp->bio->bio_done = gv_raid5_done;
- rbp->bio->bio_caller1 = wp;
- rbp->bio->bio_caller2 = rbp;
- TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
- wp->active++;
- wp->rqcount++;
- }
+ cbp->bio_driver1 = wp;
+
+ /*
+ * We must not write the new data until the old data
+ * was read, so hold this BIO back until we're ready
+ * for it.
+ */
+ wp->waiting = cbp;
+
+ /* The final bio for the parity. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
+ return (ENOMEM);
+ cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = parity->consumer;
+ cbp->bio_driver1 = wp;
+
+ /* Remember that this is the BIO for the parity data. */
+ wp->parity = cbp;
}
break;
+
default:
return (EINVAL);
}
- wp->state = VALID;
return (0);
}
diff --git a/sys/geom/vinum/geom_vinum_raid5.h b/sys/geom/vinum/geom_vinum_raid5.h
index 454311f..8074f42 100644
--- a/sys/geom/vinum/geom_vinum_raid5.h
+++ b/sys/geom/vinum/geom_vinum_raid5.h
@@ -32,22 +32,23 @@
/*
* A single RAID5 request usually needs more than one I/O transaction,
* depending on the state of the associated subdisks and the direction of the
- * transaction (read or write). Every subrequest of a RAID5 request,
- * represented by a gv_raid_packet, is defined by a gv_raid5_bit.
+ * transaction (read or write).
*/
-/* A subrequest of a RAID5 read/write operation. */
-struct gv_raid5_bit {
- struct bio *bio; /* BIO of this subrequest. */
- caddr_t buf; /* Data buffer of this subrequest. */
- int malloc; /* Flag if data buffer was malloced. */
- struct g_consumer *consumer; /* Consumer to send the BIO to. */
- TAILQ_ENTRY(gv_raid5_bit) list; /* Entry in the list of this request. */
-};
+#define GV_ENQUEUE(bp, cbp, pbp) \
+ do { \
+ if (bp->bio_driver1 == NULL) { \
+ bp->bio_driver1 = cbp; \
+ } else { \
+ pbp = bp->bio_driver1; \
+ while (pbp->bio_caller1 != NULL) \
+ pbp = pbp->bio_caller1; \
+ pbp->bio_caller1 = cbp; \
+ } \
+ } while (0);
-/* Container for one or more gv_raid5_bits; represents a RAID5 I/O request. */
struct gv_raid5_packet {
- caddr_t buf; /* Data buffer of this RAID5 request. */
+ caddr_t data; /* Data buffer of this sub-request- */
off_t length; /* Size of data buffer. */
off_t lockbase; /* Deny access to our plex offset. */
off_t offset; /* The drive offset of the subdisk. */
@@ -56,39 +57,17 @@ struct gv_raid5_packet {
int rqcount; /* Count of subrequests. */
struct bio *bio; /* Pointer to the original bio. */
- caddr_t data; /* Pointer to the original data. */
-
- struct g_consumer *original; /* Consumer to the data stripe. */
- struct g_consumer *parity; /* Consumer to the parity stripe. */
-
- /* State of this RAID5 packet. */
- enum {
- SETUP, /* Newly created. */
- VALID, /* Ready for processing. */
- IO, /* Currently doing I/O. */
- FINISH /* Packet has finished. */
- } state;
-
- /* Type of this RAID5 transaction. */
- enum {
- JUNK, /* Newly created, not valid. */
- NORMAL, /* Normal read or write. */
- ISPARITY, /* Containing only parity data. */
- NOPARITY, /* Parity stripe not available. */
- DEGRADED, /* Data stripe not available. */
- COMBINED /* Data and parity stripes ok, others not. */
- } type;
+ struct bio *parity; /* The bio containing the parity data. */
+ struct bio *waiting; /* A bio that need to wait for other bios. */
- TAILQ_HEAD(,gv_raid5_bit) bits; /* List of subrequests. */
- TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */
+ TAILQ_HEAD(,gv_bioq) bits; /* List of subrequests. */
+ TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */
};
-int gv_build_raid5_req(struct gv_raid5_packet *, struct bio *, caddr_t,
- long, off_t);
-void gv_free_raid5_packet(struct gv_raid5_packet *);
-void gv_raid5_done(struct bio *);
+int gv_stripe_active(struct gv_plex *, struct bio *);
+int gv_build_raid5_req(struct gv_plex *, struct gv_raid5_packet *,
+ struct bio *, caddr_t, off_t, off_t);
void gv_raid5_worker(void *);
-struct gv_raid5_packet *gv_new_raid5_packet(void);
-struct gv_raid5_bit *gv_new_raid5_bit(void);
+void gv_plex_done(struct bio *);
#endif /* !_GEOM_VINUM_RAID5_H_ */
diff --git a/sys/geom/vinum/geom_vinum_rm.c b/sys/geom/vinum/geom_vinum_rm.c
index cb2af79..d328c50 100644
--- a/sys/geom/vinum/geom_vinum_rm.c
+++ b/sys/geom/vinum/geom_vinum_rm.c
@@ -166,6 +166,7 @@ gv_rm_vol(struct gv_softc *sc, struct gctl_req *req, struct gv_volume *v, int fl
/* Clean up and let our geom fade away. */
LIST_REMOVE(v, volume);
+ gv_kill_vol_thread(v);
g_free(v);
if (gp != NULL) {
gp->softc = NULL;
diff --git a/sys/geom/vinum/geom_vinum_subr.c b/sys/geom/vinum/geom_vinum_subr.c
index dedb6c3..8ebe135 100644
--- a/sys/geom/vinum/geom_vinum_subr.c
+++ b/sys/geom/vinum/geom_vinum_subr.c
@@ -832,12 +832,25 @@ gv_kill_drive_thread(struct gv_drive *d)
void
gv_kill_plex_thread(struct gv_plex *p)
{
- if ((p->org == GV_PLEX_RAID5) && (p->flags & GV_PLEX_THREAD_ACTIVE)) {
+ if (p->flags & GV_PLEX_THREAD_ACTIVE) {
p->flags |= GV_PLEX_THREAD_DIE;
wakeup(p);
while (!(p->flags & GV_PLEX_THREAD_DEAD))
tsleep(p, PRIBIO, "gv_die", hz);
p->flags &= ~GV_PLEX_THREAD_ACTIVE;
- mtx_destroy(&p->worklist_mtx);
+ mtx_destroy(&p->bqueue_mtx);
+ }
+}
+
+void
+gv_kill_vol_thread(struct gv_volume *v)
+{
+ if (v->flags & GV_VOL_THREAD_ACTIVE) {
+ v->flags |= GV_VOL_THREAD_DIE;
+ wakeup(v);
+ while (!(v->flags & GV_VOL_THREAD_DEAD))
+ tsleep(v, PRIBIO, "gv_die", hz);
+ v->flags &= ~GV_VOL_THREAD_ACTIVE;
+ mtx_destroy(&v->bqueue_mtx);
}
}
diff --git a/sys/geom/vinum/geom_vinum_var.h b/sys/geom/vinum/geom_vinum_var.h
index 38b540f..99c1c37 100644
--- a/sys/geom/vinum/geom_vinum_var.h
+++ b/sys/geom/vinum/geom_vinum_var.h
@@ -111,6 +111,8 @@
#define GV_BIO_DONE 0x01
#define GV_BIO_MALLOC 0x02
#define GV_BIO_ONHOLD 0x04
+#define GV_BIO_SYNCREQ 0x08
+#define GV_BIO_SUCCEED 0x10
/*
* hostname is 256 bytes long, but we don't need to shlep multiple copies in
@@ -269,8 +271,9 @@ struct gv_plex {
off_t synced; /* Count of synced bytes. */
- struct mtx worklist_mtx; /* Mutex for RAID5 worklist. */
- TAILQ_HEAD(,gv_raid5_packet) worklist; /* List of RAID5 work packets. */
+ struct mtx bqueue_mtx; /* Lock for the BIO queue. */
+ TAILQ_HEAD(,gv_bioq) bqueue; /* BIO queue. */
+ TAILQ_HEAD(,gv_raid5_packet) packets; /* RAID5 sub-requests. */
LIST_HEAD(,gv_sd) subdisks; /* List of attached subdisks. */
LIST_ENTRY(gv_plex) in_volume; /* Plex list of associated volume. */
@@ -292,6 +295,14 @@ struct gv_volume {
#define GV_VOL_DOWN 0
#define GV_VOL_UP 1
+ int flags;
+#define GV_VOL_THREAD_ACTIVE 0x01 /* Volume has an active thread. */
+#define GV_VOL_THREAD_DIE 0x02 /* Signal the thread to die. */
+#define GV_VOL_THREAD_DEAD 0x04 /* The thread has died. */
+
+ struct mtx bqueue_mtx; /* Lock for the BIO queue. */
+ TAILQ_HEAD(,gv_bioq) bqueue; /* BIO queue. */
+
LIST_HEAD(,gv_plex) plexes; /* List of attached plexes. */
LIST_ENTRY(gv_volume) volume; /* Entry in vinum config. */
diff --git a/sys/geom/vinum/geom_vinum_volume.c b/sys/geom/vinum/geom_vinum_volume.c
index a2f262d..4ace9d2 100644
--- a/sys/geom/vinum/geom_vinum_volume.c
+++ b/sys/geom/vinum/geom_vinum_volume.c
@@ -31,6 +31,7 @@ __FBSDID("$FreeBSD$");
#include <sys/bio.h>
#include <sys/conf.h>
#include <sys/kernel.h>
+#include <sys/kthread.h>
#include <sys/libkern.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -42,6 +43,9 @@ __FBSDID("$FreeBSD$");
#include <geom/vinum/geom_vinum_var.h>
#include <geom/vinum/geom_vinum.h>
+static void gv_vol_completed_request(struct gv_volume *, struct bio *);
+static void gv_vol_normal_request(struct gv_volume *, struct bio *);
+
static void
gv_volume_orphan(struct g_consumer *cp)
{
@@ -62,8 +66,10 @@ gv_volume_orphan(struct g_consumer *cp)
if (!LIST_EMPTY(&gp->consumer))
return;
v = gp->softc;
- if (v != NULL)
+ if (v != NULL) {
+ gv_kill_vol_thread(v);
v->geom = NULL;
+ }
gp->softc = NULL;
g_wither_geom(gp, error);
}
@@ -72,78 +78,185 @@ gv_volume_orphan(struct g_consumer *cp)
static void
gv_volume_done(struct bio *bp)
{
- struct g_consumer *cp;
-
- /* The next plex in this volume. */
- cp = LIST_NEXT(bp->bio_from, consumer);
-
- switch (bp->bio_cmd) {
- case BIO_READ:
- /*
- * If no error occured on this request, or if we have no plex
- * left, finish here...
- */
- if ((bp->bio_error == 0) || (cp == NULL)) {
- g_std_done(bp);
- return;
- }
+ struct gv_volume *v;
+ struct gv_bioq *bq;
+
+ v = bp->bio_from->geom->softc;
+ bp->bio_cflags |= GV_BIO_DONE;
+ bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
+ bq->bp = bp;
+ mtx_lock(&v->bqueue_mtx);
+ TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);
+ wakeup(v);
+ mtx_unlock(&v->bqueue_mtx);
+}
- /* ... or try to read from the next plex. */
- g_io_request(bp, cp);
- return;
+static void
+gv_volume_start(struct bio *bp)
+{
+ struct gv_volume *v;
+ struct gv_bioq *bq;
+ switch(bp->bio_cmd) {
+ case BIO_READ:
case BIO_WRITE:
case BIO_DELETE:
- /* No more plexes left. */
- if (cp == NULL) {
- /*
- * Clear any errors if one of the previous writes
- * succeeded.
- */
- if (bp->bio_caller1 == (int *)1)
- bp->bio_error = 0;
- g_std_done(bp);
- return;
- }
-
- /* If this write request had no errors, remember that fact... */
- if (bp->bio_error == 0)
- bp->bio_caller1 = (int *)1;
+ break;
+ case BIO_GETATTR:
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ return;
+ }
- /* ... and write to the next plex. */
- g_io_request(bp, cp);
+ v = bp->bio_to->geom->softc;
+ if (v->state != GV_VOL_UP) {
+ g_io_deliver(bp, ENXIO);
return;
}
+
+ bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
+ bq->bp = bp;
+ mtx_lock(&v->bqueue_mtx);
+ TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);
+ wakeup(v);
+ mtx_unlock(&v->bqueue_mtx);
}
static void
-gv_volume_start(struct bio *bp)
+gv_vol_worker(void *arg)
{
- struct g_geom *gp;
- struct bio *bp2;
+ struct bio *bp;
struct gv_volume *v;
+ struct gv_bioq *bq;
+
+ v = arg;
+ KASSERT(v != NULL, ("NULL v"));
+ mtx_lock(&v->bqueue_mtx);
+ for (;;) {
+ /* We were signaled to exit. */
+ if (v->flags & GV_VOL_THREAD_DIE)
+ break;
+
+ /* Take the first BIO from our queue. */
+ bq = TAILQ_FIRST(&v->bqueue);
+ if (bq == NULL) {
+ msleep(v, &v->bqueue_mtx, PRIBIO, "-", hz/10);
+ continue;
+ }
+ TAILQ_REMOVE(&v->bqueue, bq, queue);
+ mtx_unlock(&v->bqueue_mtx);
- gp = bp->bio_to->geom;
- v = gp->softc;
- if (v->state != GV_VOL_UP) {
- g_io_deliver(bp, ENXIO);
- return;
+ bp = bq->bp;
+ g_free(bq);
+
+ if (bp->bio_cflags & GV_BIO_DONE)
+ gv_vol_completed_request(v, bp);
+ else
+ gv_vol_normal_request(v, bp);
+
+ mtx_lock(&v->bqueue_mtx);
}
- switch(bp->bio_cmd) {
+ mtx_unlock(&v->bqueue_mtx);
+ v->flags |= GV_VOL_THREAD_DEAD;
+ wakeup(v);
+
+ kthread_exit(ENXIO);
+}
+
+static void
+gv_vol_completed_request(struct gv_volume *v, struct bio *bp)
+{
+ struct bio *pbp;
+ struct gv_bioq *bq;
+
+ pbp = bp->bio_parent;
+
+ if (pbp->bio_error == 0)
+ pbp->bio_error = bp->bio_error;
+
+ switch (pbp->bio_cmd) {
case BIO_READ:
+ if (bp->bio_error) {
+ g_destroy_bio(bp);
+ pbp->bio_children--;
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = pbp;
+ mtx_lock(&v->bqueue_mtx);
+ TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);
+ mtx_unlock(&v->bqueue_mtx);
+ return;
+ }
+ break;
case BIO_WRITE:
case BIO_DELETE:
- bp2 = g_clone_bio(bp);
- if (bp2 == NULL) {
+ break;
+ }
+
+ /* When the original request is finished, we deliver it. */
+ pbp->bio_inbed++;
+ if (pbp->bio_inbed == pbp->bio_children) {
+ pbp->bio_completed = bp->bio_length;
+ g_io_deliver(pbp, pbp->bio_error);
+ }
+
+ g_destroy_bio(bp);
+}
+
+static void
+gv_vol_normal_request(struct gv_volume *v, struct bio *bp)
+{
+ struct g_geom *gp;
+ struct gv_plex *p;
+ struct bio *cbp, *pbp;
+
+ gp = v->geom;
+
+ switch (bp->bio_cmd) {
+ case BIO_READ:
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL) {
g_io_deliver(bp, ENOMEM);
return;
}
- bp2->bio_done = gv_volume_done;
- g_io_request(bp2, LIST_FIRST(&gp->consumer));
- return;
- default:
- g_io_deliver(bp, EOPNOTSUPP);
- return;
+ cbp->bio_done = gv_volume_done;
+ LIST_FOREACH(p, &v->plexes, in_volume) {
+ if (p->state >= GV_PLEX_DEGRADED)
+ break;
+ }
+ g_io_request(cbp, p->consumer);
+
+ break;
+
+ case BIO_WRITE:
+ case BIO_DELETE:
+ LIST_FOREACH(p, &v->plexes, in_volume) {
+ if (p->state < GV_PLEX_DEGRADED)
+ continue;
+
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL) /* XXX */
+ g_io_deliver(bp, ENOMEM);
+ cbp->bio_done = gv_volume_done;
+ cbp->bio_caller2 = p->consumer;
+
+ if (bp->bio_driver1 == NULL) {
+ bp->bio_driver1 = cbp;
+ } else {
+ pbp = bp->bio_driver1;
+ while (pbp->bio_caller1 != NULL)
+ pbp = pbp->bio_caller1;
+ pbp->bio_caller1 = cbp;
+ }
+ }
+
+ /* Fire off all sub-requests. */
+ pbp = bp->bio_driver1;
+ while (pbp != NULL) {
+ g_io_request(pbp, pbp->bio_caller2);
+ pbp = pbp->bio_caller1;
+ }
+
+ break;
}
}
@@ -211,6 +324,11 @@ gv_volume_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
gp->access = gv_volume_access;
gp->softc = v;
first++;
+ TAILQ_INIT(&v->bqueue);
+ mtx_init(&v->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
+ kthread_create(gv_vol_worker, v, NULL, 0, 0, "gv_v %s",
+ v->name);
+ v->flags |= GV_VOL_THREAD_ACTIVE;
} else
gp = v->geom;
@@ -261,9 +379,13 @@ static int
gv_volume_destroy_geom(struct gctl_req *req, struct g_class *mp,
struct g_geom *gp)
{
+ struct gv_volume *v;
+
g_trace(G_T_TOPOLOGY, "gv_volume_destroy_geom: %s", gp->name);
g_topology_assert();
+ v = gp->softc;
+ gv_kill_vol_thread(v);
g_wither_geom(gp, ENXIO);
return (0);
}
OpenPOWER on IntegriCloud