summaryrefslogtreecommitdiffstats
path: root/sys/geom/vinum
diff options
context:
space:
mode:
authorle <le@FreeBSD.org>2004-09-30 12:57:35 +0000
committerle <le@FreeBSD.org>2004-09-30 12:57:35 +0000
commitada7c51edc72a4c9783343b1af3a2335f448baed (patch)
tree6a80fd9dd1fc6bc23ae20575319c461f4e1c54ec /sys/geom/vinum
parentb0a3fc4cda4ac786b04578ff6fe1a009fd38f6e5 (diff)
downloadFreeBSD-src-ada7c51edc72a4c9783343b1af3a2335f448baed.zip
FreeBSD-src-ada7c51edc72a4c9783343b1af3a2335f448baed.tar.gz
Make it possible to rebuild degraded RAID5 plexes. Note that it is
currently not possible to do this while the volume is mounted. MFC in: 1 week
Diffstat (limited to 'sys/geom/vinum')
-rw-r--r--sys/geom/vinum/geom_vinum_init.c118
-rw-r--r--sys/geom/vinum/geom_vinum_list.c26
-rw-r--r--sys/geom/vinum/geom_vinum_plex.c16
-rw-r--r--sys/geom/vinum/geom_vinum_raid5.c114
-rw-r--r--sys/geom/vinum/geom_vinum_raid5.h2
-rw-r--r--sys/geom/vinum/geom_vinum_var.h1
6 files changed, 261 insertions, 16 deletions
diff --git a/sys/geom/vinum/geom_vinum_init.c b/sys/geom/vinum/geom_vinum_init.c
index 60c408c..46d9d51 100644
--- a/sys/geom/vinum/geom_vinum_init.c
+++ b/sys/geom/vinum/geom_vinum_init.c
@@ -43,6 +43,8 @@ __FBSDID("$FreeBSD$");
int gv_init_plex(struct gv_plex *);
int gv_init_sd(struct gv_sd *);
void gv_init_td(void *);
+void gv_rebuild_plex(struct gv_plex *);
+void gv_rebuild_td(void *);
void gv_start_plex(struct gv_plex *);
void gv_start_vol(struct gv_volume *);
void gv_sync(struct gv_volume *);
@@ -117,8 +119,12 @@ gv_start_plex(struct gv_plex *p)
v = p->vol_sc;
if ((v != NULL) && (v->plexcount > 1))
gv_sync(v);
- else if (p->org == GV_PLEX_RAID5)
- gv_init_plex(p);
+ else if (p->org == GV_PLEX_RAID5) {
+ if (p->state == GV_PLEX_DEGRADED)
+ gv_rebuild_plex(p);
+ else
+ gv_init_plex(p);
+ }
return;
}
@@ -142,7 +148,9 @@ gv_start_vol(struct gv_volume *v)
case GV_PLEX_DOWN:
gv_init_plex(p);
break;
- case GV_PLEX_DEGRADED: /* XXX not yet */
+ case GV_PLEX_DEGRADED:
+ gv_rebuild_plex(p);
+ break;
default:
return;
}
@@ -191,6 +199,22 @@ gv_sync(struct gv_volume *v)
}
}
+void
+gv_rebuild_plex(struct gv_plex *p)
+{
+ struct gv_sync_args *sync;
+
+ if ((p->flags & GV_PLEX_SYNCING) || gv_is_open(p->geom))
+ return;
+
+ sync = g_malloc(sizeof(*sync), M_WAITOK | M_ZERO);
+ sync->to = p;
+ sync->syncsize = GV_DFLT_SYNCSIZE;
+
+ kthread_create(gv_rebuild_td, sync, NULL, 0, 0, "gv_rebuild %s",
+ p->name);
+}
+
int
gv_init_plex(struct gv_plex *p)
{
@@ -225,6 +249,94 @@ gv_init_sd(struct gv_sd *s)
return (0);
}
+/* This thread is responsible for rebuilding a degraded RAID5 plex. */
+void
+gv_rebuild_td(void *arg)
+{
+ struct bio *bp;
+ struct gv_plex *p;
+ struct g_consumer *cp;
+ struct gv_sync_args *sync;
+ u_char *buf;
+ off_t i;
+ int error;
+
+ buf = NULL;
+ bp = NULL;
+
+ sync = arg;
+ p = sync->to;
+ p->synced = 0;
+ p->flags |= GV_PLEX_SYNCING;
+ cp = p->consumer;
+
+ g_topology_lock();
+ error = g_access(cp, 1, 1, 0);
+ if (error) {
+ g_topology_unlock();
+ printf("GEOM_VINUM: rebuild of %s failed to access consumer: "
+ "%d\n", p->name, error);
+ kthread_exit(error);
+ }
+ g_topology_unlock();
+
+ buf = g_malloc(sync->syncsize, M_WAITOK);
+
+ printf("GEOM_VINUM: rebuild of %s started\n", p->name);
+ i = 0;
+ for (i = 0; i < p->size; i += (p->stripesize * (p->sdcount - 1))) {
+/*
+ if (i + sync->syncsize > p->size)
+ sync->syncsize = p->size - i;
+*/
+ bp = g_new_bio();
+ if (bp == NULL) {
+ printf("GEOM_VINUM: rebuild of %s failed creating bio: "
+ "out of memory\n", p->name);
+ break;
+ }
+ bp->bio_cmd = BIO_WRITE;
+ bp->bio_done = NULL;
+ bp->bio_data = buf;
+ bp->bio_cflags |= GV_BIO_REBUILD;
+ bp->bio_offset = i;
+ bp->bio_length = p->stripesize;
+
+ /* Schedule it down ... */
+ g_io_request(bp, cp);
+
+ /* ... and wait for the result. */
+ error = biowait(bp, "gwrite");
+ if (error) {
+ printf("GEOM_VINUM: rebuild of %s failed at offset %jd "
+ "errno: %d\n", p->name, i, error);
+ break;
+ }
+ g_destroy_bio(bp);
+ bp = NULL;
+ }
+
+ if (bp != NULL)
+ g_destroy_bio(bp);
+ if (buf != NULL)
+ g_free(buf);
+
+ g_topology_lock();
+ g_access(cp, -1, -1, 0);
+ gv_save_config_all(p->vinumconf);
+ g_topology_unlock();
+
+ p->flags &= ~GV_PLEX_SYNCING;
+ p->synced = 0;
+
+ /* Successful initialization. */
+ if (!error)
+ printf("GEOM_VINUM: rebuild of %s finished\n", p->name);
+
+ g_free(sync);
+ kthread_exit(error);
+}
+
void
gv_sync_td(void *arg)
{
diff --git a/sys/geom/vinum/geom_vinum_list.c b/sys/geom/vinum/geom_vinum_list.c
index f70cffb..ca95ba7 100644
--- a/sys/geom/vinum/geom_vinum_list.c
+++ b/sys/geom/vinum/geom_vinum_list.c
@@ -365,9 +365,15 @@ gv_lsi(struct gv_sd *s, struct sbuf *sb, int flags)
(intmax_t)s->size, (intmax_t)s->size / MEGABYTE);
sbuf_printf(sb, "\t\tState: %s\n", gv_sdstate(s->state));
- if (s->state == GV_SD_INITIALIZING) {
- sbuf_printf(sb, "\t\tInitialized: %16jd bytes "
- "(%d%%)\n", (intmax_t)s->initialized,
+ if (s->state == GV_SD_INITIALIZING ||
+ s->state == GV_SD_REVIVING) {
+ if (s->state == GV_SD_INITIALIZING)
+ sbuf_printf(sb, "\t\tInitialized: ");
+ else
+ sbuf_printf(sb, "\t\tRevived: ");
+
+ sbuf_printf(sb, "%16jd bytes (%d%%)\n",
+ (intmax_t)s->initialized,
(int)((s->initialized * 100) / s->size));
}
@@ -377,20 +383,20 @@ gv_lsi(struct gv_sd *s, struct sbuf *sb, int flags)
gv_roughlength(s->plex_offset, 1));
}
- if (s->state == GV_SD_REVIVING) {
- /* XXX */
- }
-
sbuf_printf(sb, "\t\tDrive %s (%s) at offset %jd (%s)\n",
s->drive,
s->drive_sc == NULL ? "*missing*" : s->drive_sc->name,
(intmax_t)s->drive_offset,
gv_roughlength(s->drive_offset, 1));
} else {
- /* XXX reviving and initializing... */
sbuf_printf(sb, "S %-21s State: ", s->name);
- if (s->state == GV_SD_INITIALIZING) {
- sbuf_printf(sb, "I %d%%\t",
+ if (s->state == GV_SD_INITIALIZING ||
+ s->state == GV_SD_REVIVING) {
+ if (s->state == GV_SD_INITIALIZING)
+ sbuf_printf(sb, "I ");
+ else
+ sbuf_printf(sb, "R ");
+ sbuf_printf(sb, "%d%%\t",
(int)((s->initialized * 100) / s->size));
} else {
sbuf_printf(sb, "%s\t", gv_sdstate(s->state));
diff --git a/sys/geom/vinum/geom_vinum_plex.c b/sys/geom/vinum/geom_vinum_plex.c
index 494ec2c..7ce5b08 100644
--- a/sys/geom/vinum/geom_vinum_plex.c
+++ b/sys/geom/vinum/geom_vinum_plex.c
@@ -295,7 +295,9 @@ gv_plex_worker(void *arg)
/* A completed request. */
if (bp->bio_cflags & GV_BIO_DONE) {
g_free(bq);
- if (bp->bio_cflags & GV_BIO_SYNCREQ) {
+
+ if (bp->bio_cflags & GV_BIO_SYNCREQ ||
+ bp->bio_cflags & GV_BIO_REBUILD) {
s = bp->bio_to->private;
if (bp->bio_error == 0)
s->initialized += bp->bio_length;
@@ -306,8 +308,11 @@ gv_plex_worker(void *arg)
g_topology_unlock();
s->initialized = 0;
}
+ }
+
+ if (bp->bio_cflags & GV_BIO_SYNCREQ)
g_std_done(bp);
- } else
+ else
gv_plex_completed_request(p, bp);
/*
* A sub-request that was hold back because it interfered with
@@ -457,7 +462,12 @@ gv_plex_normal_request(struct gv_plex *p, struct bio *bp)
wp->bio = bp;
TAILQ_INIT(&wp->bits);
- err = gv_build_raid5_req(p, wp, bp, addr, boff, bcount);
+ if (bp->bio_cflags & GV_BIO_REBUILD)
+ err = gv_rebuild_raid5(p, wp, bp, addr,
+ boff, bcount);
+ else
+ err = gv_build_raid5_req(p, wp, bp, addr,
+ boff, bcount);
/*
* Building the sub-request failed, we probably need to
diff --git a/sys/geom/vinum/geom_vinum_raid5.c b/sys/geom/vinum/geom_vinum_raid5.c
index 62fb246..9ba02e8 100644
--- a/sys/geom/vinum/geom_vinum_raid5.c
+++ b/sys/geom/vinum/geom_vinum_raid5.c
@@ -77,6 +77,117 @@ gv_stripe_active(struct gv_plex *p, struct bio *bp)
return (overlap);
}
+int
+gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
+ caddr_t addr, off_t boff, off_t bcount)
+{
+ struct gv_sd *broken, *s;
+ struct gv_bioq *bq;
+ struct bio *cbp, *pbp;
+ off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;
+
+ if (p == NULL || LIST_EMPTY(&p->subdisks))
+ return (ENXIO);
+
+ /* Offset of the start address from the start of the stripe. */
+ stripeoff = boff % (p->stripesize * (p->sdcount - 1));
+ KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0"));
+
+ /* The offset of the stripe on this subdisk. */
+ stripestart = (boff - stripeoff) / (p->sdcount - 1);
+ KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0"));
+
+ stripeoff %= p->stripesize;
+
+ /* The offset of the request on this subdisk. */
+ real_off = stripestart + stripeoff;
+
+ stripeend = stripestart + p->stripesize;
+ len_left = stripeend - real_off;
+ KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0"));
+
+ /* Find the right subdisk. */
+ broken = NULL;
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ if (s->state != GV_SD_UP)
+ broken = s;
+ }
+
+ /* Parity stripe not found. */
+ if (broken == NULL)
+ return (ENXIO);
+
+ switch (broken->state) {
+ case GV_SD_UP:
+ return (EINVAL);
+
+ case GV_SD_STALE:
+ if (!(bp->bio_cflags & GV_BIO_REBUILD))
+ return (ENXIO);
+
+ printf("GEOM_VINUM: sd %s is reviving\n", broken->name);
+ gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
+ break;
+
+ case GV_SD_REVIVING:
+ break;
+
+ default:
+ /* All other subdisk states mean it's not accessible. */
+ return (ENXIO);
+ }
+
+ real_len = (bcount <= len_left) ? bcount : len_left;
+ wp->length = real_len;
+ wp->data = addr;
+ wp->lockbase = real_off;
+
+ KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
+
+ /* Read all subdisks. */
+ LIST_FOREACH(s, &p->subdisks, in_plex) {
+ /* Skip the broken subdisk. */
+ if (s == broken)
+ continue;
+
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
+ return (ENOMEM);
+ cbp->bio_cmd = BIO_READ;
+ cbp->bio_data = g_malloc(real_len, M_WAITOK);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = s->consumer;
+ cbp->bio_driver1 = wp;
+
+ GV_ENQUEUE(bp, cbp, pbp);
+
+ bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+ bq->bp = cbp;
+ TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
+ }
+
+ /* Write the parity data. */
+ cbp = g_clone_bio(bp);
+ if (cbp == NULL)
+ return (ENOMEM);
+ cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
+ cbp->bio_cflags |= GV_BIO_MALLOC;
+ cbp->bio_offset = real_off;
+ cbp->bio_length = real_len;
+ cbp->bio_done = gv_plex_done;
+ cbp->bio_caller2 = broken->consumer;
+ cbp->bio_driver1 = wp;
+ cbp->bio_cflags |= GV_BIO_REBUILD;
+ wp->parity = cbp;
+
+ p->synced = boff;
+
+ return (0);
+}
+
/* Build a request group to perform (part of) a RAID5 request. */
int
gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
@@ -166,6 +277,9 @@ gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
+ if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
+ type = REQ_TYPE_NORMAL;
+
switch (bp->bio_cmd) {
case BIO_READ:
/*
diff --git a/sys/geom/vinum/geom_vinum_raid5.h b/sys/geom/vinum/geom_vinum_raid5.h
index 8074f42..212f6c6 100644
--- a/sys/geom/vinum/geom_vinum_raid5.h
+++ b/sys/geom/vinum/geom_vinum_raid5.h
@@ -67,6 +67,8 @@ struct gv_raid5_packet {
int gv_stripe_active(struct gv_plex *, struct bio *);
int gv_build_raid5_req(struct gv_plex *, struct gv_raid5_packet *,
struct bio *, caddr_t, off_t, off_t);
+int gv_rebuild_raid5(struct gv_plex *, struct gv_raid5_packet *,
+ struct bio *, caddr_t, off_t, off_t);
void gv_raid5_worker(void *);
void gv_plex_done(struct bio *);
diff --git a/sys/geom/vinum/geom_vinum_var.h b/sys/geom/vinum/geom_vinum_var.h
index 99c1c37..196f7f8 100644
--- a/sys/geom/vinum/geom_vinum_var.h
+++ b/sys/geom/vinum/geom_vinum_var.h
@@ -113,6 +113,7 @@
#define GV_BIO_ONHOLD 0x04
#define GV_BIO_SYNCREQ 0x08
#define GV_BIO_SUCCEED 0x10
+#define GV_BIO_REBUILD 0x20
/*
* hostname is 256 bytes long, but we don't need to shlep multiple copies in
OpenPOWER on IntegriCloud