summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
committerpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
commitbbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
parentd2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
downloadFreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip
FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c155
1 files changed, 70 insertions, 85 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
index 73d1a83..c4629ff 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
@@ -39,8 +37,9 @@ typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
int mc_error;
- short mc_tried;
- short mc_skipped;
+ uint8_t mc_tried;
+ uint8_t mc_skipped;
+ uint8_t mc_speculative;
} mirror_child_t;
typedef struct mirror_map {
@@ -53,6 +52,14 @@ typedef struct mirror_map {
int vdev_mirror_shift = 21;
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+
+ kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+}
+
static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
@@ -110,18 +117,10 @@ vdev_mirror_map_alloc(zio_t *zio)
}
zio->io_vsd = mm;
+ zio->io_vsd_free = vdev_mirror_map_free;
return (mm);
}
-static void
-vdev_mirror_map_free(zio_t *zio)
-{
- mirror_map_t *mm = zio->io_vsd;
-
- kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
- zio->io_vsd = NULL;
-}
-
static int
vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
{
@@ -195,13 +194,6 @@ vdev_mirror_scrub_done(zio_t *zio)
mc->mc_skipped = 0;
}
-static void
-vdev_mirror_repair_done(zio_t *zio)
-{
- ASSERT(zio->io_private == zio->io_parent);
- vdev_mirror_map_free(zio->io_private);
-}
-
/*
* Try to find a child whose DTL doesn't contain the block we want to read.
* If we can't, try the read on any vdev we haven't already tried.
@@ -219,7 +211,7 @@ vdev_mirror_child_select(zio_t *zio)
/*
* Try to find a child whose DTL doesn't contain the block to read.
* If a child is known to be completely inaccessible (indicated by
- * vdev_is_dead() returning B_TRUE), don't even try.
+ * vdev_readable() returning B_FALSE), don't even try.
*/
for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
if (c >= mm->mm_children)
@@ -227,7 +219,7 @@ vdev_mirror_child_select(zio_t *zio)
mc = &mm->mm_child[c];
if (mc->mc_tried || mc->mc_skipped)
continue;
- if (vdev_is_dead(mc->mc_vd)) {
+ if (!vdev_readable(mc->mc_vd)) {
mc->mc_error = ENXIO;
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
@@ -237,6 +229,7 @@ vdev_mirror_child_select(zio_t *zio)
return (c);
mc->mc_error = ESTALE;
mc->mc_skipped = 1;
+ mc->mc_speculative = 1;
}
/*
@@ -253,7 +246,7 @@ vdev_mirror_child_select(zio_t *zio)
return (-1);
}
-static void
+static int
vdev_mirror_io_start(zio_t *zio)
{
mirror_map_t *mm;
@@ -275,12 +268,10 @@ vdev_mirror_io_start(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
zio_buf_alloc(zio->io_size), zio->io_size,
- zio->io_type, zio->io_priority,
- ZIO_FLAG_CANFAIL,
+ zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
}
- zio_wait_children_done(zio);
- return;
+ return (ZIO_PIPELINE_CONTINUE);
}
/*
* For normal reads just pick one child.
@@ -310,13 +301,27 @@ vdev_mirror_io_start(zio_t *zio)
while (children--) {
mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- mc->mc_vd, mc->mc_offset,
- zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
- ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
+ mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_mirror_child_done, mc));
c++;
}
- zio_wait_children_done(zio);
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+vdev_mirror_worst_error(mirror_map_t *mm)
+{
+ int error[2] = { 0, 0 };
+
+ for (int c = 0; c < mm->mm_children; c++) {
+ mirror_child_t *mc = &mm->mm_child[c];
+ int s = mc->mc_speculative;
+ error[s] = zio_worst_error(error[s], mc->mc_error);
+ }
+
+ return (error[0] ? error[0] : error[1]);
}
static void
@@ -328,41 +333,45 @@ vdev_mirror_io_done(zio_t *zio)
int good_copies = 0;
int unexpected_errors = 0;
- zio->io_error = 0;
- zio->io_numerrors = 0;
-
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
- if (mc->mc_tried && mc->mc_error == 0) {
- good_copies++;
- continue;
- }
-
- /*
- * We preserve any EIOs because those may be worth retrying;
- * whereas ECKSUM and ENXIO are more likely to be persistent.
- */
if (mc->mc_error) {
- if (zio->io_error != EIO)
- zio->io_error = mc->mc_error;
if (!mc->mc_skipped)
unexpected_errors++;
- zio->io_numerrors++;
+ } else if (mc->mc_tried) {
+ good_copies++;
}
}
if (zio->io_type == ZIO_TYPE_WRITE) {
/*
* XXX -- for now, treat partial writes as success.
- * XXX -- For a replacing vdev, we need to make sure the
- * new child succeeds.
+ *
+ * Now that we support write reallocation, it would be better
+ * to treat partial failure as real failure unless there are
+ * no non-degraded top-level vdevs left, and not update DTLs
+ * if we intend to reallocate.
*/
/* XXPOLICY */
- if (good_copies != 0)
- zio->io_error = 0;
- vdev_mirror_map_free(zio);
- zio_next_stage(zio);
+ if (good_copies != mm->mm_children) {
+ /*
+ * Always require at least one good copy.
+ *
+ * For ditto blocks (io_vd == NULL), require
+ * all copies to be good.
+ *
+ * XXX -- for replacing vdevs, there's no great answer.
+ * If the old device is really dead, we may not even
+ * be able to access it -- so we only want to
+ * require good writes to the new device. But if
+ * the new device turns out to be flaky, we want
+ * to be able to detach it -- which requires all
+ * writes to the old device to have succeeded.
+ */
+ if (good_copies == 0 || zio->io_vd == NULL)
+ zio->io_error = vdev_mirror_worst_error(mm);
+ }
return;
}
@@ -375,40 +384,27 @@ vdev_mirror_io_done(zio_t *zio)
if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
ASSERT(c >= 0 && c < mm->mm_children);
mc = &mm->mm_child[c];
- dprintf("retrying i/o (err=%d) on child %s\n",
- zio->io_error, vdev_description(mc->mc_vd));
- zio->io_error = 0;
zio_vdev_io_redone(zio);
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
- ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
+ ZIO_TYPE_READ, zio->io_priority, 0,
vdev_mirror_child_done, mc));
- zio_wait_children_done(zio);
return;
}
/* XXPOLICY */
- if (good_copies)
- zio->io_error = 0;
- else
+ if (good_copies == 0) {
+ zio->io_error = vdev_mirror_worst_error(mm);
ASSERT(zio->io_error != 0);
+ }
if (good_copies && (spa_mode & FWRITE) &&
(unexpected_errors ||
(zio->io_flags & ZIO_FLAG_RESILVER) ||
((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
- zio_t *rio;
-
/*
* Use the good data we have in hand to repair damaged children.
- *
- * We issue all repair I/Os as children of 'rio' to arrange
- * that vdev_mirror_map_free(zio) will be invoked after all
- * repairs complete, but before we advance to the next stage.
*/
- rio = zio_null(zio, zio->io_spa,
- vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
-
for (c = 0; c < mm->mm_children; c++) {
/*
* Don't rewrite known good children.
@@ -429,24 +425,13 @@ vdev_mirror_io_done(zio_t *zio)
mc->mc_error = ESTALE;
}
- dprintf("resilvered %s @ 0x%llx error %d\n",
- vdev_description(mc->mc_vd), mc->mc_offset,
- mc->mc_error);
-
- zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
- mc->mc_offset, zio->io_data, zio->io_size,
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset,
+ zio->io_data, zio->io_size,
ZIO_TYPE_WRITE, zio->io_priority,
- ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
- ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
+ ZIO_FLAG_IO_REPAIR, NULL, NULL));
}
-
- zio_nowait(rio);
- zio_wait_children_done(zio);
- return;
}
-
- vdev_mirror_map_free(zio);
- zio_next_stage(zio);
}
static void
OpenPOWER on IntegriCloud