summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib
diff options
context:
space:
mode:
authordelphij <delphij@FreeBSD.org>2013-09-21 00:17:26 +0000
committerdelphij <delphij@FreeBSD.org>2013-09-21 00:17:26 +0000
commitd04a7f01449dd0dc39fabc56e90a4580dab41263 (patch)
tree3ed935dae0abcf797f99a2887f5679babe01d4c4 /sys/cddl/contrib
parentbce147746d1a5ea29f54f93b5f2f6add24307d01 (diff)
parent715f00315e5f22ea18a2d0fe9139829934ae6b20 (diff)
downloadFreeBSD-src-d04a7f01449dd0dc39fabc56e90a4580dab41263.zip
FreeBSD-src-d04a7f01449dd0dc39fabc56e90a4580dab41263.tar.gz
MFV r254750:
Add support of Illumos dumps on zvol over RAID-Z. Note that this only adds the features. FreeBSD would still need more work to support dumping on zvols. Illumos ZFS issues: 2932 support crash dumps to raidz, etc. pools MFC after: 1 month Approved by: re (ZFS blanket)
Diffstat (limited to 'sys/cddl/contrib')
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c4
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h2
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h50
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c192
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c129
12 files changed, 388 insertions, 45 deletions
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
index c2539d2..b691989 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#ifdef _KERNEL
@@ -159,4 +160,7 @@ zpool_feature_init(void)
zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
"org.illumos:lz4_compress", "lz4_compress",
"LZ4 compression algorithm support.", B_FALSE, B_FALSE, NULL);
+ zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+ "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
+ "Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, NULL);
}
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
index 3873317..a0b73c2 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#ifndef _ZFEATURE_COMMON_H
@@ -53,6 +54,7 @@ static enum spa_feature {
SPA_FEATURE_ASYNC_DESTROY,
SPA_FEATURE_EMPTY_BPOBJ,
SPA_FEATURE_LZ4_COMPRESS,
+ SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
SPA_FEATURES
} spa_feature_t;
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
index 4bdc946..2baede9 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -69,6 +70,7 @@ zfs_prop_init(void)
{ "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
{ "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
{ "sha256", ZIO_CHECKSUM_SHA256 },
+ { "noparity", ZIO_CHECKSUM_NOPARITY },
{ NULL }
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 92ee7c3..4d02a1d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -23,6 +23,7 @@
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -2755,7 +2756,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
- ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+ zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, NULL, db->db.db_size, &zp,
dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index 54decd6..9e661be 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -22,8 +22,8 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
-
/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
+/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
@@ -1610,7 +1610,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
* pipeline.
*/
compress = ZIO_COMPRESS_OFF;
- checksum = ZIO_CHECKSUM_OFF;
+ checksum = ZIO_CHECKSUM_NOPARITY;
} else {
compress = zio_compress_select(dn->dn_compress, compress);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
index b748571..9055d0e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
@@ -21,13 +21,12 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2013 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_VDEV_DISK_H
#define _SYS_VDEV_DISK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/vdev.h>
#ifdef _KERNEL
#include <sys/buf.h>
@@ -40,14 +39,23 @@
extern "C" {
#endif
+#ifdef _KERNEL
typedef struct vdev_disk {
ddi_devid_t vd_devid;
char *vd_minor;
ldi_handle_t vd_lh;
} vdev_disk_t;
+#endif
+extern int vdev_disk_physio(vdev_t *,
+ caddr_t, size_t, uint64_t, int, boolean_t);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
#ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
#endif
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
new file mode 100644
index 0000000..e771e66
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define _SYS_VDEV_RAIDZ_H
+
+#include <sys/vdev.h>
+#ifdef illumos
+#include <sys/semaphore.h>
+#ifdef _KERNEL
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int vdev_raidz_physio(vdev_t *,
+ caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_RAIDZ_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 342e7c1..b001e11 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -23,6 +23,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#ifndef _ZIO_H
@@ -79,6 +80,7 @@ enum zio_checksum {
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
ZIO_CHECKSUM_ZILOG2,
+ ZIO_CHECKSUM_NOPARITY,
ZIO_CHECKSUM_FUNCTIONS
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
index 00682c8..3d11966 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2013 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -431,8 +432,29 @@ vdev_disk_close(vdev_t *vd)
}
int
-vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
- uint64_t offset, int flags)
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+ size_t size, uint64_t offset, int flags, boolean_t isdump)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ ASSERT(vd->vdev_ops == &vdev_disk_ops);
+
+ /*
+ * If in the context of an active crash dump, use the ldi_dump(9F)
+ * call instead of ldi_strategy(9F) as usual.
+ */
+ if (isdump) {
+ ASSERT3P(dvd, !=, NULL);
+ return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
+ lbtodb(size)));
+ }
+
+ return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+ size_t size, uint64_t offset, int flags)
{
buf_t *bp;
int error = 0;
@@ -680,7 +702,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
/* read vdev label */
offset = vdev_label_offset(size, l, 0);
- if (vdev_disk_physio(vd_lh, (caddr_t)label,
+ if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
continue;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index 5e3c6c7..f2fd29d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -22,15 +22,22 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
+#ifdef illumos
+#include <sys/vdev_disk.h>
+#endif
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
+#include <sys/bio.h>
/*
* Virtual device vector for RAID-Z.
@@ -154,6 +161,8 @@ typedef struct raidz_map {
VDEV_RAIDZ_64MUL_2((x), mask); \
}
+#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
+
/*
* Force reconstruction to use the general purpose method.
*/
@@ -437,14 +446,14 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
* the number of children in the target vdev.
*/
static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
- uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
+ uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
raidz_map_t *rm;
/* The starting RAIDZ (parent) vdev sector of the block. */
- uint64_t b = zio->io_offset >> unit_shift;
+ uint64_t b = offset >> unit_shift;
/* The zio's size in units of the vdev's minimum sector size. */
- uint64_t s = zio->io_size >> unit_shift;
+ uint64_t s = size >> unit_shift;
/* The first column for this stripe. */
uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
@@ -532,13 +541,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
ASSERT3U(rm->rm_nskip, <=, nparity);
- if (zio->io_type != ZIO_TYPE_FREE) {
+ if (!dofree) {
for (c = 0; c < rm->rm_firstdatacol; c++) {
rm->rm_col[c].rc_data =
zio_buf_alloc(rm->rm_col[c].rc_size);
}
- rm->rm_col[c].rc_data = zio->io_data;
+ rm->rm_col[c].rc_data = data;
for (c = c + 1; c < acols; c++) {
rm->rm_col[c].rc_data =
@@ -570,7 +579,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT(rm->rm_cols >= 2);
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
- if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+ if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
devidx = rm->rm_col[0].rc_devidx;
o = rm->rm_col[0].rc_offset;
rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -582,8 +591,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
rm->rm_skipstart = 1;
}
- zio->io_vsd = rm;
- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
return (rm);
}
@@ -993,12 +1000,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
* ~~ ~~
* __ __
* | 1 1 1 1 1 1 1 1 |
- * | 128 64 32 16 8 4 2 1 |
* | 19 205 116 29 64 16 4 1 |
* | 1 0 0 0 0 0 0 0 |
- * | 0 1 0 0 0 0 0 0 |
- * (V|I)' = | 0 0 1 0 0 0 0 0 |
- * | 0 0 0 1 0 0 0 0 |
+ * (V|I)' = | 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 |
@@ -1532,6 +1536,154 @@ vdev_raidz_close(vdev_t *vd)
vdev_close(vd->vdev_child[c]);
}
+#ifdef illumos
+/*
+ * Handle a read or write I/O to a RAID-Z dump device.
+ *
+ * The dump device is in a unique situation compared to other ZFS datasets:
+ * writing to this device should be as simple and fast as possible. In
+ * addition, durability matters much less since the dump will be extracted
+ * once the machine reboots. For that reason, this function eschews parity for
+ * performance and simplicity. The dump device uses the checksum setting
+ * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
+ * dataset.
+ *
+ * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
+ * 128 KB will not fill an entire block; in addition, they may not be properly
+ * aligned. In that case, this function uses the preallocated 128 KB block and
+ * omits reading or writing any "empty" portions of that block, as opposed to
+ * allocating a fresh appropriately-sized block.
+ *
+ * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
+ *
+ * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
+ *
+ * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
+ * allocated which spans all five child vdevs. 8 KB of data would be written to
+ * each of four vdevs, with the fifth containing the parity bits.
+ *
+ * parity data data data data
+ * | PP | XX | XX | XX | XX |
+ * ^ ^ ^ ^ ^
+ * | | | | |
+ * 8 KB parity ------8 KB data blocks------
+ *
+ * However, when writing to the dump device, the behavior is different:
+ *
+ * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
+ *
+ * Unlike the normal RAID-Z case in which the block is allocated based on the
+ * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
+ * I/O size is less than 128 KB, only the actual portions of data are written.
+ * In this example the data is written to the third data vdev since that vdev
+ * contains the offset [64 KB, 96 KB).
+ *
+ * parity data data data data
+ * | | | | XX | |
+ * ^
+ * |
+ * 32 KB data block
+ *
+ * As a result, an individual I/O may not span all child vdevs; moreover, a
+ * small I/O may only operate on a single child vdev.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
+ * would look like:
+ *
+ * parity parity parity data data data data
+ * | | | | | | XX | |
+ * ^
+ * |
+ * 32 KB data block
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+ uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
+{
+ vdev_t *tvd = vd->vdev_top;
+ vdev_t *cvd;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c, err = 0;
+
+ uint64_t start, end, colstart, colend;
+ uint64_t coloffset, colsize, colskip;
+
+ int flags = doread ? BIO_READ : BIO_WRITE;
+
+#ifdef _KERNEL
+
+ /*
+ * Don't write past the end of the block
+ */
+ VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+
+ start = offset;
+ end = start + size;
+
+ /*
+ * Allocate a RAID-Z map for this block. Note that this block starts
+ * from the "original" offset, this is, the offset of the extent which
+ * contains the requisite offset of the data being read or written.
+ *
+ * Even if this I/O operation doesn't span the full block size, let's
+ * treat the on-disk format as if the only blocks are the complete 128
+ * KB size.
+ */
+ rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+ SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
+ vd->vdev_nparity);
+
+ coloffset = origoffset;
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+ c++, coloffset += rc->rc_size) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ /*
+ * Find the start and end of this column in the RAID-Z map,
+ * keeping in mind that the stated size and offset of the
+ * operation may not fill the entire column for this vdev.
+ *
+ * If any portion of the data spans this column, issue the
+ * appropriate operation to the vdev.
+ */
+ if (coloffset + rc->rc_size <= start)
+ continue;
+ if (coloffset >= end)
+ continue;
+
+ colstart = MAX(coloffset, start);
+ colend = MIN(end, coloffset + rc->rc_size);
+ colsize = colend - colstart;
+ colskip = colstart - coloffset;
+
+ VERIFY3U(colsize, <=, rc->rc_size);
+ VERIFY3U(colskip, <=, rc->rc_size);
+
+ /*
+ * Note that the child vdev will have a vdev label at the start
+ * of its range of offsets, hence the need for
+ * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
+ * example of why this calculation is needed.
+ */
+ if ((err = vdev_disk_physio(cvd,
+ ((char *)rc->rc_data) + colskip, colsize,
+ VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+ flags, isdump)) != 0)
+ break;
+ }
+
+ vdev_raidz_map_free(rm);
+#endif /* KERNEL */
+
+ return (err);
+}
+#endif
+
static uint64_t
vdev_raidz_asize(vdev_t *vd, uint64_t psize)
{
@@ -1584,9 +1736,14 @@ vdev_raidz_io_start(zio_t *zio)
raidz_col_t *rc;
int c, i;
- rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+ rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+ zio->io_type == ZIO_TYPE_FREE,
+ tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_FREE) {
@@ -1729,6 +1886,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
int c, ret = 0;
raidz_col_t *rc;
+ blkptr_t *bp = zio->io_bp;
+ enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+ if (checksum == ZIO_CHECKSUM_NOPARITY)
+ return (ret);
+
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
index bc73317..d1c60c3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -78,6 +79,7 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
{{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
{{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "noparity"},
};
enum zio_checksum
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index 60d7a49..72d4502 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -24,6 +24,7 @@
* Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
* All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -60,6 +61,7 @@
#include <sys/stat.h>
#include <sys/zap.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/dmu_traverse.h>
#include <sys/dnode.h>
@@ -77,9 +79,14 @@
#include <sys/zfs_znode.h>
#include <sys/zfs_rlock.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zil_impl.h>
#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+
#include <geom/geom.h>
#include "zfs_namecheck.h"
@@ -1158,27 +1165,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
#ifdef sun
static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
- boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+ uint64_t size, boolean_t doread, boolean_t isdump)
{
vdev_disk_t *dvd;
int c;
int numerrors = 0;
- for (c = 0; c < vd->vdev_children; c++) {
- ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
- vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
- int err = zvol_dumpio_vdev(vd->vdev_child[c],
- addr, offset, size, doread, isdump);
- if (err != 0) {
- numerrors++;
- } else if (doread) {
- break;
+ if (vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) {
+ for (c = 0; c < vd->vdev_children; c++) {
+ int err = zvol_dumpio_vdev(vd->vdev_child[c],
+ addr, offset, origoffset, size, doread, isdump);
+ if (err != 0) {
+ numerrors++;
+ } else if (doread) {
+ break;
+ }
}
}
- if (!vd->vdev_ops->vdev_op_leaf)
+ if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
return (numerrors < vd->vdev_children ? 0 : EIO);
if (doread && !vdev_readable(vd))
@@ -1186,19 +1194,26 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
else if (!doread && !vdev_writeable(vd))
return (SET_ERROR(EIO));
- dvd = vd->vdev_tsd;
- ASSERT3P(dvd, !=, NULL);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ return (vdev_raidz_physio(vd,
+ addr, size, offset, origoffset, doread, isdump));
+ }
+
offset += VDEV_LABEL_START_SIZE;
if (ddi_in_panic() || isdump) {
ASSERT(!doread);
if (doread)
return (SET_ERROR(EIO));
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
lbtodb(size)));
} else {
- return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
- doread ? B_READ : B_WRITE));
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
+ return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+ offset, doread ? B_READ : B_WRITE));
}
}
@@ -1233,7 +1248,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
offset += DVA_GET_OFFSET(&ze->ze_dva);
- error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+ error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+ size, doread, isdump);
if (!ddi_in_panic())
spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1253,6 +1269,7 @@ zvol_strategy(struct bio *bp)
rl_t *rl;
int error = 0;
boolean_t doread = (bp->bio_cmd == BIO_READ);
+ boolean_t is_dumpified;
boolean_t sync;
if (zv == NULL) {
@@ -1279,7 +1296,13 @@ zvol_strategy(struct bio *bp)
return (0);
}
- sync = !doread && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+#ifdef illumos
+ is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
+#else
+ is_dumpified = B_FALSE;
+#endif
+ sync = !doread && !is_dumpified &&
+ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
/*
* There must be no buffer changes when doing a dmu_sync() because
@@ -1290,7 +1313,15 @@ zvol_strategy(struct bio *bp)
while (resid != 0 && off < volsize) {
size_t size = MIN(resid, zvol_maxphys);
+#ifdef illumos
+ if (is_dumpified) {
+ size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
+ error = zvol_dumpio(zv, addr, off, size,
+ doread, B_FALSE);
+ } else if (doread) {
+#else
if (doread) {
+#endif
error = dmu_read(os, ZVOL_OBJ, off, size, addr,
DMU_READ_PREFETCH);
} else {
@@ -1824,21 +1855,67 @@ zvol_fini(void)
}
#ifdef sun
+/*ARGSUSED*/
+static int
+zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (spa_feature_is_active(spa,
+ &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]))
+ return (1);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ spa_feature_incr(spa,
+ &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP], tx);
+}
+
static int
zvol_dump_init(zvol_state_t *zv, boolean_t resize)
{
dmu_tx_t *tx;
- int error = 0;
+ int error;
objset_t *os = zv->zv_objset;
+ spa_t *spa = dmu_objset_spa(os);
+ vdev_t *vd = spa->spa_root_vdev;
nvlist_t *nv = NULL;
- uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
+ uint64_t version = spa_version(spa);
+ enum zio_checksum checksum;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(vd->vdev_ops == &vdev_root_ops);
+
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
DMU_OBJECT_END);
/* wait for dmu_free_long_range to actually free the blocks */
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+ /*
+ * If the pool on which the dump device is being initialized has more
+ * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
+ * enabled. If so, bump that feature's counter to indicate that the
+ * feature is active. We also check the vdev type to handle the
+ * following case:
+ * # zpool create test raidz disk1 disk2 disk3
+ * Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
+ * the raidz vdev itself has 3 children.
+ */
+ if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
+ if (!spa_feature_is_enabled(spa,
+ &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]))
+ return (SET_ERROR(ENOTSUP));
+ (void) dsl_sync_task(spa_name(spa),
+ zfs_mvdev_dump_feature_check,
+ zfs_mvdev_dump_activate_feature_sync, NULL, 2);
+ }
+
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
@@ -1849,6 +1926,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
}
/*
+ * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
+ * function. Otherwise, use the old default -- OFF.
+ */
+ checksum = spa_feature_is_active(spa,
+ &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]) ?
+ ZIO_CHECKSUM_NOPARITY : ZIO_CHECKSUM_OFF;
+
+ /*
* If we are resizing the dump device then we only need to
* update the refreservation to match the newly updated
* zvolsize. Otherwise, we save off the original state of the
@@ -1911,7 +1996,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
ZIO_COMPRESS_OFF) == 0);
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
- ZIO_CHECKSUM_OFF) == 0);
+ checksum) == 0);
if (version >= SPA_VERSION_DEDUP) {
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_DEDUP),
OpenPOWER on IntegriCloud