Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.

This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
author: pjd <pjd@FreeBSD.org> 2008-11-17 20:49:29 +0000
committer: pjd <pjd@FreeBSD.org> 2008-11-17 20:49:29 +0000
commit: bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree: 81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
parent: d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
download: FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip
FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz
1 files changed, 77 insertions, 43 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
index e2385a0..17e4b0a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
@@ -53,7 +51,7 @@ extern void devctl_notify(const char *__system, const char *__subsystem,
  * 	pool			X
  *
  * If we are in a loading state, all errors are chained together by the same
- * SPA-wide ENA.
+ * SPA-wide ENA (Error Numeric Association).
  *
  * For isolated I/O requests, we get the ENA from the zio_t. The propagation
  * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
@@ -90,11 +88,10 @@ extern void devctl_notify(const char *__system, const char *__subsystem,
  * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
  * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
  * then inherit this pointer, so that when it is first set subsequent failures
- * will use the same ENA.  If a physical I/O is issued (by passing the
- * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
- * unique ENA will be generated.  For an aggregate I/O, this pointer is set to
- * NULL, and no ereport will be generated (since it doesn't actually correspond
- * to any particular device or piece of data).
+ * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
+ * this pointer is set to NULL, and no ereport will be generated (since it
+ * doesn't actually correspond to any particular device or piece of data,
+ * and the caller will always retry without caching or queueing anyway).
  */
 void
 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
@@ -104,6 +101,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	char buf[1024];
 	struct sbuf sb;
 	struct timespec ts;
+	int state;
 
 	/*
 	 * If we are doing a spa_tryimport(), ignore errors.
@@ -120,21 +118,33 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	    spa->spa_last_open_failed)
 		return;
 
-	/*
-	 * Ignore any errors from I/Os that we are going to retry anyway - we
-	 * only generate errors from the final failure.
-	 */
-	if (zio && zio_should_retry(zio))
-		return;
+	if (zio != NULL) {
+		/*
+		 * If this is not a read or write zio, ignore the error.  This
+		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
+		 */
+		if (zio->io_type != ZIO_TYPE_READ &&
+		    zio->io_type != ZIO_TYPE_WRITE)
+			return;
 
-	/*
-	 * If this is not a read or write zio, ignore the error.  This can occur
-	 * if the DKIOCFLUSHWRITECACHE ioctl fails.
-	 */
-	if (zio && zio->io_type != ZIO_TYPE_READ &&
-	    zio->io_type != ZIO_TYPE_WRITE)
-		return;
+		/*
+		 * Ignore any errors from speculative I/Os, as failure is an
+		 * expected result.
+		 */
+		if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+			return;
 
+		/*
+		 * If the vdev has already been marked as failing due to a
+		 * failed probe, then ignore any subsequent I/O errors, as the
+		 * DE will automatically fault the vdev on the first such
+		 * failure.
+		 */
+		if (vd != NULL &&
+		    (!vdev_readable(vd) || !vdev_writeable(vd)) &&
+		    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+			return;
+	}
 	nanotime(&ts);
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
@@ -187,22 +197,28 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	 */
 
 	/*
+	 * If we are importing a faulted pool, then we treat it like an open,
+	 * not an import.  Otherwise, the DE will ignore all faults during
+	 * import, since the default behavior is to mark the devices as
+	 * persistently unavailable, not leave them in the faulted state.
+	 */
+	state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state;
+
+	/*
 	 * Generic payload members common to all ereports.
-	 *
-	 * The direct reference to spa_name is used rather than spa_name()
-	 * because of the asynchronous nature of the zio pipeline.  spa_name()
-	 * asserts that the config lock is held in some form.  This is always
-	 * the case in I/O context, but because the check for RW_WRITER compares
-	 * against 'curthread', we may be in an asynchronous context and blow
-	 * this assert.  Rather than loosen this assert, we acknowledge that all
-	 * contexts in which this function is called (pool open, I/O) are safe,
-	 * and dereference the name directly.
 	 */
-	sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name);
+	sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa));
 	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
 	    spa_guid(spa));
-	sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
-	    spa->spa_load_state);
+	sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, state);
+
+	if (spa != NULL) {
+		sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
+		    FM_EREPORT_FAILMODE_WAIT :
+		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
+		    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC);
+	}
 
 	if (vd != NULL) {
 		vdev_t *pvd = vd->vdev_parent;
@@ -290,7 +306,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	mutex_exit(&spa->spa_errlist_lock);
 
 	sbuf_finish(&sb);
-	ZFS_LOG(1, "%s", sbuf_data(&sb));
 	devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb));
 	if (sbuf_overflowed(&sb))
 		printf("ZFS WARNING: sbuf overflowed\n");
@@ -298,13 +313,8 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 #endif
 }
 
-/*
- * The 'resource.fs.zfs.ok' event is an internal signal that the associated
- * resource (pool or disk) has been identified by ZFS as healthy.  This will
- * then trigger the DE to close the associated case, if any.
- */
-void
-zfs_post_ok(spa_t *spa, vdev_t *vd)
+static void
+zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
 {
 #ifdef _KERNEL
 	char buf[1024];
@@ -318,7 +328,7 @@ zfs_post_ok(spa_t *spa, vdev_t *vd)
 	sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
 
 	snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
-	    ZFS_ERROR_CLASS, FM_RESOURCE_OK);
+	    ZFS_ERROR_CLASS, name);
 	sbuf_printf(&sb, " %s=%hhu", FM_VERSION, FM_RSRC_VERSION);
 	sbuf_printf(&sb, " %s=%s", FM_CLASS, class);
 	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
@@ -327,9 +337,33 @@ zfs_post_ok(spa_t *spa, vdev_t *vd)
 		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
 		    vd->vdev_guid);
 	sbuf_finish(&sb);
+	ZFS_LOG(1, "%s", sbuf_data(&sb));
 	devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
 	if (sbuf_overflowed(&sb))
 		printf("ZFS WARNING: sbuf overflowed\n");
 	sbuf_delete(&sb);
 #endif
 }
+
+/*
+ * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
+ * has been removed from the system.  This will cause the DE to ignore any
+ * recent I/O errors, inferring that they are due to the asynchronous device
+ * removal.
+ */
+void
+zfs_post_remove(spa_t *spa, vdev_t *vd)
+{
+	zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
+}
+
+/*
+ * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
+ * has the 'autoreplace' property set, and therefore any broken vdevs will be
+ * handled by higher level logic, and no vdev fault should be generated.
+ */
+void
+zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
+{
+	zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
+}
author	pjd <pjd@FreeBSD.org>	2008-11-17 20:49:29 +0000
committer	pjd <pjd@FreeBSD.org>	2008-11-17 20:49:29 +0000
commit	bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree	81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
parent	d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
download	FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz