Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.

This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
author: pjd <pjd@FreeBSD.org> 2008-11-17 20:49:29 +0000
committer: pjd <pjd@FreeBSD.org> 2008-11-17 20:49:29 +0000
commit: bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree: 81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
parent: d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
download: FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip
FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz
1 files changed, 74 insertions, 31 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
index 4e419b6..aa8f6f0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -19,16 +19,15 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
+#include <sys/kstat.h>
 
 /*
  * Virtual device read-ahead caching.
@@ -36,15 +35,16 @@
  * This file implements a simple LRU read-ahead cache.  When the DMU reads
  * a given block, it will often want other, nearby blocks soon thereafter.
  * We take advantage of this by reading a larger disk region and caching
- * the result.  In the best case, this can turn 256 back-to-back 512-byte
- * reads into a single 128k read followed by 255 cache hits; this reduces
+ * the result.  In the best case, this can turn 128 back-to-back 512-byte
+ * reads into a single 64k read followed by 127 cache hits; this reduces
  * latency dramatically.  In the worst case, it can turn an isolated 512-byte
- * read into a 128k read, which doesn't affect latency all that much but is
+ * read into a 64k read, which doesn't affect latency all that much but is
  * terribly wasteful of bandwidth.  A more intelligent version of the cache
  * could keep track of access patterns and not do read-ahead unless it sees
- * at least two temporally close I/Os to the same region.  It could also
- * take advantage of semantic information about the I/O.  And it could use
- * something faster than an AVL tree; that was chosen solely for convenience.
+ * at least two temporally close I/Os to the same region.  Currently, only
+ * metadata I/O is inflated.  A futher enhancement could take advantage of
+ * more semantic information about the I/O.  And it could use something
+ * faster than an AVL tree; that was chosen solely for convenience.
  *
  * There are five cache operations: allocate, fill, read, write, evict.
  *
@@ -69,13 +69,15 @@
 /*
  * All i/os smaller than zfs_vdev_cache_max will be turned into
  * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
- * track buffer.  At most zfs_vdev_cache_size bytes will be kept in each
+ * track buffer).  At most zfs_vdev_cache_size bytes will be kept in each
  * vdev's vdev_cache.
  */
-int zfs_vdev_cache_max = 1<<14;
-int zfs_vdev_cache_size = 10ULL << 20;
+int zfs_vdev_cache_max = 1<<14;			/* 16KB */
+int zfs_vdev_cache_size = 10ULL << 20;		/* 10MB */
 int zfs_vdev_cache_bshift = 16;
 
+#define	VCBS (1 << zfs_vdev_cache_bshift)	/* 64KB */
+
 SYSCTL_DECL(_vfs_zfs_vdev);
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
 TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max);
@@ -84,8 +86,25 @@ SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
 TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size);
 SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
     &zfs_vdev_cache_size, 0, "Size of VDEV cache");
+TUNABLE_INT("vfs.zfs.vdev.cache.bshift", &zfs_vdev_cache_bshift);
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN,
+    &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value");
+
+kstat_t	*vdc_ksp = NULL;
+
+typedef struct vdc_stats {
+	kstat_named_t vdc_stat_delegations;
+	kstat_named_t vdc_stat_hits;
+	kstat_named_t vdc_stat_misses;
+} vdc_stats_t;
+
+static vdc_stats_t vdc_stats = {
+	{ "delegations",	KSTAT_DATA_UINT64 },
+	{ "hits",		KSTAT_DATA_UINT64 },
+	{ "misses",		KSTAT_DATA_UINT64 }
+};
 
-#define	VCBS (1 << zfs_vdev_cache_bshift)
+#define	VDCSTAT_BUMP(stat)	atomic_add_64(&vdc_stats.stat.value.ui64, 1);
 
 static int
 vdev_cache_offset_compare(const void *a1, const void *a2)
@@ -127,10 +146,6 @@ vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
 	ASSERT(ve->ve_fill_io == NULL);
 	ASSERT(ve->ve_data != NULL);
 
-	dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
-	    vc, ve->ve_offset, ve->ve_lastused, LBOLT - ve->ve_lastused,
-	    ve->ve_hits, ve->ve_missed_update);
-
 	avl_remove(&vc->vc_lastused_tree, ve);
 	avl_remove(&vc->vc_offset_tree, ve);
 	zio_buf_free(ve->ve_data, VCBS);
@@ -161,10 +176,8 @@ vdev_cache_allocate(zio_t *zio)
 	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
 	    zfs_vdev_cache_size) {
 		ve = avl_first(&vc->vc_lastused_tree);
-		if (ve->ve_fill_io != NULL) {
-			dprintf("can't evict in %p, still filling\n", vc);
+		if (ve->ve_fill_io != NULL)
 			return (NULL);
-		}
 		ASSERT(ve->ve_hits != 0);
 		vdev_cache_evict(vc, ve);
 	}
@@ -239,7 +252,7 @@ vdev_cache_fill(zio_t *zio)
 		zio->io_delegate_list = dio->io_delegate_next;
 		dio->io_delegate_next = NULL;
 		dio->io_error = zio->io_error;
-		zio_next_stage(dio);
+		zio_execute(dio);
 	}
 }
 
@@ -287,6 +300,7 @@ vdev_cache_read(zio_t *zio)
 			fio->io_delegate_list = zio;
 			zio_vdev_io_bypass(zio);
 			mutex_exit(&vc->vc_lock);
+			VDCSTAT_BUMP(vdc_stat_delegations);
 			return (0);
 		}
 
@@ -294,7 +308,8 @@ vdev_cache_read(zio_t *zio)
 		zio_vdev_io_bypass(zio);
 
 		mutex_exit(&vc->vc_lock);
-		zio_next_stage(zio);
+		zio_execute(zio);
+		VDCSTAT_BUMP(vdc_stat_hits);
 		return (0);
 	}
 
@@ -305,11 +320,9 @@ vdev_cache_read(zio_t *zio)
 		return (ENOMEM);
 	}
 
-	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
+	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
 	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
-	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
-	    vdev_cache_fill, ve);
+	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
 	fio->io_delegate_list = zio;
@@ -317,6 +330,7 @@ vdev_cache_read(zio_t *zio)
 
 	mutex_exit(&vc->vc_lock);
 	zio_nowait(fio);
+	VDCSTAT_BUMP(vdc_stat_misses);
 
 	return (0);
 }
@@ -361,6 +375,18 @@ vdev_cache_write(zio_t *zio)
 }
 
 void
+vdev_cache_purge(vdev_t *vd)
+{
+	vdev_cache_t *vc = &vd->vdev_cache;
+	vdev_cache_entry_t *ve;
+
+	mutex_enter(&vc->vc_lock);
+	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+		vdev_cache_evict(vc, ve);
+	mutex_exit(&vc->vc_lock);
+}
+
+void
 vdev_cache_init(vdev_t *vd)
 {
 	vdev_cache_t *vc = &vd->vdev_cache;
@@ -380,15 +406,32 @@ void
 vdev_cache_fini(vdev_t *vd)
 {
 	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve;
 
-	mutex_enter(&vc->vc_lock);
-	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
-		vdev_cache_evict(vc, ve);
-	mutex_exit(&vc->vc_lock);
+	vdev_cache_purge(vd);
 
 	avl_destroy(&vc->vc_offset_tree);
 	avl_destroy(&vc->vc_lastused_tree);
 
 	mutex_destroy(&vc->vc_lock);
 }
+
+void
+vdev_cache_stat_init(void)
+{
+	vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (vdc_ksp != NULL) {
+		vdc_ksp->ks_data = &vdc_stats;
+		kstat_install(vdc_ksp);
+	}
+}
+
+void
+vdev_cache_stat_fini(void)
+{
+	if (vdc_ksp != NULL) {
+		kstat_delete(vdc_ksp);
+		vdc_ksp = NULL;
+	}
+}
author	pjd <pjd@FreeBSD.org>	2008-11-17 20:49:29 +0000
committer	pjd <pjd@FreeBSD.org>	2008-11-17 20:49:29 +0000
commit	bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree	81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
parent	d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
download	FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz