summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
committerpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
commitbbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
parentd2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
downloadFreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip
FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c105
1 files changed, 74 insertions, 31 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
index 4e419b6..aa8f6f0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
@@ -19,16 +19,15 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
+#include <sys/kstat.h>
/*
* Virtual device read-ahead caching.
@@ -36,15 +35,16 @@
* This file implements a simple LRU read-ahead cache. When the DMU reads
* a given block, it will often want other, nearby blocks soon thereafter.
* We take advantage of this by reading a larger disk region and caching
- * the result. In the best case, this can turn 256 back-to-back 512-byte
- * reads into a single 128k read followed by 255 cache hits; this reduces
+ * the result. In the best case, this can turn 128 back-to-back 512-byte
+ * reads into a single 64k read followed by 127 cache hits; this reduces
* latency dramatically. In the worst case, it can turn an isolated 512-byte
- * read into a 128k read, which doesn't affect latency all that much but is
+ * read into a 64k read, which doesn't affect latency all that much but is
* terribly wasteful of bandwidth. A more intelligent version of the cache
* could keep track of access patterns and not do read-ahead unless it sees
- * at least two temporally close I/Os to the same region. It could also
- * take advantage of semantic information about the I/O. And it could use
- * something faster than an AVL tree; that was chosen solely for convenience.
+ * at least two temporally close I/Os to the same region. Currently, only
+ * metadata I/O is inflated. A futher enhancement could take advantage of
+ * more semantic information about the I/O. And it could use something
+ * faster than an AVL tree; that was chosen solely for convenience.
*
* There are five cache operations: allocate, fill, read, write, evict.
*
@@ -69,13 +69,15 @@
/*
* All i/os smaller than zfs_vdev_cache_max will be turned into
* 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
- * track buffer. At most zfs_vdev_cache_size bytes will be kept in each
+ * track buffer). At most zfs_vdev_cache_size bytes will be kept in each
* vdev's vdev_cache.
*/
-int zfs_vdev_cache_max = 1<<14;
-int zfs_vdev_cache_size = 10ULL << 20;
+int zfs_vdev_cache_max = 1<<14; /* 16KB */
+int zfs_vdev_cache_size = 10ULL << 20; /* 10MB */
int zfs_vdev_cache_bshift = 16;
+#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */
+
SYSCTL_DECL(_vfs_zfs_vdev);
SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max);
@@ -84,8 +86,25 @@ SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size);
SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
&zfs_vdev_cache_size, 0, "Size of VDEV cache");
+TUNABLE_INT("vfs.zfs.vdev.cache.bshift", &zfs_vdev_cache_bshift);
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN,
+ &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value");
+
+kstat_t *vdc_ksp = NULL;
+
+typedef struct vdc_stats {
+ kstat_named_t vdc_stat_delegations;
+ kstat_named_t vdc_stat_hits;
+ kstat_named_t vdc_stat_misses;
+} vdc_stats_t;
+
+static vdc_stats_t vdc_stats = {
+ { "delegations", KSTAT_DATA_UINT64 },
+ { "hits", KSTAT_DATA_UINT64 },
+ { "misses", KSTAT_DATA_UINT64 }
+};
-#define VCBS (1 << zfs_vdev_cache_bshift)
+#define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1);
static int
vdev_cache_offset_compare(const void *a1, const void *a2)
@@ -127,10 +146,6 @@ vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
ASSERT(ve->ve_fill_io == NULL);
ASSERT(ve->ve_data != NULL);
- dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
- vc, ve->ve_offset, ve->ve_lastused, LBOLT - ve->ve_lastused,
- ve->ve_hits, ve->ve_missed_update);
-
avl_remove(&vc->vc_lastused_tree, ve);
avl_remove(&vc->vc_offset_tree, ve);
zio_buf_free(ve->ve_data, VCBS);
@@ -161,10 +176,8 @@ vdev_cache_allocate(zio_t *zio)
if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
zfs_vdev_cache_size) {
ve = avl_first(&vc->vc_lastused_tree);
- if (ve->ve_fill_io != NULL) {
- dprintf("can't evict in %p, still filling\n", vc);
+ if (ve->ve_fill_io != NULL)
return (NULL);
- }
ASSERT(ve->ve_hits != 0);
vdev_cache_evict(vc, ve);
}
@@ -239,7 +252,7 @@ vdev_cache_fill(zio_t *zio)
zio->io_delegate_list = dio->io_delegate_next;
dio->io_delegate_next = NULL;
dio->io_error = zio->io_error;
- zio_next_stage(dio);
+ zio_execute(dio);
}
}
@@ -287,6 +300,7 @@ vdev_cache_read(zio_t *zio)
fio->io_delegate_list = zio;
zio_vdev_io_bypass(zio);
mutex_exit(&vc->vc_lock);
+ VDCSTAT_BUMP(vdc_stat_delegations);
return (0);
}
@@ -294,7 +308,8 @@ vdev_cache_read(zio_t *zio)
zio_vdev_io_bypass(zio);
mutex_exit(&vc->vc_lock);
- zio_next_stage(zio);
+ zio_execute(zio);
+ VDCSTAT_BUMP(vdc_stat_hits);
return (0);
}
@@ -305,11 +320,9 @@ vdev_cache_read(zio_t *zio)
return (ENOMEM);
}
- fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
+ fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
- ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
- vdev_cache_fill, ve);
+ ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
ve->ve_fill_io = fio;
fio->io_delegate_list = zio;
@@ -317,6 +330,7 @@ vdev_cache_read(zio_t *zio)
mutex_exit(&vc->vc_lock);
zio_nowait(fio);
+ VDCSTAT_BUMP(vdc_stat_misses);
return (0);
}
@@ -361,6 +375,18 @@ vdev_cache_write(zio_t *zio)
}
void
+vdev_cache_purge(vdev_t *vd)
+{
+ vdev_cache_t *vc = &vd->vdev_cache;
+ vdev_cache_entry_t *ve;
+
+ mutex_enter(&vc->vc_lock);
+ while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+ vdev_cache_evict(vc, ve);
+ mutex_exit(&vc->vc_lock);
+}
+
+void
vdev_cache_init(vdev_t *vd)
{
vdev_cache_t *vc = &vd->vdev_cache;
@@ -380,15 +406,32 @@ void
vdev_cache_fini(vdev_t *vd)
{
vdev_cache_t *vc = &vd->vdev_cache;
- vdev_cache_entry_t *ve;
- mutex_enter(&vc->vc_lock);
- while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
- vdev_cache_evict(vc, ve);
- mutex_exit(&vc->vc_lock);
+ vdev_cache_purge(vd);
avl_destroy(&vc->vc_offset_tree);
avl_destroy(&vc->vc_lastused_tree);
mutex_destroy(&vc->vc_lock);
}
+
+void
+vdev_cache_stat_init(void)
+{
+ vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (vdc_ksp != NULL) {
+ vdc_ksp->ks_data = &vdc_stats;
+ kstat_install(vdc_ksp);
+ }
+}
+
+void
+vdev_cache_stat_fini(void)
+{
+ if (vdc_ksp != NULL) {
+ kstat_delete(vdc_ksp);
+ vdc_ksp = NULL;
+ }
+}
OpenPOWER on IntegriCloud