summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordelphij <delphij@FreeBSD.org>2014-08-02 03:56:06 +0000
committerdelphij <delphij@FreeBSD.org>2014-08-02 03:56:06 +0000
commit6a949e106dc93d9fa177f78a454fab1d92b5c5e7 (patch)
tree7404e6333ff3ecd4049cbff81119e02d6331e597
parent4b7aa9ea8a7bce8b0826c0fb5337074a32500239 (diff)
downloadFreeBSD-src-6a949e106dc93d9fa177f78a454fab1d92b5c5e7.zip
FreeBSD-src-6a949e106dc93d9fa177f78a454fab1d92b5c5e7.tar.gz
MFC r268855: MFV r268848:
Instead of asserting all zio's be properly aligned, only assert on the logical ones. Cap uberblocks at 8k, otherwise with ashift=17, there would be only one uberblock. This fixes a problem that zdb would trip assert on pools with ashift >= 0xe (8k). While there, also change the code so it only attempt to condense space map unless the uncondensed size consumes greater than zfs_metaslab_condense_block_threshold blocks. Illumos issue: 4958 zdb trips assert on pools with ashift >= 0xe
-rw-r--r--cddl/contrib/opensolaris/cmd/ztest/ztest.c43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c39
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h45
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c30
8 files changed, 130 insertions, 45 deletions
diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
index 8f5d7ce..3327161 100644
--- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c
+++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
@@ -810,7 +810,7 @@ static uint64_t
ztest_get_ashift(void)
{
if (ztest_opts.zo_ashift == 0)
- return (SPA_MINBLOCKSHIFT + ztest_random(3));
+ return (SPA_MINBLOCKSHIFT + ztest_random(5));
return (ztest_opts.zo_ashift);
}
@@ -969,11 +969,28 @@ ztest_random_spa_version(uint64_t initial_version)
return (version);
}
+/*
+ * Find the largest ashift used
+ */
+static uint64_t
+ztest_spa_get_ashift() {
+ uint64_t i;
+ uint64_t ashift = SPA_MINBLOCKSHIFT;
+ vdev_t *rvd = ztest_spa->spa_root_vdev;
+
+ for (i = 0; i < rvd->vdev_children; i++) {
+ ashift = MAX(ashift, rvd->vdev_child[i]->vdev_ashift);
+ }
+ return (ashift);
+}
+
static int
ztest_random_blocksize(void)
{
- return (1 << (SPA_MINBLOCKSHIFT +
- ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
+ // Choose a block size >= the ashift.
+ uint64_t block_shift =
+ ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
+ return (1 << (SPA_MINBLOCKSHIFT + block_shift));
}
static int
@@ -5768,16 +5785,30 @@ ztest_freeze(void)
spa_freeze(spa);
/*
+ * Because it is hard to predict how much space a write will actually
+ * require beforehand, we leave ourselves some fudge space to write over
+ * capacity.
+ */
+ uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
+
+ /*
* Run tests that generate log records but don't alter the pool config
* or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
* We do a txg_wait_synced() after each iteration to force the txg
* to increase well beyond the last synced value in the uberblock.
* The ZIL should be OK with that.
+ *
+ * Run a random number of times less than zo_maxloops and ensure we do
+ * not run out of space on the pool.
*/
while (ztest_random(10) != 0 &&
- numloops++ < ztest_opts.zo_maxloops) {
- ztest_dmu_write_parallel(zd, 0);
- ztest_dmu_object_alloc_free(zd, 0);
+ numloops++ < ztest_opts.zo_maxloops &&
+ metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
+ ztest_od_t od;
+ ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+ VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
+ ztest_io(zd, od.od_object,
+ ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
txg_wait_synced(spa_get_dsl(spa), 0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
index 6a248b3..8dbdc96 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index d834e83..d6e3ce0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -74,6 +74,21 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
" of in-memory counterpart");
/*
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
+ */
+int zfs_metaslab_condense_block_threshold = 4;
+
+/*
* The zfs_mg_noalloc_threshold defines which metaslab groups should
* be eligible for allocation. The value is defined as a percentage of
* a free space. Metaslab groups that have more free space than
@@ -1371,6 +1386,8 @@ metaslab_group_preload(metaslab_group_t *mg)
* times the size than the free space range tree representation
* (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
*
+ * 3. The on-disk size of the space map should actually decrease.
+ *
* Checking the first condition is tricky since we don't want to walk
* the entire AVL tree calculating the estimated on-disk size. Instead we
* use the size-ordered range tree in the metaslab and calculate the
@@ -1381,13 +1398,21 @@ metaslab_group_preload(metaslab_group_t *mg)
* To determine the second criterion we use a best-case estimate and assume
* each segment can be represented on-disk as a single 64-bit entry. We refer
* to this best-case estimate as the space map's minimal form.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
*/
static boolean_t
metaslab_should_condense(metaslab_t *msp)
{
space_map_t *sm = msp->ms_sm;
range_seg_t *rs;
- uint64_t size, entries, segsz;
+ uint64_t size, entries, segsz, object_size, optimal_size, record_size;
+ dmu_object_info_t doi;
+ uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
@@ -1411,9 +1436,15 @@ metaslab_should_condense(metaslab_t *msp)
entries = size / (MIN(size, SM_RUN_MAX));
segsz = entries * sizeof (uint64_t);
- return (segsz <= space_map_length(msp->ms_sm) &&
- space_map_length(msp->ms_sm) >= (zfs_condense_pct *
- sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
+ optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+ object_size = space_map_length(msp->ms_sm);
+
+ dmu_object_info_from_db(sm->sm_dbuf, &doi);
+ record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+
+ return (segsz <= object_size &&
+ object_size >= (optimal_size * zfs_condense_pct / 100) &&
+ object_size > zfs_metaslab_condense_block_threshold * record_size);
}
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index f216a7d..d09fb5b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index 518ebc4..129f62d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -249,8 +249,11 @@ struct vdev {
#define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10)
+/* The largest uberblock we support is 8k. */
+#define MAX_UBERBLOCK_SHIFT (13)
#define VDEV_UBERBLOCK_SHIFT(vd) \
- MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
+ MAX_UBERBLOCK_SHIFT)
#define VDEV_UBERBLOCK_COUNT(vd) \
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
#define VDEV_UBERBLOCK_OFFSET(vd, n) \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 002b067..9baded0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -165,19 +165,20 @@ enum zio_flag {
ZIO_FLAG_RESILVER = 1 << 3,
ZIO_FLAG_SCRUB = 1 << 4,
ZIO_FLAG_SCAN_THREAD = 1 << 5,
+ ZIO_FLAG_PHYSICAL = 1 << 6,
#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
/*
* Flags inherited by ddt, gang, and vdev children.
*/
- ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */
- ZIO_FLAG_SPECULATIVE = 1 << 7,
- ZIO_FLAG_CONFIG_WRITER = 1 << 8,
- ZIO_FLAG_DONT_RETRY = 1 << 9,
- ZIO_FLAG_DONT_CACHE = 1 << 10,
- ZIO_FLAG_NODATA = 1 << 11,
- ZIO_FLAG_INDUCE_DAMAGE = 1 << 12,
+ ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */
+ ZIO_FLAG_SPECULATIVE = 1 << 8,
+ ZIO_FLAG_CONFIG_WRITER = 1 << 9,
+ ZIO_FLAG_DONT_RETRY = 1 << 10,
+ ZIO_FLAG_DONT_CACHE = 1 << 11,
+ ZIO_FLAG_NODATA = 1 << 12,
+ ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
@@ -185,27 +186,27 @@ enum zio_flag {
/*
* Flags inherited by vdev children.
*/
- ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */
- ZIO_FLAG_PROBE = 1 << 14,
- ZIO_FLAG_TRYHARD = 1 << 15,
- ZIO_FLAG_OPTIONAL = 1 << 16,
+ ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */
+ ZIO_FLAG_PROBE = 1 << 15,
+ ZIO_FLAG_TRYHARD = 1 << 16,
+ ZIO_FLAG_OPTIONAL = 1 << 17,
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
/*
* Flags not inherited by any children.
*/
- ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */
- ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
- ZIO_FLAG_IO_BYPASS = 1 << 19,
- ZIO_FLAG_IO_REWRITE = 1 << 20,
- ZIO_FLAG_RAW = 1 << 21,
- ZIO_FLAG_GANG_CHILD = 1 << 22,
- ZIO_FLAG_DDT_CHILD = 1 << 23,
- ZIO_FLAG_GODFATHER = 1 << 24,
- ZIO_FLAG_NOPWRITE = 1 << 25,
- ZIO_FLAG_REEXECUTED = 1 << 26,
- ZIO_FLAG_DELEGATED = 1 << 27,
+ ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */
+ ZIO_FLAG_DONT_PROPAGATE = 1 << 19,
+ ZIO_FLAG_IO_BYPASS = 1 << 20,
+ ZIO_FLAG_IO_REWRITE = 1 << 21,
+ ZIO_FLAG_RAW = 1 << 22,
+ ZIO_FLAG_GANG_CHILD = 1 << 23,
+ ZIO_FLAG_DDT_CHILD = 1 << 24,
+ ZIO_FLAG_GODFATHER = 1 << 25,
+ ZIO_FLAG_NOPWRITE = 1 << 26,
+ ZIO_FLAG_REEXECUTED = 1 << 27,
+ ZIO_FLAG_DELEGATED = 1 << 28,
};
#define ZIO_FLAG_MUSTSUCCEED 0
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
index 85fa760..a9cbe4d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -57,7 +57,10 @@ zfs_dbgmsg_fini(void)
* echo ::zfs_dbgmsg | mdb -k
*
* Monitor these messages by running:
- * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ *
+ * When used with libzpool, monitor with:
+ * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
*/
void
zfs_dbgmsg(const char *fmt, ...)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index feb1a16..e5c5f51 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -886,8 +886,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
ASSERT3U(offset + size, <=, vd->vdev_psize);
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
- ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
- ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+ ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+ NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
@@ -907,8 +907,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
ASSERT3U(offset + size, <=, vd->vdev_psize);
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
- ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
- ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+ NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
@@ -2621,7 +2621,9 @@ zio_vdev_io_start(zio_t **ziop)
align = 1ULL << vd->vdev_top->vdev_ashift;
- if (P2PHASE(zio->io_size, align) != 0) {
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+ P2PHASE(zio->io_size, align) != 0) {
+ /* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = NULL;
if (zio->io_type == ZIO_TYPE_READ ||
@@ -2636,8 +2638,22 @@ zio_vdev_io_start(zio_t **ziop)
zio_subblock);
}
- ASSERT(P2PHASE(zio->io_offset, align) == 0);
- ASSERT(P2PHASE(zio->io_size, align) == 0);
+ /*
+ * If this is not a physical io, make sure that it is properly aligned
+ * before proceeding.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+ ASSERT0(P2PHASE(zio->io_offset, align));
+ ASSERT0(P2PHASE(zio->io_size, align));
+ } else {
+ /*
+ * For physical writes, we allow 512b aligned writes and assume
+ * the device will perform a read-modify-write as necessary.
+ */
+ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
+ ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
+ }
+
VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
/*
OpenPOWER on IntegriCloud