summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
committerpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
commitbbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
parentd2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
downloadFreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip
FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c156
1 files changed, 91 insertions, 65 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 0dba134..22b56d6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/dmu.h>
@@ -35,6 +33,7 @@
#include <sys/zio.h>
uint64_t metaslab_aliquot = 512ULL << 10;
+uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
* ==========================================================================
@@ -341,7 +340,7 @@ metaslab_fini(metaslab_t *msp)
int t;
vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
- -msp->ms_smo.smo_alloc);
+ -msp->ms_smo.smo_alloc, B_TRUE);
metaslab_group_remove(mg, msp);
@@ -534,8 +533,8 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(smo, db->db_data, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(smo, db->db_data, sizeof (*smo));
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
@@ -569,10 +568,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
space_map_create(&msp->ms_freemap[t], sm->sm_start,
sm->sm_size, sm->sm_shift, sm->sm_lock);
}
- vdev_space_update(vd, sm->sm_size, 0);
+ vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
}
- vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
+ vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -714,11 +713,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
* Allocate a block for the specified i/o.
*/
static int
-metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
- dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
+metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+ dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
{
metaslab_group_t *mg, *rotor;
- metaslab_class_t *mc;
vdev_t *vd;
int dshift = 3;
int all_zero;
@@ -728,7 +726,11 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
ASSERT(!DVA_IS_VALID(&dva[d]));
- mc = spa_metaslab_class_select(spa);
+ /*
+ * For testing, make some blocks above a certain size be gang blocks.
+ */
+ if (psize >= metaslab_gang_bang && (LBOLT & 3) == 0)
+ return (ENOSPC);
/*
* Start at the rotor and loop through all mgs until we find something.
@@ -754,7 +756,7 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
*/
if (hintdva) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
- if (hintdva_avoid)
+ if (flags & METASLAB_HINTBP_AVOID)
mg = vd->vdev_mg->mg_next;
else
mg = vd->vdev_mg;
@@ -764,12 +766,34 @@ metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
} else {
mg = mc->mc_rotor;
}
- rotor = mg;
+ /*
+ * If the hint put us into the wrong class, just follow the rotor.
+ */
+ if (mg->mg_class != mc)
+ mg = mc->mc_rotor;
+
+ rotor = mg;
top:
all_zero = B_TRUE;
do {
vd = mg->mg_vd;
+ /*
+ * Don't allocate from faulted devices.
+ */
+ if (!vdev_writeable(vd))
+ goto next;
+ /*
+ * Avoid writing single-copy data to a failing vdev
+ */
+ if ((vd->vdev_stat.vs_write_errors > 0 ||
+ vd->vdev_state < VDEV_STATE_HEALTHY) &&
+ d == 0 && dshift == 3) {
+ all_zero = B_FALSE;
+ goto next;
+ }
+
+ ASSERT(mg->mg_class == mc);
distance = vd->vdev_asize >> dshift;
if (distance <= (1ULL << vd->vdev_ms_shift))
@@ -818,11 +842,12 @@ top:
DVA_SET_VDEV(&dva[d], vd->vdev_id);
DVA_SET_OFFSET(&dva[d], offset);
- DVA_SET_GANG(&dva[d], 0);
+ DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
DVA_SET_ASIZE(&dva[d], asize);
return (0);
}
+next:
mc->mc_rotor = mg->mg_next;
mc->mc_allocated = 0;
} while ((mg = mg->mg_next) != rotor);
@@ -879,38 +904,6 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
vdev_dirty(vd, VDD_METASLAB, msp, txg);
space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
-
- /*
- * verify that this region is actually allocated in
- * either a ms_allocmap or the ms_map
- */
- if (msp->ms_map.sm_loaded) {
- boolean_t allocd = B_FALSE;
- int i;
-
- if (!space_map_contains(&msp->ms_map, offset, size)) {
- allocd = B_TRUE;
- } else {
- for (i = 0; i < TXG_CONCURRENT_STATES; i++) {
- space_map_t *sm = &msp->ms_allocmap
- [(txg - i) & TXG_MASK];
- if (space_map_contains(sm,
- offset, size)) {
- allocd = B_TRUE;
- break;
- }
- }
- }
-
- if (!allocd) {
- zfs_panic_recover("freeing free segment "
- "(vdev=%llu offset=%llx size=%llx)",
- (longlong_t)vdev, (longlong_t)offset,
- (longlong_t)size);
- }
- }
-
-
}
mutex_exit(&msp->ms_lock);
@@ -946,16 +939,18 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
- if (error) {
+ if (error || txg == 0) { /* txg == 0 indicates dry run */
mutex_exit(&msp->ms_lock);
return (error);
}
- if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
- vdev_dirty(vd, VDD_METASLAB, msp, txg);
-
space_map_claim(&msp->ms_map, offset, size);
- space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+ if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */
+ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+ vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+ }
mutex_exit(&msp->ms_lock);
@@ -963,32 +958,45 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
}
int
-metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
- uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
+metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
+ int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
- int d;
int error = 0;
+ ASSERT(bp->blk_birth == 0);
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+ if (mc->mc_rotor == NULL) { /* no vdevs in this class */
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+ return (ENOSPC);
+ }
+
ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
ASSERT(BP_GET_NDVAS(bp) == 0);
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
- for (d = 0; d < ndvas; d++) {
- error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
- txg, hintbp_avoid);
+ for (int d = 0; d < ndvas; d++) {
+ error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
+ txg, flags);
if (error) {
for (d--; d >= 0; d--) {
metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
bzero(&dva[d], sizeof (dva_t));
}
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
return (error);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ bp->blk_birth = txg;
+
return (0);
}
@@ -997,12 +1005,16 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
- int d;
ASSERT(!BP_IS_HOLE(bp));
+ ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+
+ spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
- for (d = 0; d < ndvas; d++)
+ for (int d = 0; d < ndvas; d++)
metaslab_free_dva(spa, &dva[d], txg, now);
+
+ spa_config_exit(spa, SCL_FREE, FTAG);
}
int
@@ -1010,14 +1022,28 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
- int d, error;
- int last_error = 0;
+ int error = 0;
ASSERT(!BP_IS_HOLE(bp));
- for (d = 0; d < ndvas; d++)
+ if (txg != 0) {
+ /*
+ * First do a dry run to make sure all DVAs are claimable,
+ * so we don't have to unwind from partial failures below.
+ */
+ if ((error = metaslab_claim(spa, bp, 0)) != 0)
+ return (error);
+ }
+
+ spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+ for (int d = 0; d < ndvas; d++)
if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
- last_error = error;
+ break;
+
+ spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+ ASSERT(error == 0 || txg == 0);
- return (last_error);
+ return (error);
}
OpenPOWER on IntegriCloud