diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts')
84 files changed, 6036 insertions, 2750 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 860b33c..2813924 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -462,6 +462,7 @@ static arc_state_t *arc_l2c_only; static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; +static uint64_t arc_loaned_bytes; static uint64_t arc_meta_used; static uint64_t arc_meta_limit; static uint64_t arc_meta_max = 0; @@ -511,7 +512,7 @@ struct arc_buf_hdr { /* immutable */ arc_buf_contents_t b_type; uint64_t b_size; - spa_t *b_spa; + uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -533,9 +534,9 @@ static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); -static boolean_t l2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab); +static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ @@ -761,9 +762,8 @@ static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); static uint64_t -buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) +buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { - uintptr_t spav = (uintptr_t)spa; uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; @@ -773,7 +773,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; - crc ^= (spav>>8) ^ birth; + crc ^= (spa>>8) ^ birth; return (crc); } @@ -789,7 +789,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static arc_buf_hdr_t * -buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); @@ -1345,7 +1345,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; - hdr->b_spa = spa; + hdr->b_spa = spa_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); @@ -1364,6 +1364,41 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) return (buf); } +static char *arc_onloan_tag = "onloan"; + +/* + * Loan out an anonymous arc buffer. Loaned buffers are not counted as in + * flight data by arc_tempreserve_space() until they are "returned". Loaned + * buffers must be returned to the arc before they can be used by the DMU or + * freed. + */ +arc_buf_t * +arc_loan_buf(spa_t *spa, int size) +{ + arc_buf_t *buf; + + buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + + atomic_add_64(&arc_loaned_bytes, size); + return (buf); +} + +/* + * Return a loaned arc buffer to the arc. + */ +void +arc_return_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(hdr->b_state == arc_anon); + ASSERT(buf->b_data != NULL); + VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0); + VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1); + + atomic_add_64(&arc_loaned_bytes, -hdr->b_size); +} + static arc_buf_t * arc_buf_clone(arc_buf_t *from) { @@ -1661,7 +1696,7 @@ arc_buf_size(arc_buf_t *buf) * It may also return without evicting as much space as requested. */ static void * -arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, +arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; @@ -1830,12 +1865,12 @@ evict_start: if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mru_ghost->arcs_lsize[type], mru_over); - arc_evict_ghost(arc_mru_ghost, NULL, todelete); + arc_evict_ghost(arc_mru_ghost, 0, todelete); } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c); - arc_evict_ghost(arc_mfu_ghost, NULL, todelete); + arc_evict_ghost(arc_mfu_ghost, 0, todelete); } } if (stolen) @@ -1849,7 +1884,7 @@ evict_start: * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) +arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; list_t *list, *list_start; @@ -1955,13 +1990,13 @@ arc_adjust(void) if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); - (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); - (void) arc_evict(arc_mru, NULL, delta, FALSE, + (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); } @@ -1973,14 +2008,14 @@ arc_adjust(void) if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); - (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { int64_t delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); - (void) arc_evict(arc_mfu, NULL, delta, FALSE, + (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); } @@ -1992,7 +2027,7 @@ arc_adjust(void) if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { delta = MIN(arc_mru_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mru_ghost, NULL, delta); + arc_evict_ghost(arc_mru_ghost, 0, delta); } adjustment = @@ -2000,7 +2035,7 @@ arc_adjust(void) if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { delta = MIN(arc_mfu_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mfu_ghost, NULL, delta); + arc_evict_ghost(arc_mfu_ghost, 0, delta); } } @@ -2044,29 +2079,34 @@ restart: void arc_flush(spa_t *spa) { + uint64_t guid = 0; + + if (spa) + guid = spa_guid(spa); + while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } - arc_evict_ghost(arc_mru_ghost, spa, -1); - arc_evict_ghost(arc_mfu_ghost, spa, -1); + arc_evict_ghost(arc_mru_ghost, guid, -1); + arc_evict_ghost(arc_mfu_ghost, guid, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); @@ -2463,7 +2503,7 @@ arc_get_data_buf(arc_buf_t *buf) state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { + if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); @@ -2673,7 +2713,7 @@ arc_read_done(zio_t *zio) * reason for it not to be found is if we were freed during the * read. */ - found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, + found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || @@ -2817,9 +2857,10 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *buf; kmutex_t *hash_lock; zio_t *rzio; + uint64_t guid = spa_guid(spa); top: - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); if (hdr && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; @@ -2842,7 +2883,7 @@ top: acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, zio_flags); + spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; @@ -3084,9 +3125,10 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data) { arc_buf_hdr_t *hdr; kmutex_t *hash_mtx; + uint64_t guid = spa_guid(spa); int rc = 0; - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); + hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { arc_buf_t *buf = hdr->b_buf; @@ -3254,7 +3296,7 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; - spa_t *spa = hdr->b_spa; + uint64_t spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; uint32_t flags = hdr->b_flags; @@ -3539,12 +3581,13 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_hdr_t *ab; kmutex_t *hash_lock; zio_t *zio; + uint64_t guid = spa_guid(spa); /* * If this buffer is in the cache, release it, so it * can be re-used. */ - ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); if (ab != NULL) { /* * The checksum of blocks to free is not always @@ -3607,10 +3650,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } static int -arc_memory_throttle(uint64_t reserve, uint64_t txg) +arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL - uint64_t inflight_data = arc_anon->arcs_size; uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count); static uint64_t page_load = 0; static uint64_t last_txg = 0; @@ -3674,6 +3716,7 @@ int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; + uint64_t anon_size; #ifdef ZFS_DEBUG /* @@ -3690,11 +3733,18 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) return (ENOMEM); /* + * Don't count loaned bufs as in flight dirty data to prevent long + * network delays from blocking transactions that are ready to be + * assigned to a txg. + */ + anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); + + /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefor need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, txg)) + if (error = arc_memory_throttle(reserve, anon_size, txg)) return (error); /* @@ -3704,8 +3754,9 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ - if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && - arc_anon->arcs_size > arc_c / 4) { + + if (reserve + arc_tempreserve + anon_size > arc_c / 2 && + anon_size > arc_c / 4) { dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve>>10, @@ -3959,6 +4010,8 @@ arc_fini(void) buf_fini(); + ASSERT(arc_loaned_bytes == 0); + mutex_destroy(&arc_lowmem_lock); #ifdef _KERNEL if (arc_event_lowmem != NULL) @@ -4103,7 +4156,7 @@ arc_fini(void) */ static boolean_t -l2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab) +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) { /* * A buffer is *not* eligible for the L2ARC if it: @@ -4112,7 +4165,7 @@ l2arc_write_eligible(spa_t *spa, arc_buf_hdr_t *ab) * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (ab->b_spa != spa) { + if (ab->b_spa != spa_guid) { ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); return (B_FALSE); } @@ -4399,11 +4452,15 @@ l2arc_read_done(zio_t *zio) * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ - if (zio->io_waiter == NULL) - zio_nowait(zio_read(zio->io_parent, - cb->l2rcb_spa, &cb->l2rcb_bp, + if (zio->io_waiter == NULL) { + zio_t *pio = zio_unique_parent(zio); + + ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + + zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, buf->b_data, zio->io_size, arc_read_done, buf, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } } kmem_free(cb, sizeof (l2arc_read_callback_t)); @@ -4600,6 +4657,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; + uint64_t guid = spa_guid(spa); int try; ASSERT(dev->l2ad_vdev != NULL); @@ -4661,7 +4719,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (!l2arc_write_eligible(spa, ab)) { + if (!l2arc_write_eligible(guid, ab)) { mutex_exit(hash_lock); continue; } @@ -5001,7 +5059,7 @@ l2arc_fini(void) void l2arc_start(void) { - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, @@ -5011,7 +5069,7 @@ l2arc_start(void) void l2arc_stop(void) { - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index fe50ecf..cf983e2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -327,7 +327,7 @@ dbuf_verify(dmu_buf_impl_t *db) if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) + if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); @@ -899,15 +899,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. - * XXX We may want to prohibit dirtying in syncing context even - * if they did pre-dirty. */ ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || - dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_objset->os_dsl_dataset == NULL || - dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); - + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + dn->dn_objset->os_dsl_dataset == NULL); /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty @@ -965,7 +961,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * Only valid if not already dirty. */ - ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + ASSERT(dn->dn_object == 0 || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); @@ -977,15 +974,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * We should only be dirtying in syncing context if it's the - * mos, a spa os, or we're initializing the os. However, we are - * allowed to dirty in syncing context provided we already - * dirtied it in open context. Hence we must make this - * assertion only if we're not already dirty. + * mos or we're initializing the os or it's a special object. + * However, we are allowed to dirty in syncing context provided + * we already dirtied it in open context. Hence we must make + * this assertion only if we're not already dirty. */ - ASSERT(!dmu_tx_is_syncing(tx) || - os->os_dsl_dataset == NULL || - !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || - !BP_IS_HOLE(os->os_rootbp)); + ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -1285,6 +1280,68 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) } /* + * Directly assign a provided arc buf to a given dbuf if it's not referenced + * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. + */ +void +dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_level == 0); + ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); + ASSERT(buf != NULL); + ASSERT(arc_buf_size(buf) == db->db.db_size); + ASSERT(tx->tx_txg != 0); + + arc_return_buf(buf, db); + ASSERT(arc_released(buf)); + + mutex_enter(&db->db_mtx); + + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + + if (db->db_state == DB_CACHED && + refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + bcopy(buf->b_data, db->db.db_data, db->db.db_size); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + return; + } + + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(db->db_buf != NULL); + if (dr != NULL && dr->dr_txg == tx->tx_txg) { + ASSERT(dr->dt.dl.dr_data == db->db_buf); + if (!arc_released(db->db_buf)) { + ASSERT(dr->dt.dl.dr_override_state == + DR_OVERRIDDEN); + arc_release(db->db_buf, db); + } + dr->dt.dl.dr_data = buf; + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { + arc_release(db->db_buf, db); + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } + db->db_buf = NULL; + } + ASSERT(db->db_buf == NULL); + dbuf_set_data(db, buf); + db->db_state = DB_FILL; + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + dbuf_fill_done(db, tx); +} + +/* * "Clear" the contents of this dbuf. This will mark the dbuf * EVICTING and clear *most* of its references. Unfortunetely, * when we are not holding the dn_dbufs_mtx, we can't clear the @@ -1827,6 +1884,19 @@ dmu_buf_get_user(dmu_buf_t *db_fake) return (db->db_user_ptr); } +boolean_t +dmu_buf_freeable(dmu_buf_t *dbuf) +{ + boolean_t res = B_FALSE; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + + if (db->db_blkptr) + res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, + db->db_blkptr->blk_birth); + + return (res); +} + static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 1152781..133343b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -82,6 +82,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint64_array, TRUE, "FUID table size" }, { zap_byteswap, TRUE, "DSL dataset next clones"}, { zap_byteswap, TRUE, "scrub work queue" }, + { zap_byteswap, TRUE, "ZFS user/group used" }, + { zap_byteswap, TRUE, "ZFS user/group quota" }, }; int @@ -177,22 +179,22 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) * whose dnodes are in the same block. */ static int -dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dsl_pool_t *dp = NULL; dmu_buf_t **dbp; uint64_t blkid, nblks, i; - uint32_t flags; + uint32_t dbuf_flags; int err; zio_t *zio; hrtime_t start; ASSERT(length <= DMU_MAX_ACCESS); - flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; - if (length > zfetch_array_rd_sz) - flags |= DB_RF_NOPREFETCH; + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; + if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) + dbuf_flags |= DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { @@ -230,7 +232,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, /* initiate async i/o */ if (read) { rw_exit(&dn->dn_struct_rwlock); - (void) dbuf_read(db, zio, flags); + (void) dbuf_read(db, zio, dbuf_flags); rw_enter(&dn->dn_struct_rwlock, RW_READER); } dbp[i] = &db->db; @@ -282,7 +284,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp); + numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); @@ -297,7 +299,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, int err; err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp); + numbufsp, dbpp, DMU_READ_PREFETCH); return (err); } @@ -434,7 +436,8 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, object_size = align == 1 ? dn->dn_datablksz : (dn->dn_maxblkid + 1) << dn->dn_datablkshift; - if (trunc || (end = offset + length) > object_size) + end = offset + length; + if (trunc || end > object_size) end = object_size; if (end <= offset) return (0); @@ -442,6 +445,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, while (length) { start = end; + /* assert(offset <= start) */ err = get_next_chunk(dn, &start, offset); if (err) return (err); @@ -532,7 +536,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf) + void *buf, uint32_t flags) { dnode_t *dn; dmu_buf_t **dbp; @@ -562,7 +566,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp); + TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; @@ -771,9 +775,6 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); - if (err) - break; - offset += tocpy; size -= tocpy; } @@ -783,6 +784,58 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, #endif /* !__FreeBSD__ */ #endif /* _KERNEL */ +/* + * Allocate a loaned anonymous arc buffer. + */ +arc_buf_t * +dmu_request_arcbuf(dmu_buf_t *handle, int size) +{ + dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + + return (arc_loan_buf(dn->dn_objset->os_spa, size)); +} + +/* + * Free a loaned arc buffer. + */ +void +dmu_return_arcbuf(arc_buf_t *buf) +{ + arc_return_buf(buf, FTAG); + VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); +} + +/* + * When possible directly assign passed loaned arc buffer to a dbuf. + * If this is not possible copy the contents of passed arc buf via + * dmu_write(). + */ +void +dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *db; + uint32_t blksz = (uint32_t)arc_buf_size(buf); + uint64_t blkid; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, offset); + VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); + rw_exit(&dn->dn_struct_rwlock); + + if (offset == db->db.db_offset && blksz == db->db.db_size) { + dbuf_assign_arcbuf(db, buf, tx); + dbuf_rele(db, FTAG); + } else { + dbuf_rele(db, FTAG); + ASSERT(dn->dn_objset->os.os == dn->dn_objset); + dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz, + buf->b_data, tx); + dmu_return_arcbuf(buf); + } +} + typedef struct { dbuf_dirty_record_t *dr; dmu_sync_cb_t *done; @@ -794,14 +847,20 @@ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { blkptr_t *bp = zio->io_bp; + dmu_sync_arg_t *in = varg; + dbuf_dirty_record_t *dr = in->dr; + dmu_buf_impl_t *db = dr->dr_dbuf; if (!BP_IS_HOLE(bp)) { - dmu_sync_arg_t *in = varg; - dbuf_dirty_record_t *dr = in->dr; - dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type); ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; + } else { + /* + * dmu_sync() can compress a block of zeros to a null blkptr + * but the block size still needs to be passed through to replay + */ + BP_SET_LSIZE(bp, db->db.db_size); } } @@ -817,6 +876,8 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ + if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) + BP_ZERO(&dr->dt.dl.dr_overridden_by); dr->dt.dl.dr_override_state = DR_OVERRIDDEN; cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index c9e00d5..2678b83 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -164,10 +164,15 @@ dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; - ASSERT(size == sizeof (objset_phys_t)); + ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); + osp->os_flags = BSWAP_64(osp->os_flags); + if (size == sizeof (objset_phys_t)) { + dnode_byteswap(&osp->os_userused_dnode); + dnode_byteswap(&osp->os_groupused_dnode); + } } int @@ -210,12 +215,30 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, err = EIO; return (err); } + + /* Increase the blocksize if we are permitted. */ + if (spa_version(spa) >= SPA_VERSION_USERSPACE && + arc_buf_size(osi->os_phys_buf) < sizeof (objset_phys_t)) { + arc_buf_t *buf = arc_buf_alloc(spa, + sizeof (objset_phys_t), &osi->os_phys_buf, + ARC_BUFC_METADATA); + bzero(buf->b_data, sizeof (objset_phys_t)); + bcopy(osi->os_phys_buf->b_data, buf->b_data, + arc_buf_size(osi->os_phys_buf)); + (void) arc_buf_remove_ref(osi->os_phys_buf, + &osi->os_phys_buf); + osi->os_phys_buf = buf; + } + osi->os_phys = osi->os_phys_buf->b_data; + osi->os_flags = osi->os_phys->os_flags; } else { - osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), + int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? + sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; + osi->os_phys_buf = arc_buf_alloc(spa, size, &osi->os_phys_buf, ARC_BUFC_METADATA); osi->os_phys = osi->os_phys_buf->b_data; - bzero(osi->os_phys, sizeof (objset_phys_t)); + bzero(osi->os_phys, size); } /* @@ -276,6 +299,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, osi->os_meta_dnode = dnode_special_open(osi, &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + if (arc_buf_size(osi->os_phys_buf) >= sizeof (objset_phys_t)) { + osi->os_userused_dnode = dnode_special_open(osi, + &osi->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT); + osi->os_groupused_dnode = dnode_special_open(osi, + &osi->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT); + } /* * We should be the only thread trying to do this because we @@ -456,13 +485,15 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) os.os = osi; (void) dmu_objset_evict_dbufs(&os); - ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL); - dnode_special_close(osi->os_meta_dnode); + if (osi->os_userused_dnode) { + dnode_special_close(osi->os_userused_dnode); + dnode_special_close(osi->os_groupused_dnode); + } zil_free(osi->os_zil); + ASSERT3P(list_head(&osi->os_dnodes), ==, NULL); + VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); mutex_destroy(&osi->os_lock); mutex_destroy(&osi->os_obj_lock); @@ -520,6 +551,10 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); osi->os_phys->os_type = type; + if (dmu_objset_userused_enabled(osi)) { + osi->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + osi->os_flags = osi->os_phys->os_flags; + } dsl_dataset_dirty(ds, tx); @@ -704,13 +739,33 @@ struct snaparg { char *snapname; char failed[MAXPATHLEN]; boolean_t checkperms; - list_t objsets; + nvlist_t *props; }; -struct osnode { - list_node_t node; - objset_t *os; -}; +static int +snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + struct snaparg *sn = arg2; + + /* The props have already been checked by zfs_check_userprops(). */ + + return (dsl_dataset_snapshot_check(os->os->os_dsl_dataset, + sn->snapname, tx)); +} + +static void +snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + objset_t *os = arg1; + dsl_dataset_t *ds = os->os->os_dsl_dataset; + struct snaparg *sn = arg2; + + dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx); + + if (sn->props) + dsl_props_set_sync(ds->ds_prev, sn->props, cr, tx); +} static int dmu_objset_snapshot_one(char *name, void *arg) @@ -747,13 +802,8 @@ dmu_objset_snapshot_one(char *name, void *arg) */ err = zil_suspend(dmu_objset_zil(os)); if (err == 0) { - struct osnode *osn; - dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, - dsl_dataset_snapshot_sync, os->os->os_dsl_dataset, - sn->snapname, 3); - osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP); - osn->os = os; - list_insert_tail(&sn->objsets, osn); + dsl_sync_task_create(sn->dstg, snapshot_check, + snapshot_sync, os, sn, 3); } else { dmu_objset_close(os); } @@ -762,11 +812,11 @@ dmu_objset_snapshot_one(char *name, void *arg) } int -dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) +dmu_objset_snapshot(char *fsname, char *snapname, + nvlist_t *props, boolean_t recursive) { dsl_sync_task_t *dst; - struct osnode *osn; - struct snaparg sn = { 0 }; + struct snaparg sn; spa_t *spa; int err; @@ -778,8 +828,7 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); sn.snapname = snapname; - list_create(&sn.objsets, sizeof (struct osnode), - offsetof(struct osnode, node)); + sn.props = props; if (recursive) { sn.checkperms = B_TRUE; @@ -790,27 +839,19 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) err = dmu_objset_snapshot_one(fsname, &sn); } - if (err) - goto out; - - err = dsl_sync_task_group_wait(sn.dstg); + if (err == 0) + err = dsl_sync_task_group_wait(sn.dstg); for (dst = list_head(&sn.dstg->dstg_tasks); dst; dst = list_next(&sn.dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; + objset_t *os = dst->dst_arg1; + dsl_dataset_t *ds = os->os->os_dsl_dataset; if (dst->dst_err) dsl_dataset_name(ds, sn.failed); + zil_resume(dmu_objset_zil(os)); + dmu_objset_close(os); } -out: - while (osn = list_head(&sn.objsets)) { - list_remove(&sn.objsets, osn); - zil_resume(dmu_objset_zil(osn->os)); - dmu_objset_close(osn->os); - kmem_free(osn, sizeof (struct osnode)); - } - list_destroy(&sn.objsets); - if (err) (void) strcpy(fsname, sn.failed); dsl_sync_task_group_destroy(sn.dstg); @@ -819,7 +860,7 @@ out: } static void -dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) +dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { dnode_t *dn; @@ -827,14 +868,20 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* - * Initialize dn_zio outside dnode_sync() - * to accomodate meta-dnode + * Initialize dn_zio outside dnode_sync() because the + * meta-dnode needs to set it ouside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); list_remove(list, dn); + + if (newlist) { + (void) dnode_add_ref(dn, newlist); + list_insert_tail(newlist, dn); + } + dnode_sync(dn, tx); } } @@ -853,9 +900,12 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg) ASSERT(BP_GET_LEVEL(bp) == 0); /* - * Update rootbp fill count. + * Update rootbp fill count: it should be the number of objects + * allocated in the object set (not counting the "special" + * objects that are stored in the objset_phys_t -- the meta + * dnode and user/group accounting objects). */ - bp->blk_fill = 1; /* count the meta-dnode */ + bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; @@ -878,6 +928,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) writeprops_t wp = { 0 }; zio_t *zio; list_t *list; + list_t *newlist = NULL; dbuf_dirty_record_t *dr; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); @@ -915,20 +966,41 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) } arc_release(os->os_phys_buf, &os->os_phys_buf); + zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os), tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* - * Sync meta-dnode - the parent IO for the sync is the root block + * Sync special dnodes - the parent IO for the sync is the root block */ os->os_meta_dnode->dn_zio = zio; dnode_sync(os->os_meta_dnode, tx); + os->os_phys->os_flags = os->os_flags; + + if (os->os_userused_dnode && + os->os_userused_dnode->dn_type != DMU_OT_NONE) { + os->os_userused_dnode->dn_zio = zio; + dnode_sync(os->os_userused_dnode, tx); + os->os_groupused_dnode->dn_zio = zio; + dnode_sync(os->os_groupused_dnode, tx); + } + txgoff = tx->tx_txg & TXG_MASK; - dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx); - dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx); + if (dmu_objset_userused_enabled(os)) { + newlist = &os->os_synced_dnodes; + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. + */ + list_create(newlist, sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff])); + } + + dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); + dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); list = &os->os_meta_dnode->dn_dirty_records[txgoff]; while (dr = list_head(list)) { @@ -945,6 +1017,146 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) zio_nowait(zio); } +static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; + +void +dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) +{ + used_cbs[ost] = cb; +} + +boolean_t +dmu_objset_userused_enabled(objset_impl_t *os) +{ + return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && + used_cbs[os->os_phys->os_type] && + os->os_userused_dnode); +} + +void +dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx) +{ + dnode_t *dn; + list_t *list = &os->os_synced_dnodes; + static const char zerobuf[DN_MAX_BONUSLEN] = {0}; + + ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); + + while (dn = list_head(list)) { + dmu_object_type_t bonustype; + + ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); + ASSERT(dn->dn_oldphys); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || + dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED); + + /* Allocate the user/groupused objects if necessary. */ + if (os->os_userused_dnode->dn_type == DMU_OT_NONE) { + VERIFY(0 == zap_create_claim(&os->os, + DMU_USERUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + VERIFY(0 == zap_create_claim(&os->os, + DMU_GROUPUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + } + + /* + * If the object was not previously + * accounted, pretend that it was free. + */ + if (!(dn->dn_oldphys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED)) { + bzero(dn->dn_oldphys, sizeof (dnode_phys_t)); + } + + /* + * If the object was freed, use the previous bonustype. + */ + bonustype = dn->dn_phys->dn_bonustype ? + dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype; + ASSERT(dn->dn_phys->dn_type != 0 || + (bcmp(DN_BONUS(dn->dn_phys), zerobuf, + DN_MAX_BONUSLEN) == 0 && + DN_USED_BYTES(dn->dn_phys) == 0)); + ASSERT(dn->dn_oldphys->dn_type != 0 || + (bcmp(DN_BONUS(dn->dn_oldphys), zerobuf, + DN_MAX_BONUSLEN) == 0 && + DN_USED_BYTES(dn->dn_oldphys) == 0)); + used_cbs[os->os_phys->os_type](&os->os, bonustype, + DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys), + DN_USED_BYTES(dn->dn_oldphys), + DN_USED_BYTES(dn->dn_phys), tx); + + /* + * The mutex is needed here for interlock with dnode_allocate. + */ + mutex_enter(&dn->dn_mtx); + zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t)); + dn->dn_oldphys = NULL; + mutex_exit(&dn->dn_mtx); + + list_remove(list, dn); + dnode_rele(dn, list); + } +} + +boolean_t +dmu_objset_userspace_present(objset_t *os) +{ + return (os->os->os_phys->os_flags & + OBJSET_FLAG_USERACCOUNTING_COMPLETE); +} + +int +dmu_objset_userspace_upgrade(objset_t *os) +{ + uint64_t obj; + int err = 0; + + if (dmu_objset_userspace_present(os)) + return (0); + if (!dmu_objset_userused_enabled(os->os)) + return (ENOTSUP); + if (dmu_objset_is_snapshot(os)) + return (EINVAL); + + /* + * We simply need to mark every object dirty, so that it will be + * synced out and now accounted. If this is called + * concurrently, or if we already did some work before crashing, + * that's fine, since we track each object's accounted state + * independently. + */ + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + dmu_tx_t *tx; + dmu_buf_t *db; + int objerr; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + objerr = dmu_bonus_hold(os, obj, FTAG, &db); + if (objerr) + continue; + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, obj); + objerr = dmu_tx_assign(tx, TXG_WAIT); + if (objerr) { + dmu_tx_abort(tx); + continue; + } + dmu_buf_will_dirty(db, tx); + dmu_buf_rele(db, FTAG); + dmu_tx_commit(tx); + } + + os->os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) @@ -978,6 +1190,8 @@ dmu_objset_stats(objset_t *os, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, os->os->os_phys->os_type); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, + dmu_objset_userspace_present(os)); } int diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 6effae8..ed5afb4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -180,7 +180,9 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); - if (bp == NULL && zb->zb_object == 0) { + if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { + return (0); + } else if (bp == NULL && zb->zb_object == 0) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c index ef0284d..89cbfad 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -64,6 +64,9 @@ struct traverse_data { void *td_arg; }; +static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, + arc_buf_t *buf, uint64_t objset, uint64_t object); + /* ARGSUSED */ static void traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) @@ -119,7 +122,7 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh) * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ - if (claim_txg == 0 && (spa_mode & FWRITE)) + if (claim_txg == 0 && spa_writeable(td->td_spa)) return; zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); @@ -189,7 +192,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_WAIT; - int i, j; + int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; err = arc_read(NULL, td->td_spa, bp, pbuf, @@ -201,20 +204,15 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, /* recursively visitbp() blocks below this */ dnp = buf->b_data; for (i = 0; i < epb && err == 0; i++, dnp++) { - for (j = 0; j < dnp->dn_nblkptr; j++) { - SET_BOOKMARK(&czb, zb->zb_objset, - zb->zb_blkid * epb + i, - dnp->dn_nlevels - 1, j); - err = traverse_visitbp(td, dnp, buf, - (blkptr_t *)&dnp->dn_blkptr[j], &czb); - if (err) - break; - } + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + zb->zb_blkid * epb + i); + if (err) + break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; objset_phys_t *osp; - int j; + dnode_phys_t *dnp; err = arc_read_nolock(NULL, td->td_spa, bp, arc_getbuf_func, &buf, @@ -225,14 +223,17 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, osp = buf->b_data; traverse_zil(td, &osp->os_zil_header); - for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { - SET_BOOKMARK(&czb, zb->zb_objset, 0, - osp->os_meta_dnode.dn_nlevels - 1, j); - err = traverse_visitbp(td, &osp->os_meta_dnode, buf, - (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j], - &czb); - if (err) - break; + dnp = &osp->os_meta_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0); + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + dnp = &osp->os_userused_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_USERUSED_OBJECT); + } + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + dnp = &osp->os_groupused_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_GROUPUSED_OBJECT); } } @@ -245,6 +246,23 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, return (err); } +static int +traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, + arc_buf_t *buf, uint64_t objset, uint64_t object) +{ + int j, err = 0; + zbookmark_t czb; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_blkptr[j], &czb); + if (err) + break; + } + return (err); +} + /* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index bfa5699..b6a5cdb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -160,6 +160,41 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) return (err); } +static void +dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db, + boolean_t freeable, dmu_buf_impl_t **history) +{ + int i = db->db_level + 1; + dnode_t *dn = db->db_dnode; + + if (i >= dn->dn_nlevels) + return; + + db = db->db_parent; + if (db == NULL) { + uint64_t lvls = dn->dn_nlevels - i; + + txh->txh_space_towrite += lvls << dn->dn_indblkshift; + return; + } + + if (db != history[i]) { + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + uint64_t space = 1ULL << dn->dn_indblkshift; + + freeable = (db->db_blkptr && (freeable || + dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth))); + if (freeable) + txh->txh_space_tooverwrite += space; + else + txh->txh_space_towrite += space; + if (db->db_blkptr) + txh->txh_space_tounref += space; + history[i] = db; + dmu_tx_count_indirects(txh, db, freeable, history); + } +} + /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) @@ -177,17 +212,26 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; + if (dn) { + dmu_buf_impl_t *last[DN_MAX_LEVELS]; + int nlvls = dn->dn_nlevels; + int delta; - /* - * For i/o error checking, read the first and last level-0 - * blocks (if they are not aligned), and all the level-1 blocks. - */ + /* + * For i/o error checking, read the first and last level-0 + * blocks (if they are not aligned), and all the level-1 blocks. + */ - if (dn) { if (dn->dn_maxblkid == 0) { - err = dmu_tx_check_ioerr(NULL, dn, 0, 0); - if (err) - goto out; + delta = dn->dn_datablksz; + start = (off < dn->dn_datablksz) ? 0 : 1; + end = (off+len <= dn->dn_datablksz) ? 0 : 1; + if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err) + goto out; + delta -= off; + } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); @@ -211,10 +255,9 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } /* level-1 blocks */ - if (dn->dn_nlevels > 1) { - start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (i = start+1; i < end; i++) { + if (nlvls > 1) { + int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (i = (start>>shft)+1; i < end>>shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) goto out; @@ -224,20 +267,70 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) err = zio_wait(zio); if (err) goto out; + delta = P2NPHASE(off, dn->dn_datablksz); } - } - /* - * If there's more than one block, the blocksize can't change, - * so we can make a more precise estimate. Alternatively, - * if the dnode's ibs is larger than max_ibs, always use that. - * This ensures that if we reduce DN_MAX_INDBLKSHIFT, - * the code will still work correctly on existing pools. - */ - if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { - min_ibs = max_ibs = dn->dn_indblkshift; - if (dn->dn_datablkshift != 0) + if (dn->dn_maxblkid > 0) { + /* + * The blocksize can't change, + * so we can make a more precise estimate. + */ + ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; + min_ibs = max_ibs = dn->dn_indblkshift; + } else if (dn->dn_indblkshift > max_ibs) { + /* + * This ensures that if we reduce DN_MAX_INDBLKSHIFT, + * the code will still work correctly on older pools. + */ + min_ibs = max_ibs = dn->dn_indblkshift; + } + + /* + * If this write is not off the end of the file + * we need to account for overwrites/unref. + */ + if (start <= dn->dn_maxblkid) + bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS); + while (start <= dn->dn_maxblkid) { + spa_t *spa = txh->txh_tx->tx_pool->dp_spa; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold_level(dn, 0, start, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (db->db_blkptr && dsl_dataset_block_freeable(ds, + db->db_blkptr->blk_birth)) { + dprintf_bp(db->db_blkptr, "can free old%s", ""); + txh->txh_space_tooverwrite += dn->dn_datablksz; + txh->txh_space_tounref += dn->dn_datablksz; + dmu_tx_count_indirects(txh, db, TRUE, last); + } else { + txh->txh_space_towrite += dn->dn_datablksz; + if (db->db_blkptr) + txh->txh_space_tounref += + bp_get_dasize(spa, db->db_blkptr); + dmu_tx_count_indirects(txh, db, FALSE, last); + } + dbuf_rele(db, FTAG); + if (++start > end) { + /* + * Account for new indirects appearing + * before this IO gets assigned into a txg. + */ + bits = 64 - min_bs; + epbs = min_ibs - SPA_BLKPTRSHIFT; + for (bits -= epbs * (nlvls - 1); + bits >= 0; bits -= epbs) + txh->txh_fudge += 1ULL << max_ibs; + goto out; + } + off += delta; + if (len >= delta) + len -= delta; + delta = dn->dn_datablksz; + } } /* @@ -260,20 +353,22 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { start >>= epbs; end >>= epbs; - /* - * If we increase the number of levels of indirection, - * we'll need new blkid=0 indirect blocks. If start == 0, - * we're already accounting for that blocks; and if end == 0, - * we can't increase the number of levels beyond that. - */ - if (start != 0 && end != 0) - txh->txh_space_towrite += 1ULL << max_ibs; + ASSERT3U(end, >=, start); txh->txh_space_towrite += (end - start + 1) << max_ibs; + if (start != 0) { + /* + * We also need a new blkid=0 indirect block + * to reference any existing file data. + */ + txh->txh_space_towrite += 1ULL << max_ibs; + } } - ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); - out: + if (txh->txh_space_towrite + txh->txh_space_tooverwrite > + 2 * DMU_MAX_ACCESS) + err = EFBIG; + if (err) txh->txh_tx->tx_err = err; } @@ -290,6 +385,7 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh) dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, dn->dn_dbuf->db_blkptr->blk_birth)) { txh->txh_space_tooverwrite += space; + txh->txh_space_tounref += space; } else { txh->txh_space_towrite += space; if (dn && dn->dn_dbuf->db_blkptr) @@ -533,7 +629,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) } void -dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; dnode_t *dn; @@ -601,12 +697,8 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) } } - /* - * 3 blocks overwritten: target leaf, ptrtbl block, header block - * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks - */ - dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, - (3 + (add ? 3 : 0)) << dn->dn_datablkshift); + err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add, + &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* * If the modified blocks are scattered to the four winds, @@ -614,7 +706,10 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) */ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) - txh->txh_space_towrite += 3 << dn->dn_indblkshift; + if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) + txh->txh_space_towrite += 3 << dn->dn_indblkshift; + else + txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index f0b4080..f9661d6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -156,7 +156,7 @@ dnode_verify(dnode_t *dn) } if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL); + ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + @@ -320,6 +320,7 @@ dnode_destroy(dnode_t *dn) } ASSERT(NULL == list_head(&dn->dn_dbufs)); #endif + ASSERT(dn->dn_oldphys == NULL); mutex_enter(&os->os_lock); list_remove(&os->os_dnodes, dn); @@ -550,6 +551,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0); + if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { + dn = (object == DMU_USERUSED_OBJECT) ? + os->os_userused_dnode : os->os_groupused_dnode; + if (dn == NULL) + return (ENOENT); + type = dn->dn_type; + if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) + return (ENOENT); + if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) + return (EEXIST); + DNODE_VERIFY(dn); + (void) refcount_add(&dn->dn_holds, tag); + *dnp = dn; + return (0); + } + if (object == 0 || object >= DN_MAX_OBJECT) return (EINVAL); @@ -608,7 +625,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, type = dn->dn_type; if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || - ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) { + ((flag & DNODE_MUST_BE_FREE) && + (type != DMU_OT_NONE || dn->dn_oldphys))) { mutex_exit(&dn->dn_mtx); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); @@ -673,8 +691,10 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) objset_impl_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; - if (dn->dn_object == DMU_META_DNODE_OBJECT) + if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + dsl_dataset_dirty(os->os_dsl_dataset, tx); return; + } DNODE_VERIFY(dn); @@ -1270,7 +1290,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); - hole = flags & DNODE_FIND_HOLE; + hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 1b729e3..3bf0c81 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -506,9 +506,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* * Write out the dnode's dirty buffers. - * - * NOTE: The dnode is kept in memory by being dirty. Once the - * dirty bit is cleared, it may be evicted. Beware of this! */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) @@ -517,20 +514,33 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; + static const dnode_phys_t zerodn = { 0 }; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); + ASSERT(dnp->dn_type != DMU_OT_NONE || + bcmp(dnp, &zerodn, DNODE_SIZE) == 0); DNODE_VERIFY(dn); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); + if (dmu_objset_userused_enabled(dn->dn_objset) && + !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + ASSERT(dn->dn_oldphys == NULL); + dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t)); + *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */ + dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; + } else { + /* Once we account for it, we should always account for it. */ + ASSERT(!(dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED)); + } + mutex_enter(&dn->dn_mtx); if (dn->dn_allocated_txg == tx->tx_txg) { /* The dnode is newly allocated or reallocated */ if (dnp->dn_type == DMU_OT_NONE) { /* this is a first alloc, not a realloc */ - /* XXX shouldn't the phys already be zeroed? */ - bzero(dnp, DNODE_CORE_SIZE); dnp->dn_nlevels = 1; dnp->dn_nblkptr = dn->dn_nblkptr; } @@ -628,7 +638,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dbuf_sync_list(list, tx); - if (dn->dn_object != DMU_META_DNODE_OBJECT) { + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ASSERT3P(list_head(list), ==, NULL); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index 622fa5d..ac9d67f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -229,7 +229,7 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); } -int +boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) { return (blk_birth > dsl_dataset_prev_snap_txg(ds)); @@ -525,7 +525,15 @@ dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) rw_enter(&dp->dp_config_rwlock, RW_READER); return (ENOENT); } + /* + * The dp_config_rwlock lives above the ds_lock. And + * we need to check DSL_DATASET_IS_DESTROYED() while + * holding the ds_lock, so we have to drop and reacquire + * the ds_lock here. + */ + mutex_exit(&ds->ds_lock); rw_enter(&dp->dp_config_rwlock, RW_READER); + mutex_enter(&ds->ds_lock); } mutex_exit(&ds->ds_lock); return (0); @@ -981,6 +989,27 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) (void) dmu_free_object(os, obj); } + /* + * We need to sync out all in-flight IO before we try to evict + * (the dataset evict func is trying to clear the cached entries + * for this dataset in the ARC). + */ + txg_wait_synced(dd->dd_pool, 0); + + /* + * If we managed to free all the objects in open + * context, the user space accounting should be zero. + */ + if (ds->ds_phys->ds_bp.blk_fill == 0 && + dmu_objset_userused_enabled(os->os)) { + uint64_t count; + + ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || + count == 0); + ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || + count == 0); + } + dmu_objset_close(os); if (err != ESRCH) goto out; @@ -1065,7 +1094,6 @@ dsl_dataset_get_user_ptr(dsl_dataset_t *ds) return (ds->ds_user_ptr); } - blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { @@ -1445,6 +1473,33 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) cv_destroy(&arg.cv); } +static void +remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t count; + int err; + + ASSERT(ds->ds_phys->ds_num_children >= 2); + err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); + /* + * The err should not be ENOENT, but a bug in a previous version + * of the code could cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a missing entry. + * If we knew that the pool was created after + * SPA_VERSION_NEXT_CLONES, we could assert that it isn't + * ENOENT. However, at least we can check that we don't have + * too many entries in the next_clones_obj even after failing to + * remove this one. + */ + if (err != ENOENT) { + VERIFY3U(err, ==, 0); + } + ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, + &count)); + ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); +} + void dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { @@ -1495,8 +1550,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && ds_prev->ds_phys->ds_next_clones_obj != 0) { - VERIFY(0 == zap_remove_int(mos, - ds_prev->ds_phys->ds_next_clones_obj, obj, tx)); + remove_from_next_clones(ds_prev, obj, tx); if (ds->ds_phys->ds_next_snap_obj != 0) { VERIFY(0 == zap_add_int(mos, ds_prev->ds_phys->ds_next_clones_obj, @@ -1852,8 +1906,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_prev->ds_phys->ds_creation_txg); ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { - VERIFY3U(0, ==, zap_remove_int(mos, - next_clones_obj, dsphys->ds_next_snap_obj, tx)); + remove_from_next_clones(ds->ds_prev, + dsphys->ds_next_snap_obj, tx); VERIFY3U(0, ==, zap_add_int(mos, next_clones_obj, dsobj, tx)); } @@ -1962,6 +2016,9 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) if (ds->ds_phys->ds_next_snap_obj) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; + } else { + stat->dds_is_snapshot = B_FALSE; + stat->dds_num_clones = 0; } /* clone origin is really a dsl_dir thing... */ @@ -1973,6 +2030,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); dsl_dataset_name(ods, stat->dds_origin); dsl_dataset_drop_ref(ods, FTAG); + } else { + stat->dds_origin[0] = '\0'; } rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } @@ -2439,9 +2498,7 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* change the origin's next clone */ if (origin_ds->ds_phys->ds_next_clones_obj) { - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, - origin_ds->ds_phys->ds_next_clones_obj, - origin_ds->ds_phys->ds_next_snap_obj, tx)); + remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, origin_ds->ds_phys->ds_next_clones_obj, oldnext_obj, tx)); @@ -3039,12 +3096,8 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dataset_t *ds = arg1; uint64_t *reservationp = arg2; uint64_t new_reservation = *reservationp; - int64_t delta; uint64_t unique; - if (new_reservation > INT64_MAX) - return (EOVERFLOW); - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFRESERVATION) return (ENOTSUP); @@ -3061,15 +3114,18 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) mutex_enter(&ds->ds_lock); unique = dsl_dataset_unique(ds); - delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved); mutex_exit(&ds->ds_lock); - if (delta > 0 && - delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) - return (ENOSPC); - if (delta > 0 && ds->ds_quota > 0 && - new_reservation > ds->ds_quota) - return (ENOSPC); + if (MAX(unique, new_reservation) > MAX(unique, ds->ds_reserved)) { + uint64_t delta = MAX(unique, new_reservation) - + MAX(unique, ds->ds_reserved); + + if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + if (ds->ds_quota > 0 && + new_reservation > ds->ds_quota) + return (ENOSPC); + } return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c index 96b5005..2f312ae 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -226,24 +226,11 @@ dsl_dir_namelen(dsl_dir_t *dd) return (result); } -int -dsl_dir_is_private(dsl_dir_t *dd) -{ - int rv = FALSE; - - if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) - rv = TRUE; - if (dataset_name_hidden(dd->dd_myname)) - rv = TRUE; - return (rv); -} - - static int getcomponent(const char *path, char *component, const char **nextp) { char *p; - if (path == NULL) + if ((path == NULL) || (path[0] == '\0')) return (ENOENT); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); @@ -1076,10 +1063,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) uint64_t *reservationp = arg2; uint64_t new_reservation = *reservationp; uint64_t used, avail; - int64_t delta; - - if (new_reservation > INT64_MAX) - return (EOVERFLOW); /* * If we are doing the preliminary check in open context, the @@ -1090,8 +1073,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; - delta = MAX(used, new_reservation) - - MAX(used, dd->dd_phys->dd_reserved); mutex_exit(&dd->dd_lock); if (dd->dd_parent) { @@ -1101,11 +1082,17 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } - if (delta > 0 && delta > avail) - return (ENOSPC); - if (delta > 0 && dd->dd_phys->dd_quota > 0 && - new_reservation > dd->dd_phys->dd_quota) - return (ENOSPC); + if (MAX(used, new_reservation) > MAX(used, dd->dd_phys->dd_reserved)) { + uint64_t delta = MAX(used, new_reservation) - + MAX(used, dd->dd_phys->dd_reserved); + + if (delta > avail) + return (ENOSPC); + if (dd->dd_phys->dd_quota > 0 && + new_reservation > dd->dd_phys->dd_quota) + return (ENOSPC); + } + return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c index e5823c5..0f00bc96 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -133,14 +133,15 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) goto out; err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, FTAG, &ds); + if (err == 0) { + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, dp, + &dp->dp_origin_snap); + dsl_dataset_rele(ds, FTAG); + } + dsl_dir_close(dd, dp); if (err) goto out; - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - dp, &dp->dp_origin_snap); - if (err) - goto out; - dsl_dataset_rele(ds, FTAG); - dsl_dir_close(dd, dp); } /* get scrub status */ @@ -303,23 +304,51 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dp->dp_read_overhead = 0; start = gethrtime(); + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { - if (!list_link_active(&ds->ds_synced_link)) - list_insert_tail(&dp->dp_synced_datasets, ds); - else - dmu_buf_rele(ds->ds_dbuf, ds); + /* + * We must not sync any non-MOS datasets twice, because + * we may have taken a snapshot of them. However, we + * may sync newly-created datasets on pass 2. + */ + ASSERT(!list_link_active(&ds->ds_synced_link)); + list_insert_tail(&dp->dp_synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); - err = zio_wait(zio); + write_time = gethrtime() - start; ASSERT(err == 0); DTRACE_PROBE(pool_sync__2rootzio); - while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) + for (ds = list_head(&dp->dp_synced_datasets); ds; + ds = list_next(&dp->dp_synced_datasets, ds)) + dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx); + + /* + * Sync the datasets again to push out the changes due to + * userquota updates. This must be done before we process the + * sync tasks, because that could cause a snapshot of a dataset + * whose ds_bp will be rewritten when we do this 2nd sync. + */ + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + ASSERT(list_link_active(&ds->ds_synced_link)); + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, zio, tx); + } + err = zio_wait(zio); + + while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT(spa_sync_pass(dp->dp_spa) == 1); dsl_sync_task_group_sync(dstg, tx); + } DTRACE_PROBE(pool_sync__3task); start = gethrtime(); @@ -574,6 +603,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); if (prev->ds_phys->ds_next_clones_obj == 0) { + dmu_buf_will_dirty(prev->ds_dbuf, tx); prev->ds_phys->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); @@ -593,8 +623,8 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); - (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, - tx, DS_FIND_CHILDREN); + VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, + tx, DS_FIND_CHILDREN)); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c index 212acbb..d064932 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dmu_objset.h> #include <sys/dmu_tx.h> @@ -416,6 +414,34 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } void +dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + nvlist_t *nvl = arg2; + nvpair_t *elem = NULL; + + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + struct prop_set_arg psa; + + psa.name = nvpair_name(elem); + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + VERIFY(nvpair_value_string(elem, + (char **)&psa.buf) == 0); + psa.intsz = 1; + psa.numints = strlen(psa.buf) + 1; + } else { + uint64_t intval; + VERIFY(nvpair_value_uint64(elem, &intval) == 0); + psa.intsz = sizeof (intval); + psa.numints = 1; + psa.buf = &intval; + } + dsl_prop_set_sync(ds, &psa, cr, tx); + } +} + +void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx) { @@ -471,6 +497,43 @@ dsl_prop_set(const char *dsname, const char *propname, return (err); } +int +dsl_props_set(const char *dsname, nvlist_t *nvl) +{ + dsl_dataset_t *ds; + nvpair_t *elem = NULL; + int err; + + /* + * Do these checks before the syncfunc, since it can't fail. + */ + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + if (nvpair_type(elem) == DATA_TYPE_STRING) { + char *valstr; + VERIFY(nvpair_value_string(elem, &valstr) == 0); + if (strlen(valstr) >= ZAP_MAXVALUELEN) + return (E2BIG); + } + } + + if (err = dsl_dataset_hold(dsname, FTAG, &ds)) + return (err); + + if (dsl_dataset_is_snapshot(ds) && + spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); + } + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + NULL, dsl_props_set_sync, ds, nvl, 2); + + dsl_dataset_rele(ds, FTAG); + return (err); +} + /* * Iterate over all properties for this dataset and return them in an nvlist. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c index 84561ab..d11f106 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scrub.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,6 +45,8 @@ typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); static scrub_cb_t dsl_pool_scrub_clean_cb; static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; +static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t objset, uint64_t object); int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ @@ -95,6 +97,9 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ESC_ZFS_RESILVER_START); dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, tx->tx_txg); + } else { + spa_event_notify(dp->dp_spa, NULL, + ESC_ZFS_SCRUB_START); } /* zero out the scrub stats in all vdev_stat_t's */ @@ -212,8 +217,9 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) */ vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); - if (dp->dp_scrub_min_txg && *completep) - spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH); + if (*completep) + spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? + ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); spa_errlog_rotate(dp->dp_spa); /* @@ -402,7 +408,7 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh) * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ - if (claim_txg == 0 && (spa_mode & FWRITE)) + if (claim_txg == 0 && spa_writeable(dp->dp_spa)) return; zilog = zil_alloc(dp->dp_meta_objset, zh); @@ -420,9 +426,6 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, int err; arc_buf_t *buf = NULL; - if (bp->blk_birth == 0) - return; - if (bp->blk_birth <= dp->dp_scrub_min_txg) return; @@ -482,7 +485,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_WAIT; dnode_phys_t *child_dnp; - int i, j; + int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; err = arc_read(NULL, dp->dp_spa, bp, pbuf, @@ -497,20 +500,12 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, child_dnp = buf->b_data; for (i = 0; i < epb; i++, child_dnp++) { - for (j = 0; j < child_dnp->dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, - zb->zb_blkid * epb + i, - child_dnp->dn_nlevels - 1, j); - scrub_visitbp(dp, child_dnp, buf, - &child_dnp->dn_blkptr[j], &czb); - } + scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset, + zb->zb_blkid * epb + i); } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; objset_phys_t *osp; - int j; err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, @@ -526,13 +521,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, traverse_zil(dp, &osp->os_zil_header); - for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, 0, - osp->os_meta_dnode.dn_nlevels - 1, j); - scrub_visitbp(dp, &osp->os_meta_dnode, buf, - &osp->os_meta_dnode.dn_blkptr[j], &czb); + scrub_visitdnode(dp, &osp->os_meta_dnode, + buf, zb->zb_objset, 0); + if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { + scrub_visitdnode(dp, &osp->os_userused_dnode, + buf, zb->zb_objset, 0); + scrub_visitdnode(dp, &osp->os_groupused_dnode, + buf, zb->zb_objset, 0); } } @@ -542,6 +537,21 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, } static void +scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t objset, uint64_t object) +{ + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); + } + +} + +static void scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) { zbookmark_t zb; @@ -688,17 +698,34 @@ scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) ds->ds_phys->ds_next_snap_obj, tx) == 0); } if (ds->ds_phys->ds_num_children > 1) { - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + boolean_t usenext = B_FALSE; + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + /* + * A bug in a previous version of the code could + * cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a + * missing entry. Therefore we can only use the + * next_clones_obj when its count is correct. + */ + int err = zap_count(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, &count); + if (err == 0 && + count == ds->ds_phys->ds_num_children - 1) + usenext = B_TRUE; + } + + if (usenext) { + VERIFY(zap_join(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, + dp->dp_scrub_queue_obj, tx) == 0); + } else { struct enqueue_clones_arg eca; eca.tx = tx; eca.originobj = ds->ds_object; (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); - } else { - VERIFY(zap_join(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, - dp->dp_scrub_queue_obj, tx) == 0); } } @@ -751,6 +778,7 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) { + spa_t *spa = dp->dp_spa; zap_cursor_t zc; zap_attribute_t za; boolean_t complete = B_TRUE; @@ -758,8 +786,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (dp->dp_scrub_func == SCRUB_FUNC_NONE) return; - /* If the spa is not fully loaded, don't bother. */ - if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE) + /* + * If the pool is not loaded, or is trying to unload, leave it alone. + */ + if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa)) return; if (dp->dp_scrub_restart) { @@ -768,13 +798,13 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); } - if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { + if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { /* * We must have resumed after rebooting; reset the vdev * stats to know that we're doing a scrub (although it * will think we're just starting now). */ - vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, + vdev_scrub_stat_update(spa->spa_root_vdev, dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : POOL_SCRUB_EVERYTHING, B_FALSE); } @@ -782,7 +812,7 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dp->dp_scrub_pausing = B_FALSE; dp->dp_scrub_start_time = lbolt64; dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); - dp->dp_spa->spa_scrub_active = B_TRUE; + spa->spa_scrub_active = B_TRUE; if (dp->dp_scrub_bookmark.zb_objset == 0) { /* First do the MOS & ORIGIN */ @@ -790,8 +820,8 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (dp->dp_scrub_pausing) goto out; - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(spa, NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); } else { scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); @@ -841,15 +871,13 @@ out: VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &dp->dp_spa->spa_scrub_errors, tx)); + &spa->spa_scrub_errors, tx)); /* XXX this is scrub-clean specific */ - mutex_enter(&dp->dp_spa->spa_scrub_lock); - while (dp->dp_spa->spa_scrub_inflight > 0) { - cv_wait(&dp->dp_spa->spa_scrub_io_cv, - &dp->dp_spa->spa_scrub_lock); - } - mutex_exit(&dp->dp_spa->spa_scrub_lock); + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + mutex_exit(&spa->spa_scrub_lock); } void @@ -931,13 +959,17 @@ static int dsl_pool_scrub_clean_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_t *zb) { - size_t size = BP_GET_LSIZE(bp); - int d; + size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; + int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; int zio_priority; + ASSERT(bp->blk_birth > dp->dp_scrub_min_txg); + + if (bp->blk_birth >= dp->dp_scrub_max_txg) + return (0); + count_block(dp->dp_blkstats, bp); if (dp->dp_scrub_isresilver == 0) { @@ -956,7 +988,7 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) zio_flags |= ZIO_FLAG_SPECULATIVE; - for (d = 0; d < BP_GET_NDVAS(bp); d++) { + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[d])); @@ -974,16 +1006,17 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, if (DVA_GET_GANG(&bp->blk_dva[d])) { /* * Gang members may be spread across multiple - * vdevs, so the best we can do is look at the - * pool-wide DTL. + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. * XXX -- it would be better to change our - * allocation policy to ensure that this can't - * happen. + * allocation policy to ensure that all + * gang members reside on the same vdev. */ - vd = spa->spa_root_vdev; + needs_io = B_TRUE; + } else { + needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, + bp->blk_birth, 1); } - needs_io = vdev_dtl_contains(&vd->vdev_dtl_map, - bp->blk_birth, 1); } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index 47f8f5f..d216154 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,18 +36,35 @@ uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space_map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +int metaslab_df_free_pct = 30; + +/* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * -metaslab_class_create(void) +metaslab_class_create(space_map_ops_t *ops) { metaslab_class_t *mc; mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); mc->mc_rotor = NULL; + mc->mc_ops = ops; return (mc); } @@ -202,30 +219,14 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) } /* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== + * This is a helper function that can be used by the allocator to find + * a suitable block to allocate. This will search the specified AVL + * tree looking for a block that matches the specified criteria. */ -static void -metaslab_ff_load(space_map_t *sm) -{ - ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); -} - -static void -metaslab_ff_unload(space_map_t *sm) -{ - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); - sm->sm_ppd = NULL; -} - static uint64_t -metaslab_ff_alloc(space_map_t *sm, uint64_t size) +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, + uint64_t align) { - avl_tree_t *t = &sm->sm_root; - uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; space_seg_t *ss, ssearch; avl_index_t where; @@ -254,7 +255,37 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size) return (-1ULL); *cursor = 0; - return (metaslab_ff_alloc(sm, size)); + return (metaslab_block_picker(t, cursor, size, align)); +} + +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static void +metaslab_ff_load(space_map_t *sm) +{ + ASSERT(sm->sm_ppd == NULL); + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); + sm->sm_pp_root = NULL; +} + +static void +metaslab_ff_unload(space_map_t *sm) +{ + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); + sm->sm_ppd = NULL; +} + +static uint64_t +metaslab_ff_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + + return (metaslab_block_picker(t, cursor, size, align)); } /* ARGSUSED */ @@ -276,9 +307,136 @@ static space_map_ops_t metaslab_ff_ops = { metaslab_ff_unload, metaslab_ff_alloc, metaslab_ff_claim, - metaslab_ff_free + metaslab_ff_free, + NULL /* maxsize */ +}; + +/* + * Dynamic block allocator - + * Uses the first fit allocation scheme until space get low and then + * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold + * and metaslab_df_free_pct to determine when to switch the allocation scheme. + */ + +uint64_t +metaslab_df_maxsize(space_map_t *sm) +{ + avl_tree_t *t = sm->sm_pp_root; + space_seg_t *ss; + + if (t == NULL || (ss = avl_last(t)) == NULL) + return (0ULL); + + return (ss->ss_end - ss->ss_start); +} + +static int +metaslab_df_seg_compare(const void *x1, const void *x2) +{ + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + uint64_t ss_size1 = s1->ss_end - s1->ss_start; + uint64_t ss_size2 = s2->ss_end - s2->ss_start; + + if (ss_size1 < ss_size2) + return (-1); + if (ss_size1 > ss_size2) + return (1); + + if (s1->ss_start < s2->ss_start) + return (-1); + if (s1->ss_start > s2->ss_start) + return (1); + + return (0); +} + +static void +metaslab_df_load(space_map_t *sm) +{ + space_seg_t *ss; + + ASSERT(sm->sm_ppd == NULL); + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); + + sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(sm->sm_pp_root, metaslab_df_seg_compare, + sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + avl_add(sm->sm_pp_root, ss); +} + +static void +metaslab_df_unload(space_map_t *sm) +{ + void *cookie = NULL; + + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); + sm->sm_ppd = NULL; + + while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { + /* tear down the tree */ + } + + avl_destroy(sm->sm_pp_root); + kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); + sm->sm_pp_root = NULL; +} + +static uint64_t +metaslab_df_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + uint64_t max_size = metaslab_df_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if (max_size < metaslab_df_alloc_threshold || + free_pct < metaslab_df_free_pct) { + t = sm->sm_pp_root; + *cursor = 0; + } + + return (metaslab_block_picker(t, cursor, size, 1ULL)); +} + +/* ARGSUSED */ +static void +metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +/* ARGSUSED */ +static void +metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size) +{ + /* No need to update cursor */ +} + +static space_map_ops_t metaslab_df_ops = { + metaslab_df_load, + metaslab_df_unload, + metaslab_df_alloc, + metaslab_df_claim, + metaslab_df_free, + metaslab_df_maxsize }; +space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; + /* * ========================================================================== * Metaslabs @@ -414,20 +572,28 @@ metaslab_weight(metaslab_t *msp) } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight) +metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) { space_map_t *sm = &msp->ms_map; + space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = space_map_load(sm, &metaslab_ff_ops, - SM_FREE, &msp->ms_smo, + int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo, msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); if (error) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } + + /* + * If we were able to load the map then make sure + * that this map is still able to satisfy our request. + */ + if (msp->ms_weight < size) + return (ENOSPC); + metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } @@ -636,11 +802,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, int i; activation_weight = METASLAB_WEIGHT_PRIMARY; - for (i = 0; i < d; i++) - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) + for (i = 0; i < d; i++) { + if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; + break; + } + } for (;;) { + boolean_t was_active; + mutex_enter(&mg->mg_lock); for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { if (msp->ms_weight < size) { @@ -648,6 +819,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, return (-1ULL); } + was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -673,7 +845,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * another thread may have changed the weight while we * were blocked on the metaslab lock. */ - if (msp->ms_weight < size) { + if (msp->ms_weight < size || (was_active && + !(msp->ms_weight & METASLAB_ACTIVE_MASK) && + activation_weight == METASLAB_WEIGHT_PRIMARY)) { mutex_exit(&msp->ms_lock); continue; } @@ -686,7 +860,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, continue; } - if (metaslab_activate(msp, activation_weight) != 0) { + if (metaslab_activate(msp, activation_weight, size) != 0) { mutex_exit(&msp->ms_lock); continue; } @@ -720,6 +894,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, vdev_t *vd; int dshift = 3; int all_zero; + int zio_lock = B_FALSE; + boolean_t allocatable; uint64_t offset = -1ULL; uint64_t asize; uint64_t distance; @@ -778,11 +954,20 @@ top: all_zero = B_TRUE; do { vd = mg->mg_vd; + /* * Don't allocate from faulted devices. */ - if (!vdev_allocatable(vd)) + if (zio_lock) { + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + allocatable = vdev_allocatable(vd); + spa_config_exit(spa, SCL_ZIO, FTAG); + } else { + allocatable = vdev_allocatable(vd); + } + if (!allocatable) goto next; + /* * Avoid writing single-copy data to a failing vdev */ @@ -858,6 +1043,12 @@ next: goto top; } + if (!allocatable && !zio_lock) { + dshift = 3; + zio_lock = B_TRUE; + goto top; + } + bzero(&dva[d], sizeof (dva_t)); return (ENOSPC); @@ -938,7 +1129,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); @@ -946,7 +1137,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) space_map_claim(&msp->ms_map, offset, size); - if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */ + if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index b8925e3..cb6f413 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -70,16 +70,44 @@ TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, "Check hostid on import?"); -int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { - /* ISSUE INTR */ - { 1, 1 }, /* ZIO_TYPE_NULL */ - { 1, 8 }, /* ZIO_TYPE_READ */ - { 8, 1 }, /* ZIO_TYPE_WRITE */ - { 1, 1 }, /* ZIO_TYPE_FREE */ - { 1, 1 }, /* ZIO_TYPE_CLAIM */ - { 1, 1 }, /* ZIO_TYPE_IOCTL */ +enum zti_modes { + zti_mode_fixed, /* value is # of threads (min 1) */ + zti_mode_online_percent, /* value is % of online CPUs */ + zti_mode_tune, /* fill from zio_taskq_tune_* */ + zti_nmodes }; +#define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) } +#define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) } +#define ZTI_THREAD_TUNE { zti_mode_tune, 0 } + +#define ZTI_THREAD_ONE ZTI_THREAD_FIX(1) + +typedef struct zio_taskq_info { + const char *zti_name; + struct { + enum zti_modes zti_mode; + uint_t zti_value; + } zti_nthreads[ZIO_TASKQ_TYPES]; +} zio_taskq_info_t; + +static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { + "issue", "intr" +}; + +const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = { + /* ISSUE INTR */ + { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, + { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } }, + { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } }, + { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, + { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, + { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, +}; + +enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; +uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ + static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); @@ -117,38 +145,38 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { - uint64_t size = spa_get_space(spa); - uint64_t used = spa_get_alloc(spa); + uint64_t size; + uint64_t used; uint64_t cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa->spa_props_lock)); - /* - * readonly properties - */ - spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); - - cap = (size == 0) ? 0 : (used * 100 / size); - spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + if (spa->spa_root_vdev != NULL) { + size = spa_get_space(spa); + used = spa_get_alloc(spa); + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, + size - used, src); + + cap = (size == 0) ? 0 : (used * 100 / size); + spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + spa->spa_root_vdev->vdev_state, src); + + version = spa_version(spa); + if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); + } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, - spa->spa_root_vdev->vdev_state, src); - - /* - * settable properties that are not stored in the pool property object. - */ - version = spa_version(spa); - if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) - src = ZPROP_SRC_DEFAULT; - else - src = ZPROP_SRC_LOCAL; - spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, @@ -313,6 +341,11 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) break; case ZPOOL_PROP_BOOTFS: + /* + * If the pool version is less than SPA_VERSION_BOOTFS, + * or the pool is still being created (version == 0), + * the bootfs property cannot be set. + */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = ENOTSUP; break; @@ -419,16 +452,60 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) return (error); } +void +spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) +{ + char *cachefile; + spa_config_dirent_t *dp; + + if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), + &cachefile) != 0) + return; + + dp = kmem_alloc(sizeof (spa_config_dirent_t), + KM_SLEEP); + + if (cachefile[0] == '\0') + dp->scd_path = spa_strdup(spa_config_path); + else if (strcmp(cachefile, "none") == 0) + dp->scd_path = NULL; + else + dp->scd_path = spa_strdup(cachefile); + + list_insert_head(&spa->spa_config_list, dp); + if (need_sync) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); +} + int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; + nvpair_t *elem; + boolean_t need_sync = B_FALSE; + zpool_prop_t prop; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); + elem = NULL; + while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { + if ((prop = zpool_name_to_prop( + nvpair_name(elem))) == ZPROP_INVAL) + return (EINVAL); + + if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) + continue; + + need_sync = B_TRUE; + break; + } + + if (need_sync) + return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, + spa, nvp, 3)); + else + return (0); } /* @@ -493,21 +570,57 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) * Activate an uninitialized pool. */ static void -spa_activate(spa_t *spa) +spa_activate(spa_t *spa, int mode) { - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_mode = mode; - spa->spa_normal_class = metaslab_class_create(); - spa->spa_log_class = metaslab_class_create(); + spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops); + spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops); for (int t = 0; t < ZIO_TYPES; t++) { + const zio_taskq_info_t *ztip = &zio_taskqs[t]; for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - spa->spa_zio_taskq[t][q] = taskq_create("spa_zio", - zio_taskq_threads[t][q], maxclsyspri, 50, - INT_MAX, TASKQ_PREPOPULATE); + enum zti_modes mode = ztip->zti_nthreads[q].zti_mode; + uint_t value = ztip->zti_nthreads[q].zti_value; + char name[32]; + + (void) snprintf(name, sizeof (name), + "%s_%s", ztip->zti_name, zio_taskq_types[q]); + + if (mode == zti_mode_tune) { + mode = zio_taskq_tune_mode; + value = zio_taskq_tune_value; + if (mode == zti_mode_tune) + mode = zti_mode_online_percent; + } + + switch (mode) { + case zti_mode_fixed: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + + spa->spa_zio_taskq[t][q] = taskq_create(name, + value, maxclsyspri, 50, INT_MAX, + TASKQ_PREPOPULATE); + break; + + case zti_mode_online_percent: + spa->spa_zio_taskq[t][q] = taskq_create(name, + value, maxclsyspri, 50, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); + break; + + case zti_mode_tune: + default: + panic("unrecognized mode for " + "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " + "in spa_activate()", + t, q, mode, value); + break; + } } } @@ -536,7 +649,7 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); - + ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); txg_list_destroy(&spa->spa_vdev_txg_list); @@ -642,15 +755,10 @@ spa_unload(spa_t *spa) /* * Wait for any outstanding async I/O to complete. */ - mutex_enter(&spa->spa_async_root_lock); - while (spa->spa_async_root_count != 0) - cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock); - mutex_exit(&spa->spa_async_root_lock); - - /* - * Drop and purge level 2 cache - */ - spa_l2cache_drop(spa); + if (spa->spa_async_zio_root != NULL) { + (void) zio_wait(spa->spa_async_zio_root); + spa->spa_async_zio_root = NULL; + } /* * Close the dsl pool. @@ -660,6 +768,13 @@ spa_unload(spa_t *spa) spa->spa_dsl_pool = NULL; } + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * Drop and purge level 2 cache + */ + spa_l2cache_drop(spa); + /* * Close all vdevs. */ @@ -694,6 +809,8 @@ spa_unload(spa_t *spa) spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; + + spa_config_exit(spa, SCL_ALL, FTAG); } /* @@ -783,6 +900,7 @@ spa_load_spares(spa_t *spa) } vd->vdev_top = vd; + vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; @@ -905,12 +1023,9 @@ spa_load_l2cache(spa_t *spa) vd = oldvdevs[i]; if (vd != NULL) { - if ((spa_mode & FWRITE) && - spa_l2cache_exists(vd->vdev_guid, &pool) && - pool != 0ULL && - l2arc_vdev_present(vd)) { + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - } (void) vdev_close(vd); spa_l2cache_remove(vd); } @@ -959,7 +1074,8 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) dmu_buf_rele(db, FTAG); packed = kmem_alloc(nvsize, KM_SLEEP); - error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); + error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, + DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); @@ -1026,8 +1142,16 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) uint64_t pool_guid; uint64_t version; uint64_t autoreplace = 0; + int orig_mode = spa->spa_mode; char *ereport = FM_EREPORT_ZFS_POOL; + /* + * If this is an untrusted config, access the pool in read-only mode. + * This prevents things like resilvering recently removed devices. + */ + if (!mosconfig) + spa->spa_mode = FREAD; + ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa->spa_load_state = state; @@ -1057,6 +1181,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa->spa_load_guid = pool_guid; /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + + /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. @@ -1082,13 +1212,17 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) goto out; /* - * Validate the labels for all leaf vdevs. We need to grab the config - * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER. + * We need to validate the vdev labels against the configuration that + * we have in hand, which is dependent on the setting of mosconfig. If + * mosconfig is true then we're validating the vdev labels based on + * that config. Otherwise, we're validating against the cached config + * (zpool.cache) that was read when we loaded the zfs module, and then + * later we will recursively call spa_load() and validate against + * the vdev config. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) goto out; @@ -1192,7 +1326,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa_config_set(spa, newconfig); spa_unload(spa); spa_deactivate(spa); - spa_activate(spa); + spa_activate(spa, orig_mode); return (spa_load(spa, newconfig, state, B_TRUE)); } @@ -1384,10 +1518,11 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) goto out; } - if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { + if (spa_writeable(spa)) { dmu_tx_t *tx; int need_update = B_FALSE; - int c; + + ASSERT(state != SPA_LOAD_TRYIMPORT); /* * Claim log blocks that haven't been committed yet. @@ -1410,12 +1545,15 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. + * + * If spa_load_verbatim is true, trust the current + * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT) + state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) need_update = B_TRUE; - for (c = 0; c < rvd->vdev_children; c++) + for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; @@ -1483,7 +1621,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { - spa_activate(spa); + spa_activate(spa, spa_mode_global); error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); @@ -1586,6 +1724,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config) uint_t vsc; uint64_t pool; + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + if (spa->spa_spares.sav_count == 0) return; @@ -1633,11 +1773,11 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) vdev_stat_t *vs; uint_t vsc; + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + if (spa->spa_l2cache.sav_count == 0) return; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, @@ -1671,8 +1811,6 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) vdev_get_stats(vd, vs); } } - - spa_config_exit(spa, SCL_CONFIG, FTAG); } int @@ -1684,16 +1822,27 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) *config = NULL; error = spa_open_common(name, &spa, FTAG, config); - if (spa && *config != NULL) { - VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, - spa_get_errlog_size(spa)) == 0); + if (spa != NULL) { + /* + * This still leaves a window of inconsistency where the spares + * or l2cache devices could change and the config would be + * self-inconsistent. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - if (spa_suspended(spa)) + if (*config != NULL) { VERIFY(nvlist_add_uint64(*config, - ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); + ZPOOL_CONFIG_ERRCOUNT, + spa_get_errlog_size(spa)) == 0); - spa_add_spares(spa, *config); - spa_add_l2cache(spa, *config); + if (spa_suspended(spa)) + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_SUSPENDED, + spa->spa_failmode) == 0); + + spa_add_spares(spa, *config); + spa_add_l2cache(spa, *config); + } } /* @@ -1715,8 +1864,10 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) } } - if (spa != NULL) + if (spa != NULL) { + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); + } return (error); } @@ -1887,11 +2038,9 @@ spa_l2cache_drop(spa_t *spa) vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); - if ((spa_mode & FWRITE) && - spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && - l2arc_vdev_present(vd)) { + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - } if (vd->vdev_isl2cache) spa_l2cache_remove(vd); vdev_clear_stats(vd); @@ -1932,12 +2081,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, altroot); - spa_activate(spa); + spa_activate(spa, spa_mode_global); spa->spa_uberblock.ub_txg = txg - 1; if (props && (error = spa_prop_validate(spa, props))) { - spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); @@ -1952,6 +2100,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_ubsync = spa->spa_uberblock; /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + + /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -2069,8 +2223,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); - if (props) + if (props != NULL) { + spa_configfile_set(spa, props, B_FALSE); spa_sync_props(spa, props, CRED(), tx); + } dmu_tx_commit(tx); @@ -2095,148 +2251,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, return (0); } -/* - * Import the given pool into the system. We set up the necessary spa_t and - * then call spa_load() to do the dirty work. - */ -static int -spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, - boolean_t isroot, boolean_t allowfaulted) -{ - spa_t *spa; - char *altroot = NULL; - int error, loaderr; - nvlist_t *nvroot; - nvlist_t **spares, **l2cache; - uint_t nspares, nl2cache; - - /* - * If a pool with this name exists, return failure. - */ - mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { - mutex_exit(&spa_namespace_lock); - return (EEXIST); - } - - /* - * Create and initialize the spa structure. - */ - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, altroot); - spa_activate(spa); - - if (allowfaulted) - spa->spa_import_faulted = B_TRUE; - spa->spa_is_root = isroot; - - /* - * Pass off the heavy lifting to spa_load(). - * Pass TRUE for mosconfig (unless this is a root pool) because - * the user-supplied config is actually the one to trust when - * doing an import. - */ - loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - /* - * Toss any existing sparelist, as it doesn't have any validity anymore, - * and conflicts with spa_has_spare(). - */ - if (!isroot && spa->spa_spares.sav_config) { - nvlist_free(spa->spa_spares.sav_config); - spa->spa_spares.sav_config = NULL; - spa_load_spares(spa); - } - if (!isroot && spa->spa_l2cache.sav_config) { - nvlist_free(spa->spa_l2cache.sav_config); - spa->spa_l2cache.sav_config = NULL; - spa_load_l2cache(spa); - } - - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (error == 0) - error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); - if (error == 0) - error = spa_validate_aux(spa, nvroot, -1ULL, - VDEV_ALLOC_L2CACHE); - spa_config_exit(spa, SCL_ALL, FTAG); - - if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { - if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { - /* - * If we failed to load the pool, but 'allowfaulted' is - * set, then manually set the config as if the config - * passed in was specified in the cache file. - */ - error = 0; - spa->spa_import_faulted = B_FALSE; - if (spa->spa_config == NULL) - spa->spa_config = spa_config_generate(spa, - NULL, -1ULL, B_TRUE); - spa_unload(spa); - spa_deactivate(spa); - spa_config_sync(spa, B_FALSE, B_TRUE); - } else { - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); - } - mutex_exit(&spa_namespace_lock); - return (error); - } - - /* - * Override any spares and level 2 cache devices as specified by - * the user, as these may have correct device names/devids, etc. - */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - if (spa->spa_spares.sav_config) - VERIFY(nvlist_remove(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - else - VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_spares(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - spa->spa_spares.sav_sync = B_TRUE; - } - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache) == 0) { - if (spa->spa_l2cache.sav_config) - VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); - else - VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_l2cache(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - spa->spa_l2cache.sav_sync = B_TRUE; - } - - if (spa_mode & FWRITE) { - /* - * Update the config cache to include the newly-imported pool. - */ - spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); - } - - spa->spa_import_faulted = B_FALSE; - mutex_exit(&spa_namespace_lock); - - return (0); -} - -#if defined(sun) +#ifdef sun #ifdef _KERNEL /* * Build a "root" vdev for a top level vdev read in from a rootpool @@ -2372,11 +2387,11 @@ spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) char *cdevid, *cpath; uint64_t tmptxg; + cpath = NULL; + cdevid = NULL; if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, - &cpath) != 0) - return (EINVAL); - if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID, - &cdevid) != 0) + &cpath) != 0 && nvlist_lookup_string(child[c], + ZPOOL_CONFIG_DEVID, &cdevid) != 0) return (EINVAL); if ((spa_check_rootconf(cpath, cdevid, NULL, &tmptxg) == 0) && (tmptxg > txg)) { @@ -2414,6 +2429,7 @@ spa_import_rootpool(char *devpath, char *devid) nvlist_t *conf = NULL; char *pname; int error; + spa_t *spa; /* * Get the vdev pathname and configuation from the most @@ -2429,18 +2445,24 @@ spa_import_rootpool(char *devpath, char *devid) VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); - /* - * We specify 'allowfaulted' for this to be treated like spa_open() - * instead of spa_import(). This prevents us from marking vdevs as - * persistently unavailable, and generates FMA ereports as if it were a - * pool open, not import. - */ - error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); - if (error == EEXIST) - error = 0; + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pname)) != NULL) { + /* + * Remove the existing root pool from the namespace so that we + * can replace it with the correct config we just read in. + */ + spa_remove(spa); + } + + spa = spa_add(pname, NULL); + spa->spa_is_root = B_TRUE; + spa->spa_load_verbatim = B_TRUE; + + VERIFY(nvlist_dup(conf, &spa->spa_config, 0) == 0); + mutex_exit(&spa_namespace_lock); nvlist_free(conf); - return (error); + return (0); msg_out: cmn_err(CE_NOTE, "\n" @@ -2453,23 +2475,170 @@ msg_out: return (error); } #endif -#endif +#endif /* sun */ /* - * Import a non-root pool into the system. + * Take a pool and insert it into the namespace as if it had been loaded at + * boot. */ int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props) +spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) { - return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); + spa_t *spa; + char *altroot = NULL; + + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } + + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + spa = spa_add(pool, altroot); + + spa->spa_load_verbatim = B_TRUE; + + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + spa_config_sync(spa, B_FALSE, B_TRUE); + + mutex_exit(&spa_namespace_lock); + + return (0); } +/* + * Import a non-root pool into the system. + */ int -spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) +spa_import(const char *pool, nvlist_t *config, nvlist_t *props) { - return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); -} + spa_t *spa; + char *altroot = NULL; + int error; + nvlist_t *nvroot; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + + /* + * If a pool with this name exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pool)) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } + + /* + * Create and initialize the spa structure. + */ + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + spa = spa_add(pool, altroot); + spa_activate(spa, spa_mode_global); + + /* + * Don't start async tasks until we know everything is healthy. + */ + spa_async_suspend(spa); + + /* + * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig + * because the user-supplied config is actually the one to trust when + * doing an import. + */ + error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + /* + * Toss any existing sparelist, as it doesn't have any validity + * anymore, and conflicts with spa_has_spare(). + */ + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; + spa_load_spares(spa); + } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + spa_load_l2cache(spa); + } + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_SPARE); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_L2CACHE); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + if (error != 0 || (props && spa_writeable(spa) && + (error = spa_prop_set(spa, props)))) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + spa_async_resume(spa); + + /* + * Override any spares and level 2 cache devices as specified by + * the user, as these may have correct device names/devids, etc. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + if (spa->spa_spares.sav_config) + VERIFY(nvlist_remove(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + if (spa->spa_l2cache.sav_config) + VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + if (spa_writeable(spa)) { + /* + * Update the config cache to include the newly-imported pool. + */ + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + } + + mutex_exit(&spa_namespace_lock); + + return (0); +} /* * This (illegal) pool name is used when temporarily importing a spa_t in order @@ -2497,7 +2666,7 @@ spa_tryimport(nvlist_t *tryconfig) */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, NULL); - spa_activate(spa); + spa_activate(spa, FREAD); /* * Pass off the heavy lifting to spa_load(). @@ -2553,8 +2722,10 @@ spa_tryimport(nvlist_t *tryconfig) /* * Add the list of hot spares and level 2 cache devices. */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); + spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); @@ -2583,7 +2754,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, if (oldconfig) *oldconfig = NULL; - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return (EROFS); mutex_enter(&spa_namespace_lock); @@ -2718,7 +2889,7 @@ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg; - int c, error; + int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; @@ -2757,7 +2928,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) /* * Transfer each new top-level vdev from vd to rvd. */ - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = rvd->vdev_children; @@ -2965,13 +3136,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ open_txg = txg + TXG_CONCURRENT_STATES - 1; - mutex_enter(&newvd->vdev_dtl_lock); - space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, - open_txg - TXG_INITIAL + 1); - mutex_exit(&newvd->vdev_dtl_lock); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, open_txg - TXG_INITIAL + 1); - if (newvd->vdev_isspare) + if (newvd->vdev_isspare) { spa_spare_activate(newvd); + spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); + } + oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; @@ -3012,10 +3184,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * is a replacing vdev. */ int -spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) +spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; - int c, t, error; + int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; @@ -3035,6 +3207,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) pvd = vd->vdev_parent; /* + * If the parent/child relationship is not as expected, don't do it. + * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing + * vdev that's replacing B with C. The user's intent in replacing + * is to go from M(A,B) to M(A,C). If the user decides to cancel + * the replace by detaching C, the expected behavior is to end up + * M(A,B). But suppose that right after deciding to detach C, + * the replacement of B completes. We would have M(A,C), and then + * ask to detach C, which would leave us with just A -- not what + * the user wanted. To prevent this, we make sure that the + * parent/child relationship hasn't changed -- in this example, + * that C's parent is still the replacing vdev R. + */ + if (pvd->vdev_guid != pguid && pguid != 0) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* * If replace_done is specified, only remove this device if it's * the first child of a replacing vdev. For the 'spare' vdev, either * disk can be removed. @@ -3060,36 +3248,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* - * If there's only one replica, you can't detach it. + * If this device has the only valid copy of some data, + * we cannot safely detach it. */ - if (pvd->vdev_children <= 1) + if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - /* - * If all siblings have non-empty DTLs, this device may have the only - * valid copy of the data, which means we cannot safely detach it. - * - * XXX -- as in the vdev_offline() case, we really want a more - * precise DTL check. - */ - for (c = 0; c < pvd->vdev_children; c++) { - uint64_t dirty; - - cvd = pvd->vdev_child[c]; - if (cvd == vd) - continue; - if (vdev_is_dead(cvd)) - continue; - mutex_enter(&cvd->vdev_dtl_lock); - dirty = cvd->vdev_dtl_map.sm_space | - cvd->vdev_dtl_scrub.sm_space; - mutex_exit(&cvd->vdev_dtl_lock); - if (!dirty) - break; - } - - if (c == pvd->vdev_children) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then @@ -3115,7 +3280,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0) + vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) unspare = B_TRUE; /* @@ -3141,14 +3306,18 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) /* * If we need to remove the remaining child from the list of hot spares, - * do it now, marking the vdev as no longer a spare in the process. We - * must do this before vdev_remove_parent(), because that can change the - * GUID if it creates a new toplevel GUID. + * do it now, marking the vdev as no longer a spare in the process. + * We must do this before vdev_remove_parent(), because that can + * change the GUID if it creates a new toplevel GUID. For a similar + * reason, we must remove the spare now, in the same txg as the detach; + * otherwise someone could attach a new sibling, change the GUID, and + * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; + (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); } /* @@ -3186,7 +3355,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ - for (t = 0; t < TXG_SIZE; t++) + for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); @@ -3201,11 +3370,14 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) * list of every other pool. */ if (unspare) { + spa_t *myspa = spa; spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa->spa_state != POOL_STATE_ACTIVE) continue; + if (spa == myspa) + continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); @@ -3269,10 +3441,12 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) vdev_t *vd; nvlist_t **spares, **l2cache, *nv; uint_t nspares, nl2cache; - uint64_t txg; + uint64_t txg = 0; int error = 0; + boolean_t locked = MUTEX_HELD(&spa_namespace_lock); - txg = spa_vdev_enter(spa); + if (!locked) + txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -3315,7 +3489,10 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) error = ENOENT; } - return (spa_vdev_exit(spa, NULL, txg, error)); + if (!locked) + return (spa_vdev_exit(spa, NULL, txg, error)); + + return (error); } /* @@ -3341,13 +3518,9 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) oldvd = vd->vdev_child[0]; newvd = vd->vdev_child[1]; - mutex_enter(&newvd->vdev_dtl_lock); - if (newvd->vdev_dtl_map.sm_space == 0 && - newvd->vdev_dtl_scrub.sm_space == 0) { - mutex_exit(&newvd->vdev_dtl_lock); + if (vdev_dtl_empty(newvd, DTL_MISSING) && + !vdev_dtl_required(oldvd)) return (oldvd); - } - mutex_exit(&newvd->vdev_dtl_lock); } /* @@ -3357,15 +3530,12 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) newvd = vd->vdev_child[0]; oldvd = vd->vdev_child[1]; - mutex_enter(&newvd->vdev_dtl_lock); if (newvd->vdev_unspare && - newvd->vdev_dtl_map.sm_space == 0 && - newvd->vdev_dtl_scrub.sm_space == 0) { + vdev_dtl_empty(newvd, DTL_MISSING) && + !vdev_dtl_required(oldvd)) { newvd->vdev_unspare = 0; - mutex_exit(&newvd->vdev_dtl_lock); return (oldvd); } - mutex_exit(&newvd->vdev_dtl_lock); } return (NULL); @@ -3374,92 +3544,84 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) static void spa_vdev_resilver_done(spa_t *spa) { - vdev_t *vd; - vdev_t *pvd; - uint64_t guid; - uint64_t pguid = 0; + vdev_t *vd, *pvd, *ppvd; + uint64_t guid, sguid, pguid, ppguid; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { + pvd = vd->vdev_parent; + ppvd = pvd->vdev_parent; guid = vd->vdev_guid; + pguid = pvd->vdev_guid; + ppguid = ppvd->vdev_guid; + sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ - pvd = vd->vdev_parent; - if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && - pvd->vdev_id == 0) { + if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); - ASSERT(pvd->vdev_parent->vdev_children == 2); - pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; + ASSERT(ppvd->vdev_children == 2); + sguid = ppvd->vdev_child[1]->vdev_guid; } - spa_config_exit(spa, SCL_CONFIG, FTAG); - if (spa_vdev_detach(spa, guid, B_TRUE) != 0) + spa_config_exit(spa, SCL_ALL, FTAG); + if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; - if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) + if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } - spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); } /* - * Update the stored path for this vdev. Dirty the vdev configuration, relying - * on spa_vdev_enter/exit() to synchronize the labels and cache. + * Update the stored path or FRU for this vdev. Dirty the vdev configuration, + * relying on spa_vdev_enter/exit() to synchronize the labels and cache. */ int -spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, + boolean_t ispath) { vdev_t *vd; uint64_t txg; txg = spa_vdev_enter(spa); - if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { - /* - * Determine if this is a reference to a hot spare device. If - * it is, update the path manually as there is no associated - * vdev_t that can be synced to disk. - */ - nvlist_t **spares; - uint_t i, nspares; - - if (spa->spa_spares.sav_config != NULL) { - VERIFY(nvlist_lookup_nvlist_array( - spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0); - for (i = 0; i < nspares; i++) { - uint64_t theguid; - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == guid) { - VERIFY(nvlist_add_string(spares[i], - ZPOOL_CONFIG_PATH, newpath) == 0); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; - return (spa_vdev_exit(spa, NULL, txg, - 0)); - } - } - } - + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_exit(spa, NULL, txg, ENOENT)); - } if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - spa_strfree(vd->vdev_path); - vd->vdev_path = spa_strdup(newpath); + if (ispath) { + spa_strfree(vd->vdev_path); + vd->vdev_path = spa_strdup(value); + } else { + if (vd->vdev_fru != NULL) + spa_strfree(vd->vdev_fru); + vd->vdev_fru = spa_strdup(value); + } vdev_config_dirty(vd->vdev_top); return (spa_vdev_exit(spa, NULL, txg, 0)); } +int +spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +{ + return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); +} + +int +spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) +{ + return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); +} + /* * ========================================================================== * SPA Scrubbing @@ -3510,7 +3672,17 @@ spa_async_remove(spa_t *spa, vdev_t *vd) if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = 0; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); - vdev_clear(spa, vd); + + /* + * We want to clear the stats, but we don't want to do a full + * vdev_clear() as that will cause us to throw away + * degraded/faulted state as well as attempt to reopen the + * device, all of which is a waste. + */ + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + vdev_state_dirty(vd->vdev_top); } @@ -3789,7 +3961,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) zpool_prop_t prop; const char *propname; zprop_type_t proptype; - spa_config_dirent_t *dp; mutex_enter(&spa->spa_props_lock); @@ -3822,23 +3993,8 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) case ZPOOL_PROP_CACHEFILE: /* - * 'cachefile' is a non-persistent property, but note - * an async request that the config cache needs to be - * udpated. + * 'cachefile' is also a non-persisitent property. */ - VERIFY(nvpair_value_string(elem, &strval) == 0); - - dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); - - if (strval[0] == '\0') - dp->scd_path = spa_strdup(spa_config_path); - else if (strcmp(strval, "none") == 0) - dp->scd_path = NULL; - else - dp->scd_path = spa_strdup(strval); - - list_insert_head(&spa->spa_config_list, dp); - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); break; default: /* @@ -3939,9 +4095,22 @@ spa_sync(spa_t *spa, uint64_t txg) * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { - vdev_state_clean(vd); - vdev_config_dirty(vd); + while (list_head(&spa->spa_state_dirty_list) != NULL) { + /* + * We need the write lock here because, for aux vdevs, + * calling vdev_config_dirty() modifies sav_config. + * This is ugly and will become unnecessary when we + * eliminate the aux vdev wart by integrating all vdevs + * into the root vdev tree. + */ + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + } + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); @@ -4175,7 +4344,7 @@ spa_evict_all(void) } vdev_t * -spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) +spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; @@ -4183,12 +4352,18 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); - if (l2cache) { + if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } + + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; + if (vd->vdev_guid == guid) + return (vd); + } } return (NULL); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c index 51770fc..34050ef 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c @@ -212,6 +212,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (rootdir == NULL || !(spa_mode_global & FWRITE)) + return; + /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing @@ -386,23 +389,12 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) } /* - * For a pool that's not currently a booting rootpool, update all disk labels, - * generate a fresh config based on the current in-core state, and sync the - * global config cache. - */ -void -spa_config_update(spa_t *spa, int what) -{ - spa_config_update_common(spa, what, FALSE); -} - -/* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void -spa_config_update_common(spa_t *spa, int what, boolean_t isroot) +spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; @@ -440,9 +432,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot) /* * Update the global config cache to reflect the new mosconfig. */ - if (!isroot) + if (!spa->spa_is_root) spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); if (what == SPA_CONFIG_UPDATE_POOL) - spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot); + spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c index e5c395f..e1ae491 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Routines to manage the on-disk persistent error log. * @@ -61,8 +59,8 @@ * lowercase hexidecimal numbers that don't overflow. */ #ifdef _KERNEL -static uint64_t -_strtonum(char *str, char **nptr) +uint64_t +_strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; @@ -82,7 +80,8 @@ _strtonum(char *str, char **nptr) str++; } - *nptr = str; + if (nptr) + *nptr = (char *)str; return (val); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c index de520d3..b403ccb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/spa.h> #include <sys/spa_impl.h> #include <sys/zap.h> @@ -127,12 +125,12 @@ spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, - buf)) != 0) + buf, DMU_READ_PREFETCH)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, - buf + firstread)) != 0) + buf + firstread, DMU_READ_PREFETCH)) != 0) return (err); } @@ -381,10 +379,11 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) return (0); } - err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf); + err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, + DMU_READ_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, - leftover, buf + read_len); + leftover, buf + read_len, DMU_READ_PREFETCH); } mutex_exit(&spa->spa_history_lock); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index 5735d31..89e0301 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -230,7 +230,7 @@ static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; -int spa_mode; +int spa_mode_global; #ifdef ZFS_DEBUG /* Everything except dprintf is on by default in debug builds */ @@ -429,7 +429,6 @@ spa_add(const char *name, const char *altroot) spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); @@ -438,7 +437,6 @@ spa_add(const char *name, const char *altroot) mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); @@ -512,12 +510,10 @@ spa_remove(spa_t *spa) spa_config_lock_destroy(spa); cv_destroy(&spa->spa_async_cv); - cv_destroy(&spa->spa_async_root_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); mutex_destroy(&spa->spa_async_lock); - mutex_destroy(&spa->spa_async_root_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_errlist_lock); @@ -884,8 +880,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { - ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0); + ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); + spa_config_exit(spa, SCL_ALL, spa); } /* @@ -916,6 +914,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) spa_config_exit(spa, SCL_STATE_ALL, spa); + /* + * If anything changed, wait for it to sync. This ensures that, + * from the system administrator's perspective, zpool(1M) commands + * are synchronous. This is important for things like zpool offline: + * when the command completes, you expect no further I/O from ZFS. + */ + if (vd != NULL) + txg_wait_synced(spa->spa_dsl_pool, 0); + return (error); } @@ -1118,6 +1125,37 @@ zfs_panic_recover(const char *fmt, ...) } /* + * This is a stripped-down version of strtoull, suitable only for converting + * lowercase hexidecimal numbers that don't overflow. + */ +uint64_t +zfs_strtonum(const char *str, char **nptr) +{ + uint64_t val = 0; + char c; + int digit; + + while ((c = *str) != '\0') { + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (c >= 'a' && c <= 'f') + digit = 10 + c - 'a'; + else + break; + + val *= 16; + val += digit; + + str++; + } + + if (nptr) + *nptr = (char *)str; + + return (val); +} + +/* * ========================================================================== * Accessor functions * ========================================================================== @@ -1355,7 +1393,7 @@ spa_init(int mode) avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); - spa_mode = mode; + spa_mode_global = mode; refcount_sysinit(); unique_init(); @@ -1412,3 +1450,15 @@ spa_is_root(spa_t *spa) { return (spa->spa_is_root); } + +boolean_t +spa_writeable(spa_t *spa) +{ + return (!!(spa->spa_mode & FWRITE)); +} + +int +spa_mode(spa_t *spa) +{ + return (spa->spa_mode); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c index 0f247c0..75b55d5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -116,12 +116,23 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) if (merge_before && merge_after) { avl_remove(&sm->sm_root, ss_before); + if (sm->sm_pp_root) { + avl_remove(sm->sm_pp_root, ss_before); + avl_remove(sm->sm_pp_root, ss_after); + } ss_after->ss_start = ss_before->ss_start; kmem_free(ss_before, sizeof (*ss_before)); + ss = ss_after; } else if (merge_before) { ss_before->ss_end = end; + if (sm->sm_pp_root) + avl_remove(sm->sm_pp_root, ss_before); + ss = ss_before; } else if (merge_after) { ss_after->ss_start = start; + if (sm->sm_pp_root) + avl_remove(sm->sm_pp_root, ss_after); + ss = ss_after; } else { ss = kmem_alloc(sizeof (*ss), KM_SLEEP); ss->ss_start = start; @@ -129,6 +140,9 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) avl_insert(&sm->sm_root, ss, where); } + if (sm->sm_pp_root) + avl_add(sm->sm_pp_root, ss); + sm->sm_space += size; } @@ -163,12 +177,17 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) left_over = (ss->ss_start != start); right_over = (ss->ss_end != end); + if (sm->sm_pp_root) + avl_remove(sm->sm_pp_root, ss); + if (left_over && right_over) { newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); newseg->ss_start = end; newseg->ss_end = ss->ss_end; ss->ss_end = start; avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); + if (sm->sm_pp_root) + avl_add(sm->sm_pp_root, newseg); } else if (left_over) { ss->ss_end = start; } else if (right_over) { @@ -176,12 +195,16 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) } else { avl_remove(&sm->sm_root, ss); kmem_free(ss, sizeof (*ss)); + ss = NULL; } + if (sm->sm_pp_root && ss != NULL) + avl_add(sm->sm_pp_root, ss); + sm->sm_space -= size; } -int +boolean_t space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) { avl_index_t where; @@ -221,59 +244,10 @@ space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) { space_seg_t *ss; - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) - func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); -} - -void -space_map_excise(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_tree_t *t = &sm->sm_root; - avl_index_t where; - space_seg_t *ss, search; - uint64_t end = start + size; - uint64_t rm_start, rm_end; - ASSERT(MUTEX_HELD(sm->sm_lock)); - search.ss_start = start; - search.ss_end = start; - - for (;;) { - ss = avl_find(t, &search, &where); - - if (ss == NULL) - ss = avl_nearest(t, where, AVL_AFTER); - - if (ss == NULL || ss->ss_start >= end) - break; - - rm_start = MAX(ss->ss_start, start); - rm_end = MIN(ss->ss_end, end); - - space_map_remove(sm, rm_start, rm_end - rm_start); - } -} - -/* - * Replace smd with the union of smd and sms. - */ -void -space_map_union(space_map_t *smd, space_map_t *sms) -{ - avl_tree_t *t = &sms->sm_root; - space_seg_t *ss; - - ASSERT(MUTEX_HELD(smd->sm_lock)); - - /* - * For each source segment, remove any intersections with the - * destination, then add the source segment to the destination. - */ - for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { - space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start); - space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start); - } + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); } /* @@ -337,7 +311,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, smo->smo_object, offset, size); mutex_exit(sm->sm_lock); - error = dmu_read(os, smo->smo_object, offset, size, entry_map); + error = dmu_read(os, smo->smo_object, offset, size, entry_map, + DMU_READ_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; @@ -391,6 +366,15 @@ space_map_unload(space_map_t *sm) } uint64_t +space_map_maxsize(space_map_t *sm) +{ + if (sm->sm_loaded && sm->sm_ops != NULL) + return (sm->sm_ops->smop_max(sm)); + else + return (-1ULL); +} + +uint64_t space_map_alloc(space_map_t *sm, uint64_t size) { uint64_t start; @@ -505,3 +489,131 @@ space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) smo->smo_objsize = 0; smo->smo_alloc = 0; } + +/* + * Space map reference trees. + * + * A space map is a collection of integers. Every integer is either + * in the map, or it's not. A space map reference tree generalizes + * the idea: it allows its members to have arbitrary reference counts, + * as opposed to the implicit reference count of 0 or 1 in a space map. + * This representation comes in handy when computing the union or + * intersection of multiple space maps. For example, the union of + * N space maps is the subset of the reference tree with refcnt >= 1. + * The intersection of N space maps is the subset with refcnt >= N. + * + * [It's very much like a Fourier transform. Unions and intersections + * are hard to perform in the 'space map domain', so we convert the maps + * into the 'reference count domain', where it's trivial, then invert.] + * + * vdev_dtl_reassess() uses computations of this form to determine + * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev + * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev + * has an outage wherever refcnt >= vdev_children. + */ +static int +space_map_ref_compare(const void *x1, const void *x2) +{ + const space_ref_t *sr1 = x1; + const space_ref_t *sr2 = x2; + + if (sr1->sr_offset < sr2->sr_offset) + return (-1); + if (sr1->sr_offset > sr2->sr_offset) + return (1); + + if (sr1 < sr2) + return (-1); + if (sr1 > sr2) + return (1); + + return (0); +} + +void +space_map_ref_create(avl_tree_t *t) +{ + avl_create(t, space_map_ref_compare, + sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); +} + +void +space_map_ref_destroy(avl_tree_t *t) +{ + space_ref_t *sr; + void *cookie = NULL; + + while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(sr, sizeof (*sr)); + + avl_destroy(t); +} + +static void +space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) +{ + space_ref_t *sr; + + sr = kmem_alloc(sizeof (*sr), KM_SLEEP); + sr->sr_offset = offset; + sr->sr_refcnt = refcnt; + + avl_add(t, sr); +} + +void +space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, + int64_t refcnt) +{ + space_map_ref_add_node(t, start, refcnt); + space_map_ref_add_node(t, end, -refcnt); +} + +/* + * Convert (or add) a space map into a reference tree. + */ +void +space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt) +{ + space_seg_t *ss; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt); +} + +/* + * Convert a reference tree into a space map. The space map will contain + * all members of the reference tree for which refcnt >= minref. + */ +void +space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref) +{ + uint64_t start = -1ULL; + int64_t refcnt = 0; + space_ref_t *sr; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + space_map_vacate(sm, NULL, NULL); + + for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { + refcnt += sr->sr_refcnt; + if (refcnt >= minref) { + if (start == -1ULL) { + start = sr->sr_offset; + } + } else { + if (start != -1ULL) { + uint64_t end = sr->sr_offset; + ASSERT(start <= end); + if (end > start) + space_map_add(sm, start, end - start); + start = -1ULL; + } + } + } + ASSERT(refcnt == 0); + ASSERT(start == -1ULL); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h index 0a39d19..f52851d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -85,6 +85,8 @@ void *arc_data_buf_alloc(uint64_t space); void arc_data_buf_free(void *buf, uint64_t space); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); +arc_buf_t *arc_loan_buf(spa_t *spa, int size); +void arc_return_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index b27d89f..7e2754d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -262,6 +262,7 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_clear(dmu_buf_impl_t *db); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 7befe96..08c30c8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -61,6 +61,7 @@ struct zbookmark; struct spa; struct nvlist; struct objset_impl; +struct arc_buf; struct file; typedef struct objset objset_t; @@ -116,6 +117,8 @@ typedef enum dmu_object_type { DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_USERGROUP_USED, /* ZAP */ + DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -158,6 +161,9 @@ void zfs_znode_byteswap(void *buf, size_t size); #define DMU_MAX_ACCESS (10<<20) /* 10MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ +#define DMU_USERUSED_OBJECT (-1ULL) +#define DMU_GROUPUSED_OBJECT (-2ULL) + /* * Public routines to create, destroy, open, and close objsets. */ @@ -173,7 +179,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type, int dmu_objset_destroy(const char *name); int dmu_snapshots_destroy(char *fsname, char *snapname); int dmu_objset_rollback(objset_t *os); -int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props, + boolean_t recursive); int dmu_objset_rename(const char *name, const char *newname, boolean_t recursive); int dmu_objset_find(char *name, int func(char *, void *), void *arg, @@ -400,6 +407,11 @@ void *dmu_buf_get_user(dmu_buf_t *db); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* + * Tells if the given dbuf is freeable. + */ +boolean_t dmu_buf_freeable(dmu_buf_t *); + +/* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has @@ -424,7 +436,7 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); -void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); @@ -447,8 +459,10 @@ int dmu_free_object(objset_t *os, uint64_t object); * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ +#define DMU_READ_PREFETCH 0 /* prefetch */ +#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf); + void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); @@ -456,6 +470,10 @@ int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); +struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); +void dmu_return_arcbuf(struct arc_buf *buf); +void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, + dmu_tx_t *tx); extern int zfs_prefetch_disable; @@ -562,6 +580,12 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); + +typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype, + void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused, + dmu_tx_t *tx); +extern void dmu_objset_register_type(dmu_objset_type_t ost, + objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h index 1d65727..a8022d2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h @@ -42,12 +42,20 @@ struct dsl_dataset; struct dmu_tx; struct objset_impl; +#define OBJSET_PHYS_SIZE 2048 +#define OBJSET_OLD_PHYS_SIZE 1024 + +#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) + typedef struct objset_phys { dnode_phys_t os_meta_dnode; zil_header_t os_zil_header; uint64_t os_type; - char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - - sizeof (uint64_t)]; + uint64_t os_flags; + char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - + sizeof (zil_header_t) - sizeof (uint64_t)*2]; + dnode_phys_t os_userused_dnode; + dnode_phys_t os_groupused_dnode; } objset_phys_t; struct objset { @@ -62,6 +70,8 @@ typedef struct objset_impl { arc_buf_t *os_phys_buf; objset_phys_t *os_phys; dnode_t *os_meta_dnode; + dnode_t *os_userused_dnode; + dnode_t *os_groupused_dnode; zilog_t *os_zil; objset_t os; uint8_t os_checksum; /* can change, under dsl_dir's locks */ @@ -74,6 +84,8 @@ typedef struct objset_impl { struct dmu_tx *os_synctx; /* XXX sketchy */ blkptr_t *os_rootbp; zil_header_t os_zil_header; + list_t os_synced_dnodes; + uint64_t os_flags; /* Protected by os_obj_lock */ kmutex_t os_obj_lock; @@ -92,6 +104,7 @@ typedef struct objset_impl { } objset_impl_t; #define DMU_META_DNODE_OBJECT 0 +#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) #define DMU_OS_IS_L2CACHEABLE(os) \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ @@ -106,7 +119,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_objset_destroy(const char *name); int dmu_objset_rollback(objset_t *os); -int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props, + boolean_t recursive); void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, @@ -127,6 +141,10 @@ objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_impl_t **osip); void dmu_objset_evict(struct dsl_dataset *ds, void *arg); +void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx); +boolean_t dmu_objset_userused_enabled(objset_impl_t *os); +int dmu_objset_userspace_upgrade(objset_t *os); +boolean_t dmu_objset_userspace_present(objset_t *os); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h index be9e569..48e4da8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -98,7 +98,8 @@ enum dnode_dirtycontext { }; /* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ -#define DNODE_FLAG_USED_BYTES (1<<0) +#define DNODE_FLAG_USED_BYTES (1<<0) +#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ @@ -131,10 +132,7 @@ typedef struct dnode { */ krwlock_t dn_struct_rwlock; - /* - * Our link on dataset's dd_dnodes list. - * Protected by dd_accounting_mtx. - */ + /* Our link on dn_objset->os_dnodes list; protected by os_lock. */ list_node_t dn_link; /* immutable: */ @@ -191,6 +189,9 @@ typedef struct dnode { /* parent IO for current sync write */ zio_t *dn_zio; + /* used in syncing context */ + dnode_phys_t *dn_oldphys; + /* holds prefetch structure */ struct zfetch dn_zfetch; } dnode_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h index 8665aec..a1c2896 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -195,7 +195,7 @@ void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dmu_tx_t *tx); -int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); +boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h index a29e44e..b064c92 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DSL_DELEG_H #define _SYS_DSL_DELEG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dsl_pool.h> #include <sys/zfs_context.h> @@ -51,6 +49,10 @@ extern "C" { #define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_VSCAN "vscan" +#define ZFS_DELEG_PERM_USERQUOTA "userquota" +#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" +#define ZFS_DELEG_PERM_USERUSED "userused" +#define ZFS_DELEG_PERM_GROUPUSED "groupused" /* * Note: the names of properties that are marked delegatable are also diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h index 86b9636..56d0638 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -107,7 +107,6 @@ int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **); void dsl_dir_name(dsl_dir_t *dd, char *buf); int dsl_dir_namelen(dsl_dir_t *dd); -int dsl_dir_is_private(dsl_dir_t *dd); uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx); dsl_checkfunc_t dsl_dir_destroy_check; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h index ef1b904..d8da295 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h index d66caa8..26018a4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h @@ -19,18 +19,17 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DSL_PROP_H #define _SYS_DSL_PROP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dmu.h> #include <sys/dsl_pool.h> #include <sys/zfs_context.h> +#include <sys/dsl_synctask.h> #ifdef __cplusplus extern "C" { @@ -66,8 +65,10 @@ int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, int intsz, int numints, void *buf, char *setpoint); +dsl_syncfunc_t dsl_props_set_sync; int dsl_prop_set(const char *ddname, const char *propname, int intsz, int numints, const void *buf); +int dsl_props_set(const char *dsname, nvlist_t *nvl); void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index 1c9d89e..5d3e11c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,6 +39,8 @@ extern "C" { typedef struct metaslab_class metaslab_class_t; typedef struct metaslab_group metaslab_group_t; +extern space_map_ops_t *zfs_metaslab_ops; + extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, uint64_t start, uint64_t size, uint64_t txg); extern void metaslab_fini(metaslab_t *msp); @@ -55,7 +57,7 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); -extern metaslab_class_t *metaslab_class_create(void); +extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops); extern void metaslab_class_destroy(metaslab_class_t *mc); extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index 5980cbc..d67dea7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_METASLAB_IMPL_H #define _SYS_METASLAB_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/metaslab.h> #include <sys/space_map.h> #include <sys/vdev.h> @@ -41,6 +39,7 @@ extern "C" { struct metaslab_class { metaslab_group_t *mc_rotor; uint64_t mc_allocated; + space_map_ops_t *mc_ops; }; struct metaslab_group { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index 1cfa7ec..f54a5dc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -208,8 +208,8 @@ typedef struct blkptr { #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ - (BP_IS_HOLE(bp) ? 0 : \ - BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) + BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) + #define BP_SET_LSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) @@ -329,7 +329,7 @@ extern int spa_check_rootconf(char *devpath, char *devid, extern boolean_t spa_rootdev_validate(nvlist_t *nv); extern int spa_import_rootpool(char *devpath, char *devid); extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); -extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *); +extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, @@ -352,9 +352,11 @@ extern void spa_inject_delref(spa_t *spa); extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing); -extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, + int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); +extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); @@ -476,6 +478,10 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); +extern boolean_t spa_writeable(spa_t *spa); +extern int spa_mode(spa_t *spa); +extern uint64_t zfs_strtonum(const char *str, char **nptr); +#define strtonum(str, nptr) zfs_strtonum((str), (nptr)) /* history logging */ typedef enum history_log_type { @@ -529,6 +535,7 @@ extern void spa_boot_init(); extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); +extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); @@ -546,7 +553,7 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_bp(bp, fmt, ...) #endif -extern int spa_mode; /* mode, e.g. FREAD | FWRITE */ +extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index 8aeb414..f3124b1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -105,6 +105,7 @@ struct spa { int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ + boolean_t spa_load_verbatim; /* load the given config? */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; metaslab_class_t *spa_normal_class; /* normal data class */ @@ -141,9 +142,6 @@ struct spa { int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ - kmutex_t spa_async_root_lock; /* protects async root count */ - uint64_t spa_async_root_count; /* number of async root zios */ - kcondvar_t spa_async_root_cv; /* notify when count == 0 */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ boolean_t spa_last_open_failed; /* true if last open faled */ @@ -163,13 +161,14 @@ struct spa { uint64_t spa_failmode; /* failure mode for the pool */ uint64_t spa_delegation; /* delegation on/off */ list_t spa_config_list; /* previous cache file(s) */ + zio_t *spa_async_zio_root; /* root of all async I/O */ zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ - boolean_t spa_import_faulted; /* allow faulted vdevs */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ + int spa_mode; /* FREAD | FWRITE */ spa_log_state_t spa_log_state; /* log state */ /* * spa_refcnt & spa_config_lock must be the last elements diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h index db9daef..a682bbd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SPACE_MAP_H #define _SYS_SPACE_MAP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/avl.h> #include <sys/dmu.h> @@ -48,16 +46,24 @@ typedef struct space_map { uint8_t sm_loading; /* map loading? */ kcondvar_t sm_load_cv; /* map load completion */ space_map_ops_t *sm_ops; /* space map block picker ops vector */ + avl_tree_t *sm_pp_root; /* picker-private AVL tree */ void *sm_ppd; /* picker-private data */ kmutex_t *sm_lock; /* pointer to lock that protects map */ } space_map_t; typedef struct space_seg { avl_node_t ss_node; /* AVL node */ + avl_node_t ss_pp_node; /* AVL picker-private node */ uint64_t ss_start; /* starting offset of this segment */ uint64_t ss_end; /* ending offset (non-inclusive) */ } space_seg_t; +typedef struct space_ref { + avl_node_t sr_node; /* AVL node */ + uint64_t sr_offset; /* offset (start or end) */ + int64_t sr_refcnt; /* associated reference count */ +} space_ref_t; + typedef struct space_map_obj { uint64_t smo_object; /* on-disk space map object */ uint64_t smo_objsize; /* size of the object */ @@ -70,6 +76,7 @@ struct space_map_ops { uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); + uint64_t (*smop_max)(space_map_t *sm); }; /* @@ -133,13 +140,12 @@ extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, extern void space_map_destroy(space_map_t *sm); extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); -extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern boolean_t space_map_contains(space_map_t *sm, + uint64_t start, uint64_t size); extern void space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); extern void space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); -extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size); -extern void space_map_union(space_map_t *smd, space_map_t *sms); extern void space_map_load_wait(space_map_t *sm); extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, @@ -149,12 +155,22 @@ extern void space_map_unload(space_map_t *sm); extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); +extern uint64_t space_map_maxsize(space_map_t *sm); extern void space_map_sync(space_map_t *sm, uint8_t maptype, space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); extern void space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); +extern void space_map_ref_create(avl_tree_t *t); +extern void space_map_ref_destroy(avl_tree_t *t); +extern void space_map_ref_add_seg(avl_tree_t *t, + uint64_t start, uint64_t end, int64_t refcnt); +extern void space_map_ref_add_map(avl_tree_t *t, + space_map_t *sm, int64_t refcnt); +extern void space_map_ref_generate_map(avl_tree_t *t, + space_map_t *sm, int64_t minref); + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h index 55a0dd5..b49df8a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_UBERBLOCK_IMPL_H #define _SYS_UBERBLOCK_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/uberblock.h> #ifdef __cplusplus @@ -35,6 +33,11 @@ extern "C" { #endif /* + * For zdb use and debugging purposes only + */ +extern uint64_t ub_max_txg; + +/* * The uberblock version is incremented whenever an incompatible on-disk * format change is made to the SPA, DMU, or ZAP. * diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h index c070d6f..b8313a9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -36,6 +36,14 @@ extern "C" { #endif +typedef enum vdev_dtl_type { + DTL_MISSING, /* 0% replication: no copies of the data */ + DTL_PARTIAL, /* less than 100% replication: some copies missing */ + DTL_SCRUB, /* unable to fully repair during scrub/resilver */ + DTL_OUTAGE, /* temporarily missing (used to attempt detach) */ + DTL_TYPES +} vdev_dtl_type_t; + extern boolean_t zfs_nocacheflush; extern int vdev_open(vdev_t *); @@ -50,10 +58,14 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); extern boolean_t vdev_is_bootable(vdev_t *vd); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); -extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); -extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); +extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done); +extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 7e24ede..1406d15 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -123,8 +123,7 @@ struct vdev { vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ - space_map_t vdev_dtl_map; /* dirty time log in-core state */ - space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */ + space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ vdev_stat_t vdev_stat; /* virtual device statistics */ /* @@ -149,7 +148,7 @@ struct vdev { * Leaf vdev state. */ uint64_t vdev_psize; /* physical device capacity */ - space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ + space_map_obj_t vdev_dtl_smo; /* dirty time log space map obj */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_offline; /* persistent offline state */ @@ -160,6 +159,7 @@ struct vdev { char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ + char *vdev_fru; /* physical FRU location */ uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ hrtime_t vdev_last_try; /* last reopen time */ @@ -189,8 +189,9 @@ struct vdev { kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ }; -#define VDEV_SKIP_SIZE (8 << 10) -#define VDEV_BOOT_HEADER_SIZE (8 << 10) +#define VDEV_PAD_SIZE (8 << 10) +/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) @@ -202,26 +203,14 @@ struct vdev { offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) -/* ZFS boot block */ -#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL -#define VDEV_BOOT_VERSION 1 /* version number */ - -typedef struct vdev_boot_header { - uint64_t vb_magic; /* VDEV_BOOT_MAGIC */ - uint64_t vb_version; /* VDEV_BOOT_VERSION */ - uint64_t vb_offset; /* start offset (bytes) */ - uint64_t vb_size; /* size (bytes) */ - char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)]; -} vdev_boot_header_t; - typedef struct vdev_phys { char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; zio_block_tail_t vp_zbt; } vdev_phys_t; typedef struct vdev_label { - char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ - vdev_boot_header_t vl_boot_header; /* 8K */ + char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ + char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h index f88cc06..ea3a0f6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h @@ -186,6 +186,9 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, matchtype_t mt, char *realname, int rn_len, boolean_t *normalization_conflictp); +int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, + int add, uint64_t *towrite, uint64_t *tooverwrite); + /* * Create an attribute with the given name and value. * diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h index 0dc02ab..c86bb16 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZAP_IMPL_H #define _SYS_ZAP_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zap.h> #include <sys/zfs_context.h> #include <sys/avl.h> @@ -195,6 +193,8 @@ int fzap_count(zap_t *zap, uint64_t *count); int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); +int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, + uint64_t *tooverwrite); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int fzap_update(zap_name_t *zn, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h index f87823c..3607e1f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -113,8 +113,6 @@ typedef struct zfs_acl_phys { uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ } zfs_acl_phys_t; - - typedef struct acl_ops { uint32_t (*ace_mask_get) (void *acep); /* get access mask */ void (*ace_mask_set) (void *acep, @@ -160,12 +158,21 @@ typedef struct zfs_acl { zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ list_t z_acl; /* chunks of ACE data */ acl_ops_t z_ops; /* ACL operations */ - boolean_t z_has_fuids; /* FUIDs present in ACL? */ } zfs_acl_t; #define ACL_DATA_ALLOCED 0x1 #define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) +struct zfs_fuid_info; + +typedef struct zfs_acl_ids { + uint64_t z_fuid; /* file owner fuid */ + uint64_t z_fgid; /* file group owner fuid */ + uint64_t z_mode; /* mode to set on create */ + zfs_acl_t *z_aclp; /* ACL to create with file */ + struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ +} zfs_acl_ids_t; + /* * Property values for acl_mode and acl_inherit. * @@ -182,11 +189,12 @@ typedef struct zfs_acl { struct znode; struct zfsvfs; -struct zfs_fuid_info; #ifdef _KERNEL -void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, - dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **); +int zfs_acl_ids_create(struct znode *, int, vattr_t *, + cred_t *, vsecattr_t *, zfs_acl_ids_t *); +void zfs_acl_ids_free(zfs_acl_ids_t *); +boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *); int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); void zfs_acl_rele(void *); @@ -201,9 +209,9 @@ int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); int zfs_zaccess_rename(struct znode *, struct znode *, struct znode *, struct znode *, cred_t *cr); void zfs_acl_free(zfs_acl_t *); -int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **); -int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, - struct zfs_fuid_info **, dmu_tx_t *); +int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *, + struct zfs_fuid_info **, zfs_acl_t **); +int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h index 0dd8f4f..952bb24 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h @@ -134,4 +134,6 @@ extern struct mtx zfs_debug_mtx; } \ } while (0) +#define sys_shutdown rebooting + #endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h index 905e8dd..25348d6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZFS_CTLDIR_H #define _ZFS_CTLDIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/vnode.h> #include <sys/zfs_vfsops.h> #include <sys/zfs_znode.h> @@ -63,6 +61,7 @@ int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); #define ZFSCTL_INO_ROOT 0x1 #define ZFSCTL_INO_SNAPDIR 0x2 +#define ZFSCTL_INO_SHARES 0x3 #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h index 0dbb3c5..bd2c938 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h @@ -49,7 +49,6 @@ extern "C" { /* mknode flags */ #define IS_ROOT_NODE 0x01 /* create a root node */ #define IS_XATTR 0x02 /* create an extended attribute node */ -#define IS_REPLAY 0x04 /* we are replaying intent log */ extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, int, int *, pathname_t *); @@ -60,7 +59,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, pathname_t *); extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, - uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **); + uint_t, znode_t **, int, zfs_acl_ids_t *); extern void zfs_rmnode(znode_t *); extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); extern boolean_t zfs_dirempty(znode_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h index 8d73b41..c035707 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_FUID_H #define _SYS_FS_ZFS_FUID_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #ifdef _KERNEL #include <sys/kidmap.h> @@ -51,11 +49,11 @@ typedef enum { * Estimate space needed for one more fuid table entry. * for now assume its current size + 1K */ -#define FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) +#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) -#define FUID_INDEX(x) (x >> 32) -#define FUID_RID(x) (x & 0xffffffff) -#define FUID_ENCODE(idx, rid) ((idx << 32) | rid) +#define FUID_INDEX(x) ((x) >> 32) +#define FUID_RID(x) ((x) & 0xffffffff) +#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid)) /* * FUIDs cause problems for the intent log * we need to replay the creation of the FUID, @@ -104,17 +102,23 @@ struct znode; extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); extern void zfs_fuid_destroy(zfsvfs_t *); extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, - dmu_tx_t *, cred_t *, zfs_fuid_info_t **); + cred_t *, zfs_fuid_info_t **); extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, - dmu_tx_t *, zfs_fuid_info_t **); -extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid, - uid_t *gid); + zfs_fuid_info_t **); +extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, + uid_t *uid, uid_t *gid); extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); -extern void zfs_fuid_info_free(); +extern void zfs_fuid_info_free(zfs_fuid_info_t *); extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); +void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *); +extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain, + char **retdomain, boolean_t addok); +extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx); +extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx); #endif char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); +void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *); uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h index 05a21c8..15a4a76 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -169,6 +169,13 @@ typedef struct zfs_cmd { zinject_record_t zc_inject_record; } zfs_cmd_t; +typedef struct zfs_useracct { + char zu_domain[256]; + uid_t zu_rid; + uint32_t zu_pad; + uint64_t zu_space; +} zfs_useracct_t; + #define ZVOL_MAX_MINOR (1 << 16) #define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h index 8d53c02..163a800 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/list.h> #include <sys/vfs.h> #include <sys/zil.h> @@ -47,13 +45,13 @@ struct zfsvfs { uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ - uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_mode; /* acl chmod/mode behavior */ @@ -72,8 +70,13 @@ struct zfsvfs { boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ - kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */ + boolean_t z_replay; /* set during ZIL replay */ + kmutex_t z_online_recv_lock; /* held while recv in progress */ uint64_t z_version; /* ZPL version */ + uint64_t z_shares_dir; /* hidden shares dir */ + kmutex_t z_lock; + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ }; @@ -131,6 +134,17 @@ extern int zfs_super_owner; extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); +extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valuep); +extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); +extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota); +extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs, + boolean_t isgroup, uint64_t fuid); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h index f91bc90..47072fb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -100,6 +100,7 @@ extern "C" { #define ZFS_ROOT_OBJ "ROOT" #define ZPL_VERSION_STR "VERSION" #define ZFS_FUID_TABLES "FUID" +#define ZFS_SHARES_DIR "SHARES" #define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) @@ -186,7 +187,6 @@ typedef struct znode { vnode_t *z_vnode; uint64_t z_id; /* object ID for this znode */ kmutex_t z_lock; /* znode modification lock */ - krwlock_t z_map_lock; /* page map lock */ krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_name_lock; /* "master" lock for dirent locks */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ @@ -338,7 +338,6 @@ extern void zfs_remove_op_tables(); extern int zfs_create_op_tables(); extern dev_t zfs_cmpldev(uint64_t); extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); -extern int zfs_set_version(const char *name, uint64_t newvers); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern void zfs_znode_dmu_fini(znode_t *); @@ -367,6 +366,7 @@ extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, #endif extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); +extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern zil_get_data_t zfs_get_data; extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h index 5212aaf..e992f6a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h @@ -139,7 +139,8 @@ typedef enum zil_create { #define TX_MKDIR_ACL 17 /* mkdir with ACL */ #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ -#define TX_MAX_TYPE 20 /* Max transaction type */ +#define TX_WRITE2 20 /* dmu_sync EALREADY write */ +#define TX_MAX_TYPE 21 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename @@ -341,7 +342,6 @@ typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg); typedef int zil_replay_func_t(); -typedef void zil_replay_cleaner_t(); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, @@ -356,9 +356,8 @@ extern void zil_free(zilog_t *zilog); extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); extern void zil_close(zilog_t *zilog); -extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE], - zil_replay_cleaner_t *replay_cleaner); +extern void zil_replay(objset_t *os, void *arg, + zil_replay_func_t *replay_func[TX_MAX_TYPE]); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); @@ -378,6 +377,7 @@ extern int zil_suspend(zilog_t *zilog); extern void zil_resume(zilog_t *zilog); extern void zil_add_block(zilog_t *zilog, blkptr_t *bp); +extern void zil_get_replay_data(zilog_t *zilog, lr_write_t *lr); extern int zil_disable; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h index 0fc800b..3f25829 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZIL_IMPL_H #define _SYS_ZIL_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zil.h> #include <sys/dmu_objset.h> @@ -74,13 +72,14 @@ struct zilog { uint64_t zl_commit_seq; /* committed upto this number */ uint64_t zl_lr_seq; /* log record sequence number */ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ - uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */ + uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ + uint64_t zl_replaying_seq; /* current replay seq number */ uint32_t zl_suspend; /* log suspend count */ kcondvar_t zl_cv_writer; /* log writer thread completion */ kcondvar_t zl_cv_suspend; /* log suspend completion */ uint8_t zl_suspending; /* log is currently suspending */ uint8_t zl_keep_first; /* keep first log block in destroy */ - uint8_t zl_stop_replay; /* don't replay any further */ + uint8_t zl_replay; /* replaying records while set */ uint8_t zl_stop_sync; /* for debugging */ uint8_t zl_writer; /* boolean: write setup in progress */ uint8_t zl_log_error; /* boolean: log write error */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 8c8efcd..d7c0feb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -132,12 +132,15 @@ enum zio_compress { #define ZIO_FLAG_IO_RETRY 0x00400 #define ZIO_FLAG_IO_REWRITE 0x00800 -#define ZIO_FLAG_PROBE 0x01000 +#define ZIO_FLAG_SELF_HEAL 0x01000 #define ZIO_FLAG_RESILVER 0x02000 #define ZIO_FLAG_SCRUB 0x04000 #define ZIO_FLAG_SCRUB_THREAD 0x08000 -#define ZIO_FLAG_GANG_CHILD 0x10000 +#define ZIO_FLAG_PROBE 0x10000 +#define ZIO_FLAG_GANG_CHILD 0x20000 +#define ZIO_FLAG_RAW 0x40000 +#define ZIO_FLAG_GODFATHER 0x80000 #define ZIO_FLAG_GANG_INHERIT \ (ZIO_FLAG_CANFAIL | \ @@ -146,6 +149,7 @@ enum zio_compress { ZIO_FLAG_DONT_RETRY | \ ZIO_FLAG_DONT_CACHE | \ ZIO_FLAG_DONT_AGGREGATE | \ + ZIO_FLAG_SELF_HEAL | \ ZIO_FLAG_RESILVER | \ ZIO_FLAG_SCRUB | \ ZIO_FLAG_SCRUB_THREAD) @@ -156,6 +160,14 @@ enum zio_compress { ZIO_FLAG_IO_RETRY | \ ZIO_FLAG_PROBE) +#define ZIO_FLAG_AGG_INHERIT \ + (ZIO_FLAG_DONT_AGGREGATE | \ + ZIO_FLAG_IO_REPAIR | \ + ZIO_FLAG_SELF_HEAL | \ + ZIO_FLAG_RESILVER | \ + ZIO_FLAG_SCRUB | \ + ZIO_FLAG_SCRUB_THREAD) + #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 @@ -254,6 +266,13 @@ typedef int zio_pipe_stage_t(zio_t *zio); #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 +typedef struct zio_link { + zio_t *zl_parent; + zio_t *zl_child; + list_node_t zl_parent_node; + list_node_t zl_child_node; +} zio_link_t; + struct zio { /* Core information about this I/O */ zbookmark_t io_bookmark; @@ -263,15 +282,14 @@ struct zio { int io_cmd; uint8_t io_priority; uint8_t io_reexecute; - uint8_t io_async_root; + uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; blkptr_t io_bp_copy; - zio_t *io_parent; - zio_t *io_child; - zio_t *io_sibling_prev; - zio_t *io_sibling_next; + list_t io_parent_list; + list_t io_child_list; + zio_link_t *io_walk_link; zio_t *io_logical; zio_transform_t *io_transform_stack; @@ -294,8 +312,6 @@ struct zio { avl_node_t io_offset_node; avl_node_t io_deadline_node; avl_tree_t *io_vdev_tree; - zio_t *io_delegate_list; - zio_t *io_delegate_next; /* Internal pipeline state */ int io_flags; @@ -308,6 +324,7 @@ struct zio { int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t *io_stall; + zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; @@ -323,7 +340,7 @@ struct zio { #endif }; -extern zio_t *zio_null(zio_t *pio, spa_t *spa, +extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, int flags); extern zio_t *zio_root(spa_t *spa, @@ -371,6 +388,11 @@ extern void zio_nowait(zio_t *zio); extern void zio_execute(zio_t *zio); extern void zio_interrupt(zio_t *zio); +extern zio_t *zio_walk_parents(zio_t *cio); +extern zio_t *zio_walk_children(zio_t *pio); +extern zio_t *zio_unique_parent(zio_t *cio); +extern void zio_add_child(zio_t *pio, zio_t *cio); + extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); @@ -397,7 +419,7 @@ extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); extern void zio_suspend(spa_t *spa, zio_t *zio); -extern void zio_resume(spa_t *spa); +extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index daab409..befc8b3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -327,8 +327,10 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); - space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); - space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, + &vd->vdev_dtl_lock); + } txg_list_create(&vd->vdev_ms_list, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, @@ -444,6 +446,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) + vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value @@ -457,9 +461,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ - if (!spa->spa_import_faulted) - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &vd->vdev_not_present); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); /* * Get the alignment requirement. @@ -485,7 +488,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, - &vd->vdev_dtl.smo_object); + &vd->vdev_dtl_smo.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } @@ -569,6 +572,8 @@ vdev_free(vdev_t *vd) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); + if (vd->vdev_fru) + spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); @@ -577,12 +582,14 @@ vdev_free(vdev_t *vd) txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); - space_map_unload(&vd->vdev_dtl_map); - space_map_destroy(&vd->vdev_dtl_map); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - space_map_destroy(&vd->vdev_dtl_scrub); + for (int t = 0; t < DTL_TYPES; t++) { + space_map_unload(&vd->vdev_dtl[t]); + space_map_destroy(&vd->vdev_dtl[t]); + } mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); @@ -720,14 +727,18 @@ vdev_remove_parent(vdev_t *cvd) vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); + /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ - if (mvd->vdev_top == mvd) - cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid; + if (mvd->vdev_top == mvd) { + uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; + cvd->vdev_guid += guid_delta; + cvd->vdev_guid_sum += guid_delta; + } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); @@ -779,7 +790,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) { uint64_t object = 0; error = dmu_read(mos, vd->vdev_ms_array, - m * sizeof (uint64_t), sizeof (uint64_t), &object); + m * sizeof (uint64_t), sizeof (uint64_t), &object, + DMU_READ_PREFETCH); if (error) return (error); if (object != 0) { @@ -819,22 +831,22 @@ typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; - zio_t *vps_root; - vdev_t *vps_vd; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { + spa_t *spa = zio->io_spa; + vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; - vdev_t *vd = vps->vps_vd; + + ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { - ASSERT(zio->io_vd == vd); if (zio->io_error == 0) vps->vps_readable = 1; - if (zio->io_error == 0 && (spa_mode & FWRITE)) { - zio_nowait(zio_write_phys(vps->vps_root, vd, + if (zio->io_error == 0 && spa_writeable(spa)) { + zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_data, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); @@ -842,26 +854,34 @@ vdev_probe_done(zio_t *zio) zio_buf_free(zio->io_data, zio->io_size); } } else if (zio->io_type == ZIO_TYPE_WRITE) { - ASSERT(zio->io_vd == vd); if (zio->io_error == 0) vps->vps_writeable = 1; zio_buf_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { - ASSERT(zio->io_vd == NULL); - ASSERT(zio == vps->vps_root); + zio_t *pio; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && - (vdev_writeable(vd) || !(spa_mode & FWRITE))) { + (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - zio->io_spa, vd, NULL, 0, 0); + spa, vd, NULL, 0, 0); zio->io_error = ENXIO; } + + mutex_enter(&vd->vdev_probe_lock); + ASSERT(vd->vdev_probe_zio == zio); + vd->vdev_probe_zio = NULL; + mutex_exit(&vd->vdev_probe_lock); + + while ((pio = zio_walk_parents(zio)) != NULL) + if (!vdev_accessible(vd, pio)) + pio->io_error = ENXIO; + kmem_free(vps, sizeof (*vps)); } } @@ -872,53 +892,90 @@ vdev_probe_done(zio_t *zio) * but the first (which we leave alone in case it contains a VTOC). */ zio_t * -vdev_probe(vdev_t *vd, zio_t *pio) +vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; - vdev_probe_stats_t *vps; - zio_t *zio; + vdev_probe_stats_t *vps = NULL; + zio_t *pio; - vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + ASSERT(vd->vdev_ops->vdev_op_leaf); - vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY; + /* + * Don't probe the probe. + */ + if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) + return (NULL); - if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { - /* - * vdev_cant_read and vdev_cant_write can only transition - * from TRUE to FALSE when we have the SCL_ZIO lock as writer; - * otherwise they can only transition from FALSE to TRUE. - * This ensures that any zio looking at these values can - * assume that failures persist for the life of the I/O. - * That's important because when a device has intermittent - * connectivity problems, we want to ensure that they're - * ascribed to the device (ENXIO) and not the zio (EIO). - * - * Since we hold SCL_ZIO as writer here, clear both values - * so the probe can reevaluate from first principles. - */ - vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; - vd->vdev_cant_read = B_FALSE; - vd->vdev_cant_write = B_FALSE; + /* + * To prevent 'probe storms' when a device fails, we create + * just one probe i/o at a time. All zios that want to probe + * this vdev will become parents of the probe io. + */ + mutex_enter(&vd->vdev_probe_lock); + + if ((pio = vd->vdev_probe_zio) == NULL) { + vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + + vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | + ZIO_FLAG_DONT_RETRY; + + if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { + /* + * vdev_cant_read and vdev_cant_write can only + * transition from TRUE to FALSE when we have the + * SCL_ZIO lock as writer; otherwise they can only + * transition from FALSE to TRUE. This ensures that + * any zio looking at these values can assume that + * failures persist for the life of the I/O. That's + * important because when a device has intermittent + * connectivity problems, we want to ensure that + * they're ascribed to the device (ENXIO) and not + * the zio (EIO). + * + * Since we hold SCL_ZIO as writer here, clear both + * values so the probe can reevaluate from first + * principles. + */ + vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + } + + vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, + vdev_probe_done, vps, + vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); + + if (zio != NULL) { + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_PROBE); + } } - ASSERT(vd->vdev_ops->vdev_op_leaf); + if (zio != NULL) + zio_add_child(zio, pio); - zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags); + mutex_exit(&vd->vdev_probe_lock); - vps->vps_root = zio; - vps->vps_vd = vd; + if (vps == NULL) { + ASSERT(zio != NULL); + return (NULL); + } for (int l = 1; l < VDEV_LABELS; l++) { - zio_nowait(zio_read_phys(zio, vd, + zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad)), - VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), + offsetof(vdev_label_t, vl_pad2)), + VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } - return (zio); + if (zio == NULL) + return (pio); + + zio_nowait(pio); + return (NULL); } /* @@ -927,12 +984,15 @@ vdev_probe(vdev_t *vd, zio_t *pio) int vdev_open(vdev_t *vd) { + spa_t *spa = vd->vdev_spa; int error; int c; uint64_t osize = 0; uint64_t asize, psize; uint64_t ashift = 0; + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); @@ -1066,16 +1126,12 @@ vdev_open(vdev_t *vd) /* * If a leaf vdev has a DTL, and seems healthy, then kick off a - * resilver. But don't do this if we are doing a reopen for a - * scrub, since this would just restart the scrub we are already - * doing. + * resilver. But don't do this if we are doing a reopen for a scrub, + * since this would just restart the scrub we are already doing. */ - if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) { - mutex_enter(&vd->vdev_dtl_lock); - if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) - spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER); - mutex_exit(&vd->vdev_dtl_lock); - } + if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && + vdev_resilver_needed(vd, NULL, NULL)) + spa_async_request(spa, SPA_ASYNC_RESILVER); return (0); } @@ -1154,7 +1210,12 @@ vdev_validate(vdev_t *vd) nvlist_free(label); - if (spa->spa_load_state == SPA_LOAD_OPEN && + /* + * If spa->spa_load_verbatim is true, no need to check the + * state of the pool. + */ + if (!spa->spa_load_verbatim && + spa->spa_load_state == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); @@ -1176,6 +1237,10 @@ vdev_validate(vdev_t *vd) void vdev_close(vdev_t *vd) { + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); @@ -1212,6 +1277,7 @@ vdev_reopen(vdev_t *vd) if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && + vd->vdev_aux == &spa->spa_l2cache && !l2arc_vdev_present(vd)) { uint64_t size = vdev_get_rsize(vd); l2arc_add_vdev(spa, vd, @@ -1294,34 +1360,88 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } +/* + * DTLs. + * + * A vdev's DTL (dirty time log) is the set of transaction groups for which + * the vdev has less than perfect replication. There are three kinds of DTL: + * + * DTL_MISSING: txgs for which the vdev has no valid copies of the data + * + * DTL_PARTIAL: txgs for which data is available, but not fully replicated + * + * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon + * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of + * txgs that was scrubbed. + * + * DTL_OUTAGE: txgs which cannot currently be read, whether due to + * persistent errors or just some device being offline. + * Unlike the other three, the DTL_OUTAGE map is not generally + * maintained; it's only computed when needed, typically to + * determine whether a device can be detached. + * + * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device + * either has the data or it doesn't. + * + * For interior vdevs such as mirror and RAID-Z the picture is more complex. + * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because + * if any child is less than fully replicated, then so is its parent. + * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, + * comprising only those txgs which appear in 'maxfaults' or more children; + * those are the txgs we don't have enough replication to read. For example, + * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); + * thus, its DTL_MISSING consists of the set of txgs that appear in more than + * two child DTL_MISSING maps. + * + * It should be clear from the above that to compute the DTLs and outage maps + * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. + * Therefore, that is all we keep on disk. When loading the pool, or after + * a configuration change, we generate all other DTLs from first principles. + */ void -vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) +vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { + space_map_t *sm = &vd->vdev_dtl[t]; + + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + mutex_enter(sm->sm_lock); if (!space_map_contains(sm, txg, size)) space_map_add(sm, txg, size); mutex_exit(sm->sm_lock); } -int -vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) +boolean_t +vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { - int dirty; + space_map_t *sm = &vd->vdev_dtl[t]; + boolean_t dirty = B_FALSE; - /* - * Quick test without the lock -- covers the common case that - * there are no dirty time segments. - */ - if (sm->sm_space == 0) - return (0); + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); mutex_enter(sm->sm_lock); - dirty = space_map_contains(sm, txg, size); + if (sm->sm_space != 0) + dirty = space_map_contains(sm, txg, size); mutex_exit(sm->sm_lock); return (dirty); } +boolean_t +vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) +{ + space_map_t *sm = &vd->vdev_dtl[t]; + boolean_t empty; + + mutex_enter(sm->sm_lock); + empty = (sm->sm_space == 0); + mutex_exit(sm->sm_lock); + + return (empty); +} + /* * Reassess DTLs after a config change or scrub completion. */ @@ -1329,11 +1449,19 @@ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; - int c; + avl_tree_t reftree; + int minref; - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - if (vd->vdev_children == 0) { + for (int c = 0; c < vd->vdev_children; c++) + vdev_dtl_reassess(vd->vdev_child[c], txg, + scrub_txg, scrub_done); + + if (vd == spa->spa_root_vdev) + return; + + if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_dtl_lock); if (scrub_txg != 0 && (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { @@ -1344,12 +1472,38 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. + * + * There's little trick here: to excise the beginning + * of the DTL_MISSING map, we put it into a reference + * tree and then add a segment with refcnt -1 that + * covers the range [0, scrub_txg). This means + * that each txg in that range has refcnt -1 or 0. + * We then add DTL_SCRUB with a refcnt of 2, so that + * entries in the range [0, scrub_txg) will have a + * positive refcnt -- either 1 or 2. We then convert + * the reference tree into the new DTL_MISSING map. */ - space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); - space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); + space_map_ref_create(&reftree); + space_map_ref_add_map(&reftree, + &vd->vdev_dtl[DTL_MISSING], 1); + space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); + space_map_ref_add_map(&reftree, + &vd->vdev_dtl[DTL_SCRUB], 2); + space_map_ref_generate_map(&reftree, + &vd->vdev_dtl[DTL_MISSING], 1); + space_map_ref_destroy(&reftree); } + space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); + space_map_walk(&vd->vdev_dtl[DTL_MISSING], + space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); + space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); + if (!vdev_readable(vd)) + space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); + else + space_map_walk(&vd->vdev_dtl[DTL_MISSING], + space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) @@ -1357,35 +1511,36 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) return; } - /* - * Make sure the DTLs are always correct under the scrub lock. - */ - if (vd == spa->spa_root_vdev) - mutex_enter(&spa->spa_scrub_lock); - mutex_enter(&vd->vdev_dtl_lock); - space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - mutex_exit(&vd->vdev_dtl_lock); - - for (c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); - mutex_enter(&vd->vdev_dtl_lock); - space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); - space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) + continue; /* leaf vdevs only */ + if (t == DTL_PARTIAL) + minref = 1; /* i.e. non-zero */ + else if (vd->vdev_nparity != 0) + minref = vd->vdev_nparity + 1; /* RAID-Z */ + else + minref = vd->vdev_children; /* any kind of mirror */ + space_map_ref_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); + space_map_ref_destroy(&reftree); } - - if (vd == spa->spa_root_vdev) - mutex_exit(&spa->spa_scrub_lock); + mutex_exit(&vd->vdev_dtl_lock); } static int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl; + space_map_obj_t *smo = &vd->vdev_dtl_smo; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *db; int error; @@ -1403,7 +1558,8 @@ vdev_dtl_load(vdev_t *vd) dmu_buf_rele(db, FTAG); mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); + error = space_map_load(&vd->vdev_dtl[DTL_MISSING], + NULL, SM_ALLOC, smo, mos); mutex_exit(&vd->vdev_dtl_lock); return (error); @@ -1413,8 +1569,8 @@ void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl; - space_map_t *sm = &vd->vdev_dtl_map; + space_map_obj_t *smo = &vd->vdev_dtl_smo; + space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; space_map_t smsync; kmutex_t smlock; @@ -1472,6 +1628,37 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) } /* + * Determine whether the specified vdev can be offlined/detached/removed + * without losing data. + */ +boolean_t +vdev_dtl_required(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + uint8_t cant_read = vd->vdev_cant_read; + boolean_t required; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (vd == spa->spa_root_vdev || vd == tvd) + return (B_TRUE); + + /* + * Temporarily mark the device as unreadable, and then determine + * whether this results in any DTL outages in the top-level vdev. + * If not, we can safely offline/detach/remove the device. + */ + vd->vdev_cant_read = B_TRUE; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + required = !vdev_dtl_empty(tvd, DTL_OUTAGE); + vd->vdev_cant_read = cant_read; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + + return (required); +} + +/* * Determine if resilver is needed, and if so the txg range. */ boolean_t @@ -1483,19 +1670,19 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); - if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) { + if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && + vdev_writeable(vd)) { space_seg_t *ss; - ss = avl_first(&vd->vdev_dtl_map.sm_root); + ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); thismin = ss->ss_start - 1; - ss = avl_last(&vd->vdev_dtl_map.sm_root); + ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); thismax = ss->ss_end; needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { - int c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; @@ -1517,12 +1704,10 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) void vdev_load(vdev_t *vd) { - int c; - /* * Recursively load all children. */ - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_load(vd->vdev_child[c]); /* @@ -1742,11 +1927,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; - (void) spa_vdev_state_exit(spa, vd, 0); - - VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); - - return (0); + return (spa_vdev_state_exit(spa, vd, 0)); } int @@ -1767,13 +1948,10 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) */ if (!vd->vdev_offline) { /* - * If this device's top-level vdev has a non-empty DTL, - * don't allow the device to be offlined. - * - * XXX -- make this more precise by allowing the offline - * as long as the remaining devices don't have any DTL holes. + * If this device has the only valid copy of some data, + * don't allow it to be offlined. */ - if (vd->vdev_top->vdev_dtl_map.sm_space != 0) + if (vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* @@ -1783,7 +1961,7 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) */ vd->vdev_offline = B_TRUE; vdev_reopen(vd->vdev_top); - if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) { vd->vdev_offline = B_FALSE; vdev_reopen(vd->vdev_top); return (spa_vdev_state_exit(spa, NULL, EBUSY)); @@ -1863,13 +2041,17 @@ vdev_writeable(vdev_t *vd) boolean_t vdev_allocatable(vdev_t *vd) { + uint64_t state = vd->vdev_state; + /* - * We currently allow allocations from vdevs which maybe in the + * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding - * the proper locks. + * the proper locks. Note that we have to get the vdev state + * in a local variable because although it changes atomically, + * we're asking two separate questions about it. */ - return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) && + return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write); } @@ -1939,7 +2121,8 @@ vdev_clear_stats(vdev_t *vd) void vdev_stat_update(zio_t *zio, uint64_t psize) { - vdev_t *rvd = zio->io_spa->spa_root_vdev; + spa_t *spa = zio->io_spa; + vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; @@ -1972,21 +2155,23 @@ vdev_stat_update(zio_t *zio, uint64_t psize) return; ASSERT(vd == zio->io_vd); - if (!(flags & ZIO_FLAG_IO_BYPASS)) { - mutex_enter(&vd->vdev_stat_lock); - vs->vs_ops[type]++; - vs->vs_bytes[type] += psize; - mutex_exit(&vd->vdev_stat_lock); - } + + if (flags & ZIO_FLAG_IO_BYPASS) + return; + + mutex_enter(&vd->vdev_stat_lock); + if (flags & ZIO_FLAG_IO_REPAIR) { - ASSERT(zio->io_delegate_list == NULL); - mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_SCRUB_THREAD) vs->vs_scrub_repaired += psize; - else + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; - mutex_exit(&vd->vdev_stat_lock); } + + vs->vs_ops[type]++; + vs->vs_bytes[type] += psize; + + mutex_exit(&vd->vdev_stat_lock); return; } @@ -1994,29 +2179,49 @@ vdev_stat_update(zio_t *zio, uint64_t psize) return; mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ) { + if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } - if (type == ZIO_TYPE_WRITE) + if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) { - if (flags & ZIO_FLAG_SCRUB_THREAD) { - ASSERT(flags & ZIO_FLAG_IO_REPAIR); - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); - } - if (!(flags & ZIO_FLAG_IO_REPAIR)) { - if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) + if (type == ZIO_TYPE_WRITE && txg != 0 && + (!(flags & ZIO_FLAG_IO_REPAIR) || + (flags & ZIO_FLAG_SCRUB_THREAD))) { + /* + * This is either a normal write (not a repair), or it's a + * repair induced by the scrub thread. In the normal case, + * we commit the DTL change in the same txg as the block + * was born. In the scrub-induced repair case, we know that + * scrubs run in first-pass syncing context, so we commit + * the DTL change in spa->spa_syncing_txg. + * + * We currently do not make DTL entries for failed spontaneous + * self-healing writes triggered by normal (non-scrubbing) + * reads, because we have no transactional context in which to + * do so -- and it's not clear that it'd be desirable anyway. + */ + if (vd->vdev_ops->vdev_op_leaf) { + uint64_t commit_txg = txg; + if (flags & ZIO_FLAG_SCRUB_THREAD) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + ASSERT(spa_sync_pass(spa) == 1); + vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); + commit_txg = spa->spa_syncing_txg; + } + ASSERT(commit_txg >= spa->spa_syncing_txg); + if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; - vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); + vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } + if (vd != rvd) + vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } @@ -2111,8 +2316,8 @@ vdev_config_dirty(vdev_t *vd) int c; /* - * If this is an aux vdev (as with l2cache devices), then we update the - * vdev config manually and set the sync flag. + * If this is an aux vdev (as with l2cache and spare devices), then we + * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; @@ -2134,8 +2339,11 @@ vdev_config_dirty(vdev_t *vd) sav->sav_sync = B_TRUE; - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); + if (nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); + } ASSERT(c < naux); @@ -2229,7 +2437,8 @@ vdev_state_clean(vdev_t *vd) void vdev_propagate_state(vdev_t *vd) { - vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; int c; @@ -2240,7 +2449,7 @@ vdev_propagate_state(vdev_t *vd) child = vd->vdev_child[c]; if (!vdev_readable(child) || - (!vdev_writeable(child) && (spa_mode & FWRITE))) { + (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were @@ -2340,7 +2549,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * an error. */ if (spa->spa_load_state == SPA_LOAD_IMPORT && - !spa->spa_import_faulted && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; @@ -2399,8 +2607,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_removed = B_FALSE; } - if (!isopen) - vdev_propagate_state(vd); + if (!isopen && vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c index 88c15b7..8fc3738 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -215,23 +215,23 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) * Fill a previously allocated cache entry with data. */ static void -vdev_cache_fill(zio_t *zio) +vdev_cache_fill(zio_t *fio) { - vdev_t *vd = zio->io_vd; + vdev_t *vd = fio->io_vd; vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve = zio->io_private; - zio_t *dio; + vdev_cache_entry_t *ve = fio->io_private; + zio_t *pio; - ASSERT(zio->io_size == VCBS); + ASSERT(fio->io_size == VCBS); /* * Add data to the cache. */ mutex_enter(&vc->vc_lock); - ASSERT(ve->ve_fill_io == zio); - ASSERT(ve->ve_offset == zio->io_offset); - ASSERT(ve->ve_data == zio->io_data); + ASSERT(ve->ve_fill_io == fio); + ASSERT(ve->ve_offset == fio->io_offset); + ASSERT(ve->ve_data == fio->io_data); ve->ve_fill_io = NULL; @@ -240,20 +240,13 @@ vdev_cache_fill(zio_t *zio) * any reads that were queued up before the missed update are still * valid, so we can satisfy them from this line before we evict it. */ - for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) - vdev_cache_hit(vc, ve, dio); + while ((pio = zio_walk_parents(fio)) != NULL) + vdev_cache_hit(vc, ve, pio); - if (zio->io_error || ve->ve_missed_update) + if (fio->io_error || ve->ve_missed_update) vdev_cache_evict(vc, ve); mutex_exit(&vc->vc_lock); - - while ((dio = zio->io_delegate_list) != NULL) { - zio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - dio->io_error = zio->io_error; - zio_execute(dio); - } } /* @@ -296,9 +289,8 @@ vdev_cache_read(zio_t *zio) } if ((fio = ve->ve_fill_io) != NULL) { - zio->io_delegate_next = fio->io_delegate_list; - fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_delegations); return (0); @@ -308,7 +300,6 @@ vdev_cache_read(zio_t *zio) zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); - zio_execute(zio); VDCSTAT_BUMP(vdc_stat_hits); return (0); } @@ -325,8 +316,8 @@ vdev_cache_read(zio_t *zio) ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; - fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); zio_nowait(fio); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c index 35d4e2a..e6d5743 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -47,6 +47,7 @@ typedef struct vdev_disk_buf { static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) { + spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; struct dk_minfo dkm; int error; @@ -95,7 +96,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) error = EINVAL; /* presume failure */ - if (vd->vdev_path != NULL && !spa_is_root(vd->vdev_spa)) { + if (vd->vdev_path != NULL && !spa_is_root(spa)) { ddi_devid_t devid; if (vd->vdev_wholedisk == -1ULL) { @@ -105,18 +106,18 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) (void) snprintf(buf, len, "%ss0", vd->vdev_path); - if (ldi_open_by_name(buf, spa_mode, kcred, + if (ldi_open_by_name(buf, spa_mode(spa), kcred, &lh, zfs_li) == 0) { spa_strfree(vd->vdev_path); vd->vdev_path = buf; vd->vdev_wholedisk = 1ULL; - (void) ldi_close(lh, spa_mode, kcred); + (void) ldi_close(lh, spa_mode(spa), kcred); } else { kmem_free(buf, len); } } - error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* @@ -126,7 +127,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { error = EINVAL; - (void) ldi_close(dvd->vd_lh, spa_mode, kcred); + (void) ldi_close(dvd->vd_lh, spa_mode(spa), + kcred); dvd->vd_lh = NULL; } ddi_devid_free(devid); @@ -146,7 +148,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) */ if (error != 0 && vd->vdev_devid != NULL) error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, - spa_mode, kcred, &dvd->vd_lh, zfs_li); + spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * If all else fails, then try opening by physical path (if available) @@ -156,8 +158,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) */ if (error) { if (vd->vdev_physpath != NULL && - (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV) - error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode, + (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) + error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* @@ -165,10 +167,9 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ - if (error && vd->vdev_path != NULL && - !spa_is_root(vd->vdev_spa)) - error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, - &dvd->vd_lh, zfs_li); + if (error && vd->vdev_path != NULL && !spa_is_root(spa)) + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), + kcred, &dvd->vd_lh, zfs_li); } if (error) { @@ -253,7 +254,7 @@ vdev_disk_close(vdev_t *vd) ddi_devid_free(dvd->vd_devid); if (dvd->vd_lh != NULL) - (void) ldi_close(dvd->vd_lh, spa_mode, kcred); + (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); kmem_free(dvd, sizeof (vdev_disk_t)); vd->vdev_tsd = NULL; @@ -469,7 +470,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, &minor_name) == 0) { error = ldi_open_by_devid(tmpdevid, minor_name, - spa_mode, kcred, &vd_lh, zfs_li); + FREAD, kcred, &vd_lh, zfs_li); ddi_devid_free(tmpdevid); ddi_devid_str_free(minor_name); } @@ -492,8 +493,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) /* read vdev label */ offset = vdev_label_offset(size, l, 0); if (vdev_disk_physio(vd_lh, (caddr_t)label, - VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE + - VDEV_PHYS_SIZE, offset, B_READ) != 0) + VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) continue; if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c index 30b3f35..67bd110 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -61,7 +61,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, - spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); + spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; @@ -75,7 +75,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * Make sure it's a regular file. */ if (vp->v_type != VREG) { - (void) VOP_CLOSE(vp, spa_mode, 1, 0, kcred, NULL); + (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (ENODEV); } @@ -90,7 +90,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) VOP_UNLOCK(vp, 0); VFS_UNLOCK_GIANT(vfslocked); if (error) { - (void) VOP_CLOSE(vp, spa_mode, 1, 0, kcred, NULL); + (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } @@ -110,7 +110,8 @@ vdev_file_close(vdev_t *vd) return; if (vf->vf_vnode != NULL) - (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL); + (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, + kcred, NULL); kmem_free(vf, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index 9c6ec4c..00817bf 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -457,7 +457,7 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) if (cp == NULL) { ZFS_LOG(1, "Provider %s not found.", vd->vdev_path); error = ENOENT; - } else if (cp->acw == 0 && (spa_mode & FWRITE) != 0) { + } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) { int i; g_topology_lock(); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index bf93046..f1f3bb0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -233,6 +233,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath) == 0); + if (vd->vdev_fru != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU, + vd->vdev_fru) == 0); + if (vd->vdev_nparity != 0) { ASSERT(strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_RAIDZ) == 0); @@ -277,9 +281,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_islog) == 0); } - if (vd->vdev_dtl.smo_object != 0) + if (vd->vdev_dtl_smo.smo_object != 0) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, - vd->vdev_dtl.smo_object) == 0); + vd->vdev_dtl_smo.smo_object) == 0); if (getstats) { vdev_stat_t vs; @@ -488,7 +492,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; - vdev_boot_header_t *vb; + char *pad2; uberblock_t *ub; zio_t *zio; char *buf; @@ -520,9 +524,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (EBUSY); - ASSERT(reason != VDEV_LABEL_REMOVE || - vdev_inuse(vd, crtxg, reason, NULL, NULL)); - /* * If this is a request to add or replace a spare or l2cache device * that is in use elsewhere on the system, then we must update the @@ -633,16 +634,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) } /* - * Initialize boot block header. - */ - vb = zio_buf_alloc(sizeof (vdev_boot_header_t)); - bzero(vb, sizeof (vdev_boot_header_t)); - vb->vb_magic = VDEV_BOOT_MAGIC; - vb->vb_version = VDEV_BOOT_VERSION; - vb->vb_offset = VDEV_BOOT_OFFSET; - vb->vb_size = VDEV_BOOT_SIZE; - - /* * Initialize uberblock template. */ ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); @@ -650,6 +641,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) *ub = spa->spa_uberblock; ub->ub_txg = 0; + /* Initialize the 2nd padding area. */ + pad2 = zio_buf_alloc(VDEV_PAD_SIZE); + bzero(pad2, VDEV_PAD_SIZE); + /* * Write everything in parallel. */ @@ -661,9 +656,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); - vdev_label_write(zio, vd, l, vb, - offsetof(vdev_label_t, vl_boot_header), - sizeof (vdev_boot_header_t), NULL, NULL, flags); + /* + * Skip the 1st padding area. + * Zero out the 2nd padding area where it might have + * left over data from previous filesystem format. + */ + vdev_label_write(zio, vd, l, pad2, + offsetof(vdev_label_t, vl_pad2), + VDEV_PAD_SIZE, NULL, NULL, flags); for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_write(zio, vd, l, ub, @@ -675,8 +675,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) error = zio_wait(zio); nvlist_free(label); + zio_buf_free(pad2, VDEV_PAD_SIZE); zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); - zio_buf_free(vb, sizeof (vdev_boot_header_t)); zio_buf_free(vp, sizeof (vdev_phys_t)); /* @@ -705,6 +705,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) */ /* + * For use by zdb and debugging purposes only + */ +uint64_t ub_max_txg = UINT64_MAX; + +/* * Consider the following situation: txg is safely synced to disk. We've * written the first uberblock for txg + 1, and then we lose power. When we * come back up, we fail to see the uberblock for txg + 1 because, say, @@ -741,7 +746,8 @@ vdev_uberblock_load_done(zio_t *zio) if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); - if (vdev_uberblock_compare(ub, ubbest) > 0) + if (ub->ub_txg <= ub_max_txg && + vdev_uberblock_compare(ub, ubbest) > 0) *ubbest = *ub; mutex_exit(&rio->io_lock); } @@ -958,7 +964,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); - zio_t *vio = zio_null(zio, spa, + zio_t *vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c index c4629ff..fff7e08 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -180,11 +180,16 @@ vdev_mirror_scrub_done(zio_t *zio) mirror_child_t *mc = zio->io_private; if (zio->io_error == 0) { - zio_t *pio = zio->io_parent; - mutex_enter(&pio->io_lock); - ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); - mutex_exit(&pio->io_lock); + zio_t *pio; + + mutex_enter(&zio->io_lock); + while ((pio = zio_walk_parents(zio)) != NULL) { + mutex_enter(&pio->io_lock); + ASSERT3U(zio->io_size, >=, pio->io_size); + bcopy(zio->io_data, pio->io_data, pio->io_size); + mutex_exit(&pio->io_lock); + } + mutex_exit(&zio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); @@ -225,7 +230,7 @@ vdev_mirror_child_select(zio_t *zio) mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) + if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) return (c); mc->mc_error = ESTALE; mc->mc_skipped = 1; @@ -282,20 +287,10 @@ vdev_mirror_io_start(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); /* - * If this is a resilvering I/O to a replacing vdev, - * only the last child should be written -- unless the - * first child happens to have a DTL entry here as well. - * All other writes go to all children. + * Writes go to all children. */ - if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && - !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, - zio->io_txg, 1)) { - c = mm->mm_children - 1; - children = 1; - } else { - c = 0; - children = mm->mm_children; - } + c = 0; + children = mm->mm_children; } while (children--) { @@ -398,7 +393,7 @@ vdev_mirror_io_done(zio_t *zio) ASSERT(zio->io_error != 0); } - if (good_copies && (spa_mode & FWRITE) && + if (good_copies && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER) || ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { @@ -419,7 +414,7 @@ vdev_mirror_io_done(zio_t *zio) if (mc->mc_tried) continue; if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, + !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; mc->mc_error = ESTALE; @@ -429,7 +424,8 @@ vdev_mirror_io_done(zio_t *zio) mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR, NULL, NULL)); + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index cd4d5ae..45cc829 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,10 +48,11 @@ int zfs_vdev_time_shift = 6; int zfs_vdev_ramp_rate = 2; /* - * i/os will be aggregated into a single large i/o up to - * zfs_vdev_aggregation_limit bytes long. + * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. + * For read i/os, we also aggregate across small adjacency gaps. */ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; +int zfs_vdev_read_gap_limit = 32 << 10; SYSCTL_DECL(_vfs_zfs_vdev); TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending); @@ -168,33 +169,33 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) static void vdev_queue_agg_io_done(zio_t *aio) { - zio_t *dio; - uint64_t offset = 0; + zio_t *pio; - while ((dio = aio->io_delegate_list) != NULL) { + while ((pio = zio_walk_parents(aio)) != NULL) if (aio->io_type == ZIO_TYPE_READ) - bcopy((char *)aio->io_data + offset, dio->io_data, - dio->io_size); - offset += dio->io_size; - aio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - dio->io_error = aio->io_error; - zio_execute(dio); - } - ASSERT3U(offset, ==, aio->io_size); + bcopy((char *)aio->io_data + (pio->io_offset - + aio->io_offset), pio->io_data, pio->io_size); zio_buf_free(aio->io_data, aio->io_size); } -#define IS_ADJACENT(io, nio) \ - ((io)->io_offset + (io)->io_size == (nio)->io_offset) +/* + * Compute the range spanned by two i/os, which is the endpoint of the last + * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). + * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); + * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. + */ +#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) +#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { - zio_t *fio, *lio, *aio, *dio; - avl_tree_t *tree; - uint64_t size; + zio_t *fio, *lio, *aio, *dio, *nio; + avl_tree_t *t; + int flags; + uint64_t maxspan = zfs_vdev_aggregation_limit; + uint64_t maxgap; ASSERT(MUTEX_HELD(&vq->vq_lock)); @@ -204,56 +205,62 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) fio = lio = avl_first(&vq->vq_deadline_tree); - tree = fio->io_vdev_tree; - size = fio->io_size; - - while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && - !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - dio->io_delegate_next = fio; - fio = dio; - size += dio->io_size; - } - - while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && - !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - lio->io_delegate_next = dio; - lio = dio; - size += dio->io_size; + t = fio->io_vdev_tree; + flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; + maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; + + if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { + /* + * We can aggregate I/Os that are adjacent and of the + * same flavor, as expressed by the AGG_INHERIT flags. + * The latter is necessary so that certain attributes + * of the I/O, such as whether it's a normal I/O or a + * scrub/resilver, can be preserved in the aggregate. + */ + while ((dio = AVL_PREV(t, fio)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) + fio = dio; + + while ((dio = AVL_NEXT(t, lio)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) + lio = dio; } if (fio != lio) { - char *buf = zio_buf_alloc(size); - uint64_t offset = 0; - + uint64_t size = IO_SPAN(fio, lio); ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, - buf, size, fio->io_type, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, + flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); - aio->io_delegate_list = fio; - - for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { + nio = fio; + do { + dio = nio; + nio = AVL_NEXT(t, dio); ASSERT(dio->io_type == aio->io_type); - ASSERT(dio->io_vdev_tree == tree); + ASSERT(dio->io_vdev_tree == t); + if (dio->io_type == ZIO_TYPE_WRITE) - bcopy(dio->io_data, buf + offset, dio->io_size); - offset += dio->io_size; + bcopy(dio->io_data, (char *)aio->io_data + + (dio->io_offset - aio->io_offset), + dio->io_size); + + zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); - } - - ASSERT(offset == size); + zio_execute(dio); + } while (dio != lio); avl_add(&vq->vq_pending_tree, aio); return (aio); } - ASSERT(fio->io_vdev_tree == tree); + ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); avl_add(&vq->vq_pending_tree, fio); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c index 0a06190..92753d8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -687,7 +687,7 @@ vdev_raidz_io_start(zio_t *zio) rc->rc_skipped = 1; continue; } - if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { + if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else @@ -1165,7 +1165,7 @@ vdev_raidz_io_done(zio_t *zio) done: zio_checksum_verified(zio); - if (zio->io_error == 0 && (spa_mode & FWRITE) && + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. @@ -1180,7 +1180,8 @@ done: zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR, NULL, NULL)); + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c index 90fe3d0..7abe63a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -1135,3 +1135,58 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } } + +int +fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, + uint64_t *tooverwrite) +{ + zap_t *zap = zn->zn_zap; + zap_leaf_t *l; + int err; + + /* + * Account for the header block of the fatzap. + */ + if (!add && dmu_buf_freeable(zap->zap_dbuf)) { + *tooverwrite += zap->zap_dbuf->db_size; + } else { + *towrite += zap->zap_dbuf->db_size; + } + + /* + * Account for the pointer table blocks. + * If we are adding we need to account for the following cases : + * - If the pointer table is embedded, this operation could force an + * external pointer table. + * - If this already has an external pointer table this operation + * could extend the table. + */ + if (add) { + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) + *towrite += zap->zap_dbuf->db_size; + else + *towrite += (zap->zap_dbuf->db_size * 3); + } + + /* + * Now, check if the block containing leaf is freeable + * and account accordingly. + */ + err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) { + return (err); + } + + if (!add && dmu_buf_freeable(l->l_dbuf)) { + *tooverwrite += l->l_dbuf->db_size; + } else { + /* + * If this an add operation, the leaf block could split. + * Hence, we need to account for an additional leaf block. + */ + *towrite += (add ? 2 : 1) * l->l_dbuf->db_size; + } + + zap_put_leaf(l); + return (0); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c index 10d7386..9453fd2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -1079,3 +1079,79 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) zap_unlockdir(zap); return (0); } + +int +zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, + uint64_t *towrite, uint64_t *tooverwrite) +{ + zap_t *zap; + int err = 0; + + + /* + * Since, we don't have a name, we cannot figure out which blocks will + * be affected in this operation. So, account for the worst case : + * - 3 blocks overwritten: target leaf, ptrtbl block, header block + * - 4 new blocks written if adding: + * - 2 blocks for possibly split leaves, + * - 2 grown ptrtbl blocks + * + * This also accomodates the case where an add operation to a fairly + * large microzap results in a promotion to fatzap. + */ + if (name == NULL) { + *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + return (err); + } + + /* + * We lock the zap with adding == FALSE. Because, if we pass + * the actual value of add, it could trigger a mzap_upgrade(). + * At present we are just evaluating the possibility of this operation + * and hence we donot want to trigger an upgrade. + */ + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + + if (!zap->zap_ismicro) { + zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT); + if (zn) { + err = fzap_count_write(zn, add, towrite, + tooverwrite); + zap_name_free(zn); + } else { + /* + * We treat this case as similar to (name == NULL) + */ + *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + } + } else { + /* + * We are here if (name != NULL) and this is a micro-zap. + * We account for the header block depending on whether it + * is freeable. + * + * Incase of an add-operation it is hard to find out + * if this add will promote this microzap to fatzap. + * Hence, we consider the worst case and account for the + * blocks assuming this microzap would be promoted to a + * fatzap. + * + * 1 block overwritten : header block + * 4 new blocks written : 2 new split leaf, 2 grown + * ptrtbl blocks + */ + if (dmu_buf_freeable(zap->zap_dbuf)) + *tooverwrite += SPA_MAXBLOCKSIZE; + else + *towrite += SPA_MAXBLOCKSIZE; + + if (add) { + *towrite += 4 * SPA_MAXBLOCKSIZE; + } + } + + zap_unlockdir(zap); + return (err); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c index 01007d7..c42f094 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -62,13 +62,15 @@ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) #define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) #define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\ ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) @@ -535,8 +537,9 @@ zfs_acl_curr_node(zfs_acl_t *aclp) * ACE FUIDs will be created later. */ int -zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, - zfs_ace_t *z_acl, int aclcnt, size_t *size) +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) { int i; uint16_t entry_type; @@ -552,9 +555,9 @@ zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE) { - if (!aclp->z_has_fuids) - aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who); - aceptr->z_fuid = (uint64_t)acep->a_who; + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); } /* @@ -679,7 +682,7 @@ zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, * convert old ACL format to new */ void -zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) { zfs_oldace_t *oldaclp; int i; @@ -711,9 +714,9 @@ zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * sizeof (zfs_object_ace_t)); aclp->z_ops = zfs_acl_fuid_ops; - VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp, - newaclnode->z_acldata, aclp->z_acl_count, - &newaclnode->z_size) == 0); + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); newaclnode->z_ace_count = aclp->z_acl_count; aclp->z_version = ZFS_ACL_VERSION; kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); @@ -767,8 +770,7 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, * Also, create FUIDs for any User/Group ACEs */ static uint64_t -zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, - zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) { int entry_type; mode_t mode; @@ -902,15 +904,6 @@ zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, } } } - /* - * Now handle FUID create for user/group ACEs - */ - if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) { - aclp->z_ops.ace_who_set(acep, - zfs_fuid_create(zp->z_zfsvfs, who, cr, - (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, - tx, fuidp)); - } } return (mode); } @@ -986,7 +979,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) aclnode = zfs_acl_node_alloc(aclsize); list_insert_head(&aclp->z_acl, aclnode); error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, - aclsize, aclnode->z_acldata); + aclsize, aclnode->z_acldata, DMU_READ_PREFETCH); aclnode->z_ace_count = acl_count; aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; @@ -1011,8 +1004,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) * already checked the acl and knows whether to inherit. */ int -zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, - zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { int error; znode_phys_t *zphys = zp->z_phys; @@ -1023,12 +1015,9 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_object_type_t otype; zfs_acl_node_t *aclnode; - ASSERT(MUTEX_HELD(&zp->z_lock)); - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - dmu_buf_will_dirty(zp->z_dbuf, tx); - zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx); + zphys->zp_mode = zfs_mode_compute(zp, aclp); /* * Decide which opbject type to use. If we are forced to @@ -1040,7 +1029,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, } else { if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && (zfsvfs->z_version >= ZPL_VERSION_FUID)) - zfs_acl_xform(zp, aclp); + zfs_acl_xform(zp, aclp, cr); ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); otype = DMU_OT_ACL; } @@ -1122,7 +1111,6 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); return (0); } @@ -1333,7 +1321,7 @@ zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep) * Prepend deny ACE */ static void * -zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, +zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep, mode_t mode) { zfs_acl_node_t *aclnode; @@ -1346,7 +1334,7 @@ zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, fuid = aclp->z_ops.ace_who_get(acep); flags = aclp->z_ops.ace_flags_get(acep); zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); - zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid); + zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid); return (newacep); } @@ -1470,9 +1458,9 @@ zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep, * in PSARC/2002/240 */ static void -zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) +zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid, + uint64_t mode, zfs_acl_t *aclp) { - zfsvfs_t *zfsvfs = zp->z_zfsvfs; void *acep = NULL, *prevacep = NULL; uint64_t who; int i; @@ -1482,11 +1470,6 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) uint16_t iflags, type; uint32_t access_mask; - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - ASSERT(MUTEX_HELD(&zp->z_lock)); - - aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); - /* * If discard then just discard all ACL nodes which * represent the ACEs. @@ -1551,17 +1534,15 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) if (!reuse_deny) { prevacep = - zfs_acl_prepend_deny(zp, + zfs_acl_prepend_deny(uid, aclp, acep, mode); } else { zfs_acl_prepend_fixup( aclp, prevacep, - acep, mode, - zp->z_phys->zp_uid); + acep, mode, uid); } zfs_fixup_group_entries(aclp, acep, prevacep, mode); - } } } @@ -1620,8 +1601,10 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) mutex_enter(&zp->z_acl_lock); *aclp = NULL; error = zfs_acl_node_read(zp, aclp, B_TRUE); - if (error == 0) - zfs_acl_chmod(zp, mode, *aclp); + if (error == 0) { + (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp); + } mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); return (error); @@ -1646,9 +1629,8 @@ zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) * Should ACE be inherited? */ static int -zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) +zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) { - int vtype = ZTOV(zp)->v_type; int iflags = (acep_flags & 0xf); if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) @@ -1663,10 +1645,9 @@ zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) * inherit inheritable ACEs from parent */ static zfs_acl_t * -zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, - boolean_t *need_chmod) +zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) { - zfsvfs_t *zfsvfs = zp->z_zfsvfs; void *pacep; void *acep, *acep2; zfs_acl_node_t *aclnode, *aclnode2; @@ -1677,8 +1658,8 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, size_t ace_size; void *data1, *data2; size_t data1sz, data2sz; - boolean_t vdir = ZTOV(zp)->v_type == VDIR; - boolean_t vreg = ZTOV(zp)->v_type == VREG; + boolean_t vdir = vtype == VDIR; + boolean_t vreg = vtype == VREG; boolean_t passthrough, passthrough_x, noallow; passthrough_x = @@ -1707,7 +1688,7 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, ace_size = aclp->z_ops.ace_size(pacep); - if (!zfs_ace_can_use(zp, iflags)) + if (!zfs_ace_can_use(vtype, iflags)) continue; /* @@ -1803,57 +1784,60 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, * Create file system object initial permissions * including inheritable ACEs. */ -void -zfs_perm_init(znode_t *zp, znode_t *parent, int flag, - vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp) +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) { - uint64_t mode, fuid, fgid; int error; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zfs_acl_t *aclp = NULL; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfs_acl_t *paclp; - xvattr_t *xvap = (xvattr_t *)vap; gid_t gid; boolean_t need_chmod = B_TRUE; - if (setaclp) - aclp = setaclp; + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); - mode = MAKEIMODE(vap->va_type, vap->va_mode); + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); /* * Determine uid and gid. */ - if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || ((flag & IS_XATTR) && (vap->va_type == VDIR))) { - fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr, - ZFS_OWNER, tx, fuidp); - fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, - ZFS_GROUP, tx, fuidp); + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; } else { - fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp); - fgid = 0; + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; if (vap->va_mask & AT_GID) { - fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, - ZFS_GROUP, tx, fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; - if (fgid != parent->z_phys->zp_gid && + if (acl_ids->z_fgid != dzp->z_phys->zp_gid && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) - fgid = 0; + acl_ids->z_fgid = 0; } - if (fgid == 0) { - if (parent->z_phys->zp_mode & S_ISGID) { - fgid = parent->z_phys->zp_gid; - gid = zfs_fuid_map_id(zfsvfs, fgid, + if (acl_ids->z_fgid == 0) { + if (dzp->z_phys->zp_mode & S_ISGID) { + acl_ids->z_fgid = dzp->z_phys->zp_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); } else { - fgid = zfs_fuid_create_cred(zfsvfs, - ZFS_GROUP, tx, cr, fuidp); + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); #ifdef __FreeBSD__ - gid = fgid = parent->z_phys->zp_gid; + gid = acl_ids->z_fgid = dzp->z_phys->zp_gid; #else gid = crgetgid(cr); #endif @@ -1868,57 +1852,61 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag, * file's new group, clear the file's set-GID bit. */ - if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) { - mode |= S_ISGID; + if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; } else { - if ((mode & S_ISGID) && - secpolicy_vnode_setids_setgids(ZTOV(zp), cr, gid) != 0) - mode &= ~S_ISGID; - } - - zp->z_phys->zp_uid = fuid; - zp->z_phys->zp_gid = fgid; - zp->z_phys->zp_mode = mode; - - if (aclp == NULL) { - mutex_enter(&parent->z_lock); - if ((ZTOV(parent)->v_type == VDIR && - (parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) && - !(zp->z_phys->zp_flags & ZFS_XATTR)) { - mutex_enter(&parent->z_acl_lock); - VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE)); - mutex_exit(&parent->z_acl_lock); - aclp = zfs_acl_inherit(zp, paclp, mode, &need_chmod); + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_lock); + if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR && + (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) && + !(dzp->z_phys->zp_flags & ZFS_XATTR)) { + mutex_enter(&dzp->z_acl_lock); + VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); + mutex_exit(&dzp->z_acl_lock); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_type, paclp, acl_ids->z_mode, &need_chmod); zfs_acl_free(paclp); } else { - aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + } + mutex_exit(&dzp->z_lock); + if (need_chmod) { + acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ? + ZFS_ACL_AUTO_INHERIT : 0; + zfs_acl_chmod(zfsvfs, acl_ids->z_fuid, + acl_ids->z_mode, acl_ids->z_aclp); } - mutex_exit(&parent->z_lock); - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - if (need_chmod) - zfs_acl_chmod(zp, mode, aclp); - } else { - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); } - /* Force auto_inherit on all new directory objects */ - if (vap->va_type == VDIR) - aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; - - error = zfs_aclset_common(zp, aclp, cr, fuidp, tx); - - /* Set optional attributes if any */ - if (vap->va_mask & AT_XVATTR) - zfs_xvattr_set(zp, xvap); + return (0); +} - mutex_exit(&zp->z_lock); - mutex_exit(&zp->z_acl_lock); - ASSERT3U(error, ==, 0); +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} - if (aclp != setaclp) - zfs_acl_free(aclp); +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) +{ + return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || + zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); } /* @@ -1984,6 +1972,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (mask & VSA_ACE) { size_t aclsz; + zfs_acl_node_t *aclnode = list_head(&aclp->z_acl); + aclsz = count * sizeof (ace_t) + sizeof (ace_object_t) * largeace; @@ -1994,17 +1984,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); else { - zfs_acl_node_t *aclnode; - void *start = vsecp->vsa_aclentp; - - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - bcopy(aclnode->z_acldata, start, - aclnode->z_size); - start = (caddr_t)start + aclnode->z_size; - } - ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == - aclp->z_acl_bytes); + bcopy(aclnode->z_acldata, vsecp->vsa_aclentp, + count * sizeof (ace_t)); } } if (mask & VSA_ACE_ACLFLAGS) { @@ -2026,7 +2007,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) int zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, - vsecattr_t *vsecp, zfs_acl_t **zaclp) + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) { zfs_acl_t *aclp; zfs_acl_node_t *aclnode; @@ -2049,9 +2030,9 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, return (error); } } else { - if ((error = zfs_copy_ace_2_fuid(obj_type, aclp, + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, - &aclnode->z_size)) != 0) { + &aclnode->z_size, fuidp, cr)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); @@ -2092,6 +2073,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) int error; zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; if (mask == 0) return (ENOSYS); @@ -2102,7 +2084,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) return (error); - error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp); + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, + &aclp); if (error) return (error); @@ -2143,25 +2126,16 @@ top: } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } - if (aclp->z_has_fuids) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -2171,9 +2145,13 @@ top: return (error); } - error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) @@ -2216,7 +2194,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, *check_privs = B_TRUE; - if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + if (zfsvfs->z_replay) { *working_mode = 0; return (0); } @@ -2225,7 +2203,8 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, if ((v4_mode & WRITE_MASK) && (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && - (!IS_DEVVP(ZTOV(zp)))) { + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { *check_privs = B_FALSE; return (EROFS); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c index 7820293..361b17d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ZFS control directory (a.k.a. ".zfs") * @@ -116,16 +114,21 @@ snapentry_compare(const void *a, const void *b) static struct vop_vector zfsctl_ops_root; static struct vop_vector zfsctl_ops_snapdir; static struct vop_vector zfsctl_ops_snapshot; +static struct vop_vector zfsctl_ops_shares; +static struct vop_vector zfsctl_ops_shares_dir; static vnode_t *zfsctl_mknode_snapdir(vnode_t *); +static vnode_t *zfsctl_mknode_shares(vnode_t *); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); /* - * Root directory elements. We have only a single static entry, 'snapshot'. + * Root directory elements. We only have two entries + * snapshot and shares. */ static gfs_dirent_t zfsctl_root_entries[] = { { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, + { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE }, { NULL } }; @@ -150,14 +153,21 @@ zfsctl_fini(void) } /* - * Return the inode number associated with the 'snapshot' directory. + * Return the inode number associated with the 'snapshot' or + * 'shares' directory. */ /* ARGSUSED */ static ino64_t zfsctl_root_inode_cb(vnode_t *vp, int index) { - ASSERT(index == 0); - return (ZFSCTL_INO_SNAPDIR); + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + + ASSERT(index <= 2); + + if (index == 0) + return (ZFSCTL_INO_SNAPDIR); + + return (zfsvfs->z_shares_dir); } /* @@ -260,8 +270,17 @@ zfsctl_common_access(ap) { int mode = ap->a_accmode; +#ifdef TODO + if (flags & V_ACE_MASK) { + if (accmode & ACE_ALL_WRITE_PERMS) + return (EACCES); + } else { +#endif if (mode & VWRITE) return (EACCES); +#ifdef TODO + } +#endif return (0); } @@ -334,6 +353,36 @@ zfsctl_common_fid(ap) return (0); } +/*ARGSUSED*/ +static int +zfsctl_shares_fid(ap) + struct vop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_FID(ZTOV(dzp), fidp); + VN_RELE(ZTOV(dzp)); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + static int zfsctl_common_reclaim(ap) struct vop_reclaim_args /* { @@ -394,6 +443,41 @@ zfsctl_root_getattr(ap) return (0); } +#ifdef sun +static int +zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + /* + * We only care about ACL_ENABLED so that libsec can + * display ACL correctly and not default to POSIX draft. + */ + if (cmd == _PC_ACL_ENABLED) { + *valp = _ACL_ACE_ENABLED; + return (0); + } + + return (fs_pathconf(vp, cmd, valp, cr, ct)); +} +#endif /* sun */ + +#ifdef sun +static const fs_operation_def_t zfsctl_tops_root[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_root_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, + { VOPNAME_PATHCONF, { .vop_pathconf = zfsctl_pathconf } }, + { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, + { NULL } +}; +#endif /* sun */ + /* * Special case the handling of "..". */ @@ -712,7 +796,7 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, return (err); if (err == 0) { - err = dmu_objset_snapshot(name, dirname, B_FALSE); + err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE); if (err) return (err); err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); @@ -777,9 +861,6 @@ zfsctl_snapdir_lookup(ap) ASSERT(dvp->v_type == VDIR); - if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) - return (0); - *vpp = NULL; /* @@ -793,6 +874,11 @@ zfsctl_snapdir_lookup(ap) ZFS_ENTER(zfsvfs); + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + if (flags & FIGNORECASE) { boolean_t conflict = B_FALSE; @@ -904,6 +990,46 @@ domount: } /* ARGSUSED */ +int +zfsctl_shares_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + char nm[NAME_MAX + 1]; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + ASSERT(cnp->cn_namelen < sizeof(nm)); + strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); + + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) + error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp); + + VN_RELE(ZTOV(dzp)); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* ARGSUSED */ static int zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, offset_t *offp, offset_t *nextp, void *data, int flags) @@ -947,6 +1073,44 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, return (0); } +/* ARGSUSED */ +static int +zfsctl_shares_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *a_ncookies; + u_long **a_cookies; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + uio_t *uiop = ap->a_uio; + cred_t *cr = ap->a_cred; + int *eofp = ap->a_eofflag; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ap->a_ncookies, ap->a_cookies); + VN_RELE(ZTOV(dzp)); + } else { + *eofp = 1; + error = ENOENT; + } + + ZFS_EXIT(zfsvfs); + return (error); +} + /* * pvp is the '.zfs' directory (zfsctl_node_t). * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). @@ -973,6 +1137,51 @@ zfsctl_mknode_snapdir(vnode_t *pvp) return (vp); } +vnode_t * +zfsctl_mknode_shares(vnode_t *pvp) +{ + vnode_t *vp; + zfsctl_node_t *sdp; + + vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, + &zfsctl_ops_shares, NULL, NULL, MAXNAMELEN, + NULL, NULL); + sdp = vp->v_data; + sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; + return (vp); + +} + +/* ARGSUSED */ +static int +zfsctl_shares_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + cred_t *cr = ap->a_cred; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_GETATTR(ZTOV(dzp), vap, cr); + VN_RELE(ZTOV(dzp)); + } + ZFS_EXIT(zfsvfs); + return (error); +} + /* ARGSUSED */ static int zfsctl_snapdir_getattr(ap) @@ -1061,7 +1270,6 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) VN_HOLD(vp); zcp = vp->v_data; zcp->zc_id = objset; - VFS_HOLD(vp->v_vfsp); VOP_UNLOCK(vp, 0); return (vp); @@ -1112,7 +1320,6 @@ zfsctl_snapshot_inactive(ap) mutex_exit(&sdp->sd_lock); VN_RELE(dvp); end: - VFS_RELE(vp->v_vfsp); /* * Dispose of the vnode for the snapshot mount point. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c index 34b17e4..3ac4741 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -581,24 +581,6 @@ zfs_rmnode(znode_t *zp) ASSERT(zp->z_phys->zp_links == 0); /* - * If this is a ZIL replay then leave the object in the unlinked set. - * Otherwise we can get a deadlock, because the delete can be - * quite large and span multiple tx's and txgs, but each replay - * creates a tx to atomically run the replay function and mark the - * replay record as complete. We deadlock trying to start a tx in - * a new txg to further the deletion but can't because the replay - * tx hasn't finished. - * - * We actually delete the object if we get a failure to create an - * object in zil_replay_log_record(), or after calling zil_replay(). - */ - if (zfsvfs->z_assign >= TXG_INITIAL) { - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - return; - } - - /* * If this is an attribute directory, purge its contents. */ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && @@ -842,7 +824,8 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) znode_t *xzp; dmu_tx_t *tx; int error; - zfs_fuid_info_t *fuidp = NULL; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; *xvpp = NULL; @@ -855,37 +838,41 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) return (error); #endif + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + return (EDQUOT); + } + tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) + zfs_acl_ids_free(&acl_ids); + if (error == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); return (error); } - zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp); + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + ASSERT(xzp->z_phys->zp_parent == zp->z_id); dmu_buf_will_dirty(zp->z_dbuf, tx); zp->z_phys->zp_xattr = xzp->z_id; (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, - xzp, "", NULL, fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); *xvpp = ZTOV(xzp); @@ -959,7 +946,7 @@ top: error = zfs_make_xattrdir(zp, &va, xvpp, cr); zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { /* NB: we already did dmu_tx_wait() if necessary */ goto top; } @@ -990,7 +977,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) uid_t fowner; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ + if (zdp->z_zfsvfs->z_replay) return (0); if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c index 17e4b0a..63ae13a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -101,7 +101,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, char buf[1024]; struct sbuf sb; struct timespec ts; - int state; /* * If we are doing a spa_tryimport(), ignore errors. @@ -134,16 +133,31 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; - /* - * If the vdev has already been marked as failing due to a - * failed probe, then ignore any subsequent I/O errors, as the - * DE will automatically fault the vdev on the first such - * failure. - */ - if (vd != NULL && - (!vdev_readable(vd) || !vdev_writeable(vd)) && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) - return; + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && + !vdev_accessible(vd, zio) && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) + return; + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return; + } } nanotime(&ts); @@ -197,20 +211,13 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, */ /* - * If we are importing a faulted pool, then we treat it like an open, - * not an import. Otherwise, the DE will ignore all faults during - * import, since the default behavior is to mark the devices as - * persistently unavailable, not leave them in the faulted state. - */ - state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state; - - /* * Generic payload members common to all ereports. */ sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)); sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)); - sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, state); + sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, + spa->spa_load_state); if (spa != NULL) { sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, @@ -227,12 +234,15 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, vd->vdev_guid); sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, vd->vdev_ops->vdev_op_type); - if (vd->vdev_path) + if (vd->vdev_path != NULL) sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path); - if (vd->vdev_devid) + if (vd->vdev_devid != NULL) sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid); + if (vd->vdev_fru != NULL) + sbuf_printf(&sb, " %s=%s", + FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru); if (pvd != NULL) { sbuf_printf(&sb, " %s=%ju", diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c index dfec3ed..4d5b194 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -47,8 +47,10 @@ * During file system initialization the nvlist(s) are read and * two AVL trees are created. One tree is keyed by the index number * and the other by the domain string. Nodes are never removed from - * trees, but new entries may be added. If a new entry is added then the - * on-disk packed nvlist will also be updated. + * trees, but new entries may be added. If a new entry is added then + * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then + * be responsible for calling zfs_fuid_sync() to sync the changes to disk. + * */ #define FUID_IDX "fuid_idx" @@ -97,6 +99,15 @@ domain_compare(const void *arg1, const void *arg2) return (val > 0 ? 1 : -1); } +void +zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + avl_create(idx_tree, idx_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); + avl_create(domain_tree, domain_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); +} + /* * load initial fuid domain and idx trees. This function is used by * both the kernel and zdb. @@ -108,12 +119,9 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, dmu_buf_t *db; uint64_t fuid_size; - avl_create(idx_tree, idx_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); - avl_create(domain_tree, domain_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); - - VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); + ASSERT(fuid_obj != 0); + VERIFY(0 == dmu_bonus_hold(os, fuid_obj, + FTAG, &db)); fuid_size = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); @@ -125,7 +133,8 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, int i; packed = kmem_alloc(fuid_size, KM_SLEEP); - VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0); + VERIFY(dmu_read(os, fuid_obj, 0, + fuid_size, packed, DMU_READ_PREFETCH) == 0); VERIFY(nvlist_unpack(packed, fuid_size, &nvp, 0) == 0); VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, @@ -189,10 +198,8 @@ zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) * Load the fuid table(s) into memory. */ static void -zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +zfs_fuid_init(zfsvfs_t *zfsvfs) { - int error = 0; - rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); if (zfsvfs->z_fuid_loaded) { @@ -200,41 +207,101 @@ zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) return; } - if (zfsvfs->z_fuid_obj == 0) { - - /* first make sure we need to allocate object */ - - error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); - if (error == ENOENT && tx != NULL) { - zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, - DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, - sizeof (uint64_t), tx); - VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, sizeof (uint64_t), 1, - &zfsvfs->z_fuid_obj, tx) == 0); - } - } + zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (zfsvfs->z_fuid_obj != 0) { zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); - zfsvfs->z_fuid_loaded = B_TRUE; } + zfsvfs->z_fuid_loaded = B_TRUE; + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * sync out AVL trees to persistent storage. + */ +void +zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + nvlist_t *nvp; + nvlist_t **fuids; + size_t nvsize = 0; + char *packed; + dmu_buf_t *db; + fuid_domain_t *domnode; + int numnodes; + int i; + + if (!zfsvfs->z_fuid_dirty) { + return; + } + + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + /* + * First see if table needs to be created? + */ + if (zfsvfs->z_fuid_obj == 0) { + zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, sizeof (uint64_t), 1, + &zfsvfs->z_fuid_obj, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + numnodes = avl_numnodes(&zfsvfs->z_fuid_idx); + fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); + for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++, + domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) { + VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, + domnode->f_idx) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); + VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, + domnode->f_ksid->kd_name) == 0); + } + VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, + fuids, numnodes) == 0); + for (i = 0; i != numnodes; i++) + nvlist_free(fuids[i]); + kmem_free(fuids, numnodes * sizeof (void *)); + VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); + packed = kmem_alloc(nvsize, KM_SLEEP); + VERIFY(nvlist_pack(nvp, &packed, &nvsize, + NV_ENCODE_XDR, KM_SLEEP) == 0); + nvlist_free(nvp); + zfsvfs->z_fuid_size = nvsize; + dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, + zfsvfs->z_fuid_size, packed, tx); + kmem_free(packed, zfsvfs->z_fuid_size); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, + FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; + dmu_buf_rele(db, FTAG); + + zfsvfs->z_fuid_dirty = B_FALSE; rw_exit(&zfsvfs->z_fuid_lock); } /* * Query domain table for a given domain. * - * If domain isn't found it is added to AVL trees and - * the results are pushed out to disk. + * If domain isn't found and addok is set, it is added to AVL trees and + * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be + * necessary for the caller or another thread to detect the dirty table + * and sync out the changes. */ int -zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, - dmu_tx_t *tx) +zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, + char **retdomain, boolean_t addok) { fuid_domain_t searchnode, *findnode; avl_index_t loc; @@ -246,16 +313,16 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, * for the user nobody. */ if (domain[0] == '\0') { - *retdomain = nulldomain; + if (retdomain) + *retdomain = nulldomain; return (0); } searchnode.f_ksid = ksid_lookupdomain(domain); - if (retdomain) { + if (retdomain) *retdomain = searchnode.f_ksid->kd_name; - } if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs, tx); + zfs_fuid_init(zfsvfs); retry: rw_enter(&zfsvfs->z_fuid_lock, rw); @@ -265,15 +332,9 @@ retry: rw_exit(&zfsvfs->z_fuid_lock); ksiddomain_rele(searchnode.f_ksid); return (findnode->f_idx); - } else { + } else if (addok) { fuid_domain_t *domnode; - nvlist_t *nvp; - nvlist_t **fuids; uint64_t retidx; - size_t nvsize = 0; - char *packed; - dmu_buf_t *db; - int i = 0; if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { rw_exit(&zfsvfs->z_fuid_lock); @@ -288,46 +349,12 @@ retry: avl_add(&zfsvfs->z_fuid_domain, domnode); avl_add(&zfsvfs->z_fuid_idx, domnode); - /* - * Now resync the on-disk nvlist. - */ - VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - domnode = avl_first(&zfsvfs->z_fuid_domain); - fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP); - while (domnode) { - VERIFY(nvlist_alloc(&fuids[i], - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, - domnode->f_idx) == 0); - VERIFY(nvlist_add_uint64(fuids[i], - FUID_OFFSET, 0) == 0); - VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN, - domnode->f_ksid->kd_name) == 0); - domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode); - } - VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, - fuids, retidx) == 0); - for (i = 0; i != retidx; i++) - nvlist_free(fuids[i]); - kmem_free(fuids, retidx * sizeof (void *)); - VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); - packed = kmem_alloc(nvsize, KM_SLEEP); - VERIFY(nvlist_pack(nvp, &packed, &nvsize, - NV_ENCODE_XDR, KM_SLEEP) == 0); - nvlist_free(nvp); - zfsvfs->z_fuid_size = nvsize; - dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, - zfsvfs->z_fuid_size, packed, tx); - kmem_free(packed, zfsvfs->z_fuid_size); - VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, - FTAG, &db)); - dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; - dmu_buf_rele(db, FTAG); - + zfsvfs->z_fuid_dirty = B_TRUE; rw_exit(&zfsvfs->z_fuid_lock); return (retidx); + } else { + rw_exit(&zfsvfs->z_fuid_lock); + return (-1); } } @@ -337,7 +364,7 @@ retry: * Returns a pointer from an avl node of the domain string. * */ -static char * +const char * zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) { char *domain; @@ -346,7 +373,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) return (NULL); if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs, NULL); + zfs_fuid_init(zfsvfs); rw_enter(&zfsvfs->z_fuid_lock, RW_READER); @@ -374,7 +401,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, cred_t *cr, zfs_fuid_type_t type) { uint32_t index = FUID_INDEX(fuid); - char *domain; + const char *domain; uid_t id; if (index == 0) @@ -443,6 +470,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, } if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { + /* * Now allocate fuid entry and add it on the end of the list */ @@ -467,7 +495,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, */ uint64_t zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, - dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp) + cred_t *cr, zfs_fuid_info_t **fuidp) { uint64_t idx; ksid_t *ksid; @@ -493,7 +521,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, rid = ksid_getrid(ksid); domain = ksid_getdomain(ksid); - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); @@ -517,7 +545,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, */ uint64_t zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, - zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp) + zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp) { const char *domain; char *kdomain; @@ -525,7 +553,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, uint32_t rid; idmap_stat status; uint64_t idx; - boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL); zfs_fuid_t *zfuid = NULL; zfs_fuid_info_t *fuidp; @@ -540,7 +567,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) return (id); - if (is_replay) { + if (zfsvfs->z_replay) { fuidp = zfsvfs->z_fuid_replay; /* @@ -592,10 +619,11 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, #endif } - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); - if (!is_replay) - zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); + if (!zfsvfs->z_replay) + zfs_fuid_node_add(fuidpp, kdomain, + rid, idx, id, type); else if (zfuid != NULL) { list_remove(&fuidp->z_fuids, zfuid); kmem_free(zfuid, sizeof (zfs_fuid_t)); @@ -668,11 +696,14 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp) boolean_t zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) { +#ifdef sun ksid_t *ksid = crgetsid(cr, KSID_GROUP); + ksidlist_t *ksidlist = crgetsidlist(cr); +#endif /* sun */ uid_t gid; -#ifdef TODO - if (ksid) { +#ifdef sun + if (ksid && ksidlist) { int i; ksid_t *ksid_groups; ksidlist_t *ksidlist = crgetsidlist(cr); @@ -689,7 +720,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) return (B_TRUE); } } else { - char *domain; + const char *domain; domain = zfs_fuid_find_by_idx(zfsvfs, idx); ASSERT(domain != NULL); @@ -705,7 +736,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) } } } -#endif +#endif /* sun */ /* * Not found in ksidlist, check posix groups @@ -713,4 +744,19 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); return (groupmember(gid, cr)); } + +void +zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } +} #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 75f1ad0..6b6fc75 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,6 +41,7 @@ #include <sys/cmn_err.h> #include <sys/stat.h> #include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> #include <sys/zfs_znode.h> #include <sys/zap.h> #include <sys/spa.h> @@ -81,17 +82,29 @@ extern void zfs_fini(void); typedef int zfs_ioc_func_t(zfs_cmd_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); +typedef enum { + NO_NAME, + POOL_NAME, + DATASET_NAME +} zfs_ioc_namecheck_t; + typedef struct zfs_ioc_vec { zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; - enum { - NO_NAME, - POOL_NAME, - DATASET_NAME - } zvec_namecheck; + zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_his_log; + boolean_t zvec_pool_check; } zfs_ioc_vec_t; +/* This array is indexed by zfs_userquota_prop_t */ +static const char *userquota_perms[] = { + ZFS_DELEG_PERM_USERUSED, + ZFS_DELEG_PERM_USERQUOTA, + ZFS_DELEG_PERM_GROUPUSED, + ZFS_DELEG_PERM_GROUPQUOTA, +}; + +static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static void clear_props(char *dataset, nvlist_t *props, nvlist_t *newprops); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); @@ -391,6 +404,30 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) ZFS_DELEG_PERM_SEND, cr)); } +static int +zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) +{ + vnode_t *vp; + int error; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (EPERM); + } + + VN_RELE(vp); + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_SHARE, cr)); +} + int zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) { @@ -400,25 +437,20 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) if (secpolicy_nfs(cr) == 0) { return (0); } else { - vnode_t *vp; - int error; - - if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, - NO_FOLLOW, NULL, &vp)) != 0) - return (error); - - /* Now make sure mntpnt and dataset are ZFS */ + return (zfs_secpolicy_deleg_share(zc, cr)); + } +} - if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || - (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), - zc->zc_name) != 0)) { - VN_RELE(vp); - return (EPERM); - } +int +zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) +{ + if (!INGLOBALZONE(curthread)) + return (EPERM); - VN_RELE(vp); - return (dsl_deleg_access(zc->zc_name, - ZFS_DELEG_PERM_SHARE, cr)); + if (secpolicy_smb(cr) == 0) { + return (0); + } else { + return (zfs_secpolicy_deleg_share(zc, cr)); } } @@ -699,6 +731,55 @@ zfs_secpolicy_operator(const char *dataset, cred_t *cr) return (0); } +static int +zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + if (zc->zc_value[0] == 0) { + /* + * They are asking about a posix uid/gid. If it's + * themself, allow it. + */ + if (zc->zc_objset_type == ZFS_PROP_USERUSED || + zc->zc_objset_type == ZFS_PROP_USERQUOTA) { + if (zc->zc_guid == crgetuid(cr)) + return (0); + } else { + if (groupmember(zc->zc_guid, cr)) + return (0); + } + } + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +static int +zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +static int +zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, cr)); +} + /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ @@ -766,6 +847,69 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) } static int +getzfsvfs(const char *dsname, zfsvfs_t **zvp) +{ + objset_t *os; + int error; + + error = dmu_objset_open(dsname, DMU_OST_ZFS, + DS_MODE_USER | DS_MODE_READONLY, &os); + if (error) + return (error); + + mutex_enter(&os->os->os_user_ptr_lock); + *zvp = dmu_objset_get_user(os); + if (*zvp) { + VFS_HOLD((*zvp)->z_vfs); + } else { + error = ESRCH; + } + mutex_exit(&os->os->os_user_ptr_lock); + dmu_objset_close(os); + return (error); +} + +/* + * Find a zfsvfs_t for a mounted filesystem, or create our own, in which + * case its z_vfs will be NULL, and it will be opened as the owner. + */ +static int +zfsvfs_hold(const char *name, boolean_t readonly, void *tag, zfsvfs_t **zvp) +{ + int error = 0; + int mode = DS_MODE_OWNER | (readonly ? DS_MODE_READONLY : 0); + + if (getzfsvfs(name, zvp) != 0) + error = zfsvfs_create(name, mode, zvp); + if (error == 0) { + rrw_enter(&(*zvp)->z_teardown_lock, RW_READER, tag); + if ((*zvp)->z_unmounted) { + /* + * XXX we could probably try again, since the unmounting + * thread should be just about to disassociate the + * objset from the zfsvfs. + */ + rrw_exit(&(*zvp)->z_teardown_lock, tag); + return (EBUSY); + } + } + return (error); +} + +static void +zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) +{ + rrw_exit(&zfsvfs->z_teardown_lock, tag); + + if (zfsvfs->z_vfs) { + VFS_RELE(zfsvfs->z_vfs); + } else { + dmu_objset_close(zfsvfs->z_os); + zfsvfs_free(zfsvfs); + } +} + +static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; @@ -864,7 +1008,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) guid != zc->zc_guid) error = EINVAL; else if (zc->zc_cookie) - error = spa_import_faulted(zc->zc_name, config, + error = spa_import_verbatim(zc->zc_name, config, props); else error = spa_import(zc->zc_name, config, props); @@ -1189,7 +1333,7 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); + error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); @@ -1212,6 +1356,23 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc) return (error); } +static int +zfs_ioc_vdev_setfru(zfs_cmd_t *zc) +{ + spa_t *spa; + char *fru = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setfru(spa, guid, fru); + spa_close(spa, FTAG); + return (error); +} + /* * inputs: * zc_name name of filesystem @@ -1319,6 +1480,23 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) return (err); } +static boolean_t +dataset_name_hidden(const char *name) +{ + /* + * Skip over datasets that are not visible in this zone, + * internal datasets (which have a $ in their name), and + * temporary datasets (which have a % in their name). + */ + if (strchr(name, '$') != NULL) + return (B_TRUE); + if (strchr(name, '%') != NULL) + return (B_TRUE); + if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL)) + return (B_TRUE); + return (B_FALSE); +} + /* * inputs: * zc_name name of filesystem @@ -1327,6 +1505,7 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) * * outputs: * zc_name name of next filesystem + * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist @@ -1350,12 +1529,16 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); + /* + * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 + * but is not declared void because its called by dmu_objset_find(). + */ if (zc->zc_cookie == 0) { uint64_t cookie = 0; int len = sizeof (zc->zc_name) - (p - zc->zc_name); while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) - dmu_objset_prefetch(p, NULL); + (void) dmu_objset_prefetch(p, NULL); } do { @@ -1364,15 +1547,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) NULL, &zc->zc_cookie); if (error == ENOENT) error = ESRCH; - } while (error == 0 && !INGLOBALZONE(curthread) && - !zone_dataset_visible(zc->zc_name, NULL)); + } while (error == 0 && dataset_name_hidden(zc->zc_name)); dmu_objset_close(os); - /* - * If it's a hidden dataset (ie. with a '$' in its name), don't - * try to get stats for it. Userland will skip over it. - */ - if (error == 0 && strchr(zc->zc_name, '$') == NULL) + if (error == 0) error = zfs_ioc_objset_stats(zc); /* fill in the stats */ return (error); @@ -1396,14 +1574,15 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *os; int error; - if (zc->zc_cookie == 0) - dmu_objset_find(zc->zc_name, dmu_objset_prefetch, - NULL, DS_FIND_SNAPSHOTS); error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os); if (error) return (error == ENOENT ? ESRCH : error); + if (zc->zc_cookie == 0) { + (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, + NULL, DS_FIND_SNAPSHOTS); + } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. @@ -1432,13 +1611,16 @@ int zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) { nvpair_t *elem; - int error; + int error = 0; uint64_t intval; char *strval; + nvlist_t *genericnvl; + boolean_t issnap = (strchr(name, '@') != NULL); /* * First validate permission to set all of the properties */ + VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { const char *propname = nvpair_name(elem); @@ -1449,16 +1631,35 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) * If this is a user-defined property, it must be a * string, and there is no further validation to do. */ - if (!zfs_prop_user(propname) || - nvpair_type(elem) != DATA_TYPE_STRING) - return (EINVAL); + if (zfs_prop_user(propname) && + nvpair_type(elem) == DATA_TYPE_STRING) { + if (error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_USERPROP, CRED())) + return (error); + continue; + } - if (error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_USERPROP, CRED())) - return (error); - continue; + if (!issnap && zfs_prop_userquota(propname) && + nvpair_type(elem) == DATA_TYPE_UINT64_ARRAY) { + const char *perm; + const char *up = zfs_userquota_prop_prefixes + [ZFS_PROP_USERQUOTA]; + if (strncmp(propname, up, strlen(up)) == 0) + perm = ZFS_DELEG_PERM_USERQUOTA; + else + perm = ZFS_DELEG_PERM_GROUPQUOTA; + if (error = zfs_secpolicy_write_perms(name, + perm, CRED())) + return (error); + continue; + } + + return (EINVAL); } + if (issnap) + return (EINVAL); + if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) return (error); @@ -1494,8 +1695,7 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) break; case ZFS_PROP_COPIES: - if (zfs_earlier_version(name, - SPA_VERSION_DITTO_BLOCKS)) + if (zfs_earlier_version(name, SPA_VERSION_DITTO_BLOCKS)) return (ENOTSUP); break; @@ -1520,71 +1720,115 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) zfs_prop_t prop = zfs_name_to_prop(propname); if (prop == ZPROP_INVAL) { - VERIFY(nvpair_value_string(elem, &strval) == 0); - error = dsl_prop_set(name, propname, 1, - strlen(strval) + 1, strval); - if (error == 0) - continue; - else - return (error); + if (zfs_prop_userquota(propname)) { + uint64_t *valary; + unsigned int vallen; + const char *domain; + zfs_userquota_prop_t type; + uint64_t rid; + uint64_t quota; + zfsvfs_t *zfsvfs; + + VERIFY(nvpair_value_uint64_array(elem, + &valary, &vallen) == 0); + VERIFY(vallen == 3); + type = valary[0]; + rid = valary[1]; + quota = valary[2]; + domain = propname + + strlen(zfs_userquota_prop_prefixes[type]); + + error = zfsvfs_hold(name, B_FALSE, FTAG, + &zfsvfs); + if (error == 0) { + error = zfs_set_userquota(zfsvfs, + type, domain, rid, quota); + zfsvfs_rele(zfsvfs, FTAG); + } + if (error == 0) + continue; + else + goto out; + } else if (zfs_prop_user(propname)) { + VERIFY(nvpair_value_string(elem, &strval) == 0); + error = dsl_prop_set(name, propname, 1, + strlen(strval) + 1, strval); + if (error == 0) + continue; + else + goto out; + } } switch (prop) { case ZFS_PROP_QUOTA: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = dsl_dir_set_quota(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_REFQUOTA: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = dsl_dataset_set_quota(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_RESERVATION: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = dsl_dir_set_reservation(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_REFRESERVATION: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = dsl_dataset_set_reservation(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_VOLSIZE: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = zvol_set_volsize(name, ddi_driver_major(zfs_dip), intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_VOLBLOCKSIZE: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = zvol_set_volblocksize(name, intval)) != 0) - return (error); + goto out; break; case ZFS_PROP_VERSION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zfs_set_version(name, intval)) != 0) - return (error); + { + zfsvfs_t *zfsvfs; + + if ((error = nvpair_value_uint64(elem, &intval)) != 0) + goto out; + if ((error = zfsvfs_hold(name, B_FALSE, FTAG, + &zfsvfs)) != 0) + goto out; + error = zfs_set_version(zfsvfs, intval); + zfsvfs_rele(zfsvfs, FTAG); + + if (error == 0 && intval >= ZPL_VERSION_USERSPACE) { + zfs_cmd_t zc = { 0 }; + (void) strcpy(zc.zc_name, name); + (void) zfs_ioc_userspace_upgrade(&zc); + } + if (error) + goto out; break; + } default: if (nvpair_type(elem) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != - PROP_TYPE_STRING) - return (EINVAL); - VERIFY(nvpair_value_string(elem, &strval) == 0); - if ((error = dsl_prop_set(name, - nvpair_name(elem), 1, strlen(strval) + 1, - strval)) != 0) - return (error); + PROP_TYPE_STRING) { + error = EINVAL; + goto out; + } } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { const char *unused; @@ -1594,35 +1838,72 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: - return (EINVAL); + error = EINVAL; + goto out; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, - intval, &unused) != 0) - return (EINVAL); + intval, &unused) != 0) { + error = EINVAL; + goto out; + } break; default: cmn_err(CE_PANIC, "unknown property type"); break; } - - if ((error = dsl_prop_set(name, propname, - 8, 1, &intval)) != 0) - return (error); } else { - return (EINVAL); + error = EINVAL; + goto out; } - break; + if ((error = nvlist_add_nvpair(genericnvl, elem)) != 0) + goto out; } } + if (nvlist_next_nvpair(genericnvl, NULL) != NULL) { + error = dsl_props_set(name, genericnvl); + } +out: + nvlist_free(genericnvl); + return (error); +} + +/* + * Check that all the properties are valid user properties. + */ +static int +zfs_check_userprops(char *fsname, nvlist_t *nvl) +{ + nvpair_t *elem = NULL; + int error = 0; + + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *propname = nvpair_name(elem); + char *valstr; + + if (!zfs_prop_user(propname) || + nvpair_type(elem) != DATA_TYPE_STRING) + return (EINVAL); + + if (error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_USERPROP, CRED())) + return (error); + + if (strlen(propname) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + + VERIFY(nvpair_value_string(elem, &valstr) == 0); + if (strlen(valstr) >= ZAP_MAXVALUELEN) + return (E2BIG); + } return (0); } /* * inputs: * zc_name name of filesystem - * zc_value name of property to inherit + * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie clear existing local props? * @@ -1679,11 +1960,30 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) nvlist_t *props; spa_t *spa; int error; + nvpair_t *elem; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) return (error); + /* + * If the only property is the configfile, then just do a spa_lookup() + * to handle the faulted case. + */ + elem = nvlist_next_nvpair(props, NULL); + if (elem != NULL && strcmp(nvpair_name(elem), + zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && + nvlist_next_nvpair(props, elem) == NULL) { + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) { + spa_configfile_set(spa, props, B_FALSE); + spa_config_sync(spa, B_FALSE, B_TRUE); + } + mutex_exit(&spa_namespace_lock); + if (spa != NULL) + return (0); + } + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); @@ -1704,27 +2004,34 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) int error; nvlist_t *nvp = NULL; - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_prop_get(spa, &nvp); + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + /* + * If the pool is faulted, there may be properties we can still + * get (such as altroot and cachefile), so attempt to get them + * anyway. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) + error = spa_prop_get(spa, &nvp); + mutex_exit(&spa_namespace_lock); + } else { + error = spa_prop_get(spa, &nvp); + spa_close(spa, FTAG); + } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = EFAULT; - spa_close(spa, FTAG); - - if (nvp) - nvlist_free(nvp); + nvlist_free(nvp); return (error); } static int zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) { -#ifdef TODO +#ifdef sun nvlist_t *nvp; int error; uint32_t uid; @@ -1767,9 +2074,9 @@ zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred); crfree(usercred); return (error); -#else +#else /* sun */ return (EPERM); -#endif +#endif /* sun */ } /* @@ -1920,11 +2227,10 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) * processing. */ static int -zfs_fill_zplprops_impl(objset_t *os, uint64_t default_zplver, +zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { - uint64_t zplver = default_zplver; uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; @@ -2012,6 +2318,8 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, ASSERT(cp != NULL); cp[0] = '\0'; + if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE)) + zplver = ZPL_VERSION_USERSPACE - 1; if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) { zplver = ZPL_VERSION_FUID - 1; fuids_ok = B_FALSE; @@ -2190,32 +2498,12 @@ zfs_ioc_create(zfs_cmd_t *zc) return (error); } -struct snap_prop_arg { - nvlist_t *nvprops; - const char *snapname; -}; - -static int -set_snap_props(char *name, void *arg) -{ - struct snap_prop_arg *snpa = arg; - int len = strlen(name) + strlen(snpa->snapname) + 2; - char *buf = kmem_alloc(len, KM_SLEEP); - int err; - - (void) snprintf(buf, len, "%s@%s", name, snpa->snapname); - err = zfs_set_prop_nvlist(buf, snpa->nvprops); - if (err) - (void) dmu_objset_destroy(buf); - kmem_free(buf, len); - return (err); -} - /* * inputs: * zc_name name of filesystem * zc_value short name of snapshot * zc_cookie recursive flag + * zc_nvlist_src[_size] property list * * outputs: none */ @@ -2234,26 +2522,20 @@ zfs_ioc_snapshot(zfs_cmd_t *zc) &nvprops)) != 0) return (error); - error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, recursive); + error = zfs_check_userprops(zc->zc_name, nvprops); + if (error) + goto out; - /* - * It would be nice to do this atomically. - */ - if (error == 0) { - struct snap_prop_arg snpa; - snpa.nvprops = nvprops; - snpa.snapname = zc->zc_value; - if (recursive) { - error = dmu_objset_find(zc->zc_name, - set_snap_props, &snpa, DS_FIND_CHILDREN); - if (error) { - (void) dmu_snapshots_destroy(zc->zc_name, - zc->zc_value); - } - } else { - error = set_snap_props(zc->zc_name, &snpa); - } + if (nvprops != NULL && nvlist_next_nvpair(nvprops, NULL) != NULL && + zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) { + error = ENOTSUP; + goto out; } + + error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, + nvprops, recursive); + +out: nvlist_free(nvprops); return (error); } @@ -2358,31 +2640,19 @@ zfs_ioc_rollback(zfs_cmd_t *zc) if (error) return (error); - if (dmu_objset_type(os) == DMU_OST_ZFS) { - mutex_enter(&os->os->os_user_ptr_lock); - zfsvfs = dmu_objset_get_user(os); - if (zfsvfs != NULL) - VFS_HOLD(zfsvfs->z_vfs); - mutex_exit(&os->os->os_user_ptr_lock); - } - - if (zfsvfs != NULL) { - char *osname; + if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { int mode; - osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - error = zfs_suspend_fs(zfsvfs, osname, &mode); + error = zfs_suspend_fs(zfsvfs, NULL, &mode); if (error == 0) { int resume_err; - ASSERT(strcmp(osname, zc->zc_name) == 0); error = dmu_objset_rollback(os); - resume_err = zfs_resume_fs(zfsvfs, osname, mode); + resume_err = zfs_resume_fs(zfsvfs, zc->zc_name, mode); error = error ? error : resume_err; } else { dmu_objset_close(os); } - kmem_free(osname, MAXNAMELEN); VFS_RELE(zfsvfs->z_vfs); } else { error = dmu_objset_rollback(os); @@ -2497,32 +2767,26 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (EBADF); } - if (dmu_objset_open(tofs, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - /* - * Try to get the zfsvfs for the receiving objset. - * There won't be one if we're operating on a zvol, - * if the objset doesn't exist yet, or is not mounted. - */ - mutex_enter(&os->os->os_user_ptr_lock); - if (zfsvfs = dmu_objset_get_user(os)) { - if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { - mutex_exit(&os->os->os_user_ptr_lock); - dmu_objset_close(os); - zfsvfs = NULL; - error = EBUSY; - goto out; - } - VFS_HOLD(zfsvfs->z_vfs); + if (getzfsvfs(tofs, &zfsvfs) == 0) { + if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { + VFS_RELE(zfsvfs->z_vfs); + zfsvfs = NULL; + error = EBUSY; + goto out; } - mutex_exit(&os->os->os_user_ptr_lock); - /* * If new properties are supplied, they are to completely * replace the existing ones, so stash away the existing ones. */ if (props) - (void) dsl_prop_get_all(os, &origprops, TRUE); + (void) dsl_prop_get_all(zfsvfs->z_os, &origprops, TRUE); + } else if (props && dmu_objset_open(tofs, DMU_OST_ANY, + DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + /* + * Get the props even if there was no zfsvfs (zvol or + * unmounted zpl). + */ + (void) dsl_prop_get_all(os, &origprops, TRUE); dmu_objset_close(os); } @@ -2762,11 +3026,12 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * Resume any suspended I/Os. */ - zio_resume(spa); + if (zio_resume(spa) != 0) + error = EIO; spa_close(spa, FTAG); - return (0); + return (error); } /* @@ -2793,7 +3058,121 @@ zfs_ioc_promote(zfs_cmd_t *zc) return (dsl_dataset_promote(zc->zc_name)); } -#ifdef TODO +/* + * Retrieve a single {user|group}{used|quota}@... property. + * + * inputs: + * zc_name name of filesystem + * zc_objset_type zfs_userquota_prop_t + * zc_value domain name (eg. "S-1-234-567-89") + * zc_guid RID/UID/GID + * + * outputs: + * zc_cookie property value + */ +static int +zfs_ioc_userspace_one(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int error; + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs); + if (error) + return (error); + + error = zfs_userspace_one(zfsvfs, + zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_objset_type zfs_userquota_prop_t + * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) + * + * outputs: + * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) + * zc_cookie zap cursor + */ +static int +zfs_ioc_userspace_many(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int error; + + error = zfsvfs_hold(zc->zc_name, B_TRUE, FTAG, &zfsvfs); + if (error) + return (error); + + int bufsize = zc->zc_nvlist_dst_size; + void *buf = kmem_alloc(bufsize, KM_SLEEP); + + error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, + buf, &zc->zc_nvlist_dst_size); + + if (error == 0) { + error = xcopyout(buf, + (void *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size); + } + kmem_free(buf, bufsize); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * none + */ +static int +zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + zfsvfs_t *zfsvfs; + + if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { + if (!dmu_objset_userused_enabled(zfsvfs->z_os->os)) { + /* + * If userused is not enabled, it may be because the + * objset needs to be closed & reopened (to grow the + * objset_phys_t). Suspend/resume the fs will do that. + */ + int mode; + error = zfs_suspend_fs(zfsvfs, NULL, &mode); + if (error == 0) { + error = zfs_resume_fs(zfsvfs, + zc->zc_name, mode); + } + } + if (error == 0) + error = dmu_objset_userspace_upgrade(zfsvfs->z_os); + VFS_RELE(zfsvfs->z_vfs); + } else { + error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, + DS_MODE_USER, &os); + if (error) + return (error); + + error = dmu_objset_userspace_upgrade(os); + dmu_objset_close(os); + } + + return (error); +} + +#ifdef sun /* * We don't want to have a hard dependency * against some special symbols in sharefs @@ -2811,10 +3190,10 @@ int zfs_smbshare_inited; ddi_modhandle_t nfs_mod; ddi_modhandle_t sharefs_mod; ddi_modhandle_t smbsrv_mod; -#endif +#endif /* sun */ kmutex_t zfs_share_lock; -#ifdef TODO +#ifdef sun static int zfs_init_sharefs() { @@ -2834,12 +3213,12 @@ zfs_init_sharefs() } return (0); } -#endif +#endif /* sun */ static int zfs_ioc_share(zfs_cmd_t *zc) { -#ifdef TODO +#ifdef sun int error; int opcode; @@ -2911,7 +3290,7 @@ zfs_ioc_share(zfs_cmd_t *zc) if (error = zsmbexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata, zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? - B_TRUE : B_FALSE)) { + B_TRUE: B_FALSE)) { return (error); } break; @@ -2929,9 +3308,168 @@ zfs_ioc_share(zfs_cmd_t *zc) zc->zc_share.z_sharemax); return (error); -#else +#else /* sun */ return (ENOSYS); -#endif +#endif /* sun */ +} + +ace_t full_access[] = { + {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} +}; + +#ifdef sun +/* + * Remove all ACL files in shares dir + */ +static int +zfs_smb_acl_purge(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, + NULL, 0)) != 0) + break; + } + zap_cursor_fini(&zc); + return (error); +} +#endif /* sun */ + +static int +zfs_ioc_smb_acl(zfs_cmd_t *zc) +{ +#ifdef sun + vnode_t *vp; + znode_t *dzp; + vnode_t *resourcevp = NULL; + znode_t *sharedir; + zfsvfs_t *zfsvfs; + nvlist_t *nvlist; + char *src, *target; + vattr_t vattr; + vsecattr_t vsec; + int error = 0; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (vp->v_vfsp->vfs_fstype != zfsfstype || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (EINVAL); + } + + dzp = VTOZ(vp); + zfsvfs = dzp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + + /* + * Create share dir if its missing. + */ + mutex_enter(&zfsvfs->z_lock); + if (zfsvfs->z_shares_dir == 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, + ZFS_SHARES_DIR); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = zfs_create_share_dir(zfsvfs, tx); + dmu_tx_commit(tx); + } + if (error) { + mutex_exit(&zfsvfs->z_lock); + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + } + mutex_exit(&zfsvfs->z_lock); + + ASSERT(zfsvfs->z_shares_dir); + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + + switch (zc->zc_cookie) { + case ZFS_SMB_ACL_ADD: + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VREG; + vattr.va_mode = S_IFREG|0777; + vattr.va_uid = 0; + vattr.va_gid = 0; + + vsec.vsa_mask = VSA_ACE; + vsec.vsa_aclentp = &full_access; + vsec.vsa_aclentsz = sizeof (full_access); + vsec.vsa_aclcnt = 1; + + error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, + &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); + if (resourcevp) + VN_RELE(resourcevp); + break; + + case ZFS_SMB_ACL_REMOVE: + error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, + NULL, 0); + break; + + case ZFS_SMB_ACL_RENAME: + if ((error = get_nvlist(zc->zc_nvlist_src, + zc->zc_nvlist_src_size, &nvlist)) != 0) { + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || + nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, + &target)) { + VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); + ZFS_EXIT(zfsvfs); + return (error); + } + error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, + kcred, NULL, 0); + nvlist_free(nvlist); + break; + + case ZFS_SMB_ACL_PURGE: + error = zfs_smb_acl_purge(sharedir); + break; + + default: + error = EINVAL; + break; + } + + VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); + + ZFS_EXIT(zfsvfs); + + return (error); +#else /* !sun */ + return (EOPNOTSUPP); +#endif /* !sun */ } /* @@ -2956,60 +3494,125 @@ zfs_ioc_unjail(zfs_cmd_t *zc) } static zfs_ioc_vec_t zfs_ioc_vec[] = { - { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE }, - { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_dataset_list_next, zfs_secpolicy_read, - DATASET_NAME, B_FALSE }, - { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, - DATASET_NAME, B_FALSE }, - { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE }, - { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, - { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, - { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE }, - { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, - { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE }, - { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE }, - { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE }, - { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE }, - { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE }, - { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, - { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE }, - { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE }, - { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE }, - { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, - DATASET_NAME, B_FALSE }, - { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE }, - { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE }, - { zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE }, - { zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE } + { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_FALSE }, + { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_FALSE }, + { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, + B_TRUE}, + { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE }, + { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE }, + { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_jail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE }, + { zfs_ioc_unjail, zfs_secpolicy_config, DATASET_NAME, B_TRUE, B_FALSE }, + { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, + DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, + DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, + DATASET_NAME, B_FALSE, B_TRUE }, + { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_TRUE } }; +int +pool_status_check(const char *name, zfs_ioc_namecheck_t type) +{ + spa_t *spa; + char pool[ZFS_MAXNAMELEN]; + int error; + + ASSERT(type == POOL_NAME || type == DATASET_NAME); + + error = spa_open(name, &spa, FTAG); + if (error == 0) { + if (spa_suspended(spa)) + error = EAGAIN; + spa_close(spa, FTAG); + } + return (error); +} + static int zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) @@ -3035,11 +3638,17 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; + if (zfs_ioc_vec[vec].zvec_pool_check) + error = pool_status_check(zc->zc_name, + zfs_ioc_vec[vec].zvec_namecheck); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; + if (zfs_ioc_vec[vec].zvec_pool_check) + error = pool_status_check(zc->zc_name, + zfs_ioc_vec[vec].zvec_namecheck); break; case NO_NAME: @@ -3051,7 +3660,7 @@ zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, error = zfs_ioc_vec[vec].zvec_func(zc); if (error == 0) { - if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE) + if (zfs_ioc_vec[vec].zvec_his_log) zfs_log_history(zc); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c index 180196b..3105088 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c @@ -42,13 +42,33 @@ #include <sys/dmu.h> #include <sys/spa.h> #include <sys/zfs_fuid.h> +#include <sys/dsl_dataset.h> + +#define ZFS_HANDLE_REPLAY(zilog, tx) \ + if (zilog->zl_replay) { \ + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \ + zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \ + zilog->zl_replaying_seq; \ + return; \ + } /* - * All the functions in this file are used to construct the log entries - * to record transactions. They allocate * an intent log transaction - * structure (itx_t) and save within it all the information necessary to - * possibly replay the transaction. The itx is then assigned a sequence - * number and inserted in the in-memory list anchored in the zilog. + * These zfs_log_* functions must be called within a dmu tx, in one + * of 2 contexts depending on zilog->z_replay: + * + * Non replay mode + * --------------- + * We need to record the transaction so that if it is committed to + * the Intent Log then it can be replayed. An intent log transaction + * structure (itx_t) is allocated and all the information necessary to + * possibly replay the transaction is saved in it. The itx is then assigned + * a sequence number and inserted in the in-memory list anchored in the zilog. + * + * Replay mode + * ----------- + * We need to mark the intent log record as replayed in the log header. + * This is done in the same transaction as the replay so that they + * commit atomically. */ int @@ -236,6 +256,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + /* * If we have FUIDs present then add in space for * domains and ACE fuid's if any. @@ -339,6 +361,8 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); lr = (lr_remove_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; @@ -363,6 +387,8 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); lr = (lr_link_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; @@ -390,6 +416,8 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; @@ -424,6 +452,8 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zilog == NULL) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); lr = (lr_rename_t *)&itx->itx_lr; lr->lr_sdoid = sdzp->z_id; @@ -456,6 +486,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, if (zilog == NULL || zp->z_unlinked) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + /* * Writes are handled in three different ways: * @@ -508,7 +540,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, - zp->z_id, off, len, lr + 1) != 0) { + zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); itx = zil_itx_create(txtype, sizeof (*lr)); @@ -554,6 +586,8 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, if (zilog == NULL || zp->z_unlinked) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = zp->z_id; @@ -583,6 +617,8 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, if (zilog == NULL || zp->z_unlinked) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + /* * If XVATTR set, then log record size needs to allow * for lr_attr_t + xvattr mask, mapsize and create time @@ -649,6 +685,8 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, if (zilog == NULL || zp->z_unlinked) return; + ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ + txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? TX_ACL_V0 : TX_ACL; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c index 658e539..c965247 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c @@ -278,9 +278,9 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs, uint64_t txtype; int error; + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); - txtype = (int)lr->lr_common.lrc_txtype; if (txtype == TX_CREATE_ACL_ATTR || txtype == TX_MKDIR_ACL_ATTR) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); @@ -321,7 +321,7 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs, if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; - switch ((int)lr->lr_common.lrc_txtype) { + switch (txtype) { case TX_CREATE_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + @@ -402,7 +402,8 @@ bail: VN_RELE(ZTOV(dzp)); - zfs_fuid_info_free(zfsvfs->z_fuid_replay); + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); @@ -425,9 +426,9 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) struct componentname cn; int error; + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); - txtype = (int)lr->lr_common.lrc_txtype; if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } @@ -477,7 +478,7 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) cn.cn_flags = SAVENAME; vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); - switch ((int)lr->lr_common.lrc_txtype) { + switch (txtype) { case TX_CREATE_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); @@ -709,6 +710,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) znode_t *zp; int error; ssize_t resid; + uint64_t orig_eof, eod; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -723,10 +725,65 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) error = 0; return (error); } + orig_eof = zp->z_phys->zp_size; + eod = lr->lr_offset + lr->lr_length; /* end of data for this write */ + + /* If it's a dmu_sync() block get the data and write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) + zil_get_replay_data(zfsvfs->z_log, lr); error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + /* + * This may be a write from a dmu_sync() for a whole block, + * and may extend beyond the current end of the file. + * We can't just replay what was written for this TX_WRITE as + * a future TX_WRITE2 may extend the eof and the data for that + * write needs to be there. So we write the whole block and + * reduce the eof. + */ + if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */ + zp->z_phys->zp_size = eod; + + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * TX_WRITE2 are only generated when dmu_sync() returns EALREADY + * meaning the pool block is already being synced. So now that we always write + * out full blocks, all we have to do is expand the eof if + * the file is grown. + */ +static int +zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) +{ + znode_t *zp; + int error; + uint64_t end; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log writes out of order, it's possible the + * file has been removed. In this case just drop the write + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + end = lr->lr_offset + lr->lr_length; + if (end > zp->z_phys->zp_size) { + ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz); + zp->z_phys->zp_size = end; + } + VN_RELE(ZTOV(zp)); return (error); @@ -944,4 +1001,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create_acl, /* TX_MKDIR_ACL */ zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ + zfs_replay_write2, /* TX_WRITE2 */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c index f0a75b5..4de8d8a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains the code to implement file range locking in * ZFS, although there isn't much specific to ZFS (all that comes to mind @@ -431,6 +429,8 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) new = kmem_alloc(sizeof (rl_t), KM_SLEEP); new->r_zp = zp; new->r_off = off; + if (len + off < off) /* overflow */ + len = UINT64_MAX - off; new->r_len = len; new->r_cnt = 1; /* assume it's going to be in the tree */ new->r_type = type; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index ce2c1e3..beb6401 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -55,7 +55,6 @@ #include <sys/dnlc.h> #include <sys/dmu_objset.h> #include <sys/spa_boot.h> -#include <sys/vdev_impl.h> /* VDEV_BOOT_VERSION */ struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); @@ -84,9 +83,6 @@ SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD, static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); -static int zfs_version_vdev_boot = VDEV_BOOT_VERSION; -SYSCTL_INT(_vfs_zfs_version, OID_AUTO, vdev_boot, CTLFLAG_RD, - &zfs_version_vdev_boot, 0, "VDEV_BOOT_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); @@ -140,6 +136,7 @@ zfs_sync(vfs_t *vfsp, int waitfor) * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; + dsl_pool_t *dp; int error; error = vfs_stdsync(vfsp, waitfor); @@ -147,10 +144,21 @@ zfs_sync(vfs_t *vfsp, int waitfor) return (error); ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (sys_shutdown && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, UINT64_MAX, 0); else - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + txg_wait_synced(dp, 0); ZFS_EXIT(zfsvfs); } else { /* @@ -483,6 +491,392 @@ unregister: } +static void +uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, + int64_t delta, dmu_tx_t *tx) +{ + uint64_t used = 0; + char buf[32]; + int err; + uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + + if (delta == 0) + return; + + (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); + err = zap_lookup(os, obj, buf, 8, 1, &used); + ASSERT(err == 0 || err == ENOENT); + /* no underflow/overflow */ + ASSERT(delta > 0 || used >= -delta); + ASSERT(delta < 0 || used + delta > used); + used += delta; + if (used == 0) + err = zap_remove(os, obj, buf, tx); + else + err = zap_update(os, obj, buf, 8, 1, &used, tx); + ASSERT(err == 0); +} + +static void +zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype, + void *oldbonus, void *newbonus, + uint64_t oldused, uint64_t newused, dmu_tx_t *tx) +{ + znode_phys_t *oldznp = oldbonus; + znode_phys_t *newznp = newbonus; + + if (bonustype != DMU_OT_ZNODE) + return; + + /* We charge 512 for the dnode (if it's allocated). */ + if (oldznp->zp_gen != 0) + oldused += DNODE_SIZE; + if (newznp->zp_gen != 0) + newused += DNODE_SIZE; + + if (oldznp->zp_uid == newznp->zp_uid) { + uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx); + } else { + uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx); + uidacct(os, B_FALSE, newznp->zp_uid, newused, tx); + } + + if (oldznp->zp_gid == newznp->zp_gid) { + uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx); + } else { + uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx); + uidacct(os, B_TRUE, newznp->zp_gid, newused, tx); + } +} + +static void +fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, + char *domainbuf, int buflen, uid_t *ridp) +{ + uint64_t fuid; + const char *domain; + + fuid = strtonum(fuidstr, NULL); + + domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); + if (domain) + (void) strlcpy(domainbuf, domain, buflen); + else + domainbuf[0] = '\0'; + *ridp = FUID_RID(fuid); +} + +static uint64_t +zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) +{ + switch (type) { + case ZFS_PROP_USERUSED: + return (DMU_USERUSED_OBJECT); + case ZFS_PROP_GROUPUSED: + return (DMU_GROUPUSED_OBJECT); + case ZFS_PROP_USERQUOTA: + return (zfsvfs->z_userquota_obj); + case ZFS_PROP_GROUPQUOTA: + return (zfsvfs->z_groupquota_obj); + } + return (0); +} + +int +zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +{ + int error; + zap_cursor_t zc; + zap_attribute_t za; + zfs_useracct_t *buf = vbuf; + uint64_t obj; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (ENOTSUP); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) { + *bufsizep = 0; + return (0); + } + + for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > + *bufsizep) + break; + + fuidstr_to_sid(zfsvfs, za.za_name, + buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); + + buf->zu_space = za.za_first_integer; + buf++; + } + if (error == ENOENT) + error = 0; + + ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); + *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; + *cookiep = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + return (error); +} + +/* + * buf must be big enough (eg, 32 bytes) + */ +static int +id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, boolean_t addok) +{ + uint64_t fuid; + int domainid = 0; + + if (domain && domain[0]) { + domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); + if (domainid == -1) + return (ENOENT); + } + fuid = FUID_ENCODE(domainid, rid); + (void) sprintf(buf, "%llx", (longlong_t)fuid); + return (0); +} + +int +zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valp) +{ + char buf[32]; + int err; + uint64_t obj; + + *valp = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (ENOTSUP); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) + return (0); + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); + if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); + if (err == ENOENT) + err = 0; + return (err); +} + +int +zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota) +{ + char buf[32]; + int err; + dmu_tx_t *tx; + uint64_t *objp; + boolean_t fuid_dirtied; + + if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) + return (EINVAL); + + if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) + return (ENOTSUP); + + objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : + &zfsvfs->z_groupquota_obj; + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); + if (err) + return (err); + fuid_dirtied = zfsvfs->z_fuid_dirty; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); + if (*objp == 0) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + zfs_userquota_prop_prefixes[type]); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&zfsvfs->z_lock); + if (*objp == 0) { + *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, + DMU_OT_NONE, 0, tx); + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); + } + mutex_exit(&zfsvfs->z_lock); + + if (quota == 0) { + err = zap_remove(zfsvfs->z_os, *objp, buf, tx); + if (err == ENOENT) + err = 0; + } else { + err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); + } + ASSERT(err == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + dmu_tx_commit(tx); + return (err); +} + +boolean_t +zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +{ + char buf[32]; + uint64_t used, quota, usedobj, quotaobj; + int err; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) sprintf(buf, "%llx", (longlong_t)fuid); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +int +zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + uint64_t zval; + int i, error; + + if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL)) + return (error); + if (zval) + mode |= DS_MODE_READONLY; + + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os); + if (error == EROFS) { + mode |= DS_MODE_READONLY; + error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os); + } + if (error) + return (error); + + /* + * Initialize the zfs-specific filesystem structure. + * Should probably make this a kmem cache, shuffle fields, + * and just bzero up to z_hold_mtx[]. + */ + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error) { + goto out; + } else if (zfsvfs->z_version > ZPL_VERSION) { + (void) printf("Mismatched versions: File system " + "is version %llu on-disk format, which is " + "incompatible with this software version %lld!", + (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); + error = ENOTSUP; + goto out; + } + + if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) + goto out; + zfsvfs->z_norm = (int)zval; + + if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) + goto out; + zfsvfs->z_utf8 = (zval != 0); + + if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) + goto out; + zfsvfs->z_case = (uint_t)zval; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error) + goto out; + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error && error != ENOENT) + goto out; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rrw_init(&zfsvfs->z_teardown_lock); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + *zvp = zfsvfs; + return (0); + +out: + dmu_objset_close(os); + *zvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); +} + static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { @@ -551,8 +945,9 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ - zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, - zfs_replay_vector, zfs_unlinked_drain); + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } @@ -560,49 +955,52 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) return (0); } -static void -zfs_freezfsvfs(zfsvfs_t *zfsvfs) +void +zfsvfs_free(zfsvfs_t *zfsvfs) { + int i; + + zfs_fuid_destroy(zfsvfs); + mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_online_recv_lock); + mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrw_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + } +} + static int zfs_domount(vfs_t *vfsp, char *osname) { - uint64_t recordsize, readonly; + uint64_t recordsize, fsid_guid; int error = 0; - int mode; zfsvfs_t *zfsvfs; - znode_t *zp = NULL; + vnode_t *vp; ASSERT(vfsp); ASSERT(osname); - /* - * Initialize the zfs-specific filesystem structure. - * Should probably make this a kmem cache, shuffle fields, - * and just bzero up to z_hold_mtx[]. - */ - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs); + if (error) + return (error); zfsvfs->z_vfs = vfsp; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_assign = TXG_NOWAIT; - zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; - zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - rrw_init(&zfsvfs->z_teardown_lock); - rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) @@ -616,36 +1014,25 @@ zfs_domount(vfs_t *vfsp, char *osname) vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; - if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) - goto out; - - mode = DS_MODE_OWNER; - if (readonly) - mode |= DS_MODE_READONLY; - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); - if (error == EROFS) { - mode = DS_MODE_OWNER | DS_MODE_READONLY; - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, - &zfsvfs->z_os); - } - - if (error) - goto out; - - if (error = zfs_init_fs(zfsvfs, &zp)) - goto out; + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + vfsp->mnt_vfc->vfc_typenum & 0xFF; /* * Set features for file system. */ - zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_use_fuids) { - vfs_set_feature(vfsp, VFSFT_XVATTR); - vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); - vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS); - vfs_set_feature(vfsp, VFSFT_ACLONCREATE); - } + zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); @@ -658,26 +1045,31 @@ zfs_domount(vfs_t *vfsp, char *osname) if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; - ASSERT(mode & DS_MODE_READONLY); atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; + + mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } vfs_mountedfrom(vfsp, osname); + /* Grab extra reference. */ + VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0); + VOP_UNLOCK(vp, 0); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { - if (zfsvfs->z_os) - dmu_objset_close(zfsvfs->z_os); - zfs_freezfsvfs(zfsvfs); + dmu_objset_close(zfsvfs->z_os); + zfsvfs_free(zfsvfs); } else { atomic_add_32(&zfs_active_fs_count, 1); } @@ -779,24 +1171,12 @@ zfs_mount(vfs_t *vfsp) goto out; } -#if 0 /* CHECK THIS! Is probably needed for zfs_suser. */ if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { - error = EPERM; - goto out; - } -#else - if (error = secpolicy_vnode_owner(mvp, cr, vattr.va_uid)) { - VOP_UNLOCK(mvp, 0); - goto out; - } - - if (error = VOP_ACCESS(mvp, VWRITE, cr, td)) { VOP_UNLOCK(mvp, 0); goto out; } VOP_UNLOCK(mvp, 0); -#endif } secpolicy_fs_mount_clearopts(cr, vfsp); @@ -826,6 +1206,21 @@ zfs_mount(vfs_t *vfsp) DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); + + /* + * Add an extra VFS_HOLD on our parent vfs so that it can't + * disappear due to a forced unmount. + */ + if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) + VFS_HOLD(mvp->v_vfsp); + + /* + * Add an extra VFS_HOLD on our parent vfs so that it can't + * disappear due to a forced unmount. + */ + if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) + VFS_HOLD(mvp->v_vfsp); + out: return (error); } @@ -1025,9 +1420,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) static int zfs_umount(vfs_t *vfsp, int fflag) { + kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; - cred_t *cr = curthread->td_ucred; + cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); @@ -1052,7 +1448,7 @@ zfs_umount(vfs_t *vfsp, int fflag) if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); - ret = vflush(vfsp, 0, 0, curthread); + ret = vflush(vfsp, 0, 0, td); ASSERT(ret == EBUSY); if (!(fflag & MS_FORCE)) { if (zfsvfs->z_ctldir->v_count > 1) @@ -1077,7 +1473,7 @@ zfs_umount(vfs_t *vfsp, int fflag) /* * Flush all the files. */ - ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, curthread); + ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) { if (!zfsvfs->z_issnap) { zfsctl_create(zfsvfs); @@ -1304,15 +1700,16 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) * 'z_teardown_inactive_lock' write held. */ int -zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) +zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); - *mode = zfsvfs->z_os->os_mode; - dmu_objset_name(zfsvfs->z_os, name); + *modep = zfsvfs->z_os->os_mode; + if (name) + dmu_objset_name(zfsvfs->z_os, name); dmu_objset_close(zfsvfs->z_os); return (0); @@ -1371,13 +1768,15 @@ static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; - int i; - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); + /* + * If this is a snapshot, we have an extra VFS_HOLD on our parent + * from zfs_mount(). Release it here. + */ + if (zfsvfs->z_issnap) + VFS_RELE(zfsvfs->z_parent->z_vfs); - zfs_fuid_destroy(zfsvfs); - zfs_freezfsvfs(zfsvfs); + zfsvfs_free(zfsvfs); atomic_add_32(&zfs_active_fs_count, -1); } @@ -1438,6 +1837,8 @@ zfs_init(void) * ZFS/i386. */ zfs_vnodes_adjust(); + + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); } void @@ -1455,54 +1856,46 @@ zfs_busy(void) } int -zfs_set_version(const char *name, uint64_t newvers) +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; - objset_t *os; + objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; - uint64_t curvers; - - /* - * XXX for now, require that the filesystem be unmounted. Would - * be nice to find the zfsvfs_t and just update that if - * possible. - */ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (EINVAL); - error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os); - if (error) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, - 8, 1, &curvers); - if (error) - goto out; - if (newvers < curvers) { - error = EINVAL; - goto out; - } + if (newvers < zfsvfs->z_version) + return (EINVAL); tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - goto out; + return (error); + } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); } - error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, - &newvers, tx); spa_history_internal_log(LOG_DS_UPGRADE, dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", curvers, newvers, - dmu_objset_id(os)); + "oldver=%llu newver=%llu dataset = %llu", + zfsvfs->z_version, newvers, dmu_objset_id(os)); + dmu_tx_commit(tx); -out: - dmu_objset_close(os); - return (error); + zfsvfs->z_version = newvers; + + if (zfsvfs->z_version >= ZPL_VERSION_FUID) + zfs_set_fuid_feature(zfsvfs); + + return (0); } /* * Read a property stored within the master node. diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index adeabfb..9292880 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,6 +56,7 @@ #include <sys/policy.h> #include <sys/sunddi.h> #include <sys/filio.h> +#include <sys/sid.h> #include <sys/zfs_ctldir.h> #include <sys/zfs_fuid.h> #include <sys/dnlc.h> @@ -98,9 +99,7 @@ * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * - * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). - * In normal operation, this will be TXG_NOWAIT. During ZIL replay, - * it will be a specific txg. Either way, dmu_tx_assign() never blocks. + * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). * This is critical because we don't want to block while holding locks. * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing to @@ -117,6 +116,8 @@ * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. @@ -132,12 +133,12 @@ * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign + * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes - * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + * if (error == ERESTART) { * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; @@ -163,23 +164,32 @@ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { + ZFS_EXIT(zfsvfs); return (EPERM); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && - zp->z_phys->zp_size > 0) - if (fs_vscan(*vpp, cr, 0) != 0) + zp->z_phys->zp_size > 0) { + if (fs_vscan(*vpp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); return (EACCES); + } + } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); + ZFS_EXIT(zfsvfs); return (0); } @@ -189,6 +199,10 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) @@ -206,6 +220,7 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, zp->z_phys->zp_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); + ZFS_EXIT(zfsvfs); return (0); } @@ -296,98 +311,108 @@ zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, return (ENOTTY); } +static vm_page_t +page_lookup(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) +{ + vm_object_t obj; + vm_page_t pp; + + obj = vp->v_object; + VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED); + + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + vm_page_is_valid(pp, (vm_offset_t)off, nbytes)) { + if (vm_page_sleep_if_busy(pp, FALSE, "zfsmwb")) + continue; + vm_page_busy(pp); + vm_page_lock_queues(); + vm_page_undirty(pp); + vm_page_unlock_queues(); + } else { + if (__predict_false(obj->cache != NULL)) { + vm_page_cache_free(obj, OFF_TO_IDX(start), + OFF_TO_IDX(start) + 1); + } + pp = NULL; + } + break; + } + return (pp); +} + +static void +page_unlock(vm_page_t pp) +{ + + vm_page_wakeup(pp); +} + +static caddr_t +zfs_map_page(vm_page_t pp, struct sf_buf **sfp) +{ + + sched_pin(); + *sfp = sf_buf_alloc(pp, SFB_CPUPRIVATE); + return ((caddr_t)sf_buf_kva(*sfp)); +} + +static void +zfs_unmap_page(struct sf_buf *sf) +{ + + sf_buf_free(sf); + sched_unpin(); +} + + /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. */ -static int -mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) + +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, + int segflg, dmu_tx_t *tx) { - znode_t *zp = VTOZ(vp); - objset_t *os = zp->z_zfsvfs->z_os; vm_object_t obj; - vm_page_t m; struct sf_buf *sf; - int64_t start, off; - int len = nbytes; - int error = 0; - uint64_t dirbytes; + int64_t off; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); - start = uio->uio_loffset; off = start & PAGEOFFSET; - dirbytes = 0; VM_OBJECT_LOCK(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { - uint64_t bytes = MIN(PAGESIZE - off, len); - uint64_t fsize; + vm_page_t pp; + uint64_t nbytes = MIN(PAGESIZE - off, len); -again: - if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && - vm_page_is_valid(m, (vm_offset_t)off, bytes)) { - uint64_t woff; + if ((pp = page_lookup(vp, start, off, nbytes)) != NULL) { caddr_t va; - if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb")) - goto again; - fsize = obj->un_pager.vnp.vnp_size; - vm_page_busy(m); - vm_page_lock_queues(); - vm_page_undirty(m); - vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); - if (dirbytes > 0) { - error = dmu_write_uio(os, zp->z_id, uio, - dirbytes, tx); - dirbytes = 0; - } - if (error == 0) { - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - va = (caddr_t)sf_buf_kva(sf); - woff = uio->uio_loffset - off; - error = uiomove(va + off, bytes, UIO_WRITE, uio); - /* - * The uiomove() above could have been partially - * successful, that's why we call dmu_write() - * below unconditionally. The page was marked - * non-dirty above and we would lose the changes - * without doing so. If the uiomove() failed - * entirely, well, we just write what we got - * before one more time. - */ - dmu_write(os, zp->z_id, woff, - MIN(PAGESIZE, fsize - woff), va, tx); - sf_buf_free(sf); - sched_unpin(); + va = zfs_map_page(pp, &sf); + if (segflg == UIO_NOCOPY) { + (void) dmu_write(os, oid, start+off, nbytes, + va+off, tx); + } else { + (void) dmu_read(os, oid, start+off, nbytes, + va+off, DMU_READ_PREFETCH);; } + zfs_unmap_page(sf); VM_OBJECT_LOCK(obj); - vm_page_wakeup(m); - } else { - if (__predict_false(obj->cache != NULL)) { - vm_page_cache_free(obj, OFF_TO_IDX(start), - OFF_TO_IDX(start) + 1); - } - dirbytes += bytes; + page_unlock(pp); + } - len -= bytes; + len -= nbytes; off = 0; - if (error) - break; } VM_OBJECT_UNLOCK(obj); - if (error == 0 && dirbytes > 0) - error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); - return (error); } /* @@ -469,7 +494,8 @@ again: sf = sf_buf_alloc(m, SFB_CPUPRIVATE); va = (caddr_t)sf_buf_kva(sf); error = dmu_read(os, zp->z_id, start + off, - bytes, (void *)(va + off)); + bytes, (void *)(va + off), + DMU_READ_PREFETCH); sf_buf_free(sf); sched_unpin(); } @@ -690,6 +716,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) int max_blksz = zfsvfs->z_max_blksz; uint64_t pflags; int error; + arc_buf_t *abuf; /* * Fasttrack empty write @@ -786,22 +813,59 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * and allows us to do more fine-grained space accounting. */ while (n > 0) { + abuf = NULL; + woff = uio->uio_loffset; + +again: + if (zfs_usergroup_overquota(zfsvfs, + B_FALSE, zp->z_phys->zp_uid) || + zfs_usergroup_overquota(zfsvfs, + B_TRUE, zp->z_phys->zp_gid)) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + error = EDQUOT; + break; + } + + /* + * If dmu_assign_arcbuf() is expected to execute with minimum + * overhead loan an arc buffer and copy user data to it before + * we enter a txg. This avoids holding a txg forever while we + * pagefault on a hanging NFS server mapping. + */ + if (abuf == NULL && n >= max_blksz && + woff >= zp->z_phys->zp_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + size_t cbytes; + + abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if (error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes)) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + /* * Start a transaction. */ - woff = uio->uio_loffset; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); - continue; + goto again; } dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); break; } @@ -833,18 +897,33 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (woff + nbytes > zp->z_phys->zp_size) vnode_pager_setsize(vp, woff + nbytes); - rw_enter(&zp->z_map_lock, RW_READER); - - tx_bytes = uio->uio_resid; - if (vn_has_cached_data(vp)) { - rw_exit(&zp->z_map_lock); - error = mappedwrite(vp, nbytes, uio, tx); + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, + nbytes, tx); + tx_bytes -= uio->uio_resid; } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, nbytes, tx); - rw_exit(&zp->z_map_lock); + tx_bytes = nbytes; + ASSERT(tx_bytes == max_blksz); + dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + + /* + * XXXPJD: There are some cases (triggered by fsx) where + * vn_has_cached_data(vp) returns false when it should + * return true. This should be investigated. + */ +#if 0 + if (tx_bytes && vn_has_cached_data(vp)) +#else + if (tx_bytes && vp->v_object != NULL) +#endif + { + update_pages(vp, woff, tx_bytes, zfsvfs->z_os, + zp->z_id, uio->uio_segflg, tx); } - tx_bytes -= uio->uio_resid; /* * If we made no progress, we're done. If we made even @@ -906,7 +985,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ - if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { + if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } @@ -988,7 +1067,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) error = ENOENT; goto out; } - VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); + VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf, + DMU_READ_NO_PREFETCH)); } else { /* indirect write */ uint64_t boff; /* block starting offset */ @@ -1027,16 +1107,28 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT((error && error != EINPROGRESS) || lr->lr_length <= zp->z_blksz); - if (error == 0) + if (error == 0) { + /* + * dmu_sync() can compress a block of zeros to a null + * blkptr but the block size still needs to be passed + * through to replay. + */ + BP_SET_LSIZE(&lr->lr_blkptr, db->db_size); zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); + } + /* * If we get EINPROGRESS, then we need to wait for a * write IO initiated by dmu_sync() to complete before * we can release this dbuf. We will finish everything * up in the zfs_get_done() callback. */ - if (error == EINPROGRESS) + if (error == EINPROGRESS) { return (0); + } else if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + error = 0; + } dmu_buf_rele(db, zgd); kmem_free(zgd, sizeof (zgd_t)); } @@ -1279,8 +1371,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, zfs_dirlock_t *dl; dmu_tx_t *tx; int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; void *vsecp = NULL; int flag = 0; @@ -1289,6 +1384,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, * make sure file system is at proper version */ + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) @@ -1339,21 +1439,9 @@ top: if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (error); - } - } - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - ZFS_EXIT(zfsvfs); - if (dl) - zfs_dirent_unlock(dl); return (error); } } - if (zp == NULL) { uint64_t txtype; @@ -1375,52 +1463,52 @@ top: goto out; } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, + &acl_ids)) != 0) + goto out; + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + error = EDQUOT; + goto out; + } + tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || - IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, - FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); return (error); } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + (void) zfs_link_create(dl, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; @@ -1490,8 +1578,6 @@ out: *vpp = svp; } } - if (aclp) - zfs_acl_free(aclp); ZFS_EXIT(zfsvfs); return (error); @@ -1610,11 +1696,11 @@ top: /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1724,9 +1810,12 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, uint64_t txtype; dmu_tx_t *tx; int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; int zf = ZNEW; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); @@ -1735,6 +1824,11 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, * make sure file system is at proper version */ + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| IS_EPHEMERAL(crgetgid(cr)))) @@ -1782,59 +1876,51 @@ top: return (error); } - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, + &acl_ids)) != 0) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); } + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (EDQUOT); + } + /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) || - IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); return (error); } /* * Create new node. */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); - - if (aclp) - zfs_acl_free(aclp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ @@ -1845,10 +1931,10 @@ top: txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); @@ -1942,13 +2028,13 @@ top: dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -2534,11 +2620,13 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; + xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; + uint64_t new_uid, new_gid; znode_t *attrzp; int need_policy = FALSE; int err; @@ -2547,6 +2635,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, xoptattr_t *xoap; zfs_acl_t *aclp = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; if (mask == 0) return (0); @@ -2589,6 +2678,8 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, */ xoap = xva_getxoptattr(xvap); + xva_init(&tmpxvattr); + /* * Immutable files can only alter immutable bit and atime */ @@ -2711,28 +2802,78 @@ top: oldva.va_mode = pzp->zp_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { - if ((need_policy == FALSE) && - (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && - xoap->xoa_appendonly != - ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && - xoap->xoa_nounlink != - ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && - xoap->xoa_immutable != - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_NODUMP) && - xoap->xoa_nodump != - ((pzp->zp_flags & ZFS_NODUMP) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && - xoap->xoa_av_modified != - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || - ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && - ((vp->v_type != VREG && xoap->xoa_av_quarantined) || - xoap->xoa_av_quarantined != - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || - (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || - (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((pzp->zp_flags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((vp->v_type != VREG && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } @@ -2800,30 +2941,14 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } if (mask & AT_MODE) { uint64_t pmode = pzp->zp_mode; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); - } + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) + goto out; if (pzp->zp_acl.z_acl_extern_obj) { /* Are we upgrading ACL from old V0 format to new V1 */ if (zfsvfs->z_version <= ZPL_VERSION_FUID && @@ -2845,36 +2970,53 @@ top: } } - if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { - err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); - if (err) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (err); + if (mask & (AT_UID | AT_GID)) { + if (pzp->zp_xattr) { + err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); + if (err) + goto out; + dmu_tx_hold_bonus(tx, attrzp->z_id); + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != pzp->zp_uid && + zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) { + err = EDQUOT; + goto out; + } } - dmu_tx_hold_bonus(tx, attrzp->z_id); - } - - err = dmu_tx_assign(tx, zfsvfs->z_assign); - if (err) { - if (attrzp) - VN_RELE(ZTOV(attrzp)); - if (aclp) { - zfs_acl_free(aclp); - aclp = NULL; + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != pzp->zp_gid && + zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) { + err = EDQUOT; + goto out; + } } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, + FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + } - if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + err = dmu_tx_assign(tx, TXG_NOWAIT); + if (err) { + if (err == ERESTART) dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); + goto out; } dmu_buf_will_dirty(zp->z_dbuf, tx); @@ -2892,7 +3034,7 @@ top: if (mask & AT_MODE) { mutex_enter(&zp->z_acl_lock); zp->z_phys->zp_mode = new_mode; - err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT3U(err, ==, 0); mutex_exit(&zp->z_acl_lock); } @@ -2901,25 +3043,17 @@ top: mutex_enter(&attrzp->z_lock); if (mask & AT_UID) { - pzp->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - if (attrzp) { - attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - } + pzp->zp_uid = new_uid; + if (attrzp) + attrzp->z_phys->zp_uid = new_uid; } if (mask & AT_GID) { - pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, - cr, ZFS_GROUP, tx, &fuidp); + pzp->zp_gid = new_gid; if (attrzp) - attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, - vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); + attrzp->z_phys->zp_gid = new_gid; } - if (aclp) - zfs_acl_free(aclp); - if (attrzp) mutex_exit(&attrzp->z_lock); @@ -2940,6 +3074,31 @@ top: */ if (xoap && (mask & AT_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { size_t len; dmu_object_info_t doi; @@ -2956,17 +3115,35 @@ top: zfs_xvattr_set(zp, xvap); } + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - if (fuidp) - zfs_fuid_info_free(fuidp); mutex_exit(&zp->z_lock); +out: if (attrzp) VN_RELE(ZTOV(attrzp)); - dmu_tx_commit(tx); + if (aclp) { + zfs_acl_free(aclp); + aclp = NULL; + } + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) + dmu_tx_abort(tx); + else + dmu_tx_commit(tx); + + if (err == ERESTART) + goto top; ZFS_EXIT(zfsvfs); return (err); @@ -3329,7 +3506,7 @@ top: if (tzp) dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); @@ -3342,7 +3519,7 @@ top: VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -3428,7 +3605,8 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, int len = strlen(link); int error; int zflg = ZNEW; - zfs_fuid_info_t *fuidp = NULL; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; int flags = 0; ASSERT(vap->va_type == VLNK); @@ -3464,28 +3642,27 @@ top: return (error); } + VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (EDQUOT); + } tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); - if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -3503,13 +3680,16 @@ top: * otherwise, store it just like any other file data. */ if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids); if (len != 0) bcopy(link, zp->z_phys + 1, len); } else { dmu_buf_t *dbp; - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); /* * Nothing can access the znode yet so no locking needed * for growing the znode's blocksize. @@ -3530,7 +3710,6 @@ top: * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); -out: if (error == 0) { uint64_t txtype = TX_SYMLINK; if (flags & FIGNORECASE) @@ -3538,8 +3717,8 @@ out: zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); } - if (fuidp) - zfs_fuid_info_free(fuidp); + + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); @@ -3701,10 +3880,10 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -4994,6 +5173,7 @@ zfs_freebsd_aclcheck(ap) struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; +struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, @@ -5052,3 +5232,15 @@ struct vop_vector zfs_fifoops = { .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, }; + +/* + * special share hidden files vnode operations template + */ +struct vop_vector zfs_shareops = { + .vop_default = &default_vnodeops, + .vop_access = zfs_freebsd_access, + .vop_inactive = zfs_freebsd_inactive, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_fid = zfs_freebsd_fid, + .vop_pathconf = zfs_freebsd_pathconf, +}; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c index 947f9dd..740302a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -126,6 +126,7 @@ znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) extern struct vop_vector zfs_vnodeops; extern struct vop_vector zfs_fifoops; +extern struct vop_vector zfs_shareops; /* * XXX: We cannot use this function as a cache constructor, because @@ -160,7 +161,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); @@ -185,7 +185,6 @@ zfs_znode_cache_destructor(void *buf, void *arg) vn_free(ZTOV(zp)); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); - rw_destroy(&zp->z_map_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); @@ -252,17 +251,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) POINTER_INVALIDATE(&ozp->z_zfsvfs); } -/* - * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise - * returns a non-zero error code. - */ -static int -zfs_enter(zfsvfs_t *zfsvfs) -{ - ZFS_ENTER(zfsvfs); - return (0); -} - /*ARGSUSED*/ static kmem_cbrc_t zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) @@ -287,8 +275,11 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) /* * Ensure that the filesystem is not unmounted during the move. + * This is the equivalent to ZFS_ENTER(). */ - if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ + rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); + if (zfsvfs->z_unmounted) { + ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); return (KMEM_CBRC_DONT_KNOW); } @@ -378,97 +369,55 @@ zfs_znode_fini(void) znode_cache = NULL; } -/* - * zfs_init_fs - Initialize the zfsvfs struct and the file system - * incore "master" object. Verify version compatibility. - */ int -zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { - objset_t *os = zfsvfs->z_os; - int i, error; - uint64_t fsid_guid; - uint64_t zval; - - *zpp = NULL; - - error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); - if (error) { - return (error); - } else if (zfsvfs->z_version > ZPL_VERSION) { - (void) printf("Mismatched versions: File system " - "is version %llu on-disk format, which is " - "incompatible with this software version %lld!", - (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); - return (ENOTSUP); - } - - if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) - return (error); - zfsvfs->z_norm = (int)zval; - if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) - return (error); - zfsvfs->z_utf8 = (zval != 0); - if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) - return (error); - zfsvfs->z_case = (uint_t)zval; - /* - * Fold case on file systems that are always or sometimes case - * insensitive. - */ - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - zfsvfs->z_case == ZFS_CASE_MIXED) - zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; - - /* - * The fsid is 64 bits, composed of an 8-bit fs type, which - * separates our fsid from any other filesystem types, and a - * 56-bit objset unique ID. The objset unique ID is unique to - * all objsets open on this system, provided by unique_create(). - * The 8-bit fs type must be put in the low bits of fsid[1] - * because that's where other Solaris filesystems put it. - */ - fsid_guid = dmu_objset_fsid_guid(os); - ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); - zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; - zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | - zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, - &zfsvfs->z_root); - if (error) - return (error); - ASSERT(zfsvfs->z_root != 0); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, - &zfsvfs->z_unlinkedobj); - if (error) - return (error); - - /* - * Initialize zget mutex's - */ - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + vnode_t *vp, vnode; + znode_t *zp; + int error; - error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); - if (error) { - /* - * On error, we destroy the mutexes here since it's not - * possible for the caller to determine if the mutexes were - * initialized properly. - */ - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); - return (error); - } - ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, - &zfsvfs->z_fuid_obj); - if (error == ENOENT) - error = 0; + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); + + sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0); + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; + + sharezp->z_vnode = &vnode; + vnode.v_data = sharezp; + + vp = ZTOV(sharezp); + vp->v_type = VDIR; + + VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, + &zp, 0, &acl_ids); + ASSERT3P(zp, ==, sharezp); + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + ZTOV(sharezp)->v_data = NULL; + ZTOV(sharezp)->v_count = 0; + ZTOV(sharezp)->v_holdcnt = 0; + zp->z_vnode = NULL; + sharezp->z_vnode = NULL; + dmu_buf_rele(sharezp->z_dbuf, NULL); + sharezp->z_dbuf = NULL; + kmem_cache_free(znode_cache, sharezp); - return (0); + return (error); } /* @@ -611,6 +560,11 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) case VFIFO: vp->v_op = &zfs_fifoops; break; + case VREG: + if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) { + vp->v_op = &zfs_shareops; + } + break; } if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); @@ -639,7 +593,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute - * IS_REPLAY - intent log replay * bonuslen - length of bonus buffer * setaclp - File/Dir initial ACL * fuidp - Tracks fuid allocation. @@ -649,8 +602,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, - zfs_fuid_info_t **fuidp) + uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) { dmu_buf_t *db; znode_phys_t *pzp; @@ -661,9 +613,8 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); - if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + if (zfsvfs->z_replay) { obj = vap->va_nodeid; - flag |= IS_REPLAY; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ } else { @@ -682,7 +633,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, * assertions below. */ if (vap->va_type == VDIR) { - if (flag & IS_REPLAY) { + if (zfsvfs->z_replay) { err = zap_create_claim_norm(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); @@ -693,7 +644,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } else { - if (flag & IS_REPLAY) { + if (zfsvfs->z_replay) { err = dmu_object_claim(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); @@ -775,7 +726,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, */ *zpp = dzp; } - zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); + pzp->zp_uid = acl_ids->z_fuid; + pzp->zp_gid = acl_ids->z_fgid; + pzp->zp_mode = acl_ids->z_mode; + VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); if (!(flag & IS_ROOT_NODE)) { vnode_t *vp; @@ -1225,9 +1181,9 @@ top: newblksz = 0; } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1247,11 +1203,7 @@ top: dmu_tx_commit(tx); - rw_enter(&zp->z_map_lock, RW_WRITER); - error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); - ASSERT(error == 0); vnode_pager_setsize(ZTOV(zp), end); - rw_exit(&zp->z_map_lock); return (0); } @@ -1296,11 +1248,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) * In FreeBSD we cannot free block in the middle of a file, * but only at the end of a file. */ - rw_enter(&zp->z_map_lock, RW_WRITER); - error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); - ASSERT(error == 0); vnode_pager_setsize(ZTOV(zp), off); - rw_exit(&zp->z_map_lock); } zfs_range_unlock(rl); @@ -1347,9 +1295,9 @@ zfs_trunc(znode_t *zp, uint64_t end) top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1364,23 +1312,15 @@ top: dmu_tx_commit(tx); - zfs_range_unlock(rl); - /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ - rw_enter(&zp->z_map_lock, RW_WRITER); -#if 0 - error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); -#else - error = vinvalbuf(vp, V_SAVE, 0, 0); - ASSERT(error == 0); vnode_pager_setsize(vp, end); -#endif - rw_exit(&zp->z_map_lock); + + zfs_range_unlock(rl); return (0); } @@ -1426,9 +1366,9 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto log; @@ -1448,7 +1388,7 @@ void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { zfsvfs_t zfsvfs; - uint64_t moid, doid, version; + uint64_t moid, obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; @@ -1458,6 +1398,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) vnode_t vnode; vattr_t vattr; znode_t *zp; + zfs_acl_ids_t acl_ids; /* * First attempt to create master node. @@ -1474,12 +1415,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) /* * Set starting attributes. */ - if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) version = ZPL_VERSION; + else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + version = ZPL_VERSION_USERSPACE - 1; else version = ZPL_VERSION_FUID - 1; - error = zap_update(os, moid, ZPL_VERSION_STR, - 8, 1, &version, tx); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ @@ -1490,9 +1431,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { - version = val; - error = zap_update(os, moid, ZPL_VERSION_STR, - 8, 1, &version, tx); + if (val < version) + version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } @@ -1503,13 +1443,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) sense = val; } ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); /* * Create a delete queue. */ - doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); - error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* @@ -1535,7 +1476,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) bzero(&zfsvfs, sizeof (zfsvfs_t)); zfsvfs.z_os = os; - zfsvfs.z_assign = TXG_NOWAIT; zfsvfs.z_parent = &zfsvfs; zfsvfs.z_version = version; zfsvfs.z_use_fuids = USE_FUIDS(version, os); @@ -1556,19 +1496,30 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_zfsvfs = &zfsvfs; - zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); POINTER_INVALIDATE(&rootzp->z_zfsvfs); dmu_buf_rele(rootzp->z_dbuf, NULL); rootzp->z_dbuf = NULL; - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs.z_hold_mtx[i]); - mutex_destroy(&zfsvfs.z_znodes_lock); rootzp->z_vnode = NULL; kmem_cache_free(znode_cache, rootzp); + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(&zfsvfs, tx); + + ASSERT(error == 0); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs.z_hold_mtx[i]); } #endif /* _KERNEL */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 4a0e8d5..7839713 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -729,17 +729,26 @@ zil_lwb_write_done(zio_t *zio) ASSERT(zio->io_bp->blk_fill == 0); /* - * Now that we've written this log block, we have a stable pointer - * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. + * Ensure the lwb buffer pointer is cleared before releasing + * the txg. If we have had an allocation failure and + * the txg is waiting to sync then we want want zil_sync() + * to remove the lwb so that it's not picked up as the next new + * one in zil_commit_writer(). zil_sync() will only remove + * the lwb if lwb_buf is null. */ - txg_rele_to_sync(&lwb->lwb_txgh); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; if (zio->io_error) zilog->zl_log_error = B_TRUE; + + /* + * Now that we've written this log block, we have a stable pointer + * to the next block in the chain, so it's OK to let the txg in + * which we allocated the next block sync. We still have the + * zl_lock to ensure zil_sync doesn't kmem free the lwb. + */ + txg_rele_to_sync(&lwb->lwb_txgh); mutex_exit(&zilog->zl_lock); } @@ -1226,20 +1235,26 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) spa_t *spa = zilog->zl_spa; lwb_t *lwb; + /* + * We don't zero out zl_destroy_txg, so make sure we don't try + * to destroy it twice. + */ + if (spa_sync_pass(spa) != 1) + return; + mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); - zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; + zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK]; if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; ASSERT(list_head(&zilog->zl_lwb_list) == NULL); - ASSERT(spa_sync_pass(spa) == 1); bzero(zh, sizeof (zil_header_t)); - bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); + bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* @@ -1454,12 +1469,57 @@ zil_resume(zilog_t *zilog) mutex_exit(&zilog->zl_lock); } +/* + * Read in the data for the dmu_sync()ed block, and change the log + * record to write this whole block. + */ +void +zil_get_replay_data(zilog_t *zilog, lr_write_t *lr) +{ + blkptr_t *wbp = &lr->lr_blkptr; + char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */ + uint64_t blksz; + + if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ + blksz = BP_GET_LSIZE(&lr->lr_blkptr); + /* + * If the blksz is zero then we must be replaying a log + * from an version prior to setting the blksize of null blocks. + * So we just zero the actual write size reqeusted. + */ + if (blksz == 0) { + bzero(wbuf, lr->lr_length); + return; + } + bzero(wbuf, blksz); + } else { + /* + * A subsequent write may have overwritten this block, in which + * case wbp may have been been freed and reallocated, and our + * read of wbp may fail with a checksum error. We can safely + * ignore this because the later write will provide the + * correct data. + */ + zbookmark_t zb; + + zb.zb_objset = dmu_objset_id(zilog->zl_os); + zb.zb_object = lr->lr_foid; + zb.zb_level = 0; + zb.zb_blkid = -1; /* unknown */ + + blksz = BP_GET_LSIZE(&lr->lr_blkptr); + (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz, + NULL, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); + } + lr->lr_offset -= lr->lr_offset % blksz; + lr->lr_length = blksz; +} + typedef struct zil_replay_arg { objset_t *zr_os; zil_replay_func_t **zr_replay; - zil_replay_cleaner_t *zr_replay_cleaner; void *zr_arg; - uint64_t *zr_txgp; boolean_t zr_byteswap; char *zr_lrbuf; } zil_replay_arg_t; @@ -1472,9 +1532,9 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; char *name; - int pass, error, sunk; + int pass, error; - if (zilog->zl_stop_replay) + if (!zilog->zl_replay) /* giving up */ return; if (lr->lrc_txg < claim_txg) /* already committed */ @@ -1486,6 +1546,11 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; + if (txtype == 0 || txtype >= TX_MAX_TYPE) { + error = EINVAL; + goto bad; + } + /* * Make a copy of the data so we can revise and extend it. */ @@ -1502,103 +1567,16 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) byteswap_uint64_array(zr->zr_lrbuf, reclen); /* - * If this is a TX_WRITE with a blkptr, suck in the data. - */ - if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { - lr_write_t *lrw = (lr_write_t *)lr; - blkptr_t *wbp = &lrw->lr_blkptr; - uint64_t wlen = lrw->lr_length; - char *wbuf = zr->zr_lrbuf + reclen; - - if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ - bzero(wbuf, wlen); - } else { - /* - * A subsequent write may have overwritten this block, - * in which case wbp may have been been freed and - * reallocated, and our read of wbp may fail with a - * checksum error. We can safely ignore this because - * the later write will provide the correct data. - */ - zbookmark_t zb; - - zb.zb_objset = dmu_objset_id(zilog->zl_os); - zb.zb_object = lrw->lr_foid; - zb.zb_level = -1; - zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp); - - (void) zio_wait(zio_read(NULL, zilog->zl_spa, - wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, - ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); - (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); - } - } - - /* - * Replay of large truncates can end up needing additional txs - * and a different txg. If they are nested within the replay tx - * as below then a hang is possible. So we do the truncate here - * and redo the truncate later (a no-op) and update the sequence - * number whilst in the replay tx. Fortunately, it's safe to repeat - * a truncate if we crash and the truncate commits. A create over - * an existing file will also come in as a TX_TRUNCATE record. - * - * Note, remove of large files and renames over large files is - * handled by putting the deleted object on a stable list - * and if necessary force deleting the object outside of the replay - * transaction using the zr_replay_cleaner. - */ - if (txtype == TX_TRUNCATE) { - *zr->zr_txgp = TXG_NOWAIT; - error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap); - if (error) - goto bad; - zr->zr_byteswap = 0; /* only byteswap once */ - } - - /* * We must now do two things atomically: replay this log record, - * and update the log header to reflect the fact that we did so. - * We use the DMU's ability to assign into a specific txg to do this. + * and update the log header sequence number to reflect the fact that + * we did so. At the end of each replay function the sequence number + * is updated if we are in replay mode. */ - for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) { - uint64_t replay_txg; - dmu_tx_t *replay_tx; - - replay_tx = dmu_tx_create(zr->zr_os); - error = dmu_tx_assign(replay_tx, TXG_WAIT); - if (error) { - dmu_tx_abort(replay_tx); - break; - } - - replay_txg = dmu_tx_get_txg(replay_tx); - - if (txtype == 0 || txtype >= TX_MAX_TYPE) { - error = EINVAL; - } else { - /* - * On the first pass, arrange for the replay vector - * to fail its dmu_tx_assign(). That's the only way - * to ensure that those code paths remain well tested. - * - * Only byteswap (if needed) on the 1st pass. - */ - *zr->zr_txgp = replay_txg - (pass == 1); - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap && pass == 1); - *zr->zr_txgp = TXG_NOWAIT; - } - - if (error == 0) { - dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx); - zilog->zl_replay_seq[replay_txg & TXG_MASK] = - lr->lrc_seq; - } - - dmu_tx_commit(replay_tx); + for (pass = 1; pass <= 2; pass++) { + zilog->zl_replaying_seq = lr->lrc_seq; + /* Only byteswap (if needed) on the 1st pass. */ + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, + zr->zr_byteswap && pass == 1); if (!error) return; @@ -1606,37 +1584,22 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with - * EEXIST. So if we receive any error other than ERESTART - * we try syncing out any removes then retrying the - * transaction. + * EEXIST. So if we receive any error we try syncing out + * any removes then retry the transaction. */ - if (error != ERESTART && !sunk) { - if (zr->zr_replay_cleaner) - zr->zr_replay_cleaner(zr->zr_arg); + if (pass == 1) txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); - sunk = B_TRUE; - continue; /* retry */ - } - - if (error != ERESTART) - break; - - if (pass != 1) - txg_wait_open(spa_get_dsl(zilog->zl_spa), - replay_txg + 1); - - dprintf("pass %d, retrying\n", pass); } bad: - ASSERT(error && error != ERESTART); + ASSERT(error); name = kmem_alloc(MAXNAMELEN, KM_SLEEP); dmu_objset_name(zr->zr_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, (lr->lrc_txtype & TX_CI) ? "CI" : ""); - zilog->zl_stop_replay = 1; + zilog->zl_replay = B_FALSE; kmem_free(name, MAXNAMELEN); } @@ -1651,9 +1614,7 @@ zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) * If this dataset has a non-empty intent log, replay it and destroy it. */ void -zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE], - zil_replay_cleaner_t *replay_cleaner) +zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; @@ -1667,9 +1628,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, zr.zr_os = os; zr.zr_replay = replay_func; - zr.zr_replay_cleaner = replay_cleaner; zr.zr_arg = arg; - zr.zr_txgp = txgp; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); @@ -1678,7 +1637,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, */ txg_wait_synced(zilog->zl_dmu_pool, 0); - zilog->zl_stop_replay = 0; + zilog->zl_replay = B_TRUE; zilog->zl_replay_time = LBOLT; ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, @@ -1687,6 +1646,7 @@ zil_replay(objset_t *os, void *arg, uint64_t *txgp, zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + zilog->zl_replay = B_FALSE; //printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 8879742..75b7617 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -76,6 +76,7 @@ char *zio_type_name[ZIO_TYPES] = { * ========================================================================== */ kmem_cache_t *zio_cache; +kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; @@ -94,8 +95,10 @@ void zio_init(void) { size_t c; - zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, - NULL, NULL, NULL, NULL, NULL, 0); + zio_cache = kmem_cache_create("zio_cache", + sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_link_cache = kmem_cache_create("zio_link_cache", + sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * For small buffers, we want a cache for each multiple of @@ -165,6 +168,7 @@ zio_fini(void) zio_data_buf_cache[c] = NULL; } + kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); @@ -311,41 +315,102 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ +/* + * NOTE - Callers to zio_walk_parents() and zio_walk_children must + * continue calling these functions until they return NULL. + * Otherwise, the next caller will pick up the list walk in + * some indeterminate state. (Otherwise every caller would + * have to pass in a cookie to keep the state represented by + * io_walk_link, which gets annoying.) + */ +zio_t * +zio_walk_parents(zio_t *cio) +{ + zio_link_t *zl = cio->io_walk_link; + list_t *pl = &cio->io_parent_list; -static void -zio_add_child(zio_t *pio, zio_t *zio) + zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); + cio->io_walk_link = zl; + + if (zl == NULL) + return (NULL); + + ASSERT(zl->zl_child == cio); + return (zl->zl_parent); +} + +zio_t * +zio_walk_children(zio_t *pio) +{ + zio_link_t *zl = pio->io_walk_link; + list_t *cl = &pio->io_child_list; + + zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); + pio->io_walk_link = zl; + + if (zl == NULL) + return (NULL); + + ASSERT(zl->zl_parent == pio); + return (zl->zl_child); +} + +zio_t * +zio_unique_parent(zio_t *cio) { + zio_t *pio = zio_walk_parents(cio); + + VERIFY(zio_walk_parents(cio) == NULL); + return (pio); +} + +void +zio_add_child(zio_t *pio, zio_t *cio) +{ + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT(cio->io_child_type <= pio->io_child_type); + + zl->zl_parent = pio; + zl->zl_child = cio; + + mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); - if (zio->io_stage < ZIO_STAGE_READY) - pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; - if (zio->io_stage < ZIO_STAGE_DONE) - pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; - zio->io_sibling_prev = NULL; - zio->io_sibling_next = pio->io_child; - if (pio->io_child != NULL) - pio->io_child->io_sibling_prev = zio; - pio->io_child = zio; - zio->io_parent = pio; + + ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; + + list_insert_head(&pio->io_child_list, zl); + list_insert_head(&cio->io_parent_list, zl); + mutex_exit(&pio->io_lock); + mutex_exit(&cio->io_lock); } static void -zio_remove_child(zio_t *pio, zio_t *zio) +zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { - zio_t *next, *prev; - - ASSERT(zio->io_parent == pio); + ASSERT(zl->zl_parent == pio); + ASSERT(zl->zl_child == cio); + mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); - next = zio->io_sibling_next; - prev = zio->io_sibling_prev; - if (next != NULL) - next->io_sibling_prev = prev; - if (prev != NULL) - prev->io_sibling_next = next; - if (pio->io_child == zio) - pio->io_child = next; + + list_remove(&pio->io_child_list, zl); + list_remove(&cio->io_parent_list, zl); + mutex_exit(&pio->io_lock); + mutex_exit(&cio->io_lock); + + kmem_cache_free(zio_link_cache, zl); } static boolean_t @@ -420,6 +485,11 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + list_create(&zio->io_parent_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_parent_node)); + list_create(&zio->io_child_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_child_node)); + if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) @@ -433,11 +503,10 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ - if (zio->io_child_type == ZIO_CHILD_LOGICAL) { - if (BP_IS_GANG(bp)) - pipeline |= ZIO_GANG_STAGES; + if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; - } + if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) + pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; @@ -454,19 +523,17 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; + zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); + zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); + if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { - /* - * Logical I/Os can have logical, gang, or vdev children. - * Gang I/Os can have gang or vdev children. - * Vdev I/Os can only have vdev children. - * The following ASSERT captures all of these constraints. - */ - ASSERT(zio->io_child_type <= pio->io_child_type); if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; + if (zio->io_child_type == ZIO_CHILD_GANG) + zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } @@ -476,29 +543,21 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, static void zio_destroy(zio_t *zio) { - spa_t *spa = zio->io_spa; - uint8_t async_root = zio->io_async_root; - + list_destroy(&zio->io_parent_list); + list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); - - if (async_root) { - mutex_enter(&spa->spa_async_root_lock); - if (--spa->spa_async_root_count == 0) - cv_broadcast(&spa->spa_async_root_cv); - mutex_exit(&spa->spa_async_root_lock); - } } zio_t * -zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, - int flags) +zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, + void *private, int flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); @@ -507,7 +566,7 @@ zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) { - return (zio_null(NULL, spa, done, private, flags)); + return (zio_null(NULL, spa, NULL, done, private, flags)); } zio_t * @@ -576,12 +635,12 @@ zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ASSERT(!BP_IS_HOLE(bp)); if (bp->blk_fill == BLK_FILL_ALREADY_FREED) - return (zio_null(pio, spa, NULL, NULL, flags)); + return (zio_null(pio, spa, NULL, NULL, NULL, flags)); if (txg == spa->spa_syncing_txg && spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) { bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); - return (zio_null(pio, spa, NULL, NULL, flags)); + return (zio_null(pio, spa, NULL, NULL, NULL, flags)); } zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), @@ -632,7 +691,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio->io_cmd = cmd; } else { - zio = zio_null(pio, spa, NULL, NULL, flags); + zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, @@ -770,7 +829,9 @@ zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) { + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && + zio->io_child_type == ZIO_CHILD_LOGICAL && + !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t csize = BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(csize); @@ -819,16 +880,10 @@ zio_write_bp_init(zio_t *zio) * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(zio->io_spa); - ASSERT(pass > 1); if (pass > SYNC_PASS_DONT_COMPRESS) compress = ZIO_COMPRESS_OFF; - /* - * Only MOS (objset 0) data should need to be rewritten. - */ - ASSERT(zio->io_logical->io_bookmark.zb_objset == 0); - /* Make sure someone doesn't change their mind on overwrites */ ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp), spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp)); @@ -1022,17 +1077,16 @@ zio_nowait(zio_t *zio) { ASSERT(zio->io_executor == NULL); - if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) { + if (zio->io_child_type == ZIO_CHILD_LOGICAL && + zio_unique_parent(zio) == NULL) { /* * This is a logical async I/O with no parent to wait for it. - * Attach it to the pool's global async root zio so that - * spa_unload() has a way of waiting for async I/O to finish. + * We add it to the spa_async_root_zio "Godfather" I/O which + * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; - zio->io_async_root = B_TRUE; - mutex_enter(&spa->spa_async_root_lock); - spa->spa_async_root_count++; - mutex_exit(&spa->spa_async_root_lock); + + zio_add_child(spa->spa_async_zio_root, zio); } zio_execute(zio); @@ -1047,13 +1101,20 @@ zio_nowait(zio_t *zio) static void zio_reexecute(zio_t *pio) { - zio_t *zio, *zio_next; + zio_t *cio, *cio_next; + + ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); + ASSERT(pio->io_gang_leader == NULL); + ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_error = 0; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; @@ -1073,24 +1134,27 @@ zio_reexecute(zio_t *pio) /* * As we reexecute pio's children, new children could be created. - * New children go to the head of the io_child list, however, + * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that - * the remainder of the io_child list, from 'zio_next' onward, - * cannot be affected by any side effects of reexecuting 'zio'. + * the remainder of pio's io_child_list, from 'cio_next' onward, + * cannot be affected by any side effects of reexecuting 'cio'. */ - for (zio = pio->io_child; zio != NULL; zio = zio_next) { - zio_next = zio->io_sibling_next; + for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio); mutex_enter(&pio->io_lock); - pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; - pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); - zio_reexecute(zio); + zio_reexecute(cio); } /* * Now that all children have been reexecuted, execute the parent. + * We don't reexecute "The Godfather" I/O here as it's the + * responsibility of the caller to wait on him. */ - zio_execute(pio); + if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) + zio_execute(pio); } void @@ -1106,14 +1170,17 @@ zio_suspend(spa_t *spa, zio_t *zio) mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) - spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0); + spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); spa->spa_suspended = B_TRUE; if (zio != NULL) { + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(zio->io_parent == NULL); + ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } @@ -1121,10 +1188,10 @@ zio_suspend(spa_t *spa, zio_t *zio) mutex_exit(&spa->spa_suspend_lock); } -void +int zio_resume(spa_t *spa) { - zio_t *pio, *zio; + zio_t *pio; /* * Reexecute all previously suspended i/o. @@ -1137,17 +1204,10 @@ zio_resume(spa_t *spa) mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) - return; + return (0); - while ((zio = pio->io_child) != NULL) { - zio_remove_child(pio, zio); - zio->io_parent = NULL; - zio_reexecute(zio); - } - - ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0); - - (void) zio_wait(pio); + zio_reexecute(pio); + return (zio_wait(pio)); } void @@ -1254,7 +1314,7 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ - if (gn != pio->io_logical->io_gang_tree) { + if (gn != pio->io_gang_leader->io_gang_tree) { zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), data, BP_GET_PSIZE(bp)); } @@ -1336,27 +1396,27 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) } static void -zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp) +zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); - ASSERT(lio->io_logical == lio); + ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh, + zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark)); + gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { - zio_t *lio = zio->io_logical; + zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; - ASSERT(zio->io_parent == lio); - ASSERT(zio->io_child == NULL); + ASSERT(gio == zio_unique_parent(zio)); + ASSERT(zio_walk_children(zio) == NULL); if (zio->io_error) return; @@ -1372,25 +1432,25 @@ zio_gang_tree_assemble_done(zio_t *zio) blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; - zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]); + zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) { - zio_t *lio = pio->io_logical; + zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); - ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp)); - ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree); + ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); + ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); @@ -1404,8 +1464,8 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) } } - if (gn == lio->io_gang_tree) - ASSERT3P((char *)lio->io_data + lio->io_size, ==, data); + if (gn == gio->io_gang_tree) + ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) zio_nowait(zio); @@ -1416,7 +1476,10 @@ zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; - ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical); + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); @@ -1426,18 +1489,18 @@ zio_gang_assemble(zio_t *zio) static int zio_gang_issue(zio_t *zio) { - zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); - ASSERT(BP_IS_GANG(bp) && zio == lio); + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); else - zio_gang_tree_free(&lio->io_gang_tree); + zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -1447,8 +1510,8 @@ zio_gang_issue(zio_t *zio) static void zio_write_gang_member_ready(zio_t *zio) { - zio_t *pio = zio->io_parent; - zio_t *lio = zio->io_logical; + zio_t *pio = zio_unique_parent(zio); + zio_t *gio = zio->io_gang_leader; dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; @@ -1459,7 +1522,7 @@ zio_write_gang_member_ready(zio_t *zio) ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); - ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas); + ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas); ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); @@ -1479,28 +1542,28 @@ zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; - zio_t *lio = pio->io_logical; + zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; - int ndvas = lio->io_prop.zp_ndvas; + int ndvas = gio->io_prop.zp_ndvas; int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); zio_prop_t zp; int error; error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE, - bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp, + bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp, METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); if (error) { pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } - if (pio == lio) { - gnpp = &lio->io_gang_tree; + if (pio == gio) { + gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); @@ -1524,11 +1587,11 @@ zio_write_gang_block(zio_t *pio) SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); - zp.zp_checksum = lio->io_prop.zp_checksum; + zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; - zp.zp_ndvas = lio->io_prop.zp_ndvas; + zp.zp_ndvas = gio->io_prop.zp_ndvas; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, @@ -1561,6 +1624,11 @@ zio_dva_allocate(zio_t *zio) blkptr_t *bp = zio->io_bp; int error; + if (zio->io_gang_leader == NULL) { + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + zio->io_gang_leader = zio; + } + ASSERT(BP_IS_HOLE(bp)); ASSERT3U(BP_GET_NDVAS(bp), ==, 0); ASSERT3U(zio->io_prop.zp_ndvas, >, 0); @@ -1692,72 +1760,6 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) * Read and write to physical devices * ========================================================================== */ - -static void -zio_vdev_io_probe_done(zio_t *zio) -{ - zio_t *dio; - vdev_t *vd = zio->io_private; - - mutex_enter(&vd->vdev_probe_lock); - ASSERT(vd->vdev_probe_zio == zio); - vd->vdev_probe_zio = NULL; - mutex_exit(&vd->vdev_probe_lock); - - while ((dio = zio->io_delegate_list) != NULL) { - zio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - if (!vdev_accessible(vd, dio)) - dio->io_error = ENXIO; - zio_execute(dio); - } -} - -/* - * Probe the device to determine whether I/O failure is specific to this - * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged). - */ -static int -zio_vdev_io_probe(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - zio_t *pio = NULL; - boolean_t created_pio = B_FALSE; - - /* - * Don't probe the probe. - */ - if (zio->io_flags & ZIO_FLAG_PROBE) - return (ZIO_PIPELINE_CONTINUE); - - /* - * To prevent 'probe storms' when a device fails, we create - * just one probe i/o at a time. All zios that want to probe - * this vdev will join the probe zio's io_delegate_list. - */ - mutex_enter(&vd->vdev_probe_lock); - - if ((pio = vd->vdev_probe_zio) == NULL) { - vd->vdev_probe_zio = pio = zio_root(zio->io_spa, - zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL); - created_pio = B_TRUE; - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(zio->io_spa, SPA_ASYNC_PROBE); - } - - zio->io_delegate_next = pio->io_delegate_list; - pio->io_delegate_list = zio; - - mutex_exit(&vd->vdev_probe_lock); - - if (created_pio) { - zio_nowait(vdev_probe(vd, pio)); - zio_nowait(pio); - } - - return (ZIO_PIPELINE_STOP); -} - static int zio_vdev_io_start(zio_t *zio) { @@ -1793,13 +1795,35 @@ zio_vdev_io_start(zio_t *zio) ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); + ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + + /* + * If this is a repair I/O, and there's no self-healing involved -- + * that is, we're just resilvering what we expect to resilver -- + * then don't do the I/O unless zio's txg is actually in vd's DTL. + * This prevents spurious resilvering with nested replication. + * For example, given a mirror of mirrors, (A+B)+(C+D), if only + * A is out of date, we'll read from C+D, then use the data to + * resilver A+B -- but we don't actually want to resilver B, just A. + * The top-level mirror has no way to know this, so instead we just + * discard unnecessary repairs as we work our way down the vdev tree. + * The same logic applies to any form of nested replication: + * ditto + mirror, RAID-Z + replacing, etc. This covers them all. + */ + if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && + zio->io_txg != 0 && /* not a delegated i/o */ + !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + zio_vdev_io_bypass(zio); + return (ZIO_PIPELINE_CONTINUE); + } if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return (ZIO_PIPELINE_STOP); + return (ZIO_PIPELINE_CONTINUE); if ((zio = vdev_queue_io(zio)) == NULL) return (ZIO_PIPELINE_STOP); @@ -1809,7 +1833,6 @@ zio_vdev_io_start(zio_t *zio) zio_interrupt(zio); return (ZIO_PIPELINE_STOP); } - } return (vd->vdev_ops->vdev_op_io_start(zio)); @@ -1852,7 +1875,7 @@ zio_vdev_io_done(zio_t *zio) ops->vdev_op_io_done(zio); if (unexpected_error) - return (zio_vdev_io_probe(zio)); + VERIFY(vdev_probe(vd, zio) == NULL); return (ZIO_PIPELINE_CONTINUE); } @@ -2048,13 +2071,12 @@ static int zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; - zio_t *pio = zio->io_parent; + zio_t *pio, *pio_next; - if (zio->io_ready) { - if (BP_IS_GANG(bp) && - zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) - return (ZIO_PIPELINE_STOP); + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); + if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); @@ -2068,8 +2090,22 @@ zio_ready(zio_t *zio) if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - if (pio != NULL) + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_READY] = 1; + pio = zio_walk_parents(zio); + mutex_exit(&zio->io_lock); + + /* + * As we notify zio's parents, new parents could be added. + * New parents go to the head of zio's io_parent_list, however, + * so we will (correctly) not notify them. The remainder of zio's + * io_parent_list, from 'pio_next' onward, cannot change because + * all parents must wait for us to be done before they can be done. + */ + for (; pio != NULL; pio = pio_next) { + pio_next = zio_walk_parents(zio); zio_notify_parent(pio, zio, ZIO_WAIT_READY); + } return (ZIO_PIPELINE_CONTINUE); } @@ -2078,14 +2114,14 @@ static int zio_done(zio_t *zio) { spa_t *spa = zio->io_spa; - zio_t *pio = zio->io_parent; zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; + zio_t *pio, *pio_next; /* - * If our of children haven't all completed, + * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || @@ -2102,7 +2138,7 @@ zio_done(zio_t *zio) ASSERT(bp->blk_pad[1] == 0); ASSERT(bp->blk_pad[2] == 0); ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || - (pio != NULL && bp == pio->io_bp)); + (bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT(!BP_SHOULD_BYTESWAP(bp)); @@ -2160,6 +2196,7 @@ zio_done(zio_t *zio) if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && zio->io_error == ENXIO && + spa->spa_load_state == SPA_LOAD_NONE && spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; @@ -2175,6 +2212,21 @@ zio_done(zio_t *zio) */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); + if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && + zio->io_child_type == ZIO_CHILD_LOGICAL) { + ASSERT(zio->io_child_type != ZIO_CHILD_GANG); + zio_dva_unallocate(zio, zio->io_gang_tree, bp); + } + + zio_gang_tree_free(&zio->io_gang_tree); + + /* + * Godfather I/Os should never suspend. + */ + if ((zio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) + zio->io_reexecute = 0; + if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. @@ -2191,17 +2243,37 @@ zio_done(zio_t *zio) */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - if (IO_IS_ALLOCATING(zio)) - zio_dva_unallocate(zio, zio->io_gang_tree, bp); + zio->io_gang_leader = NULL; - zio_gang_tree_free(&zio->io_gang_tree); + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); + + /* + * "The Godfather" I/O monitors its children but is + * not a true parent to them. It will track them through + * the pipeline but severs its ties whenever they get into + * trouble (e.g. suspended). This allows "The Godfather" + * I/O to return status without blocking. + */ + for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { + zio_link_t *zl = zio->io_walk_link; + pio_next = zio_walk_parents(zio); + + if ((pio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { + zio_remove_child(pio, zio, zl); + zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + } + } - if (pio != NULL) { + if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { @@ -2222,20 +2294,26 @@ zio_done(zio_t *zio) return (ZIO_PIPELINE_STOP); } - ASSERT(zio->io_child == NULL); + ASSERT(zio_walk_children(zio) == NULL); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); + /* + * It is the responsibility of the done callback to ensure that this + * particular zio is no longer discoverable for adoption, and as + * such, cannot acquire any new parents. + */ if (zio->io_done) zio->io_done(zio); - zio_gang_tree_free(&zio->io_gang_tree); - - ASSERT(zio->io_delegate_list == NULL); - ASSERT(zio->io_delegate_next == NULL); + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); - if (pio != NULL) { - zio_remove_child(pio, zio); + for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { + zio_link_t *zl = zio->io_walk_link; + pio_next = zio_walk_parents(zio); + zio_remove_child(pio, zio, zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index 79a9966..8f769e6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -72,6 +72,7 @@ #include <sys/zfs_rlock.h> #include <sys/vdev_impl.h> #include <sys/zvol.h> +#include <sys/zil_impl.h> #include <geom/geom.h> #include "zfs_namecheck.h" @@ -115,7 +116,6 @@ typedef struct zvol_state { uint32_t zv_total_opens; /* total open count */ zilog_t *zv_zilog; /* ZIL handle */ list_t zv_extents; /* List of extents for dump */ - uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ znode_t zv_znode; /* for range locking */ int zv_state; struct bio_queue_head zv_queue; @@ -287,8 +287,16 @@ static void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) { uint32_t blocksize = zv->zv_volblocksize; + zilog_t *zilog = zv->zv_zilog; lr_write_t *lr; + if (zilog->zl_replay) { + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = + zilog->zl_replaying_seq; + return; + } + while (len) { ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); @@ -303,7 +311,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); BP_ZERO(&lr->lr_blkptr); - (void) zil_itx_assign(zv->zv_zilog, itx, tx); + (void) zil_itx_assign(zilog, itx, tx); len -= nbytes; off += nbytes; } @@ -373,7 +381,8 @@ zvol_serve_one(zvol_state_t *zv, struct bio *bp) size = volsize - off; if (doread) { - error = dmu_read(os, ZVOL_OBJ, off, size, addr); + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); @@ -576,9 +585,13 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + /* If it's a dmu_sync() block get the data and write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) + zil_get_replay_data(dmu_objset_zil(os), lr); + tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); - error = dmu_tx_assign(tx, zv->zv_txg_assign); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { @@ -614,6 +627,13 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ + zvol_replay_err, /* TX_CREATE_ACL */ + zvol_replay_err, /* TX_CREATE_ATTR */ + zvol_replay_err, /* TX_CREATE_ACL_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL */ + zvol_replay_err, /* TX_MKDIR_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ + zvol_replay_err, /* TX_WRITE2 */ }; /* @@ -678,7 +698,7 @@ zvol_create_minor(const char *name, major_t maj) ASSERT(error == 0); zv->zv_volblocksize = doi.doi_data_block_size; - zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL); + zil_replay(os, zv, zvol_replay_vector); /* XXX this should handle the possible i/o error */ VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), @@ -983,7 +1003,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) /* immediate write */ - return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf)); + return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf, + DMU_READ_NO_PREFETCH)); zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zv->zv_zilog; @@ -1000,10 +1021,19 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zgd->zgd_rl = rl; VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); + error = dmu_sync(zio, db, &lr->lr_blkptr, lr->lr_common.lrc_txg, zvol_get_done, zgd); - if (error == 0) + if (error == 0) { + /* + * dmu_sync() can compress a block of zeros to a null blkptr + * but the block size still needs to be passed through to + * replay. + */ + BP_SET_LSIZE(&lr->lr_blkptr, db->db_size); zil_add_block(zv->zv_zilog, &lr->lr_blkptr); + } + /* * If we get EINPROGRESS, then we need to wait for a * write IO initiated by dmu_sync() to complete before diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h index f10fec6..b0ec063 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ACL_H #define _SYS_ACL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/acl_impl.h> @@ -168,6 +166,10 @@ typedef struct ace_object { ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ ACE_WRITE_OWNER|ACE_SYNCHRONIZE) +#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) + #define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \ ACE_READ_NAMED_ATTRS) diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h index 432e6be..5fabb14 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,8 +30,6 @@ #ifndef _SYS_DEBUG_H #define _SYS_DEBUG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #ifdef __cplusplus @@ -50,7 +47,7 @@ extern "C" { extern int assfail(const char *, const char *, int); #define VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__))) #ifdef DEBUG -#define ASSERT(EX) VERIFY(EX) +#define ASSERT(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__))) #else #define ASSERT(x) ((void)0) #endif @@ -58,7 +55,7 @@ extern int assfail(const char *, const char *, int); extern int assfail(); #define VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__))) #ifdef DEBUG -#define ASSERT(EX) VERIFY(EX) +#define ASSERT(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__))) #else #define ASSERT(x) ((void)0) #endif @@ -98,9 +95,9 @@ _NOTE(CONSTCOND) } while (0) #define VERIFY3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t) #define VERIFY3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t) #ifdef DEBUG -#define ASSERT3S(x, y, z) VERIFY3S(x, y, z) -#define ASSERT3U(x, y, z) VERIFY3U(x, y, z) -#define ASSERT3P(x, y, z) VERIFY3P(x, y, z) +#define ASSERT3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t) +#define ASSERT3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t) +#define ASSERT3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t) #else #define ASSERT3S(x, y, z) ((void)0) #define ASSERT3U(x, y, z) ((void)0) diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h index 66ca9c5..21b7dbe 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FM_FS_ZFS_H #define _SYS_FM_FS_ZFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -57,6 +55,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h index 2f7e747..8400dc1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -109,9 +109,20 @@ typedef enum { ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, + ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_NUM_PROPS } zfs_prop_t; +typedef enum { + ZFS_PROP_USERUSED, + ZFS_PROP_USERQUOTA, + ZFS_PROP_GROUPUSED, + ZFS_PROP_GROUPQUOTA, + ZFS_NUM_USERQUOTA_PROPS +} zfs_userquota_prop_t; + +extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; + /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected @@ -173,6 +184,7 @@ boolean_t zfs_prop_setonce(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); +boolean_t zfs_prop_userquota(const char *name); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); boolean_t zfs_prop_valid_for_type(int, zfs_type_t); @@ -217,6 +229,9 @@ typedef enum { #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" +#define ZFS_SMB_ACL_SRC "src" +#define ZFS_SMB_ACL_TARGET "target" + typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, @@ -230,6 +245,13 @@ typedef enum zfs_share_op { ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; +typedef enum zfs_smb_acl_op { + ZFS_SMB_ACL_ADD, + ZFS_SMB_ACL_REMOVE, + ZFS_SMB_ACL_RENAME, + ZFS_SMB_ACL_PURGE +} zfs_smb_acl_op_t; + typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, @@ -254,13 +276,15 @@ typedef enum zfs_cache_type { #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL +#define SPA_VERSION_15 15ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*}, - * and do the appropriate changes. + * and do the appropriate changes. Also bump the version number in + * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_14 -#define SPA_VERSION_STRING "14" +#define SPA_VERSION SPA_VERSION_15 +#define SPA_VERSION_STRING "15" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -296,6 +320,7 @@ typedef enum zfs_cache_type { #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 +#define SPA_VERSION_USERSPACE SPA_VERSION_15 /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -308,14 +333,16 @@ typedef enum zfs_cache_type { #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL -#define ZPL_VERSION ZPL_VERSION_3 -#define ZPL_VERSION_STRING "3" +#define ZPL_VERSION_4 4ULL +#define ZPL_VERSION ZPL_VERSION_4 +#define ZPL_VERSION_STRING "4" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 +#define ZPL_VERSION_USERSPACE ZPL_VERSION_4 /* * The following are configuration names used in the nvlist describing a pool's @@ -365,6 +392,7 @@ typedef enum zfs_cache_type { #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" +#define ZPOOL_CONFIG_FRU "fru" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" @@ -568,6 +596,11 @@ typedef unsigned long zfs_ioc_t; #define ZFS_IOC_INHERIT_PROP _IOWR('Z', 46, struct zfs_cmd) #define ZFS_IOC_JAIL _IOWR('Z', 47, struct zfs_cmd) #define ZFS_IOC_UNJAIL _IOWR('Z', 48, struct zfs_cmd) +#define ZFS_IOC_SMB_ACL _IOWR('Z', 49, struct zfs_cmd) +#define ZFS_IOC_USERSPACE_ONE _IOWR('Z', 50, struct zfs_cmd) +#define ZFS_IOC_USERSPACE_MANY _IOWR('Z', 51, struct zfs_cmd) +#define ZFS_IOC_USERSPACE_UPGRADE _IOWR('Z', 52, struct zfs_cmd) +#define ZFS_IOC_SETFRU _IOWR('Z', 53, struct zfs_cmd) /* * Internal SPA load state. Used by FMA diagnosis engine. diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h index ac21686..c46223f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h @@ -234,6 +234,9 @@ extern "C" { #define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear" #define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check" #define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync" +#define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start" +#define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish" +#define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare" /* * datalink subclass definitions. diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h index 5f1f4b4..5a7c9e6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h @@ -39,8 +39,6 @@ #ifndef _SYS_VNODE_H #define _SYS_VNODE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include_next <sys/vnode.h> #ifdef __cplusplus @@ -266,6 +264,14 @@ typedef struct xvattr { ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ ASSERT((xvap)->xva_magic == XVA_MAGIC); \ (xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr) +/* + * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap + * of requested attributes (xva_reqattrmap[]). + */ +#define XVA_CLR_REQ(xvap, attr) \ + ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR); \ + ASSERT((xvap)->xva_magic == XVA_MAGIC); \ + (xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr) /* * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap |