From aac8087bead638965fd8df72ec05b082185fa599 Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 30 Jul 2013 21:35:02 +0000 Subject: MFV r253783: Skip eviction step of processing free records when doing ZFS receive to avoid the expensive search operation of non-existent dbufs in dn_dbufs. Illumos ZFS issues: 3834 incremental replication of 'holey' file systems is slow MFC after: 2 weeks --- .../contrib/opensolaris/uts/common/fs/zfs/dbuf.c | 20 +++++++- .../opensolaris/uts/common/fs/zfs/dmu_send.c | 54 +++++++++++++++++++++- .../contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c | 49 ++++++++++++-------- .../opensolaris/uts/common/fs/zfs/sys/dmu_impl.h | 6 +++ .../opensolaris/uts/common/fs/zfs/sys/dmu_send.h | 3 +- 5 files changed, 110 insertions(+), 22 deletions(-) (limited to 'sys/cddl') diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 28aa330..759fd7a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -796,9 +797,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find - * empty blocks. Also, if we happen accross any level-1 dbufs in the + * empty blocks. Also, if we happen across any level-1 dbufs in the * range that have not already been marked dirty, mark them dirty so * they stay in memory. + * + * This is a no-op if the dataset is in the middle of an incremental + * receive; see comment below for details. */ void dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) @@ -814,6 +818,20 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) last_l1 = end >> epbs; } dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + + if (dmu_objset_is_receiving(dn->dn_objset)) { + /* + * When processing a free record from a zfs receive, + * there should have been no previous modifications to the + * data in this range. Therefore there should be no dbufs + * in the range. Searching dn_dbufs for these non-existent + * dbufs can be very expensive, so simply ignore this. + */ + VERIFY3P(dbuf_find(dn, 0, start), ==, NULL); + VERIFY3P(dbuf_find(dn, 0, end), ==, NULL); + return; + } + mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index a86814a..f636253 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -96,6 +96,32 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, { struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); + /* + * When we receive a free record, dbuf_free_range() assumes + * that the receiving system doesn't have any dbufs in the range + * being freed. This is always true because there is a one-record + * constraint: we only send one WRITE record for any given + * object+offset. We know that the one-record constraint is + * true because we always send data in increasing order by + * object,offset. + * + * If the increasing-order constraint ever changes, we should find + * another way to assert that the one-record constraint is still + * satisfied. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + + /* + * If we are doing a non-incremental send, then there can't + * be any data in the dataset we're receiving into. Therefore + * a free record would simply be a no-op. Save space by not + * sending it to begin with. + */ + if (!dsp->dsa_incremental) + return (0); + if (length != -1ULL && offset + length < offset) length = -1ULL; @@ -162,6 +188,15 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, { struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); + /* + * We send data in increasing object, offset order. + * See comment in dump_free() for details. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + dsp->dsa_last_data_object = object; + dsp->dsa_last_data_offset = offset + blksz - 1; /* * If there is any kind of pending aggregation (currently either @@ -229,6 +264,10 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); + /* See comment in dump_free(). */ + if (!dsp->dsa_incremental) + return (0); + /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for @@ -305,9 +344,9 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (SET_ERROR(EINTR)); - /* free anything past the end of the file */ + /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) return (SET_ERROR(EINTR)); if (dsp->dsa_err != 0) return (SET_ERROR(EINTR)); @@ -495,6 +534,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsp->dsa_toguid = ds->ds_phys->ds_guid; ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; + dsp->dsa_incremental = (fromtxg != 0); mutex_enter(&ds->ds_sendstream_lock); list_insert_head(&ds->ds_sendstreams, dsp); @@ -1819,3 +1859,13 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) else return (dmu_recv_existing_end(drc)); } + +/* + * Return TRUE if this objset is currently being received into. + */ +boolean_t +dmu_objset_is_receiving(objset_t *os) +{ + return (os->os_dsl_dataset != NULL && + os->os_dsl_dataset->ds_owner == dmu_recv_tag); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index 5338425..929b0c4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -587,8 +587,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; dnode_t *dn; - uint64_t start, end, i; - int err, shift; + int err; zio_t *zio; ASSERT(tx->tx_txg == 0); @@ -599,34 +598,48 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) return; dn = txh->txh_dnode; - /* first block */ - if (off != 0) - dmu_tx_count_write(txh, off, 1); - /* last block */ - if (len != DMU_OBJECT_END) - dmu_tx_count_write(txh, off+len, 1); - - dmu_tx_count_dnode(txh); - if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + dmu_tx_count_dnode(txh); + /* - * For i/o error checking, read the first and last level-0 - * blocks, and all the level-1 blocks. The above count_write's - * have already taken care of the level-0 blocks. + * For i/o error checking, we read the first and last level-0 + * blocks if they are not aligned, and all the level-1 blocks. + * + * Note: dbuf_free_range() assumes that we have not instantiated + * any level-0 dbufs that will be completely freed. Therefore we must + * exercise care to not read or count the first and last blocks + * if they are blocksize-aligned. + */ + if (dn->dn_datablkshift == 0) { + dmu_tx_count_write(txh, off, len); + } else { + /* first block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off, 1); + /* last block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off+len, 1); + } + + /* + * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { - shift = dn->dn_datablkshift + dn->dn_indblkshift - + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; - start = off >> shift; - end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; + uint64_t start = off >> shift; + uint64_t end = (off + len) >> shift; + + ASSERT(dn->dn_datablkshift != 0); + ASSERT(dn->dn_indblkshift != 0); zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - for (i = start; i <= end; i++) { + for (uint64_t i = start; i <= end; i++) { uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h index e281121..f841989 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h @@ -21,8 +21,11 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + */ +/* * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska . All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -293,6 +296,9 @@ typedef struct dmu_sendarg { uint64_t dsa_toguid; int dsa_err; dmu_pendop_t dsa_pending_op; + boolean_t dsa_incremental; + uint64_t dsa_last_data_object; + uint64_t dsa_last_data_offset; } dmu_sendarg_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h index de17f0c..13ef511 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -74,5 +74,6 @@ int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, #endif int cleanup_fd, uint64_t *action_handlep); int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); +boolean_t dmu_objset_is_receiving(objset_t *os); #endif /* _DMU_SEND_H */ -- cgit v1.1