summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormav <mav@FreeBSD.org>2016-01-26 13:14:39 +0000
committermav <mav@FreeBSD.org>2016-01-26 13:14:39 +0000
commitf904c5a333da8e5dc66548da80c974216ef23c97 (patch)
tree0678f1462276d98c1b7765a8f41bf60d4dfddd8e
parent71d7abc46e4defbcf77033c417935b944c13084a (diff)
downloadFreeBSD-src-f904c5a333da8e5dc66548da80c974216ef23c97.zip
FreeBSD-src-f904c5a333da8e5dc66548da80c974216ef23c97.tar.gz
MFV r294814: 6393 zfs receive a full send as a clone
Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Richard Elling <Richard.Elling@RichardElling.com> Approved by: Dan McDonald <danmcd@omniti.com> Author: Paul Dagnelie <pcd@delphix.com> illumos/illumos-gate@68ecb2ec930c4b0f00acaf8e0abb2b19c4b8b76f This allows to do a full (non-incremental send) and receive it as a clone of an existing dataset. It can leverage nopwrite to share blocks with the origin. This can be used to change the relationship of datasets on the target. For example, maybe on the source you have: A ---- B ---- C And you have sent to the target a full of B, and the incremental B->C: B ---- C You later realize that you want to have A on the target. You will have to do a full send of A, but nopwrite can save you space on the target if you receive it as a clone of B, assuming that A and B have some blocks inxi common: B ---- C \ A
-rw-r--r--cddl/contrib/opensolaris/cmd/zfs/zfs.87
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c158
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h12
4 files changed, 122 insertions, 58 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zfs/zfs.8 b/cddl/contrib/opensolaris/cmd/zfs/zfs.8
index 71ac878..dedfc0a 100644
--- a/cddl/contrib/opensolaris/cmd/zfs/zfs.8
+++ b/cddl/contrib/opensolaris/cmd/zfs/zfs.8
@@ -2841,8 +2841,11 @@ Do not actually receive the stream. This can be useful in conjunction with the
option to verify the name the receive operation would use.
.It Fl o Sy origin Ns = Ns Ar snapshot
Forces the stream to be received as a clone of the given snapshot.
-This is only valid if the stream is an incremental stream whose source
-is the same as the provided origin.
+If the stream is a full send stream, this will create the filesystem
+described by the stream as a clone of the specified snapshot. Which
+snapshot was specified will not affect the success or failure of the
+receive, as long as the snapshot does exist. If the stream is an
+incremental send stream, all the normal verification will be performed.
.It Fl F
Force a rollback of the file system to the most recent snapshot before
performing the receive operation. If receiving an incremental replication
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index ede1555..00f52d6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -158,6 +158,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
return (0);
}
+/*
+ * Fill in the drr_free struct, or perform aggregation if the previous record is
+ * also a free record, and the two are adjacent.
+ *
+ * Note that we send free records even for a full send, because we want to be
+ * able to receive a full send as a clone, which requires a list of all the free
+ * and freeobject records that were generated on the source.
+ */
static int
dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
uint64_t length)
@@ -181,15 +189,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
(object == dsp->dsa_last_data_object &&
offset > dsp->dsa_last_data_offset));
- /*
- * If we are doing a non-incremental send, then there can't
- * be any data in the dataset we're receiving into. Therefore
- * a free record would simply be a no-op. Save space by not
- * sending it to begin with.
- */
- if (!dsp->dsa_incremental)
- return (0);
-
if (length != -1ULL && offset + length < offset)
length = -1ULL;
@@ -368,10 +367,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
{
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
- /* See comment in dump_free(). */
- if (!dsp->dsa_incremental)
- return (0);
-
/*
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
* push it out, since free block aggregation can only be done for
@@ -776,6 +771,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
if (ancestor_zb != NULL) {
drr->drr_u.drr_begin.drr_fromguid =
@@ -799,7 +795,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
dsp->dsa_off = off;
dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
dsp->dsa_pending_op = PENDING_NONE;
- dsp->dsa_incremental = (ancestor_zb != NULL);
dsp->dsa_featureflags = featureflags;
dsp->dsa_resume_object = resumeobj;
dsp->dsa_resume_offset = resumeoff;
@@ -1321,7 +1316,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/* target fs already exists; recv into temp clone */
/* Can't recv a clone into an existing fs */
- if (flags & DRR_FLAG_CLONE) {
+ if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -1340,6 +1335,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
drba->drba_origin))
return (SET_ERROR(ENOENT));
+ /*
+ * If we're receiving a full send as a clone, and it doesn't
+ * contain all the necessary free records and freeobject
+ * records, reject it.
+ */
+ if (fromguid == 0 && drba->drba_origin &&
+ !(flags & DRR_FLAG_FREERECORDS))
+ return (SET_ERROR(EINVAL));
+
/* Open the parent of tofs */
ASSERT3U(strlen(tofs), <, MAXNAMELEN);
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
@@ -1379,7 +1383,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
- if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
+ if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
+ fromguid != 0) {
dsl_dataset_rele(origin, FTAG);
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENODEV));
@@ -1709,6 +1714,20 @@ struct receive_writer_arg {
uint64_t bytes_read; /* bytes read when current record created */
};
+struct objlist {
+ list_t list; /* List of struct receive_objnode. */
+ /*
+ * Last object looked up. Used to assert that objects are being looked
+ * up in ascending order.
+ */
+ uint64_t last_lookup;
+};
+
+struct receive_objnode {
+ list_node_t node;
+ uint64_t object;
+};
+
struct receive_arg {
objset_t *os;
kthread_t *td;
@@ -1727,12 +1746,7 @@ struct receive_arg {
int err;
boolean_t byteswap;
/* Sorted list of objects not to issue prefetches for. */
- list_t ignore_obj_list;
-};
-
-struct receive_ign_obj_node {
- list_node_t node;
- uint64_t object;
+ struct objlist ignore_objlist;
};
typedef struct guid_map_entry {
@@ -2068,13 +2082,14 @@ receive_freeobjects(struct receive_writer_arg *rwa,
struct drr_freeobjects *drrfo)
{
uint64_t obj;
+ int next_err = 0;
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL));
for (obj = drrfo->drr_firstobj;
- obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
- (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
+ obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
+ next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
int err;
if (dmu_object_info(rwa->os, obj, NULL) != 0)
@@ -2084,7 +2099,8 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (err != 0)
return (err);
}
-
+ if (next_err != ESRCH)
+ return (next_err);
return (0);
}
@@ -2414,6 +2430,66 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
return (0);
}
+static void
+objlist_create(struct objlist *list)
+{
+ list_create(&list->list, sizeof (struct receive_objnode),
+ offsetof(struct receive_objnode, node));
+ list->last_lookup = 0;
+}
+
+static void
+objlist_destroy(struct objlist *list)
+{
+ for (struct receive_objnode *n = list_remove_head(&list->list);
+ n != NULL; n = list_remove_head(&list->list)) {
+ kmem_free(n, sizeof (*n));
+ }
+ list_destroy(&list->list);
+}
+
+/*
+ * This function looks through the objlist to see if the specified object number
+ * is contained in the objlist. In the process, it will remove all object
+ * numbers in the list that are smaller than the specified object number. Thus,
+ * any lookup of an object number smaller than a previously looked up object
+ * number will always return false; therefore, all lookups should be done in
+ * ascending order.
+ */
+static boolean_t
+objlist_exists(struct objlist *list, uint64_t object)
+{
+ struct receive_objnode *node = list_head(&list->list);
+ ASSERT3U(object, >=, list->last_lookup);
+ list->last_lookup = object;
+ while (node != NULL && node->object < object) {
+ VERIFY3P(node, ==, list_remove_head(&list->list));
+ kmem_free(node, sizeof (*node));
+ node = list_head(&list->list);
+ }
+ return (node != NULL && node->object == object);
+}
+
+/*
+ * The objlist is a list of object numbers stored in ascending order. However,
+ * the insertion of new object numbers does not seek out the correct location to
+ * store a new object number; instead, it appends it to the list for simplicity.
+ * Thus, any users must take care to only insert new object numbers in ascending
+ * order.
+ */
+static void
+objlist_insert(struct objlist *list, uint64_t object)
+{
+ struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+ node->object = object;
+#ifdef ZFS_DEBUG
+ struct receive_objnode *last_object = list_tail(&list->list);
+ uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
+ ASSERT3U(node->object, >, last_objnum);
+#endif
+ list_insert_tail(&list->list, node);
+}
+
/*
* Issue the prefetch reads for any necessary indirect blocks.
*
@@ -2436,13 +2512,7 @@ static void
receive_read_prefetch(struct receive_arg *ra,
uint64_t object, uint64_t offset, uint64_t length)
{
- struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
- while (node != NULL && node->object < object) {
- VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
- kmem_free(node, sizeof (*node));
- node = list_head(&ra->ignore_obj_list);
- }
- if (node == NULL || node->object > object) {
+ if (!objlist_exists(&ra->ignore_objlist, object)) {
dmu_prefetch(ra->os, object, 1, offset, length,
ZIO_PRIORITY_SYNC_READ);
}
@@ -2475,18 +2545,7 @@ receive_read_record(struct receive_arg *ra)
*/
if (err == ENOENT ||
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
- struct receive_ign_obj_node *node =
- kmem_zalloc(sizeof (*node),
- KM_SLEEP);
- node->object = drro->drr_object;
-#ifdef ZFS_DEBUG
- struct receive_ign_obj_node *last_object =
- list_tail(&ra->ignore_obj_list);
- uint64_t last_objnum = (last_object != NULL ?
- last_object->object : 0);
- ASSERT3U(node->object, >, last_objnum);
-#endif
- list_insert_tail(&ra->ignore_obj_list, node);
+ objlist_insert(&ra->ignore_objlist, drro->drr_object);
err = 0;
}
return (err);
@@ -2704,7 +2763,6 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
return (0);
}
-
/*
* Read in the stream's records, one by one, and apply them to the pool. There
* are two threads involved; the thread that calls this function will spin up a
@@ -2739,8 +2797,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
sizeof (ra.bytes_read), 1, &ra.bytes_read);
}
- list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
- offsetof(struct receive_ign_obj_node, node));
+ objlist_create(&ra.ignore_objlist);
/* these were verified in dmu_recv_begin */
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@@ -2894,12 +2951,7 @@ out:
}
*voffp = ra.voff;
- for (struct receive_ign_obj_node *n =
- list_remove_head(&ra.ignore_obj_list); n != NULL;
- n = list_remove_head(&ra.ignore_obj_list)) {
- kmem_free(n, sizeof (*n));
- }
- list_destroy(&ra.ignore_obj_list);
+ objlist_destroy(&ra.ignore_objlist);
return (err);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
index e8d6294..8cb6341 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
@@ -25,7 +25,7 @@
/*
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_DMU_IMPL_H
@@ -296,7 +296,6 @@ typedef struct dmu_sendarg {
uint64_t dsa_toguid;
int dsa_err;
dmu_pendop_t dsa_pending_op;
- boolean_t dsa_incremental;
uint64_t dsa_featureflags;
uint64_t dsa_last_data_object;
uint64_t dsa_last_data_offset;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
index 20bf545..2a50e19 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_IOCTL_H
@@ -126,6 +126,16 @@ typedef enum dmu_send_resume_token_version {
#define DRR_FLAG_CLONE (1<<0)
#define DRR_FLAG_CI_DATA (1<<1)
+/*
+ * This send stream, if it is a full send, includes the FREE and FREEOBJECT
+ * records that are created by the sending process. This means that the send
+ * stream can be received as a clone, even though it is not an incremental.
+ * This is not implemented as a feature flag, because the receiving side does
+ * not need to have implemented it to receive this stream; it is fully backwards
+ * compatible. We need a flag, though, because full send streams without it
+ * cannot necessarily be received as a clone correctly.
+ */
+#define DRR_FLAG_FREERECORDS (1<<2)
/*
* flags in the drr_checksumflags field in the DRR_WRITE and
OpenPOWER on IntegriCloud