diff options
author | mav <mav@FreeBSD.org> | 2018-04-16 03:32:41 +0000 |
---|---|---|
committer | mav <mav@FreeBSD.org> | 2018-04-16 03:32:41 +0000 |
commit | 13f49c3c8f0f20b613f308073363007c7d62ea57 (patch) | |
tree | 51aabb1447f1077a7af9776bc016c6fbed3290e8 /cddl/contrib/opensolaris/lib | |
parent | 05908e9a0336c64217262cd3df008e560d0f67e7 (diff) | |
download | FreeBSD-src-13f49c3c8f0f20b613f308073363007c7d62ea57.zip FreeBSD-src-13f49c3c8f0f20b613f308073363007c7d62ea57.tar.gz |
MFC r329732: MFV r329502: 7614 zfs device evacuation/removal
illumos/illumos-gate@5cabbc6b49070407fb9610cfe73d4c0e0dea3e77
https://www.illumos.org/issues/7614:
This project allows top-level vdevs to be removed from the storage pool with
“zpool remove”, reducing the total amount of storage in the pool. This
operation copies all allocated regions of the device to be removed onto other
devices, recording the mapping from old to new location. After the removal is
complete, read and free operations to the removed (now “indirect”) vdev must
be remapped and performed at the new location on disk. The indirect mapping
table is kept in memory whenever the pool is loaded, so there is minimal
performance overhead when doing operations on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become “obsolete” because they are no longer used by any block pointers in
the pool. An entry becomes obsolete when all the blocks that use it are
freed. An entry can also become obsolete when all the snapshots that
reference it are deleted, and the block pointers that reference it have been
“remapped” in all filesystems/zvols (and clones). Whenever an indirect block
is written, all the block pointers in it will be “remapped” to their new
(concrete) locations if possible. This process can be accelerated by using
the “zfs remap” command to proactively rewrite all indirect blocks that
reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of the data
that is copied. This makes the process much faster, but if it were used on
redundant vdevs (i.e. mirror or raidz vdevs), it would be possible to copy
the wrong data, when we have the correct data on e.g. the other side of the
mirror. Therefore, mirror and raidz devices can not be removed.
Reviewed by: Alex Reece <alex@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: John Kennedy <john.kennedy@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Author: Prashanth Sreenivasa <pks@delphix.com>
Diffstat (limited to 'cddl/contrib/opensolaris/lib')
6 files changed, 132 insertions, 20 deletions
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h index 3e15dd1..e382c0e 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. @@ -131,6 +131,7 @@ typedef enum zfs_error { EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_SCRUB_PAUSED, /* scrub currently paused */ + EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ EZFS_UNKNOWN } zfs_error_t; @@ -267,6 +268,8 @@ extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); +extern int zpool_vdev_remove_cancel(zpool_handle_t *); +extern int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, splitflags_t); @@ -825,6 +828,7 @@ extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); extern int zmount(const char *, const char *, int, char *, char *, int, char *, int); #endif +extern int zfs_remap_indirects(libzfs_handle_t *hdl, const char *); #ifdef __cplusplus } diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c index 691fb04..0b51e1cc 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c @@ -3829,6 +3829,24 @@ zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) return (rv); } +int +zfs_remap_indirects(libzfs_handle_t *hdl, const char *fs) +{ + int err; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot remap filesystem '%s' "), fs); + + err = lzc_remap(fs); + + if (err != 0) { + (void) zfs_standard_error(hdl, err, errbuf); + } + + return (err); +} + /* * Creates snapshots. The keys in the snaps nvlist are the snapshots to be * created. diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c index 612f37e..2b428ef 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com> @@ -1334,6 +1334,13 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) (void) zfs_error(hdl, EZFS_BADDEV, msg); break; + case EINVAL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid config; a pool with removing/removed " + "vdevs does not support adding raidz vdevs")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + case EOVERFLOW: /* * This occurrs when one of the devices is below @@ -2664,7 +2671,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, - &islog)) == 0) + &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) @@ -2773,7 +2780,8 @@ zpool_vdev_attach(zpool_handle_t *zhp, break; case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"), + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " + "or pool has removing/removed vdevs"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; @@ -2827,7 +2835,7 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - NULL)) == 0) + NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) @@ -3116,8 +3124,7 @@ out: } /* - * Remove the given device. Currently, this is supported only for hot spares - * and level 2 cache devices. + * Remove the given device. */ int zpool_vdev_remove(zpool_handle_t *zhp, const char *path) @@ -3134,26 +3141,61 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - &islog)) == 0) - return (zfs_error(hdl, EZFS_NODEVICE, msg)); - /* - * XXX - this should just go away. - */ - if (!avail_spare && !l2cache && !islog) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "only inactive hot spares, cache, top-level, " - "or log devices can be removed")); + &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); - } version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog && version < SPA_VERSION_HOLES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgrade to support log removal")); + "pool must be upgraded to support log removal")); return (zfs_error(hdl, EZFS_BADVERSION, msg)); } - verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + if (!islog && !avail_spare && !l2cache && zpool_is_bootable(zhp)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "root pool can not have removed devices, " + "because GRUB does not understand them")); + return (zfs_error(hdl, EINVAL, msg)); + } + + zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) + return (0); + + switch (errno) { + + case EINVAL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid config; all top-level vdevs must " + "have the same sector size and not be raidz.")); + (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); + break; + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Pool busy; removal may already be in progress")); + (void) zfs_error(hdl, EZFS_BUSY, msg); + break; + + default: + (void) zpool_standard_error(hdl, errno, msg); + } + return (-1); +} + +int +zpool_vdev_remove_cancel(zpool_handle_t *zhp) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot cancel removal")); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_cookie = 1; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); @@ -3161,6 +3203,36 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) return (zpool_standard_error(hdl, errno, msg)); } +int +zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path, + uint64_t *sizep) +{ + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache, islog; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"), + path); + + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + &islog)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + if (avail_spare || l2cache || islog) { + *sizep = 0; + return (0); + } + + if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "indirect size not available")); + return (zfs_error(hdl, EINVAL, msg)); + } + return (0); +} + /* * Clear the errors for the pool, or the particular device if specified. */ @@ -3188,7 +3260,7 @@ zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (path) { if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, - &l2cache, NULL)) == 0) + &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); /* diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c index 5f5335d..6adab0b 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c @@ -240,6 +240,9 @@ libzfs_error_description(libzfs_handle_t *hdl) return (dgettext(TEXT_DOMAIN, "invalid diff data")); case EZFS_POOLREADONLY: return (dgettext(TEXT_DOMAIN, "pool is read-only")); + case EZFS_NO_PENDING: + return (dgettext(TEXT_DOMAIN, "operation is not " + "in progress")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -487,6 +490,10 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; + /* There is no pending operation to cancel */ + case ESRCH: + zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap); + break; default: zfs_error_aux(hdl, strerror(error)); diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c index b1863be..a7c973f 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c @@ -286,6 +286,16 @@ lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen) return (0); } +int +lzc_remap(const char *fsname) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + error = lzc_ioctl(ZFS_IOC_REMAP, fsname, args, NULL); + nvlist_free(args); + return (error); +} + /* * Creates snapshots. * diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h index c61ad52..5202fd1 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h @@ -47,6 +47,7 @@ enum lzc_dataset_type { LZC_DATSET_TYPE_ZVOL }; +int lzc_remap(const char *fsname); int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *); int lzc_clone(const char *, const char *, nvlist_t *); |