From c998beca600e455ed5c90b088898d49b75c0b301 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 18 Jul 2012 07:48:04 +0000 Subject: Update vendor/opensolaris to last OpenSolaris state (13149:b23a4dab3d50) Add ZFS bits to vendor/opensolaris Obtained from: https://hg.openindiana.org/upstream/oracle/onnv-gate --- OPENSOLARIS.LICENSE | 384 +++ cmd/pyzfs/pyzfs.py | 82 + cmd/sgs/include/alist.h | 24 +- cmd/sgs/include/debug.h | 477 ++- cmd/sgs/include/sgs.h | 126 +- cmd/sgs/messages/sgs.ident | 8 +- cmd/sgs/tools/common/sgsmsg.c | 57 +- cmd/sgs/tools/common/string_table.c | 31 +- cmd/stat/common/statcommon.h | 328 ++ cmd/stat/common/timestamp.c | 54 + cmd/zdb/zdb.c | 3158 +++++++++++++++++++ cmd/zdb/zdb_il.c | 384 +++ cmd/zfs/zfs_iter.c | 464 +++ cmd/zfs/zfs_iter.h | 56 + cmd/zfs/zfs_main.c | 4160 ++++++++++++++++++++++++ cmd/zfs/zfs_util.h | 42 + cmd/zinject/translate.c | 494 +++ cmd/zinject/zinject.c | 972 ++++++ cmd/zinject/zinject.h | 70 + cmd/zlook/zlook.c | 411 +++ cmd/zpool/zpool_iter.c | 252 ++ cmd/zpool/zpool_main.c | 4467 ++++++++++++++++++++++++++ cmd/zpool/zpool_util.c | 86 + cmd/zpool/zpool_util.h | 72 + cmd/zpool/zpool_vdev.c | 1469 +++++++++ cmd/zstreamdump/zstreamdump.c | 429 +++ cmd/ztest/ztest.c | 5604 +++++++++++++++++++++++++++++++++ head/atomic.h | 34 + head/libintl.h | 125 + head/stdio_ext.h | 88 + head/synch.h | 277 ++ head/thread.h | 156 + lib/libdtrace/common/dt_decl.c | 14 +- lib/libdtrace/common/dt_ident.c | 8 +- lib/libdtrace/common/dt_parser.c | 23 +- lib/libdtrace/common/dt_pragma.c | 8 +- lib/libdtrace/common/dt_string.c | 17 - lib/libdtrace/common/dt_string.h | 12 +- lib/libdtrace/common/dt_subr.c | 6 +- lib/libnvpair/libnvpair.c | 1269 ++++++++ lib/libnvpair/libnvpair.h | 194 ++ lib/libnvpair/nvpair_alloc_system.c | 59 + lib/libuutil/common/libuutil.h | 390 +++ lib/libuutil/common/libuutil_common.h | 35 + lib/libuutil/common/libuutil_impl.h | 181 ++ lib/libuutil/common/uu_alloc.c | 135 + lib/libuutil/common/uu_avl.c | 569 ++++ lib/libuutil/common/uu_dprintf.c | 128 + lib/libuutil/common/uu_ident.c | 122 + lib/libuutil/common/uu_list.c | 718 +++++ lib/libuutil/common/uu_misc.c | 280 ++ lib/libuutil/common/uu_open.c | 70 + lib/libuutil/common/uu_pname.c | 207 ++ lib/libuutil/common/uu_string.c | 56 + lib/libuutil/common/uu_strtoint.c | 300 ++ lib/libzfs/common/libzfs.h | 705 +++++ lib/libzfs/common/libzfs_changelist.c | 693 ++++ lib/libzfs/common/libzfs_config.c | 370 +++ lib/libzfs/common/libzfs_dataset.c | 4058 ++++++++++++++++++++++++ lib/libzfs/common/libzfs_diff.c | 826 +++++ lib/libzfs/common/libzfs_fru.c | 452 +++ lib/libzfs/common/libzfs_graph.c | 653 ++++ lib/libzfs/common/libzfs_impl.h | 214 ++ lib/libzfs/common/libzfs_import.c | 1688 ++++++++++ lib/libzfs/common/libzfs_mount.c | 1266 ++++++++ lib/libzfs/common/libzfs_pool.c | 3803 ++++++++++++++++++++++ lib/libzfs/common/libzfs_sendrecv.c | 3021 ++++++++++++++++++ lib/libzfs/common/libzfs_status.c | 398 +++ lib/libzfs/common/libzfs_util.c | 1482 +++++++++ lib/libzpool/common/kernel.c | 981 ++++++ lib/libzpool/common/sys/zfs_context.h | 611 ++++ lib/libzpool/common/taskq.c | 303 ++ lib/libzpool/common/util.c | 155 + lib/pyzfs/common/__init__.py | 27 + lib/pyzfs/common/allow.py | 396 +++ lib/pyzfs/common/dataset.py | 234 ++ lib/pyzfs/common/groupspace.py | 28 + lib/pyzfs/common/holds.py | 75 + lib/pyzfs/common/ioctl.c | 543 ++++ lib/pyzfs/common/table.py | 70 + lib/pyzfs/common/unallow.py | 27 + lib/pyzfs/common/userspace.py | 246 ++ lib/pyzfs/common/util.py | 141 + 83 files changed, 52790 insertions(+), 318 deletions(-) create mode 100644 OPENSOLARIS.LICENSE create mode 100644 cmd/pyzfs/pyzfs.py create mode 100644 cmd/stat/common/statcommon.h create mode 100644 cmd/stat/common/timestamp.c create mode 100644 cmd/zdb/zdb.c create mode 100644 cmd/zdb/zdb_il.c create mode 100644 cmd/zfs/zfs_iter.c create mode 100644 cmd/zfs/zfs_iter.h create mode 100644 cmd/zfs/zfs_main.c create mode 100644 cmd/zfs/zfs_util.h create mode 100644 cmd/zinject/translate.c create mode 100644 cmd/zinject/zinject.c create mode 100644 cmd/zinject/zinject.h create mode 100644 cmd/zlook/zlook.c create mode 100644 cmd/zpool/zpool_iter.c create mode 100644 cmd/zpool/zpool_main.c create mode 100644 cmd/zpool/zpool_util.c create mode 100644 cmd/zpool/zpool_util.h create mode 100644 cmd/zpool/zpool_vdev.c create mode 100644 cmd/zstreamdump/zstreamdump.c create mode 100644 cmd/ztest/ztest.c create mode 100644 head/atomic.h create mode 100644 head/libintl.h create mode 100644 head/stdio_ext.h create mode 100644 head/synch.h create mode 100644 head/thread.h create mode 100644 lib/libnvpair/libnvpair.c create mode 100644 lib/libnvpair/libnvpair.h create mode 100644 lib/libnvpair/nvpair_alloc_system.c create mode 100644 lib/libuutil/common/libuutil.h create mode 100644 lib/libuutil/common/libuutil_common.h create mode 100644 lib/libuutil/common/libuutil_impl.h create mode 100644 lib/libuutil/common/uu_alloc.c create mode 100644 lib/libuutil/common/uu_avl.c create mode 100644 lib/libuutil/common/uu_dprintf.c create mode 100644 lib/libuutil/common/uu_ident.c create mode 100644 lib/libuutil/common/uu_list.c create mode 100644 lib/libuutil/common/uu_misc.c create mode 100644 lib/libuutil/common/uu_open.c create mode 100644 lib/libuutil/common/uu_pname.c create mode 100644 lib/libuutil/common/uu_string.c create mode 100644 lib/libuutil/common/uu_strtoint.c create mode 100644 lib/libzfs/common/libzfs.h create mode 100644 lib/libzfs/common/libzfs_changelist.c create mode 100644 lib/libzfs/common/libzfs_config.c create mode 100644 lib/libzfs/common/libzfs_dataset.c create mode 100644 lib/libzfs/common/libzfs_diff.c create mode 100644 lib/libzfs/common/libzfs_fru.c create mode 100644 lib/libzfs/common/libzfs_graph.c create mode 100644 lib/libzfs/common/libzfs_impl.h create mode 100644 lib/libzfs/common/libzfs_import.c create mode 100644 lib/libzfs/common/libzfs_mount.c create mode 100644 lib/libzfs/common/libzfs_pool.c create mode 100644 lib/libzfs/common/libzfs_sendrecv.c create mode 100644 lib/libzfs/common/libzfs_status.c create mode 100644 lib/libzfs/common/libzfs_util.c create mode 100644 lib/libzpool/common/kernel.c create mode 100644 lib/libzpool/common/sys/zfs_context.h create mode 100644 lib/libzpool/common/taskq.c create mode 100644 lib/libzpool/common/util.c create mode 100644 lib/pyzfs/common/__init__.py create mode 100644 lib/pyzfs/common/allow.py create mode 100644 lib/pyzfs/common/dataset.py create mode 100644 lib/pyzfs/common/groupspace.py create mode 100644 lib/pyzfs/common/holds.py create mode 100644 lib/pyzfs/common/ioctl.c create mode 100644 lib/pyzfs/common/table.py create mode 100644 lib/pyzfs/common/unallow.py create mode 100644 lib/pyzfs/common/userspace.py create mode 100644 lib/pyzfs/common/util.py diff --git a/OPENSOLARIS.LICENSE b/OPENSOLARIS.LICENSE new file mode 100644 index 0000000..da23621 --- /dev/null +++ b/OPENSOLARIS.LICENSE @@ -0,0 +1,384 @@ +Unless otherwise noted, all files in this distribution are released +under the Common Development and Distribution License (CDDL). +Exceptions are noted within the associated source files. + +-------------------------------------------------------------------- + + +COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0 + +1. Definitions. + + 1.1. "Contributor" means each individual or entity that creates + or contributes to the creation of Modifications. + + 1.2. "Contributor Version" means the combination of the Original + Software, prior Modifications used by a Contributor (if any), + and the Modifications made by that particular Contributor. + + 1.3. "Covered Software" means (a) the Original Software, or (b) + Modifications, or (c) the combination of files containing + Original Software with files containing Modifications, in + each case including portions thereof. + + 1.4. "Executable" means the Covered Software in any form other + than Source Code. + + 1.5. "Initial Developer" means the individual or entity that first + makes Original Software available under this License. + + 1.6. "Larger Work" means a work which combines Covered Software or + portions thereof with code not governed by the terms of this + License. + + 1.7. "License" means this document. + + 1.8. "Licensable" means having the right to grant, to the maximum + extent possible, whether at the time of the initial grant or + subsequently acquired, any and all of the rights conveyed + herein. + + 1.9. "Modifications" means the Source Code and Executable form of + any of the following: + + A. Any file that results from an addition to, deletion from or + modification of the contents of a file containing Original + Software or previous Modifications; + + B. Any new file that contains any part of the Original + Software or previous Modifications; or + + C. Any new file that is contributed or otherwise made + available under the terms of this License. + + 1.10. "Original Software" means the Source Code and Executable + form of computer software code that is originally released + under this License. + + 1.11. "Patent Claims" means any patent claim(s), now owned or + hereafter acquired, including without limitation, method, + process, and apparatus claims, in any patent Licensable by + grantor. + + 1.12. "Source Code" means (a) the common form of computer software + code in which modifications are made and (b) associated + documentation included in or with such code. + + 1.13. "You" (or "Your") means an individual or a legal entity + exercising rights under, and complying with all of the terms + of, this License. For legal entities, "You" includes any + entity which controls, is controlled by, or is under common + control with You. For purposes of this definition, + "control" means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by + contract or otherwise, or (b) ownership of more than fifty + percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants. + + 2.1. The Initial Developer Grant. + + Conditioned upon Your compliance with Section 3.1 below and + subject to third party intellectual property claims, the Initial + Developer hereby grants You a world-wide, royalty-free, + non-exclusive license: + + (a) under intellectual property rights (other than patent or + trademark) Licensable by Initial Developer, to use, + reproduce, modify, display, perform, sublicense and + distribute the Original Software (or portions thereof), + with or without Modifications, and/or as part of a Larger + Work; and + + (b) under Patent Claims infringed by the making, using or + selling of Original Software, to make, have made, use, + practice, sell, and offer for sale, and/or otherwise + dispose of the Original Software (or portions thereof). + + (c) The licenses granted in Sections 2.1(a) and (b) are + effective on the date Initial Developer first distributes + or otherwise makes the Original Software available to a + third party under the terms of this License. + + (d) Notwithstanding Section 2.1(b) above, no patent license is + granted: (1) for code that You delete from the Original + Software, or (2) for infringements caused by: (i) the + modification of the Original Software, or (ii) the + combination of the Original Software with other software + or devices. + + 2.2. Contributor Grant. + + Conditioned upon Your compliance with Section 3.1 below and + subject to third party intellectual property claims, each + Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + (a) under intellectual property rights (other than patent or + trademark) Licensable by Contributor to use, reproduce, + modify, display, perform, sublicense and distribute the + Modifications created by such Contributor (or portions + thereof), either on an unmodified basis, with other + Modifications, as Covered Software and/or as part of a + Larger Work; and + + (b) under Patent Claims infringed by the making, using, or + selling of Modifications made by that Contributor either + alone and/or in combination with its Contributor Version + (or portions of such combination), to make, use, sell, + offer for sale, have made, and/or otherwise dispose of: + (1) Modifications made by that Contributor (or portions + thereof); and (2) the combination of Modifications made by + that Contributor with its Contributor Version (or portions + of such combination). + + (c) The licenses granted in Sections 2.2(a) and 2.2(b) are + effective on the date Contributor first distributes or + otherwise makes the Modifications available to a third + party. + + (d) Notwithstanding Section 2.2(b) above, no patent license is + granted: (1) for any code that Contributor has deleted + from the Contributor Version; (2) for infringements caused + by: (i) third party modifications of Contributor Version, + or (ii) the combination of Modifications made by that + Contributor with other software (except as part of the + Contributor Version) or other devices; or (3) under Patent + Claims infringed by Covered Software in the absence of + Modifications made by that Contributor. + +3. Distribution Obligations. + + 3.1. Availability of Source Code. + + Any Covered Software that You distribute or otherwise make + available in Executable form must also be made available in Source + Code form and that Source Code form must be distributed only under + the terms of this License. You must include a copy of this + License with every copy of the Source Code form of the Covered + Software You distribute or otherwise make available. You must + inform recipients of any such Covered Software in Executable form + as to how they can obtain such Covered Software in Source Code + form in a reasonable manner on or through a medium customarily + used for software exchange. + + 3.2. Modifications. + + The Modifications that You create or to which You contribute are + governed by the terms of this License. You represent that You + believe Your Modifications are Your original creation(s) and/or + You have sufficient rights to grant the rights conveyed by this + License. + + 3.3. Required Notices. + + You must include a notice in each of Your Modifications that + identifies You as the Contributor of the Modification. You may + not remove or alter any copyright, patent or trademark notices + contained within the Covered Software, or any notices of licensing + or any descriptive text giving attribution to any Contributor or + the Initial Developer. + + 3.4. Application of Additional Terms. + + You may not offer or impose any terms on any Covered Software in + Source Code form that alters or restricts the applicable version + of this License or the recipients' rights hereunder. You may + choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of + Covered Software. However, you may do so only on Your own behalf, + and not on behalf of the Initial Developer or any Contributor. + You must make it absolutely clear that any such warranty, support, + indemnity or liability obligation is offered by You alone, and You + hereby agree to indemnify the Initial Developer and every + Contributor for any liability incurred by the Initial Developer or + such Contributor as a result of warranty, support, indemnity or + liability terms You offer. + + 3.5. Distribution of Executable Versions. + + You may distribute the Executable form of the Covered Software + under the terms of this License or under the terms of a license of + Your choice, which may contain terms different from this License, + provided that You are in compliance with the terms of this License + and that the license for the Executable form does not attempt to + limit or alter the recipient's rights in the Source Code form from + the rights set forth in this License. If You distribute the + Covered Software in Executable form under a different license, You + must make it absolutely clear that any terms which differ from + this License are offered by You alone, not by the Initial + Developer or Contributor. You hereby agree to indemnify the + Initial Developer and every Contributor for any liability incurred + by the Initial Developer or such Contributor as a result of any + such terms You offer. + + 3.6. Larger Works. + + You may create a Larger Work by combining Covered Software with + other code not governed by the terms of this License and + distribute the Larger Work as a single product. In such a case, + You must make sure the requirements of this License are fulfilled + for the Covered Software. + +4. Versions of the License. + + 4.1. New Versions. + + Sun Microsystems, Inc. is the initial license steward and may + publish revised and/or new versions of this License from time to + time. Each version will be given a distinguishing version number. + Except as provided in Section 4.3, no one other than the license + steward has the right to modify this License. + + 4.2. Effect of New Versions. + + You may always continue to use, distribute or otherwise make the + Covered Software available under the terms of the version of the + License under which You originally received the Covered Software. + If the Initial Developer includes a notice in the Original + Software prohibiting it from being distributed or otherwise made + available under any subsequent version of the License, You must + distribute and make the Covered Software available under the terms + of the version of the License under which You originally received + the Covered Software. Otherwise, You may also choose to use, + distribute or otherwise make the Covered Software available under + the terms of any subsequent version of the License published by + the license steward. + + 4.3. Modified Versions. + + When You are an Initial Developer and You want to create a new + license for Your Original Software, You may create and use a + modified version of this License if You: (a) rename the license + and remove any references to the name of the license steward + (except to note that the license differs from this License); and + (b) otherwise make it clear that the license contains terms which + differ from this License. + +5. DISCLAIMER OF WARRANTY. + + COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" + BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, + INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED + SOFTWARE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR + PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND + PERFORMANCE OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY + COVERED SOFTWARE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE + INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY + NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF + WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF + ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS + DISCLAIMER. + +6. TERMINATION. + + 6.1. This License and the rights granted hereunder will terminate + automatically if You fail to comply with terms herein and fail to + cure such breach within 30 days of becoming aware of the breach. + Provisions which, by their nature, must remain in effect beyond + the termination of this License shall survive. + + 6.2. If You assert a patent infringement claim (excluding + declaratory judgment actions) against Initial Developer or a + Contributor (the Initial Developer or Contributor against whom You + assert such claim is referred to as "Participant") alleging that + the Participant Software (meaning the Contributor Version where + the Participant is a Contributor or the Original Software where + the Participant is the Initial Developer) directly or indirectly + infringes any patent, then any and all rights granted directly or + indirectly to You by such Participant, the Initial Developer (if + the Initial Developer is not the Participant) and all Contributors + under Sections 2.1 and/or 2.2 of this License shall, upon 60 days + notice from Participant terminate prospectively and automatically + at the expiration of such 60 day notice period, unless if within + such 60 day period You withdraw Your claim with respect to the + Participant Software against such Participant either unilaterally + or pursuant to a written agreement with Participant. + + 6.3. In the event of termination under Sections 6.1 or 6.2 above, + all end user licenses that have been validly granted by You or any + distributor hereunder prior to termination (excluding licenses + granted to You by any distributor) shall survive termination. + +7. LIMITATION OF LIABILITY. + + UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT + (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE + INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF + COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE + LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR + CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT + LIMITATION, DAMAGES FOR LOST PROFITS, LOSS OF GOODWILL, WORK + STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER + COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN + INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF + LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL + INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT + APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO + NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR + CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND LIMITATION MAY NOT + APPLY TO YOU. + +8. U.S. GOVERNMENT END USERS. + + The Covered Software is a "commercial item," as that term is + defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial + computer software" (as that term is defined at 48 + C.F.R. 252.227-7014(a)(1)) and "commercial computer software + documentation" as such terms are used in 48 C.F.R. 12.212 + (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 + C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all + U.S. Government End Users acquire Covered Software with only those + rights set forth herein. This U.S. Government Rights clause is in + lieu of, and supersedes, any other FAR, DFAR, or other clause or + provision that addresses Government rights in computer software + under this License. + +9. MISCELLANEOUS. + + This License represents the complete agreement concerning subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. This License shall be governed + by the law of the jurisdiction specified in a notice contained + within the Original Software (except to the extent applicable law, + if any, provides otherwise), excluding such jurisdiction's + conflict-of-law provisions. Any litigation relating to this + License shall be subject to the jurisdiction of the courts located + in the jurisdiction and venue specified in a notice contained + within the Original Software, with the losing party responsible + for costs, including, without limitation, court costs and + reasonable attorneys' fees and expenses. The application of the + United Nations Convention on Contracts for the International Sale + of Goods is expressly excluded. Any law or regulation which + provides that the language of a contract shall be construed + against the drafter shall not apply to this License. You agree + that You alone are responsible for compliance with the United + States export administration regulations (and the export control + laws and regulation of any other countries) when You use, + distribute or otherwise make available any Covered Software. + +10. RESPONSIBILITY FOR CLAIMS. + + As between Initial Developer and the Contributors, each party is + responsible for claims and damages arising, directly or + indirectly, out of its utilization of rights under this License + and You agree to work with Initial Developer and Contributors to + distribute such responsibility on an equitable basis. Nothing + herein is intended or shall be deemed to constitute any admission + of liability. + +-------------------------------------------------------------------- + +NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND +DISTRIBUTION LICENSE (CDDL) + +For Covered Software in this distribution, this License shall +be governed by the laws of the State of California (excluding +conflict-of-law provisions). + +Any litigation relating to this License shall be subject to the +jurisdiction of the Federal Courts of the Northern District of +California and the state courts of the State of California, with +venue lying in Santa Clara County, California. diff --git a/cmd/pyzfs/pyzfs.py b/cmd/pyzfs/pyzfs.py new file mode 100644 index 0000000..2088993 --- /dev/null +++ b/cmd/pyzfs/pyzfs.py @@ -0,0 +1,82 @@ +#! /usr/bin/python2.6 -S +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +# Note, we want SIGINT (control-c) to exit the process quietly, to mimic +# the standard behavior of C programs. The best we can do with pure +# Python is to run with -S (to disable "import site"), and start our +# program with a "try" statement. Hopefully nobody hits ^C before our +# try statement is executed. + +try: + import site + import gettext + import zfs.util + import zfs.ioctl + import sys + import errno + import solaris.misc + + """This is the main script for doing zfs subcommands. It doesn't know + what subcommands there are, it just looks for a module zfs. + that implements that subcommand.""" + + try: + _ = gettext.translation("SUNW_OST_OSCMD", "/usr/lib/locale", + fallback=True).gettext + except: + _ = solaris.misc.gettext + + if len(sys.argv) < 2: + sys.exit(_("missing subcommand argument")) + + zfs.ioctl.set_cmdstr(" ".join(["zfs"] + sys.argv[1:])) + + try: + # import zfs. + # subfunc = zfs..do_ + + subcmd = sys.argv[1] + __import__("zfs." + subcmd) + submod = getattr(zfs, subcmd) + subfunc = getattr(submod, "do_" + subcmd) + except (ImportError, AttributeError): + sys.exit(_("invalid subcommand")) + + try: + subfunc() + except zfs.util.ZFSError, e: + print(e) + sys.exit(1) + +except IOError, e: + import errno + import sys + + if e.errno == errno.EPIPE: + sys.exit(1) + raise +except KeyboardInterrupt: + import sys + + sys.exit(1) diff --git a/cmd/sgs/include/alist.h b/cmd/sgs/include/alist.h index e92811f..c5071a8 100644 --- a/cmd/sgs/include/alist.h +++ b/cmd/sgs/include/alist.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Define an Alist, a list maintained as a reallocable array, and a for() loop @@ -32,8 +32,6 @@ #ifndef _ALIST_H #define _ALIST_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -138,6 +136,13 @@ typedef struct { void *apl_data[1]; /* data area: (arrcnt * size) bytes */ } APlist; +#ifdef _SYSCALL32 /* required by librtld_db */ +typedef struct { + Elf32_Word apl_arritems; + Elf32_Word apl_nitems; + Elf32_Addr apl_data[1]; +} APlist32; +#endif /* _SYSCALL32 */ /* * The ALIST_OFF_DATA and APLIST_OFF_DATA macros give the byte offset @@ -224,7 +229,7 @@ typedef struct { * Possible values returned by aplist_test() */ typedef enum { - ALE_ALLOCFAIL = 0, /* Memory allocation error */ + ALE_ALLOCFAIL = 0, /* memory allocation error */ ALE_EXISTS = 1, /* alist entry already exists */ ALE_NOTFND = 2, /* item not found and insert not required */ ALE_CREATE = 3 /* alist entry created */ @@ -244,11 +249,14 @@ typedef enum { ((void *)((_off) + (char *)(_lp))) /* - * # of items currently found in a list. These macros handle the case - * where the list has not been allocated yet. + * The number of items currently found in a list (nitems), and the total number + * of slots in the current data allocation (arritems). These macros handle the + * case where the list has not been allocated yet. */ -#define alist_nitems(_lp) (((_lp) == NULL) ? 0 : (_lp)->al_nitems) -#define aplist_nitems(_lp) (((_lp) == NULL) ? 0 : (_lp)->apl_nitems) +#define alist_nitems(_lp) (((_lp) == NULL) ? 0 : (_lp)->al_nitems) +#define aplist_nitems(_lp) (((_lp) == NULL) ? 0 : (_lp)->apl_nitems) +#define alist_arritems(_lp) (((_lp) == NULL) ? 0 : (_lp)->al_arritems) +#define aplist_arritems(_lp) (((_lp) == NULL) ? 0 : (_lp)->apl_arritems) extern void *alist_append(Alist **, const void *, size_t, Aliste); diff --git a/cmd/sgs/include/debug.h b/cmd/sgs/include/debug.h index 0a42f8d..9db2025 100644 --- a/cmd/sgs/include/debug.h +++ b/cmd/sgs/include/debug.h @@ -20,15 +20,12 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _DEBUG_H #define _DEBUG_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Global include file for lddbg debugging. * @@ -45,6 +42,7 @@ * start with the `Elf_' prefix. These latter routines are the only * routines used by the elfdump(1) utility. */ +#include #include #include #include @@ -55,13 +53,12 @@ extern "C" { #endif /* - * Define Dbg_*() interface flags. These flags direct the debugging routine to - * generate different diagnostics, thus the strings themselves are maintained + * Define Dbg_*() interface values. These values direct the debugging routine + * to generate different diagnostics, thus the strings themselves are maintained * in the debugging library. */ #define DBG_SUP_ENVIRON 1 #define DBG_SUP_CMDLINE 2 -#define DBG_SUP_DEFAULT 3 #define DBG_CONF_IGNORE 1 /* configuration processing errors */ #define DBG_CONF_VERSION 2 @@ -87,18 +84,22 @@ extern "C" { #define DBG_DLSYM_SELF 3 #define DBG_DLSYM_PROBE 4 #define DBG_DLSYM_SINGLETON 5 +#define DBG_DLSYM_NUM DBG_DLSYM_SINGLETON + 1 #define DBG_DLCLOSE_NULL 0 #define DBG_DLCLOSE_IGNORE 1 #define DBG_DLCLOSE_RESCAN 2 -#define DBG_WAIT_INIT 1 -#define DBG_WAIT_FINI 2 -#define DBG_WAIT_SYMBOL 3 - #define DBG_SYM_REDUCE_GLOBAL 1 /* reporting global symbols to local */ #define DBG_SYM_REDUCE_RETAIN 2 /* reporting non reduced local syms */ +#define DBG_AUD_CALL 1 /* original call to auditor */ +#define DBG_AUD_RET 2 /* return from auditor diagnostic */ + +#define DBG_AUD_LOCAL 0 /* auditor is local */ +#define DBG_AUD_GLOBAL 1 /* auditor is global */ +#define DBG_AUD_PRELOAD 2 /* auditor is preloaded */ + /* * Group handle operations - passed to Dbg_file_hdl_title(). Indicate why * handle dependencies are being manipulated. @@ -139,12 +140,18 @@ extern "C" { #define DBG_BINFO_REF_PARENT 0x2000 /* reference to PARENT */ #define DBG_BINFO_REF_MSK 0xf000 - -#define DBG_CAP_INITIAL 0 -#define DBG_CAP_IGNORE 1 -#define DBG_CAP_OLD 2 -#define DBG_CAP_NEW 3 -#define DBG_CAP_RESOLVED 4 +/* + * ld.so.1(1) symbol capabilities processing. + */ +#define DBG_CAP_DEFAULT 0 +#define DBG_CAP_USED 1 +#define DBG_CAP_CANDIDATE 2 +#define DBG_CAP_REJECTED 3 +#define DBG_CAP_HW_1 4 +#define DBG_CAP_SF_1 5 +#define DBG_CAP_HW_2 6 +#define DBG_CAP_PLAT 7 +#define DBG_CAP_MACH 8 #define DBG_REL_START 1 #define DBG_REL_FINISH 2 @@ -153,23 +160,53 @@ extern "C" { #define DBG_NL_STD 0 /* newline controllers - standard and */ #define DBG_NL_FRC 2 /* forced. */ -#define DBG_BNDREJ_NODIR 0 /* bind rejected, direct to nodirect */ -#define DBG_BNDREJ_SINGLE 1 /* bind rejected, singleton without */ +#define DBG_BNDREJ_DIRECT 0 /* bind rejected, direct to nodirect */ +#define DBG_BNDREJ_GROUP 1 /* bind rejected, group to nodirect */ +#define DBG_BNDREJ_SINGLE 2 /* bind rejected, singleton without */ /* default search model */ #define DBG_BNDREJ_NUM DBG_BNDREJ_SINGLE /* + * Dbg_state_str() is used to obtain commonly used "state transition" + * strings used in various debugging output. + */ +#define DBG_STATE_ADD 0 /* add */ +#define DBG_STATE_CURRENT 1 /* current */ +#define DBG_STATE_EXCLUDE 2 /* exclude */ +#define DBG_STATE_IGNORED 3 /* ignored */ +#define DBG_STATE_MOD_BEFORE 4 /* modify (before) */ +#define DBG_STATE_MOD_AFTER 5 /* modify (after) */ +#define DBG_STATE_NEW 6 /* new */ +#define DBG_STATE_NEW_IMPLICIT 7 /* new (implicit) */ +#define DBG_STATE_RESET 8 /* reset */ +#define DBG_STATE_ORIGINAL 9 /* original */ +#define DBG_STATE_RESOLVED 10 /* resolved */ + +#define DBG_STATE_NUM 11 +typedef uint_t dbg_state_t; +extern const char *Dbg_state_str(dbg_state_t); + +/* * Define a debug descriptor, and a user macro that inspects the descriptor as * a means of triggering a class of diagnostic output. */ typedef struct { uint_t d_class; /* debugging classes */ - uint_t d_extra; /* extra information for classes */ - APlist *d_list; /* associated strings */ + uint_t d_extra; /* extra public information */ + APlist *d_list; /* accepted link-map list names */ + struct timeval d_totaltime; /* total time since entry - */ + /* gettimeofday(3c) */ + struct timeval d_deltatime; /* delta time since last diagnostic - */ + /* gettimeofday(3c) */ } Dbg_desc; extern Dbg_desc *dbg_desc; +/* + * Macros used to avoid calls to liblddbg unless debugging is enabled. + * liblddbg is lazy loaded --- this prevents it from happening unless + * it will actually be used. + */ #define DBG_ENABLED (dbg_desc->d_class) #define DBG_CALL(func) if (DBG_ENABLED) func @@ -180,26 +217,51 @@ extern Dbg_desc *dbg_desc; * may be interpreted by the debugging library itself or from the callers * dbg_print() routine. */ -#define DBG_E_DETAIL 0x0001 /* add detail to a class */ -#define DBG_E_LONG 0x0002 /* use long names (ie. no truncation) */ - -#define DBG_E_STDNL 0x0010 /* standard newline indicator */ +#define DBG_E_DETAIL 0x00000001 /* add detail to a class */ +#define DBG_E_LONG 0x00000002 /* use long names (ie. no truncation) */ +#define DBG_E_DEMANGLE 0x00000004 /* demangle symbol names */ +#define DBG_E_STDNL 0x00000008 /* standard newline indicator */ +#define DBG_E_HELP 0x00000010 /* help requested */ +#define DBG_E_HELP_EXIT 0x00000020 /* hint: user should exit after help */ +#define DBG_E_TTIME 0x00000040 /* prepend total time */ +#define DBG_E_DTIME 0x00000080 /* prepend delta time */ +#define DBG_E_RESET 0x00000100 /* reset times */ + +/* ld only */ +#define DBG_E_SNAME 0x00001000 /* prepend simple name */ +#define DBG_E_FNAME 0x00002000 /* prepend full name */ +#define DBG_E_CLASS 0x00004000 /* prepend ELF class */ + +/* ld.so.1 only */ +#define DBG_E_LMID 0x00100000 /* prepend link-map id */ +#define DBG_E_LMID_LDSO 0x00200000 /* show ldso link-map list */ +#define DBG_E_LMID_ALL 0x00400000 /* show all non-ldso link-map lists */ +#define DBG_E_LMID_ALT 0x00800000 /* show all ALT link-map lists */ +#define DBG_E_LMID_BASE 0x01000000 /* show BASE link-map list */ -#define DBG_E_SNAME 0x0100 /* prepend simple name (ld only) */ -#define DBG_E_FNAME 0x0200 /* prepend full name (ld only) */ -#define DBG_E_CLASS 0x0400 /* prepend ELF class (ld only) */ -#define DBG_E_LMID 0x0800 /* prepend link-map id (ld.so.1 only) */ -#define DBG_E_DEMANGLE 0x1000 /* demangle symbol names */ #define DBG_NOTDETAIL() !(dbg_desc->d_extra & DBG_E_DETAIL) #define DBG_NOTLONG() !(dbg_desc->d_extra & DBG_E_LONG) +#define DBG_ISDEMANGLE() \ + (dbg_desc->d_extra & DBG_E_DEMANGLE) + +#define DBG_TOTALTIME (dbg_desc->d_totaltime) +#define DBG_DELTATIME (dbg_desc->d_deltatime) + +#define DBG_ISTTIME() (dbg_desc->d_extra & DBG_E_TTIME) +#define DBG_ISDTIME() (dbg_desc->d_extra & DBG_E_DTIME) +#define DBG_ISTIME() (dbg_desc->d_extra & (DBG_E_TTIME | DBG_E_DTIME)) +#define DBG_NOTTIME() !(dbg_desc->d_extra & (DBG_E_TTIME | DBG_E_DTIME)) + +#define DBG_ISRESET() (dbg_desc->d_extra & DBG_E_RESET) +#define DBG_ONRESET() (dbg_desc->d_extra |= DBG_E_RESET) +#define DBG_OFFRESET() (dbg_desc->d_extra &= ~DBG_E_RESET) + #define DBG_ISSNAME() (dbg_desc->d_extra & DBG_E_SNAME) #define DBG_ISFNAME() (dbg_desc->d_extra & DBG_E_FNAME) #define DBG_ISCLASS() (dbg_desc->d_extra & DBG_E_CLASS) #define DBG_ISLMID() (dbg_desc->d_extra & DBG_E_LMID) -#define DBG_ISDEMANGLE() \ - (dbg_desc->d_extra & DBG_E_DEMANGLE) /* * Print routine, this must be supplied by the application. The initial @@ -209,7 +271,19 @@ extern Dbg_desc *dbg_desc; /* PRINTFLIKE2 */ extern void dbg_print(Lm_list *, const char *, ...); -extern uintptr_t Dbg_setup(const char *, Dbg_desc *); +/* + * Initialization routine, called before any other Dbg routines to + * establish the necessary state. + */ +typedef enum { DBG_CALLER_LD, DBG_CALLER_RTLD } dbg_setup_caller_t; +extern int Dbg_setup(dbg_setup_caller_t, const char *, + Dbg_desc *, const char **); + +/* Call dbg_print() to produce linker version output */ +extern void Dbg_version(void); + +/* Call dbg_print() to produce help output */ +extern void Dbg_help(void); /* * Establish ELF32 and ELF64 class Dbg_*() interfaces. @@ -225,12 +299,28 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_bind_reject Dbg64_bind_reject #define Dbg_bind_weak Dbg64_bind_weak -#define Dbg_cap_val_hw1 Dbg64_cap_val_hw1 -#define Dbg_cap_hw_candidate Dbg64_cap_hw_candidate -#define Dbg_cap_hw_filter Dbg64_cap_hw_filter -#define Dbg_cap_mapfile Dbg64_cap_mapfile -#define Dbg_cap_sec_entry Dbg64_cap_sec_entry +#define Dbg_cap_candidate Dbg64_cap_candidate +#define Dbg_cap_filter Dbg64_cap_filter +#define Dbg_cap_id Dbg64_cap_id +#define Dbg_cap_identical Dbg64_cap_identical +#define Dbg_cap_mapfile_title Dbg64_cap_mapfile_title +#define Dbg_cap_post_title Dbg64_cap_post_title #define Dbg_cap_sec_title Dbg64_cap_sec_title +#define Dbg_cap_val Dbg64_cap_val +#define Dbg_cap_ptr_entry Dbg64_cap_ptr_entry +#define Dbg_cap_val_entry Dbg64_cap_val_entry + +#define Dbg_dl_dladdr Dbg64_dl_dladdr +#define Dbg_dl_dlclose Dbg64_dl_dlclose +#define Dbg_dl_dldump Dbg64_dl_dldump +#define Dbg_dl_dlerror Dbg64_dl_dlerror +#define Dbg_dl_dlinfo Dbg64_dl_dlinfo +#define Dbg_dl_dlopen Dbg64_dl_dlopen +#define Dbg_dl_dlsym Dbg64_dl_dlsym +#define Dbg_dl_iphdr_enter Dbg64_dl_iphdr_enter +#define Dbg_dl_iphdr_callback Dbg64_dl_iphdr_callback +#define Dbg_dl_iphdr_mapchange Dbg64_dl_iphdr_mapchange +#define Dbg_dl_iphdr_unmap_ret Dbg64_dl_iphdr_unmap_ret #define Dbg_ent_entry Dbg64_ent_entry #define Dbg_ent_print Dbg64_ent_print @@ -241,15 +331,14 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_file_ar_rescan Dbg64_file_ar_rescan #define Dbg_file_bind_entry Dbg64_file_bind_entry #define Dbg_file_bindings Dbg64_file_bindings +#define Dbg_file_bindings_done Dbg64_file_bindings_done #define Dbg_file_cleanup Dbg64_file_cleanup #define Dbg_file_cntl Dbg64_file_cntl #define Dbg_file_config_dis Dbg64_file_config_dis #define Dbg_file_config_obj Dbg64_file_config_obj #define Dbg_file_del_rescan Dbg64_file_del_rescan #define Dbg_file_delete Dbg64_file_delete -#define Dbg_file_dlclose Dbg64_file_dlclose -#define Dbg_file_dldump Dbg64_file_dldump -#define Dbg_file_dlopen Dbg64_file_dlopen +#define Dbg_file_deferred Dbg64_file_deferred #define Dbg_file_elf Dbg64_file_elf #define Dbg_file_filtee Dbg64_file_filtee #define Dbg_file_filter Dbg64_file_filter @@ -260,6 +349,7 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_file_hdl_title Dbg64_file_hdl_title #define Dbg_file_lazyload Dbg64_file_lazyload #define Dbg_file_ldso Dbg64_file_ldso +#define Dbg_file_mmapobj Dbg64_file_mmapobj #define Dbg_file_mode_promote Dbg64_file_mode_promote #define Dbg_file_modified Dbg64_file_modified #define Dbg_file_needed Dbg64_file_needed @@ -275,7 +365,7 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_libs_audit Dbg64_libs_audit #define Dbg_libs_find Dbg64_libs_find #define Dbg_libs_found Dbg64_libs_found -#define Dbg_libs_ignore Dbg64_libs_ignore +#define Dbg_libs_insecure Dbg64_libs_insecure #define Dbg_libs_init Dbg64_libs_init #define Dbg_libs_l Dbg64_libs_l #define Dbg_libs_path Dbg64_libs_path @@ -284,22 +374,28 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_libs_yp Dbg64_libs_yp #define Dbg_libs_ylu Dbg64_libs_ylu -#define Dbg_map_dash Dbg64_map_dash +#define Dbg_map_cexp_id Dbg64_map_cexp_id +#define Dbg_map_dv Dbg64_map_dv +#define Dbg_map_dv_entry Dbg64_map_dv_entry #define Dbg_map_ent Dbg64_map_ent +#define Dbg_map_ent_ord_title Dbg64_map_ent_ord_title +#define Dbg_map_hdr_noalloc Dbg64_map_hdr_noalloc #define Dbg_map_parse Dbg64_map_parse -#define Dbg_map_pipe Dbg64_map_pipe +#define Dbg_map_pass Dbg64_map_pass +#define Dbg_map_post_title Dbg64_map_post_title #define Dbg_map_seg Dbg64_map_seg -#define Dbg_map_set_atsign Dbg64_map_set_atsign -#define Dbg_map_set_equal Dbg64_map_set_equal +#define Dbg_map_seg_order Dbg64_map_seg_order +#define Dbg_map_seg_os_order Dbg64_map_seg_os_order #define Dbg_map_size_new Dbg64_map_size_new #define Dbg_map_size_old Dbg64_map_size_old -#define Dbg_map_sort_fini Dbg64_map_sort_fini -#define Dbg_map_sort_orig Dbg64_map_sort_orig +#define Dbg_map_sort_seg Dbg64_map_sort_seg +#define Dbg_map_sort_title Dbg64_map_sort_title #define Dbg_map_symbol Dbg64_map_symbol #define Dbg_map_version Dbg64_map_version #define Dbg_move_adjexpandreloc Dbg64_move_adjexpandreloc #define Dbg_move_adjmovereloc Dbg64_move_adjmovereloc +#define Dbg_move_bad Dbg64_move_bad #define Dbg_move_data Dbg64_move_data #define Dbg_move_entry1 Dbg64_move_entry1 #define Dbg_move_entry2 Dbg64_move_entry2 @@ -329,13 +425,16 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_reloc_sloppycomdat Dbg64_reloc_sloppycomdat #define Dbg_sec_added Dbg64_sec_added +#define Dbg_sec_backing Dbg64_sec_backing #define Dbg_sec_created Dbg64_sec_created #define Dbg_sec_discarded Dbg64_sec_discarded #define Dbg_sec_genstr_compress Dbg64_sec_genstr_compress #define Dbg_sec_group Dbg64_sec_group +#define Dbg_sec_gnu_comdat Dbg64_sec_gnu_comdat #define Dbg_sec_in Dbg64_sec_in #define Dbg_sec_order_error Dbg64_sec_order_error #define Dbg_sec_order_list Dbg64_sec_order_list +#define Dbg_sec_redirected Dbg64_sec_redirected #define Dbg_sec_strtab Dbg64_sec_strtab #define Dbg_sec_unsup_strmerge Dbg64_sec_unsup_strmerge @@ -353,17 +452,24 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_support_action Dbg64_support_action #define Dbg_support_load Dbg64_support_load #define Dbg_support_req Dbg64_support_req +#define Dbg_support_vnone Dbg64_support_vnone #define Dbg_syminfo_entry Dbg64_syminfo_entry #define Dbg_syminfo_title Dbg64_syminfo_title #define Dbg_syms_ar_checking Dbg64_syms_ar_checking -#define Dbg_syms_ar_entry Dbg64_syms_ar_entry +#define Dbg_syms_ar_force Dbg64_syms_ar_force #define Dbg_syms_ar_resolve Dbg64_syms_ar_resolve +#define Dbg_syms_ar_skip Dbg64_syms_ar_skip #define Dbg_syms_ar_title Dbg64_syms_ar_title +#define Dbg_syms_cap_convert Dbg64_syms_cap_convert +#define Dbg_syms_cap_local Dbg64_syms_cap_local +#define Dbg_syms_cap_lookup Dbg64_syms_cap_lookup +#define Dbg_syms_cap_title Dbg64_syms_cap_title +#define Dbg_syms_copy_reloc Dbg64_syms_copy_reloc #define Dbg_syms_created Dbg64_syms_created #define Dbg_syms_discarded Dbg64_syms_discarded -#define Dbg_syms_dlsym Dbg64_syms_dlsym +#define Dbg_syms_dup_discarded Dbg64_syms_dup_discarded #define Dbg_syms_dup_sort_addr Dbg64_syms_dup_sort_addr #define Dbg_syms_entered Dbg64_syms_entered #define Dbg_syms_entry Dbg64_syms_entry @@ -376,7 +482,6 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_syms_old Dbg64_syms_old #define Dbg_syms_process Dbg64_syms_process #define Dbg_syms_reduce Dbg64_syms_reduce -#define Dbg_syms_reloc Dbg64_syms_reloc #define Dbg_syms_resolved Dbg64_syms_resolved #define Dbg_syms_resolving Dbg64_syms_resolving #define Dbg_syms_sec_entry Dbg64_syms_sec_entry @@ -384,8 +489,8 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_syms_spec_title Dbg64_syms_spec_title #define Dbg_syms_updated Dbg64_syms_updated #define Dbg_syms_up_title Dbg64_syms_up_title +#define Dbg_syms_wrap Dbg64_syms_wrap -#define Dbg_util_broadcast Dbg64_util_broadcast #define Dbg_util_call_array Dbg64_util_call_array #define Dbg_util_call_fini Dbg64_util_call_fini #define Dbg_util_call_init Dbg64_util_call_init @@ -401,7 +506,6 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_util_scc_entry Dbg64_util_scc_entry #define Dbg_util_scc_title Dbg64_util_scc_title #define Dbg_util_str Dbg64_util_str -#define Dbg_util_wait Dbg64_util_wait #define Dbg_unused_file Dbg64_unused_file #define Dbg_unused_lcinterface Dbg64_unused_lcinterface @@ -413,6 +517,7 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_ver_avail_title Dbg64_ver_avail_title #define Dbg_ver_def_title Dbg64_ver_def_title #define Dbg_ver_desc_entry Dbg64_ver_desc_entry +#define Dbg_ver_need_done Dbg64_ver_need_done #define Dbg_ver_need_entry Dbg64_ver_need_entry #define Dbg_ver_need_title Dbg64_ver_need_title #define Dbg_ver_nointerface Dbg64_ver_nointerface @@ -427,12 +532,28 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_bind_reject Dbg32_bind_reject #define Dbg_bind_weak Dbg32_bind_weak -#define Dbg_cap_val_hw1 Dbg32_cap_val_hw1 -#define Dbg_cap_hw_candidate Dbg32_cap_hw_candidate -#define Dbg_cap_hw_filter Dbg32_cap_hw_filter -#define Dbg_cap_mapfile Dbg32_cap_mapfile -#define Dbg_cap_sec_entry Dbg32_cap_sec_entry +#define Dbg_cap_candidate Dbg32_cap_candidate +#define Dbg_cap_filter Dbg32_cap_filter +#define Dbg_cap_id Dbg32_cap_id +#define Dbg_cap_identical Dbg32_cap_identical +#define Dbg_cap_mapfile_title Dbg32_cap_mapfile_title +#define Dbg_cap_post_title Dbg32_cap_post_title #define Dbg_cap_sec_title Dbg32_cap_sec_title +#define Dbg_cap_val Dbg32_cap_val +#define Dbg_cap_ptr_entry Dbg32_cap_ptr_entry +#define Dbg_cap_val_entry Dbg32_cap_val_entry + +#define Dbg_dl_dladdr Dbg32_dl_dladdr +#define Dbg_dl_dlclose Dbg32_dl_dlclose +#define Dbg_dl_dldump Dbg32_dl_dldump +#define Dbg_dl_dlerror Dbg32_dl_dlerror +#define Dbg_dl_dlinfo Dbg32_dl_dlinfo +#define Dbg_dl_dlopen Dbg32_dl_dlopen +#define Dbg_dl_dlsym Dbg32_dl_dlsym +#define Dbg_dl_iphdr_enter Dbg32_dl_iphdr_enter +#define Dbg_dl_iphdr_callback Dbg32_dl_iphdr_callback +#define Dbg_dl_iphdr_mapchange Dbg32_dl_iphdr_mapchange +#define Dbg_dl_iphdr_unmap_ret Dbg32_dl_iphdr_unmap_ret #define Dbg_ent_entry Dbg32_ent_entry #define Dbg_ent_print Dbg32_ent_print @@ -443,15 +564,14 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_file_ar_rescan Dbg32_file_ar_rescan #define Dbg_file_bind_entry Dbg32_file_bind_entry #define Dbg_file_bindings Dbg32_file_bindings +#define Dbg_file_bindings_done Dbg32_file_bindings_done #define Dbg_file_cleanup Dbg32_file_cleanup #define Dbg_file_cntl Dbg32_file_cntl #define Dbg_file_config_dis Dbg32_file_config_dis #define Dbg_file_config_obj Dbg32_file_config_obj #define Dbg_file_del_rescan Dbg32_file_del_rescan #define Dbg_file_delete Dbg32_file_delete -#define Dbg_file_dlclose Dbg32_file_dlclose -#define Dbg_file_dldump Dbg32_file_dldump -#define Dbg_file_dlopen Dbg32_file_dlopen +#define Dbg_file_deferred Dbg32_file_deferred #define Dbg_file_elf Dbg32_file_elf #define Dbg_file_filtee Dbg32_file_filtee #define Dbg_file_filter Dbg32_file_filter @@ -462,6 +582,7 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_file_hdl_title Dbg32_file_hdl_title #define Dbg_file_lazyload Dbg32_file_lazyload #define Dbg_file_ldso Dbg32_file_ldso +#define Dbg_file_mmapobj Dbg32_file_mmapobj #define Dbg_file_mode_promote Dbg32_file_mode_promote #define Dbg_file_modified Dbg32_file_modified #define Dbg_file_needed Dbg32_file_needed @@ -477,7 +598,7 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_libs_audit Dbg32_libs_audit #define Dbg_libs_find Dbg32_libs_find #define Dbg_libs_found Dbg32_libs_found -#define Dbg_libs_ignore Dbg32_libs_ignore +#define Dbg_libs_insecure Dbg32_libs_insecure #define Dbg_libs_init Dbg32_libs_init #define Dbg_libs_l Dbg32_libs_l #define Dbg_libs_path Dbg32_libs_path @@ -486,22 +607,28 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_libs_yp Dbg32_libs_yp #define Dbg_libs_ylu Dbg32_libs_ylu -#define Dbg_map_dash Dbg32_map_dash +#define Dbg_map_cexp_id Dbg32_map_cexp_id +#define Dbg_map_dv Dbg32_map_dv +#define Dbg_map_dv_entry Dbg32_map_dv_entry #define Dbg_map_ent Dbg32_map_ent +#define Dbg_map_ent_ord_title Dbg32_map_ent_ord_title +#define Dbg_map_hdr_noalloc Dbg32_map_hdr_noalloc #define Dbg_map_parse Dbg32_map_parse -#define Dbg_map_pipe Dbg32_map_pipe +#define Dbg_map_pass Dbg32_map_pass +#define Dbg_map_post_title Dbg32_map_post_title #define Dbg_map_seg Dbg32_map_seg -#define Dbg_map_set_atsign Dbg32_map_set_atsign -#define Dbg_map_set_equal Dbg32_map_set_equal +#define Dbg_map_seg_order Dbg32_map_seg_order +#define Dbg_map_seg_os_order Dbg32_map_seg_os_order #define Dbg_map_size_new Dbg32_map_size_new #define Dbg_map_size_old Dbg32_map_size_old -#define Dbg_map_sort_fini Dbg32_map_sort_fini -#define Dbg_map_sort_orig Dbg32_map_sort_orig +#define Dbg_map_sort_seg Dbg32_map_sort_seg +#define Dbg_map_sort_title Dbg32_map_sort_title #define Dbg_map_symbol Dbg32_map_symbol #define Dbg_map_version Dbg32_map_version #define Dbg_move_adjexpandreloc Dbg32_move_adjexpandreloc #define Dbg_move_adjmovereloc Dbg32_move_adjmovereloc +#define Dbg_move_bad Dbg32_move_bad #define Dbg_move_data Dbg32_move_data #define Dbg_move_entry1 Dbg32_move_entry1 #define Dbg_move_entry2 Dbg32_move_entry2 @@ -531,13 +658,16 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_reloc_sloppycomdat Dbg32_reloc_sloppycomdat #define Dbg_sec_added Dbg32_sec_added +#define Dbg_sec_backing Dbg32_sec_backing #define Dbg_sec_created Dbg32_sec_created #define Dbg_sec_discarded Dbg32_sec_discarded #define Dbg_sec_genstr_compress Dbg32_sec_genstr_compress #define Dbg_sec_group Dbg32_sec_group +#define Dbg_sec_gnu_comdat Dbg32_sec_gnu_comdat #define Dbg_sec_in Dbg32_sec_in #define Dbg_sec_order_error Dbg32_sec_order_error #define Dbg_sec_order_list Dbg32_sec_order_list +#define Dbg_sec_redirected Dbg32_sec_redirected #define Dbg_sec_strtab Dbg32_sec_strtab #define Dbg_sec_unsup_strmerge Dbg32_sec_unsup_strmerge @@ -555,17 +685,24 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_support_action Dbg32_support_action #define Dbg_support_load Dbg32_support_load #define Dbg_support_req Dbg32_support_req +#define Dbg_support_vnone Dbg32_support_vnone #define Dbg_syminfo_entry Dbg32_syminfo_entry #define Dbg_syminfo_title Dbg32_syminfo_title #define Dbg_syms_ar_checking Dbg32_syms_ar_checking -#define Dbg_syms_ar_entry Dbg32_syms_ar_entry +#define Dbg_syms_ar_force Dbg32_syms_ar_force #define Dbg_syms_ar_resolve Dbg32_syms_ar_resolve +#define Dbg_syms_ar_skip Dbg32_syms_ar_skip #define Dbg_syms_ar_title Dbg32_syms_ar_title +#define Dbg_syms_cap_convert Dbg32_syms_cap_convert +#define Dbg_syms_cap_local Dbg32_syms_cap_local +#define Dbg_syms_cap_lookup Dbg32_syms_cap_lookup +#define Dbg_syms_cap_title Dbg32_syms_cap_title +#define Dbg_syms_copy_reloc Dbg32_syms_copy_reloc #define Dbg_syms_created Dbg32_syms_created #define Dbg_syms_discarded Dbg32_syms_discarded -#define Dbg_syms_dlsym Dbg32_syms_dlsym +#define Dbg_syms_dup_discarded Dbg32_syms_dup_discarded #define Dbg_syms_dup_sort_addr Dbg32_syms_dup_sort_addr #define Dbg_syms_entered Dbg32_syms_entered #define Dbg_syms_entry Dbg32_syms_entry @@ -579,7 +716,6 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_syms_old Dbg32_syms_old #define Dbg_syms_process Dbg32_syms_process #define Dbg_syms_reduce Dbg32_syms_reduce -#define Dbg_syms_reloc Dbg32_syms_reloc #define Dbg_syms_resolved Dbg32_syms_resolved #define Dbg_syms_resolving Dbg32_syms_resolving #define Dbg_syms_sec_entry Dbg32_syms_sec_entry @@ -587,8 +723,8 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_syms_spec_title Dbg32_syms_spec_title #define Dbg_syms_updated Dbg32_syms_updated #define Dbg_syms_up_title Dbg32_syms_up_title +#define Dbg_syms_wrap Dbg32_syms_wrap -#define Dbg_util_broadcast Dbg32_util_broadcast #define Dbg_util_call_array Dbg32_util_call_array #define Dbg_util_call_fini Dbg32_util_call_fini #define Dbg_util_call_init Dbg32_util_call_init @@ -604,7 +740,6 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_util_scc_entry Dbg32_util_scc_entry #define Dbg_util_scc_title Dbg32_util_scc_title #define Dbg_util_str Dbg32_util_str -#define Dbg_util_wait Dbg32_util_wait #define Dbg_unused_file Dbg32_unused_file #define Dbg_unused_lcinterface Dbg32_unused_lcinterface @@ -616,6 +751,7 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); #define Dbg_ver_avail_title Dbg32_ver_avail_title #define Dbg_ver_def_title Dbg32_ver_def_title #define Dbg_ver_desc_entry Dbg32_ver_desc_entry +#define Dbg_ver_need_done Dbg32_ver_need_done #define Dbg_ver_need_entry Dbg32_ver_need_entry #define Dbg_ver_need_title Dbg32_ver_need_title #define Dbg_ver_nointerface Dbg32_ver_nointerface @@ -626,17 +762,40 @@ extern uintptr_t Dbg_setup(const char *, Dbg_desc *); /* * External Dbg_*() interface routines. */ -extern void Dbg_args_files(Lm_list *, int, char *); -extern void Dbg_args_flags(Lm_list *, int, int); +extern void Dbg_args_file(Lm_list *, int, char *); +extern void Dbg_args_guidance_unknown(Lm_list *, const char *); +extern void Dbg_args_option(Lm_list *, int, int, char *); +extern void Dbg_args_str2chr(Lm_list *, int, const char *, int); +extern void Dbg_args_Wldel(Lm_list *, int, const char *); +extern void Dbg_audit_activity(Lm_list *, const char *, const char *, + uint_t); extern void Dbg_audit_ignore(Rt_map *); extern void Dbg_audit_interface(Lm_list *, const char *, const char *); -extern void Dbg_audit_lib(Lm_list *, const char *); -extern void Dbg_audit_object(Lm_list *, const char *, const char *); -extern void Dbg_audit_symval(Lm_list *, const char *, const char *, - const char *, Addr, Addr); +extern void Dbg_audit_lib(Rt_map *, const char *, int); +extern void Dbg_audit_objclose(Lm_list *, const char *, const char *); +extern void Dbg_audit_objopen(Lm_list *, int, const char *, const char *, + uint_t, Boolean); +extern void Dbg_audit_objfilter(Lm_list *, int, const char *, const char *, + const char *, const char *); +extern void Dbg_audit_objsearch(Lm_list *, int, const char *, const char *, + uint_t, const char *); +extern void Dbg_audit_pltenter(Lm_list *, int, const char *, const char *, + Addr); +extern void Dbg_audit_pltexit(Lm_list *, const char *, const char *); +extern void Dbg_audit_preinit(Lm_list *, const char *, const char *); +extern void Dbg_audit_symbind(Lm_list *, int, const char *, const char *, + Addr, uint_t); extern void Dbg_audit_skip(Lm_list *, const char *, const char *); extern void Dbg_audit_terminate(Lm_list *, const char *); -extern void Dbg_audit_version(Lm_list *, const char *, ulong_t); +extern void Dbg_audit_version(Lm_list *, const char *, uint_t, uint_t); + +extern void Dbg_basic_collect(Lm_list *); +extern void Dbg_basic_create(Lm_list *); +extern void Dbg_basic_finish(Lm_list *); +extern void Dbg_basic_files(Lm_list *); +extern void Dbg_basic_options(Lm_list *); +extern void Dbg_basic_relocate(Lm_list *); +extern void Dbg_basic_validate(Lm_list *); extern void Dbg_bind_global(Rt_map *, Addr, Off, Xword, Pltbindtype, Rt_map *, Addr, Off, const char *, uint_t); @@ -649,26 +808,43 @@ extern void Dbg_bind_pltpad_to(Rt_map *, Addr, const char *, const char *); extern void Dbg_bind_reject(Rt_map *, Rt_map *, const char *, int); extern void Dbg_bind_weak(Rt_map *, Addr, Addr, const char *); -extern void Dbg_cap_hw_candidate(Lm_list *, const char *); -extern void Dbg_cap_hw_filter(Lm_list *, const char *, Rt_map *); -extern void Dbg_cap_mapfile(Lm_list *, Xword, Xword, Half); -extern void Dbg_cap_sec_entry(Lm_list *, uint_t, Xword, Xword, Half); -extern void Dbg_cap_sec_title(Ofl_desc *); -extern void Dbg_cap_val_hw1(Lm_list *, Xword, Half); +extern void Dbg_cap_candidate(Lm_list *, const char *); +extern void Dbg_cap_filter(Lm_list *, const char *, Rt_map *); +extern void Dbg_cap_id(Lm_list *, Lineno, const char *, const char *); +extern void Dbg_cap_identical(Lm_list *, const char *, const char *); +extern void Dbg_cap_mapfile_title(Lm_list *, Lineno); +extern void Dbg_cap_post_title(Lm_list *, int *); +extern void Dbg_cap_sec_title(Lm_list *, const char *); +extern void Dbg_cap_val(Lm_list *, Syscapset *, Syscapset *, Half); +extern void Dbg_cap_ptr_entry(Lm_list *, dbg_state_t, Xword, const char *); +extern void Dbg_cap_val_entry(Lm_list *, dbg_state_t, Xword, Xword, Half); + +extern void Dbg_dl_dladdr(Rt_map *, void *); +extern void Dbg_dl_dlclose(Rt_map *, const char *, int); +extern void Dbg_dl_dldump(Rt_map *, const char *, const char *, int); +extern void Dbg_dl_dlerror(Rt_map *, const char *); +extern void Dbg_dl_dlinfo(Rt_map *, const char *, int, void *); +extern void Dbg_dl_dlopen(Rt_map *, const char *, int *, int); +extern void Dbg_dl_dlsym(Rt_map *, const char *, int *, const char *, int); +extern void Dbg_dl_iphdr_enter(Rt_map *, u_longlong_t, u_longlong_t); +extern void Dbg_dl_iphdr_callback(Lm_list *, struct dl_phdr_info *); +extern void Dbg_dl_iphdr_mapchange(Lm_list *, u_longlong_t, u_longlong_t); +extern void Dbg_dl_iphdr_unmap_ret(Lm_list *); extern const char * Dbg_demangle_name(const char *); -extern void Dbg_ent_entry(Lm_list *, Half, Ent_desc *); -extern void Dbg_ent_print(Lm_list *, Half, List *, Boolean); +extern void Dbg_ent_entry(Lm_list *, uchar_t, Half, Ent_desc *); +extern void Dbg_ent_print(Lm_list *, uchar_t, Half, APlist *); extern void Dbg_file_analyze(Rt_map *); -extern void Dbg_file_aout(Lm_list *, const char *, ulong_t, ulong_t, - ulong_t, const char *, Aliste); -extern void Dbg_file_ar(Lm_list *, const char *, int); -extern void Dbg_file_ar_rescan(Lm_list *); +extern void Dbg_file_aout(Lm_list *, const char *, Addr, size_t, + const char *, Aliste); +extern void Dbg_file_ar(Lm_list *, const char *, Boolean); +extern void Dbg_file_ar_rescan(Lm_list *, int, int); extern void Dbg_file_bind_entry(Lm_list *, Bnd_desc *); extern void Dbg_file_bindings(Rt_map *, int); +extern void Dbg_file_bindings_done(Lm_list *); extern void Dbg_file_cleanup(Lm_list *, const char *, Aliste); extern void Dbg_file_cntl(Lm_list *, Aliste, Aliste); extern void Dbg_file_config_dis(Lm_list *, const char *, int); @@ -676,11 +852,9 @@ extern void Dbg_file_config_obj(Lm_list *, const char *, const char *, const char *); extern void Dbg_file_del_rescan(Lm_list *); extern void Dbg_file_delete(Rt_map *); -extern void Dbg_file_dlclose(Lm_list *, const char *, int); -extern void Dbg_file_dldump(Rt_map *, const char *, int); -extern void Dbg_file_dlopen(Rt_map *, const char *, int *, int); -extern void Dbg_file_elf(Lm_list *, const char *, ulong_t, ulong_t, - ulong_t, ulong_t, const char *, Aliste); +extern void Dbg_file_deferred(Lm_list *, const char *, const char *); +extern void Dbg_file_elf(Lm_list *, const char *, Addr, size_t, + const char *, Aliste); extern void Dbg_file_filtee(Lm_list *, const char *, const char *, int); extern void Dbg_file_filter(Lm_list *, const char *, const char *, int); extern void Dbg_file_fixname(Lm_list *, const char *, const char *); @@ -691,6 +865,8 @@ extern void Dbg_file_hdl_title(int); extern void Dbg_file_lazyload(Rt_map *, const char *, const char *); extern void Dbg_file_ldso(Rt_map *, char **, auxv_t *, const char *, Aliste); +extern void Dbg_file_mmapobj(Lm_list *, const char *, mmapobj_result_t *, + uint_t); extern void Dbg_file_mode_promote(Rt_map *, int); extern void Dbg_file_modified(Lm_list *, const char *, const char *, const char *, int, int, Elf *, Elf *); @@ -707,32 +883,42 @@ extern void Dbg_got_display(Ofl_desc *, Off, int, Word, size_t); extern void Dbg_libs_audit(Lm_list *, const char *, const char *); extern void Dbg_libs_find(Lm_list *, const char *); extern void Dbg_libs_found(Lm_list *, const char *, int); -extern void Dbg_libs_ignore(Lm_list *, const char *); -extern void Dbg_libs_init(Lm_list *, List *, List *); +extern void Dbg_libs_insecure(Lm_list *, const char *, int); +extern void Dbg_libs_init(Lm_list *, APlist *, APlist *); extern void Dbg_libs_l(Lm_list *, const char *, const char *); extern void Dbg_libs_path(Lm_list *, const char *, uint_t, const char *); extern void Dbg_libs_req(Lm_list *, const char *, const char *, const char *); -extern void Dbg_libs_update(Lm_list *, List *, List *); +extern void Dbg_libs_update(Lm_list *, APlist *, APlist *); extern void Dbg_libs_yp(Lm_list *, const char *); extern void Dbg_libs_ylu(Lm_list *, const char *, const char *, int); -extern void Dbg_map_dash(Lm_list *, const char *, Sdf_desc *); -extern void Dbg_map_ent(Lm_list *, Boolean, Ent_desc *, Ofl_desc *); -extern void Dbg_map_parse(Lm_list *, const char *); -extern void Dbg_map_pipe(Lm_list *, Sg_desc *, const char *, const Word); -extern void Dbg_map_seg(Ofl_desc *, int, Sg_desc *); -extern void Dbg_map_set_atsign(Boolean); -extern void Dbg_map_set_equal(Boolean); -extern void Dbg_map_size_new(Lm_list *, const char *); -extern void Dbg_map_size_old(Ofl_desc *, Sym_desc *); -extern void Dbg_map_sort_fini(Lm_list *, Sg_desc *); -extern void Dbg_map_sort_orig(Lm_list *, Sg_desc *); +extern void Dbg_map_cexp_id(Lm_list *, Boolean, const char *, Lineno, + const char *); +extern void Dbg_map_dv(Lm_list *, const char *, Lineno); +extern void Dbg_map_dv_entry(Lm_list *, Lineno, int, const char *); +extern void Dbg_map_ent(Lm_list *, Ent_desc *, Ofl_desc *, Lineno); +extern void Dbg_map_ent_ord_title(Lm_list *, const char *); +extern void Dbg_map_hdr_noalloc(Lm_list *, Lineno); +extern void Dbg_map_parse(Lm_list *, const char *, int); +extern void Dbg_map_pass(Lm_list *, Boolean, const char *, Lineno, + const char *); +extern void Dbg_map_post_title(Lm_list *); +extern void Dbg_map_seg(Ofl_desc *, dbg_state_t, int, Sg_desc *, Lineno); +extern void Dbg_map_seg_order(Ofl_desc *, uchar_t, Half, dbg_state_t, + Lineno); +extern void Dbg_map_seg_os_order(Lm_list *, Sg_desc *, const char *, + Word, Lineno); +extern void Dbg_map_size_new(Lm_list *, const char *, const char *, Lineno); +extern void Dbg_map_size_old(Ofl_desc *, Sym_desc *, const char *, Lineno); +extern void Dbg_map_sort_title(Lm_list *, Boolean); +extern void Dbg_map_sort_seg(Lm_list *, uchar_t, Half, Sg_desc *); extern void Dbg_map_symbol(Ofl_desc *, Sym_desc *); extern void Dbg_map_version(Lm_list *, const char *, const char *, int); extern void Dbg_move_adjexpandreloc(Lm_list *, Xword, const char *); extern void Dbg_move_adjmovereloc(Lm_list *, Xword, Xword, const char *); +extern void Dbg_move_bad(Lm_list *, ulong_t, const char *, Addr); extern void Dbg_move_data(Rt_map *); extern void Dbg_move_entry1(Lm_list *, int, Move *, Sym_desc *); extern void Dbg_move_entry2(Lm_list *, Move *, Word, const char *); @@ -747,8 +933,8 @@ extern void Dbg_reloc_apply_val(Lm_list *, int, Xword, Xword); extern void Dbg_reloc_ars_entry(Lm_list *, int, Word, Half, Rel_desc *); extern void Dbg_reloc_copy(Rt_map *, Rt_map *, const char *, int); extern void Dbg_reloc_discard(Lm_list *, Half, Rel_desc *); -extern void Dbg_reloc_doact(Lm_list *, int, Half, Word, Word, Xword, Xword, - const char *, Os_desc *); +extern void Dbg_reloc_doact(Lm_list *, int, Half, Word, Rel_desc *, + Xword, Xword, rel_desc_sname_func_t); extern void Dbg_reloc_doact_title(Lm_list *); extern void Dbg_reloc_dooutrel(Lm_list *, Word); extern void Dbg_reloc_entry(Lm_list *, const char *, Half, Word, void *, @@ -757,35 +943,40 @@ extern void Dbg_reloc_error(Lm_list *, int, Half, Word, void *, const char *); extern void Dbg_reloc_generate(Lm_list *, Os_desc *, Word); extern void Dbg_reloc_in(Lm_list *, int, Half, Word, void *, const char *, - const char *); + Word, const char *); extern void Dbg_reloc_ors_entry(Lm_list *, int, Word, Half, Rel_desc *); extern void Dbg_reloc_out(Ofl_desc *, int, Word, void *, const char *, const char *); extern void Dbg_reloc_proc(Lm_list *, Os_desc *, Is_desc *, Is_desc *); extern void Dbg_reloc_run(Rt_map *, uint_t, int, int); -extern void Dbg_reloc_transition(Lm_list *, Half, Word, Rel_desc *); -extern void Dbg_reloc_sloppycomdat(Lm_list *, const char *, Sym_desc *); +extern void Dbg_reloc_transition(Lm_list *, Half, Word, Rel_desc *, + rel_desc_sname_func_t); +extern void Dbg_reloc_sloppycomdat(Lm_list *, Sym_desc *); extern void Dbg_sec_added(Lm_list *, Os_desc *, Sg_desc *); +extern void Dbg_sec_backing(Lm_list *); extern void Dbg_sec_created(Lm_list *, Os_desc *, Sg_desc *); extern void Dbg_sec_discarded(Lm_list *, Is_desc *, Is_desc *); extern void Dbg_sec_genstr_compress(Lm_list *, const char *, Xword, Xword); extern void Dbg_sec_group(Lm_list *, Is_desc *, Group_desc *); +extern void Dbg_sec_gnu_comdat(Lm_list *, Is_desc *, Boolean, Boolean); extern void Dbg_sec_in(Lm_list *, Is_desc *); extern void Dbg_sec_order_error(Lm_list *, Ifl_desc *, Word, int); extern void Dbg_sec_order_list(Ofl_desc *, int); +extern void Dbg_sec_redirected(Lm_list *, Is_desc *, const char *); extern void Dbg_sec_strtab(Lm_list *, Os_desc *, Str_tbl *); extern void Dbg_sec_unsup_strmerge(Lm_list *, Is_desc *); -extern void Dbg_seg_desc_entry(Lm_list *, Half, int, Sg_desc *); +extern void Dbg_seg_desc_entry(Lm_list *, uchar_t, Half, int, Sg_desc *, + Boolean); extern void Dbg_seg_entry(Ofl_desc *, int, Sg_desc *); -extern void Dbg_seg_list(Lm_list *, Half, List *); +extern void Dbg_seg_list(Lm_list *, uchar_t, Half, APlist *); extern void Dbg_seg_os(Ofl_desc *, Os_desc *, int); extern void Dbg_seg_title(Lm_list *); -extern void Dbg_shdr_modified(Lm_list *, const char *, Half, Shdr *, Shdr *, - const char *); +extern void Dbg_shdr_modified(Lm_list *, const char *, uchar_t, Half, + Word, Shdr *, Shdr *, const char *); extern void Dbg_statistics_ar(Ofl_desc *); extern void Dbg_statistics_ld(Ofl_desc *); @@ -794,21 +985,29 @@ extern void Dbg_support_action(Lm_list *, const char *, const char *, Support_ndx, const char *); extern void Dbg_support_load(Lm_list *, const char *, const char *); extern void Dbg_support_req(Lm_list *, const char *, int); +extern void Dbg_support_vnone(Lm_list *, const char *); extern void Dbg_syminfo_entry(Lm_list *, Word, Syminfo *, Sym *, const char *, Dyn *); extern void Dbg_syminfo_title(Lm_list *); -extern void Dbg_syms_ar_checking(Lm_list *, Xword, Elf_Arsym *, - const char *); -extern void Dbg_syms_ar_entry(Lm_list *, Xword, Elf_Arsym *); -extern void Dbg_syms_ar_resolve(Lm_list *, Xword, Elf_Arsym *, - const char *, int); -extern void Dbg_syms_ar_title(Lm_list *, const char *, int); +extern void Dbg_syms_ar_checking(Lm_list *, const char *, const char *, + Elf_Arsym *); +extern void Dbg_syms_ar_force(Lm_list *, const char *, const char *); +extern void Dbg_syms_ar_resolve(Lm_list *, const char *, const char *, + Elf_Arsym *); +extern void Dbg_syms_ar_skip(Lm_list *, const char *, Elf_Arsym *); +extern void Dbg_syms_ar_title(Lm_list *, const char *, Boolean); +extern void Dbg_syms_cap_convert(Ofl_desc *, Word, const char *, Sym *); +extern void Dbg_syms_cap_local(Ofl_desc *, Word, const char *, Sym *, + Sym_desc *); +extern void Dbg_syms_cap_lookup(Rt_map *, uint_t, const char *, uint_t, + Half, Syscapset *); +extern void Dbg_syms_cap_title(Ofl_desc *); +extern void Dbg_syms_copy_reloc(Ofl_desc *, Sym_desc *, Word); extern void Dbg_syms_created(Lm_list *, const char *); extern void Dbg_syms_discarded(Lm_list *, Sym_desc *); -extern void Dbg_syms_dlsym(Rt_map *, const char *, int *, const char *, - int); +extern void Dbg_syms_dup_discarded(Lm_list *, Word ndx, Sym_desc *); extern void Dbg_syms_dup_sort_addr(Lm_list *, const char *, const char *, const char *, Addr); extern void Dbg_syms_entered(Ofl_desc *, Sym *, Sym_desc *); @@ -826,7 +1025,6 @@ extern void Dbg_syms_old(Ofl_desc *, Sym_desc *); extern void Dbg_syms_process(Lm_list *, Ifl_desc *); extern void Dbg_syms_reduce(Ofl_desc *, int, Sym_desc *, int, const char *); -extern void Dbg_syms_reloc(Ofl_desc *, Sym_desc *); extern void Dbg_syms_resolved(Ofl_desc *, Sym_desc *); extern void Dbg_syms_resolving(Ofl_desc *, Word, const char *, int, int, Sym *, Sym *, Sym_desc *, Ifl_desc *); @@ -835,12 +1033,12 @@ extern void Dbg_syms_sec_title(Lm_list *); extern void Dbg_syms_spec_title(Lm_list *); extern void Dbg_syms_updated(Ofl_desc *, Sym_desc *, const char *); extern void Dbg_syms_up_title(Lm_list *); +extern void Dbg_syms_wrap(Lm_list *, Word, const char *, const char *); extern void Dbg_tls_modactivity(Lm_list *, void *, uint_t); extern void Dbg_tls_static_block(Lm_list *, void *, ulong_t, ulong_t); extern void Dbg_tls_static_resv(Rt_map *, ulong_t, ulong_t); -extern void Dbg_util_broadcast(Rt_map *); extern void Dbg_util_call_array(Rt_map *, void *, int, Word); extern void Dbg_util_call_fini(Rt_map *); extern void Dbg_util_call_init(Rt_map *, int); @@ -854,10 +1052,9 @@ extern void Dbg_util_intoolate(Rt_map *); extern void Dbg_util_lcinterface(Rt_map *, int, char *); extern void Dbg_util_nl(Lm_list *, int); extern void Dbg_util_no_init(Rt_map *); -extern void Dbg_util_str(Lm_list *, const char *); extern void Dbg_util_scc_entry(Rt_map *, uint_t); extern void Dbg_util_scc_title(Lm_list *, int); -extern void Dbg_util_wait(Rt_map *, Rt_map *, int); +extern void Dbg_util_str(Lm_list *, const char *); extern void Dbg_unused_file(Lm_list *, const char *, int, uint_t); extern void Dbg_unused_lcinterface(Rt_map *, Rt_map *, int); @@ -870,6 +1067,7 @@ extern void Dbg_ver_avail_entry(Lm_list *, Ver_index *, const char *); extern void Dbg_ver_avail_title(Lm_list *, const char *); extern void Dbg_ver_def_title(Lm_list *, const char *); extern void Dbg_ver_desc_entry(Lm_list *, Ver_desc *); +extern void Dbg_ver_need_done(Lm_list *); extern void Dbg_ver_need_entry(Lm_list *, Half, const char *, const char *); extern void Dbg_ver_need_title(Lm_list *, const char *); @@ -882,6 +1080,7 @@ extern void Dbg_ver_symbol(Lm_list *, const char *); #define ELF_DBG_ELFDUMP 1 #define ELF_DBG_RTLD 2 #define ELF_DBG_LD 3 +#define ELF_DBG_LD_ACT 4 /* * Define generic Elf_*() interfaces. @@ -967,12 +1166,14 @@ extern void Elf_syminfo_title(Lm_list *); #endif -extern void Elf_cap_entry(Lm_list *, Cap *, int, Half); +extern void Elf_cap_entry(Lm_list *, Cap *, int, const char *, size_t, + Half); extern void Elf_cap_title(Lm_list *); extern const char \ *Elf_demangle_name(const char *); -extern void Elf_dyn_entry(Lm_list *, Dyn *, int, const char *, Half); +extern void Elf_dyn_entry(Lm_list *, Dyn *, int, const char *, + uchar_t, Half); extern void Elf_dyn_null_entry(Lm_list *, Dyn *, int, int); extern void Elf_dyn_title(Lm_list *); @@ -982,7 +1183,7 @@ extern void Elf_got_entry(Lm_list *, Sword, Addr, Xword, Half, uchar_t, uchar_t, Word, void *, const char *); extern void Elf_got_title(Lm_list *); -extern void Elf_phdr(Lm_list *, Half, Phdr *); +extern void Elf_phdr(Lm_list *, uchar_t, Half, Phdr *); extern void Elf_reloc_apply_val(Lm_list *, int, Xword, Xword); extern void Elf_reloc_apply_reg(Lm_list *, int, Half, Xword, Xword); @@ -993,10 +1194,10 @@ extern void Elf_reloc_entry_2(Lm_list *, int, const char *, Word, const char *); extern void Elf_reloc_title(Lm_list *, int, Word); -extern void Elf_shdr(Lm_list *, Half, Shdr *); +extern void Elf_shdr(Lm_list *, uchar_t, Half, Shdr *); -extern void Elf_syms_table_entry(Lm_list *, int, const char *, Half, Sym *, - Versym, int, const char *, const char *); +extern void Elf_syms_table_entry(Lm_list *, int, const char *, uchar_t, + Half, Sym *, Versym, int, const char *, const char *); extern void Elf_syms_table_title(Lm_list *, int); extern void Elf_ver_def_title(Lm_list *); diff --git a/cmd/sgs/include/sgs.h b/cmd/sgs/include/sgs.h index 388ec97..fc62481 100644 --- a/cmd/sgs/include/sgs.h +++ b/cmd/sgs/include/sgs.h @@ -24,8 +24,7 @@ * All Rights Reserved * * - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * * Global include file for all sgs. */ @@ -33,9 +32,6 @@ #ifndef _SGS_H #define _SGS_H -#pragma ident "%Z%%M% %I% %E% SMI" - - #ifdef __cplusplus extern "C" { #endif @@ -51,6 +47,7 @@ extern "C" { #include #include #include +#include #include #include #include @@ -122,17 +119,27 @@ typedef enum { } Boolean; /* - * Types of errors (used by eprintf()), together with a generic error return + * Types of errors (used by veprintf()), together with a generic error return * value. */ typedef enum { - ERR_NONE, - ERR_WARNING, - ERR_FATAL, - ERR_ELF, - ERR_NUM /* Must be last */ + ERR_NONE, /* plain message */ + ERR_WARNING_NF, /* warning that cannot be promoted to fatal */ + ERR_WARNING, /* warning that can be promoted to fatal */ + ERR_GUIDANCE, /* guidance warning that can be promoted */ + ERR_FATAL, /* fatal error */ + ERR_ELF, /* fatal libelf error */ + ERR_NUM /* # of Error codes. Must be last */ } Error; +/* + * Type used to represent line numbers within files, and a corresponding + * printing macro for it. + */ +typedef ulong_t Lineno; +#define EC_LINENO(_x) EC_XWORD(_x) /* "llu" */ + + #if defined(_LP64) && !defined(_ELF64) #define S_ERROR (~(uint_t)0) #else @@ -140,48 +147,16 @@ typedef enum { #endif /* - * LIST_TRAVERSE() is used as the only "argument" of a "for" loop to - * traverse a linked list. The node pointer `node' is set to each node in - * turn and the corresponding data pointer is copied to `data'. The macro - * is used as in - * for (LIST_TRAVERSE(List *list, Listnode *node, void *data)) { - * process(data); - * } + * CTF currently does not handle automatic array variables sized via function + * arguments (VLA arrays) properly, when the code is compiled with gcc. + * Adding 1 to the size is a workaround. VLA_SIZE, and its use, should be + * pulled when CTF is fixed or replaced. */ -#define LIST_TRAVERSE(L, N, D) \ - (void) (((N) = (L)->head) != NULL && ((D) = (N)->data) != NULL); \ - (N) != NULL; \ - (void) (((N) = (N)->next) != NULL && ((D) = (N)->data) != NULL) - -typedef struct listnode Listnode; -typedef struct list List; - -struct listnode { /* a node on a linked list */ - void *data; /* the data item */ - Listnode *next; /* the next element */ -}; - -struct list { /* a linked list */ - Listnode *head; /* the first element */ - Listnode *tail; /* the last element */ -}; - - -#ifdef _SYSCALL32 -typedef struct listnode32 Listnode32; -typedef struct list32 List32; - -struct listnode32 { /* a node on a linked list */ - Elf32_Addr data; /* the data item */ - Elf32_Addr next; /* the next element */ -}; - -struct list32 { /* a linked list */ - Elf32_Addr head; /* the first element */ - Elf32_Addr tail; /* the last element */ -}; -#endif /* _SYSCALL32 */ - +#ifdef __GNUC__ +#define VLA_SIZE(_arg) ((_arg) + 1) +#else +#define VLA_SIZE(_arg) (_arg) +#endif /* * Structure to maintain rejected files elf information. Files that are not @@ -192,7 +167,7 @@ struct list32 { /* a linked list */ */ typedef struct { ushort_t rej_type; /* SGS_REJ_ value */ - ushort_t rej_flag; /* additional information */ + ushort_t rej_flags; /* additional information */ uint_t rej_info; /* numeric and string information */ const char *rej_str; /* associated with error */ const char *rej_name; /* object name - expanded library */ @@ -212,7 +187,16 @@ typedef struct { /* required */ #define SGS_REJ_STR 10 /* generic error - info is a string */ #define SGS_REJ_UNKFILE 11 /* unknown file type */ -#define SGS_REJ_HWCAP_1 12 /* hardware capabilities mismatch */ +#define SGS_REJ_UNKCAP 12 /* unknown capabilities */ +#define SGS_REJ_HWCAP_1 13 /* hardware capabilities mismatch */ +#define SGS_REJ_SFCAP_1 14 /* software capabilities mismatch */ +#define SGS_REJ_MACHCAP 15 /* machine capability mismatch */ +#define SGS_REJ_PLATCAP 16 /* platform capability mismatch */ +#define SGS_REJ_HWCAP_2 17 /* hardware capabilities mismatch */ +#define SGS_REJ_ARCHIVE 18 /* archive used in invalid context */ +#define SGS_REJ_NUM 19 + +#define FLG_REJ_ALTER 0x01 /* object name is an alternative */ /* * For those source files used both inside and outside of the @@ -227,44 +211,42 @@ typedef struct { #define realloc libld_realloc #define libld_calloc(x, a) libld_malloc(((size_t)x) * ((size_t)a)) -extern void libld_free(void *); -extern void *libld_malloc(size_t); -extern void *libld_realloc(void *, size_t); +extern void libld_free(void *); +extern void *libld_malloc(size_t); +extern void *libld_realloc(void *, size_t); #endif - /* * Data structures (defined in libld.h). */ +typedef struct audit_desc Audit_desc; +typedef struct audit_info Audit_info; +typedef struct audit_list Audit_list; +typedef struct cap_desc Cap_desc; typedef struct ent_desc Ent_desc; typedef struct group_desc Group_desc; typedef struct ifl_desc Ifl_desc; typedef struct is_desc Is_desc; typedef struct isa_desc Isa_desc; typedef struct isa_opt Isa_opt; -typedef struct mv_desc Mv_desc; -typedef struct ofl_desc Ofl_desc; typedef struct os_desc Os_desc; +typedef struct ofl_desc Ofl_desc; typedef struct rel_cache Rel_cache; +typedef struct rel_cachebuf Rel_cachebuf; +typedef struct rel_aux_cachebuf Rel_aux_cachebuf; +typedef struct rel_aux Rel_aux; +typedef struct rel_desc Rel_desc; typedef struct sdf_desc Sdf_desc; typedef struct sdv_desc Sdv_desc; +typedef struct sec_order Sec_order; typedef struct sg_desc Sg_desc; typedef struct sort_desc Sort_desc; -typedef struct sec_order Sec_order; -typedef struct sym_desc Sym_desc; -typedef struct sym_aux Sym_aux; typedef struct sym_avlnode Sym_avlnode; +typedef struct sym_aux Sym_aux; +typedef struct sym_desc Sym_desc; typedef struct uts_desc Uts_desc; typedef struct ver_desc Ver_desc; typedef struct ver_index Ver_index; -typedef struct audit_desc Audit_desc; -typedef struct audit_info Audit_info; -typedef struct audit_list Audit_list; - -/* - * Data structures defined in machrel.h. - */ -typedef struct rel_desc Rel_desc; /* * Data structures defined in rtld.h. @@ -279,7 +261,7 @@ typedef struct lm_list32 Lm_list32; */ extern int assfail(const char *, const char *, int); extern void eprintf(Lm_list *, Error, const char *, ...); -extern char *sgs_demangle(char *); +extern void veprintf(Lm_list *, Error, const char *, va_list); extern uint_t sgs_str_hash(const char *); extern uint_t findprime(uint_t); diff --git a/cmd/sgs/messages/sgs.ident b/cmd/sgs/messages/sgs.ident index 6afbf5f..9aa3696 100644 --- a/cmd/sgs/messages/sgs.ident +++ b/cmd/sgs/messages/sgs.ident @@ -1,6 +1,5 @@ # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. # # CDDL HEADER START # @@ -21,8 +20,6 @@ # # CDDL HEADER END # -# ident "%Z%%M% %I% %E% SMI" -# # # Global message identifiers for the sgs utilities. This information is read # by sgsmsg(1l) using the -i option. @@ -37,7 +34,7 @@ MSG_ID_RTLD 1 SUNW_OST_SGS /* sgs/rtld */ MSG_ID_LIBRTLD 2 SUNW_OST_SGS /* sgs/librtld */ MSG_ID_LIBLD 3 SUNW_OST_SGS /* sgs/libld */ MSG_ID_LIBLDDBG 4 SUNW_OST_SGS /* sgs/liblddbg */ -MSG_ID_LIBLDSTAB 5 SUNW_OST_SGS /* sgs/libldstab */ + MSG_ID_LIBRTLD_DB 6 SUNW_OST_SGS /* sgs/librtld_db */ MSG_ID_LIBPROF 7 SUNW_OST_SGS /* sgs/libprof */ MSG_ID_LIBCRLE 8 SUNW_OST_SGS /* sgs/libcrle */ @@ -60,3 +57,4 @@ MSG_ID_ELFEDIT_STR 27 SUNW_OST_SGS /* str: */ MSG_ID_ELFEDIT_SYM 27 SUNW_OST_SGS /* sym: */ MSG_ID_ELFEDIT_SYMINFO 27 SUNW_OST_SGS /* syminfo: */ MSG_ID_ELFWRAP 28 SUNW_OST_SGS /* sgs/elfwrap */ +MSG_ID_AR 29 SUNW_OST_SGS /* sgs/ar */ diff --git a/cmd/sgs/tools/common/sgsmsg.c b/cmd/sgs/tools/common/sgsmsg.c index 9b2e37b..1cdd584 100644 --- a/cmd/sgs/tools/common/sgsmsg.c +++ b/cmd/sgs/tools/common/sgsmsg.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. * * sgsmsg generates several message files from an input template file. Messages * are constructed for use with gettext(3i) - the default - or catgets(3c). The @@ -66,7 +65,6 @@ * the data array being built in msg.c. The index into this array * becomes the `message' identifier created in the msg.h file. */ -#pragma ident "%Z%%M% %I% %E% SMI" #include #include @@ -393,15 +391,22 @@ init_defs(void) return (1); } + if (fprintf(fddefs, "#include \t/* Msg typedef */\n\n") < 0) { + (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); + return (1); + } + if (fprintf(fddefs, "#ifndef\t__lint\n\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } /* - * add "typedef int Msg;" + * The MSG_SGS_ARRAY_NAME macro supplies a generic way to + * reference the string table regardless of its name. */ - if (fprintf(fddefs, "typedef int\tMsg;\n\n") < 0) { + if (fprintf(fddefs, "#define\tMSG_SGS_LOCAL_ARRAY\t__%s\n\n", + interface) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } @@ -418,7 +423,13 @@ init_defs(void) return (1); } } - if (fprintf(fddefs, "#define\tMSG_ORIG(x)\t&__%s[x]\n\n", + if (fprintf(fddefs, + "#define\tMSG_ORIG_STRTAB(_x, _s)\t&_s[_x]\n\n") < 0) { + (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); + return (1); + } + if (fprintf(fddefs, + "#define\tMSG_ORIG(x)\tMSG_ORIG_STRTAB(x, __%s)\n\n", interface) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); @@ -453,17 +464,14 @@ fini_defs(void) return (1); } - /* - * When __lint is defined, Msg is a char *. This allows lint to - * check our format strings against it's arguments. - */ - if (fprintf(fddefs, "\ntypedef char *\tMsg;\n\n") < 0) { + if (fprintf(fddefs, "extern\tconst char *\t_%s(Msg);\n\n", + interface) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } - if (fprintf(fddefs, "extern\tconst char *\t_%s(Msg);\n\n", - interface) < 0) { + if (fprintf(fddefs, "#ifndef MSG_SGS_LOCAL_ARRAY\n" + "#define\tMSG_SGS_LOCAL_ARRAY\t\"\"\n#endif\n\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } @@ -478,12 +486,25 @@ fini_defs(void) } if (fprintf(fddefs, + "#define MSG_ORIG_STRTAB(_x, _s)\t_x\n" "#define MSG_ORIG(x)\tx\n#define MSG_INTL(x)\tx\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } /* + * Provide a way to get the array and function declarations above + * without also getting the actual messages. This is useful in + * our lintsup.c files that include more than one message header. + * lintsup doesn't need the actual messages, and this prevents + * macro name collisions. + */ + if (fprintf(fddefs, "\n#ifndef LINTSUP_SUPPRESS_STRINGS\n") < 0) { + (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); + return (1); + } + + /* * Copy the temporary lint defs file into the new header. */ if (fdlint) { @@ -510,6 +531,11 @@ fini_defs(void) (void) free(buf); } + if (fprintf(fddefs, "\n#endif\t/* LINTSUP_SUPPRESS_STRINGS */\n") < 0) { + (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); + return (1); + } + if (fprintf(fddefs, "\n#endif\t/* __lint */\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); @@ -1102,8 +1128,9 @@ main(int argc, char ** argv) } } if (fddefs && fddata) { - (void) sprintf(fllint, "%s.%d", nmlint, (int)getpid()); - if ((fdlint = fopen(fllint, "w+")) == NULL) { + (void) sprintf(fllint, "%s.%d.XXXXXX", nmlint, (int)getpid()); + if ((mkstemp(fllint) == -1) || + ((fdlint = fopen(fllint, "w+")) == NULL)) { (void) fprintf(stderr, Errmsg_opne, fllint, strerror(errno)); return (1); diff --git a/cmd/sgs/tools/common/string_table.c b/cmd/sgs/tools/common/string_table.c index e174aca..c154731 100644 --- a/cmd/sgs/tools/common/string_table.c +++ b/cmd/sgs/tools/common/string_table.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <_string_table.h> #include #include @@ -160,7 +158,7 @@ st_new(uint_t flags) { Str_tbl *stp; - if ((stp = calloc(sizeof (Str_tbl), 1)) == NULL) + if ((stp = calloc(sizeof (*stp), 1)) == NULL) return (NULL); /* @@ -175,7 +173,7 @@ st_new(uint_t flags) if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) return (stp); - if ((stp->st_lentree = calloc(sizeof (avl_tree_t), 1)) == NULL) + if ((stp->st_lentree = calloc(sizeof (*stp->st_lentree), 1)) == NULL) return (NULL); avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode), @@ -187,9 +185,9 @@ st_new(uint_t flags) /* * Insert a new string into the Str_tbl. There are two AVL trees used. * - * . The first LenNode AVL tree maintains a tree of nodes based on string + * - The first LenNode AVL tree maintains a tree of nodes based on string * sizes. - * . Each LenNode maintains a StrNode AVL tree for each string. Large + * - Each LenNode maintains a StrNode AVL tree for each string. Large * applications have been known to contribute thousands of strings of * the same size. Should strings need to be removed (-z ignore), then * the string AVL tree makes this removal efficient and scalable. @@ -227,12 +225,13 @@ st_insert(Str_tbl *stp, const char *str) */ ln.ln_strlen = len; if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) { - if ((lnp = calloc(sizeof (LenNode), 1)) == NULL) + if ((lnp = calloc(sizeof (*lnp), 1)) == NULL) return (-1); lnp->ln_strlen = len; avl_insert(stp->st_lentree, lnp, where); - if ((lnp->ln_strtree = calloc(sizeof (avl_tree_t), 1)) == NULL) + if ((lnp->ln_strtree = calloc(sizeof (*lnp->ln_strtree), 1)) == + NULL) return (0); avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode), @@ -246,7 +245,7 @@ st_insert(Str_tbl *stp, const char *str) */ sn.sn_str = str; if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) { - if ((snp = calloc(sizeof (StrNode), 1)) == NULL) + if ((snp = calloc(sizeof (*snp), 1)) == NULL) return (-1); snp->sn_str = str; avl_insert(lnp->ln_strtree, snp, where); @@ -513,7 +512,7 @@ st_hash_insert(Str_tbl *stp, const char *str, size_t len) /* * allocate a new master string */ - if ((mstr = calloc(sizeof (Str_hash), 1)) == 0) + if ((mstr = calloc(sizeof (*mstr), 1)) == 0) return (-1); mstr->sm_next = stp->st_mstrlist; stp->st_mstrlist = mstr; @@ -528,7 +527,7 @@ st_hash_insert(Str_tbl *stp, const char *str, size_t len) stp->st_strsize += len - mstr->sm_strlen; } - if ((sthash = calloc(sizeof (Str_hash), 1)) == 0) + if ((sthash = calloc(sizeof (*sthash), 1)) == 0) return (-1); mstr->sm_hashval = sthash->hi_hashval = hashval; @@ -569,8 +568,8 @@ st_getstrtab_sz(Str_tbl *stp) * strings input. */ stp->st_hbckcnt = findprime(stp->st_strcnt); - if ((stp->st_hashbcks = - calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL) + if ((stp->st_hashbcks = calloc(sizeof (*stp->st_hashbcks), + stp->st_hbckcnt)) == NULL) return (0); /* @@ -673,8 +672,8 @@ st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize) #ifdef DEBUG /* * for debug builds - start with a stringtable filled in - * with '0xff'. This makes it very easy to find wholes - * which we failed to fill in - in the strtab. + * with '0xff'. This makes it very easy to spot unfilled + * holes in the strtab. */ memset(stbuf, 0xff, bufsize); stbuf[0] = '\0'; diff --git a/cmd/stat/common/statcommon.h b/cmd/stat/common/statcommon.h new file mode 100644 index 0000000..9ee0774 --- /dev/null +++ b/cmd/stat/common/statcommon.h @@ -0,0 +1,328 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Common routines for acquiring snapshots of kstats for + * iostat, mpstat, and vmstat. + */ + +#ifndef _STATCOMMON_H +#define _STATCOMMON_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* No CPU present at this CPU position */ +#define ID_NO_CPU -1 +/* CPU belongs to no pset (we number this as "pset 0") */ +#define ID_NO_PSET 0 +/* CPU is usable */ +#define CPU_ONLINE(s) ((s) == P_ONLINE || (s) == P_NOINTR) +/* will the CPU have kstats */ +#define CPU_ACTIVE(c) (CPU_ONLINE((c)->cs_state) && (c)->cs_id != ID_NO_CPU) +/* IO device has no identified ID */ +#define IODEV_NO_ID -1 +/* no limit to iodevs to collect */ +#define UNLIMITED_IODEVS ((size_t)-1) + +#define NODATE 0 /* Default: No time stamp */ +#define DDATE 1 /* Standard date format */ +#define UDATE 2 /* Internal representation of Unix time */ + + +enum snapshot_types { + /* All CPUs separately */ + SNAP_CPUS = 1 << 0, + /* Aggregated processor sets */ + SNAP_PSETS = 1 << 1, + /* sys-wide stats including aggregated CPU stats */ + SNAP_SYSTEM = 1 << 2, + /* interrupt sources and counts */ + SNAP_INTERRUPTS = 1 << 3, + /* disk etc. stats */ + SNAP_IODEVS = 1 << 4, + /* disk controller aggregates */ + SNAP_CONTROLLERS = 1 << 5, + /* mpxio L I (multipath) paths: -X: Lun,LunInitiator */ + SNAP_IOPATHS_LI = 1 << 6, + /* mpxio LTI (multipath) paths: -Y: Lun,LunTarget,LunTargetInitiator */ + SNAP_IOPATHS_LTI = 1 << 7, + /* disk error stats */ + SNAP_IODEV_ERRORS = 1 << 8, + /* pretty names for iodevs */ + SNAP_IODEV_PRETTY = 1 << 9, + /* devid for iodevs */ + SNAP_IODEV_DEVID = 1 << 10 +}; + +struct cpu_snapshot { + /* may be ID_NO_CPU if no CPU present */ + processorid_t cs_id; + /* may be ID_NO_PSET if no pset */ + psetid_t cs_pset_id; + /* as in p_online(2) */ + int cs_state; + /* stats for this CPU */ + kstat_t cs_vm; + kstat_t cs_sys; +}; + +struct pset_snapshot { + /* ID may be zero to indicate the "none set" */ + psetid_t ps_id; + /* number of CPUs in set */ + size_t ps_nr_cpus; + /* the CPUs in this set */ + struct cpu_snapshot **ps_cpus; +}; + +struct intr_snapshot { + /* name of interrupt source */ + char is_name[KSTAT_STRLEN]; + /* total number of interrupts from this source */ + ulong_t is_total; +}; + +struct sys_snapshot { + sysinfo_t ss_sysinfo; + vminfo_t ss_vminfo; + struct nc_stats ss_nc; + /* vm/sys stats aggregated across all CPUs */ + kstat_t ss_agg_vm; + kstat_t ss_agg_sys; + /* ticks since boot */ + ulong_t ss_ticks; + long ss_deficit; +}; + +/* order is significant (see sort_before()) */ +enum iodev_type { + IODEV_CONTROLLER = 1 << 0, + IODEV_DISK = 1 << 1, + IODEV_PARTITION = 1 << 2, + IODEV_TAPE = 1 << 3, + IODEV_NFS = 1 << 4, + IODEV_IOPATH_LT = 1 << 5, /* synthetic LunTarget */ + IODEV_IOPATH_LI = 1 << 6, /* synthetic LunInitiator */ + IODEV_IOPATH_LTI = 1 << 7, /* LunTgtInitiator (pathinfo) */ + IODEV_UNKNOWN = 1 << 8 +}; + +/* identify a disk, partition, etc. */ +struct iodev_id { + int id; + /* target id (for disks) */ + char tid[KSTAT_STRLEN]; +}; + +/* + * Used for disks, partitions, tapes, nfs, controllers, iopaths + * Each entry can be a branch of a tree; for example, the disks + * of a controller constitute the children of the controller + * iodev_snapshot. This relationship is not strictly maintained + * if is_pretty can't be found. + */ +struct iodev_snapshot { + /* original kstat name */ + char is_name[KSTAT_STRLEN]; + /* type of kstat */ + enum iodev_type is_type; + /* ID if meaningful */ + struct iodev_id is_id; + /* parent ID if meaningful */ + struct iodev_id is_parent_id; + /* user-friendly name if found */ + char *is_pretty; + /* device ID if applicable */ + char *is_devid; + /* mount-point if applicable */ + char *is_dname; + /* number of direct children */ + int is_nr_children; + /* children of this I/O device */ + struct iodev_snapshot *is_children; + /* standard I/O stats */ + kstat_io_t is_stats; + /* iodev error stats */ + kstat_t is_errors; + /* creation time of the stats */ + hrtime_t is_crtime; + /* time at which iodev snapshot was taken */ + hrtime_t is_snaptime; + /* kstat module */ + char is_module[KSTAT_STRLEN]; + /* kstat instance */ + int is_instance; + /* kstat (only used temporarily) */ + kstat_t *is_ksp; + struct iodev_snapshot *is_prev; + struct iodev_snapshot *is_next; + /* AVL structures to speedup insertion */ + avl_tree_t *avl_list; /* list this element belongs to */ + avl_node_t avl_link; +}; + +/* which iodevs to show. */ +struct iodev_filter { + /* nr. of iodevs to choose */ + size_t if_max_iodevs; + /* bit mask of enum io_types to allow */ + int if_allowed_types; + /* should we show floppy ? if_names can override this */ + int if_skip_floppy; + /* nr. of named iodevs */ + size_t if_nr_names; + char **if_names; +}; + +/* The primary structure of a system snapshot. */ +struct snapshot { + /* what types were *requested* */ + enum snapshot_types s_types; + size_t s_nr_cpus; + struct cpu_snapshot *s_cpus; + size_t s_nr_psets; + struct pset_snapshot *s_psets; + size_t s_nr_intrs; + struct intr_snapshot *s_intrs; + size_t s_nr_iodevs; + struct iodev_snapshot *s_iodevs; + size_t s_iodevs_is_name_maxlen; + struct sys_snapshot s_sys; + struct biostats s_biostats; +}; + +/* print a message and exit with failure */ +void fail(int do_perror, char *message, ...); + +/* strdup str, or exit with failure */ +char *safe_strdup(char *str); + +/* malloc successfully, or exit with failure */ +void *safe_alloc(size_t size); + +/* + * Copy a kstat from src to dst. If the source kstat contains no data, + * then set the destination kstat data to NULL and size to zero. + * Returns 0 on success. + */ +int kstat_copy(const kstat_t *src, kstat_t *dst); + +/* + * Look up the named kstat, and give the ui64 difference i.e. + * new - old, or if old is NULL, return new. + */ +uint64_t kstat_delta(kstat_t *old, kstat_t *new, char *name); + +/* Return the number of ticks delta between two hrtime_t values. */ +uint64_t hrtime_delta(hrtime_t old, hrtime_t new); + +/* + * Add the integer-valued stats from "src" to the + * existing ones in "dst". If "dst" does not contain + * stats, then a kstat_copy() is performed. + */ +int kstat_add(const kstat_t *src, kstat_t *dst); + +/* return the number of CPUs with kstats (i.e. present and online) */ +int nr_active_cpus(struct snapshot *ss); + +/* + * Return the difference in CPU ticks between the two sys + * kstats. + */ +uint64_t cpu_ticks_delta(kstat_t *old, kstat_t *new); + +/* + * Open the kstat chain. Cannot fail. + */ +kstat_ctl_t *open_kstat(void); + +/* + * Return a struct snapshot based on the snapshot_types parameter + * passed in. iodev_filter may be NULL in which case all iodevs + * are selected if SNAP_IODEVS is passed. + */ +struct snapshot *acquire_snapshot(kstat_ctl_t *, int, struct iodev_filter *); + +/* free a snapshot */ +void free_snapshot(struct snapshot *ss); + +typedef void (*snapshot_cb)(void *old, void *new, void *data); + +/* + * Call the call back for each pair of data items of the given type, + * passing the data pointer passed in as well. If an item has been + * added, the first pointer will be NULL; if removed, the second pointer + * will be NULL. + * + * A non-zero return value indicates configuration has changed. + */ +int snapshot_walk(enum snapshot_types type, struct snapshot *old, + struct snapshot *new, snapshot_cb cb, void *data); + +/* + * Output a line detailing any configuration changes such as a CPU + * brought online, etc, bracketed by << >>. + */ +void snapshot_report_changes(struct snapshot *old, struct snapshot *new); + +/* Return non-zero if configuration has changed. */ +int snapshot_has_changed(struct snapshot *old, struct snapshot *new); + +/* free the given iodev */ +void free_iodev(struct iodev_snapshot *iodev); + +/* acquire the I/O devices */ +int acquire_iodevs(struct snapshot *ss, kstat_ctl_t *kc, + struct iodev_filter *df); + +/* strcmp-style I/O device comparator */ +int iodev_cmp(struct iodev_snapshot *io1, struct iodev_snapshot *io2); + +/* sleep until *wakeup + interval, keeping cadence where desired */ +void sleep_until(hrtime_t *wakeup, hrtime_t interval, int forever, + int *caught_cont); + +/* signal handler - so we can be aware of SIGCONT */ +void cont_handler(int sig_number); + +/* Print a timestamp in either Unix or standard format. */ +void print_timestamp(uint_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _STATCOMMON_H */ diff --git a/cmd/stat/common/timestamp.c b/cmd/stat/common/timestamp.c new file mode 100644 index 0000000..e0592f3 --- /dev/null +++ b/cmd/stat/common/timestamp.c @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "statcommon.h" + +#include + +/* + * Print timestamp as decimal reprentation of time_t value (-T u was specified) + * or in date(1) format (-T d was specified). + */ +void +print_timestamp(uint_t timestamp_fmt) +{ + time_t t = time(NULL); + static char *fmt = NULL; + + /* We only need to retrieve this once per invocation */ + if (fmt == NULL) + fmt = nl_langinfo(_DATE_FMT); + + if (timestamp_fmt == UDATE) { + (void) printf("%ld\n", t); + } else if (timestamp_fmt == DDATE) { + char dstr[64]; + int len; + + len = strftime(dstr, sizeof (dstr), fmt, localtime(&t)); + if (len > 0) + (void) printf("%s\n", dstr); + } +} diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c new file mode 100644 index 0000000..c6e219d --- /dev/null +++ b/cmd/zdb/zdb.c @@ -0,0 +1,3158 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#undef ZFS_MAXNAMELEN +#undef verify +#include + +#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ + zio_compress_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ + zio_checksum_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ + dmu_ot[(idx)].ot_name : "UNKNOWN") +#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES) + +#ifndef lint +extern int zfs_recover; +#else +int zfs_recover; +#endif + +const char cmdname[] = "zdb"; +uint8_t dump_opt[256]; + +typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); + +extern void dump_intent_log(zilog_t *); +uint64_t *zopt_object = NULL; +int zopt_objects = 0; +libzfs_handle_t *g_zfs; + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ +const char * +_umem_debug_init() +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} + +static void +usage(void) +{ + (void) fprintf(stderr, + "Usage: %s [-CumdibcsDvhL] poolname [object...]\n" + " %s [-div] dataset [object...]\n" + " %s -m [-L] poolname [vdev [metaslab...]]\n" + " %s -R poolname vdev:offset:size[:flags]\n" + " %s -S poolname\n" + " %s -l [-u] device\n" + " %s -C\n\n", + cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); + + (void) fprintf(stderr, " Dataset name must include at least one " + "separator character '/' or '@'\n"); + (void) fprintf(stderr, " If dataset name is specified, only that " + "dataset is dumped\n"); + (void) fprintf(stderr, " If object numbers are specified, only " + "those objects are dumped\n\n"); + (void) fprintf(stderr, " Options to control amount of output:\n"); + (void) fprintf(stderr, " -u uberblock\n"); + (void) fprintf(stderr, " -d dataset(s)\n"); + (void) fprintf(stderr, " -i intent logs\n"); + (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); + (void) fprintf(stderr, " -h pool history\n"); + (void) fprintf(stderr, " -b block statistics\n"); + (void) fprintf(stderr, " -m metaslabs\n"); + (void) fprintf(stderr, " -c checksum all metadata (twice for " + "all data) blocks\n"); + (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); + (void) fprintf(stderr, " -D dedup statistics\n"); + (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); + (void) fprintf(stderr, " -v verbose (applies to all others)\n"); + (void) fprintf(stderr, " -l dump label contents\n"); + (void) fprintf(stderr, " -L disable leak tracking (do not " + "load spacemaps)\n"); + (void) fprintf(stderr, " -R read and display block from a " + "device\n\n"); + (void) fprintf(stderr, " Below options are intended for use " + "with other options (except -l):\n"); + (void) fprintf(stderr, " -A ignore assertions (-A), enable " + "panic recovery (-AA) or both (-AAA)\n"); + (void) fprintf(stderr, " -F attempt automatic rewind within " + "safe range of transaction groups\n"); + (void) fprintf(stderr, " -U -- use alternate " + "cachefile\n"); + (void) fprintf(stderr, " -X attempt extreme rewind (does not " + "work with dataset)\n"); + (void) fprintf(stderr, " -e pool is exported/destroyed/" + "has altroot/not in a cachefile\n"); + (void) fprintf(stderr, " -p -- use one or more with " + "-e to specify path to vdev dir\n"); + (void) fprintf(stderr, " -P print numbers parsable\n"); + (void) fprintf(stderr, " -t -- highest txg to use when " + "searching for uberblocks\n"); + (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " + "to make only that option verbose\n"); + (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); + exit(1); +} + +/* + * Called for usage errors that are discovered after a call to spa_open(), + * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. + */ + +static void +fatal(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + (void) fprintf(stderr, "%s: ", cmdname); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); + (void) fprintf(stderr, "\n"); + + exit(1); +} + +/* ARGSUSED */ +static void +dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) +{ + nvlist_t *nv; + size_t nvsize = *(uint64_t *)data; + char *packed = umem_alloc(nvsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); + + VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); + + umem_free(packed, nvsize); + + dump_nvlist(nv, 8); + + nvlist_free(nv); +} + +static void +zdb_nicenum(uint64_t num, char *buf) +{ + if (dump_opt['P']) + (void) sprintf(buf, "%llu", (longlong_t)num); + else + nicenum(num, buf); +} + +const char dump_zap_stars[] = "****************************************"; +const int dump_zap_width = sizeof (dump_zap_stars) - 1; + +static void +dump_zap_histogram(uint64_t histo[ZAP_HISTOGRAM_SIZE]) +{ + int i; + int minidx = ZAP_HISTOGRAM_SIZE - 1; + int maxidx = 0; + uint64_t max = 0; + + for (i = 0; i < ZAP_HISTOGRAM_SIZE; i++) { + if (histo[i] > max) + max = histo[i]; + if (histo[i] > 0 && i > maxidx) + maxidx = i; + if (histo[i] > 0 && i < minidx) + minidx = i; + } + + if (max < dump_zap_width) + max = dump_zap_width; + + for (i = minidx; i <= maxidx; i++) + (void) printf("\t\t\t%u: %6llu %s\n", i, (u_longlong_t)histo[i], + &dump_zap_stars[(max - histo[i]) * dump_zap_width / max]); +} + +static void +dump_zap_stats(objset_t *os, uint64_t object) +{ + int error; + zap_stats_t zs; + + error = zap_get_stats(os, object, &zs); + if (error) + return; + + if (zs.zs_ptrtbl_len == 0) { + ASSERT(zs.zs_num_blocks == 1); + (void) printf("\tmicrozap: %llu bytes, %llu entries\n", + (u_longlong_t)zs.zs_blocksize, + (u_longlong_t)zs.zs_num_entries); + return; + } + + (void) printf("\tFat ZAP stats:\n"); + + (void) printf("\t\tPointer table:\n"); + (void) printf("\t\t\t%llu elements\n", + (u_longlong_t)zs.zs_ptrtbl_len); + (void) printf("\t\t\tzt_blk: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_blk); + (void) printf("\t\t\tzt_numblks: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_numblks); + (void) printf("\t\t\tzt_shift: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_shift); + (void) printf("\t\t\tzt_blks_copied: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_blks_copied); + (void) printf("\t\t\tzt_nextblk: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_nextblk); + + (void) printf("\t\tZAP entries: %llu\n", + (u_longlong_t)zs.zs_num_entries); + (void) printf("\t\tLeaf blocks: %llu\n", + (u_longlong_t)zs.zs_num_leafs); + (void) printf("\t\tTotal blocks: %llu\n", + (u_longlong_t)zs.zs_num_blocks); + (void) printf("\t\tzap_block_type: 0x%llx\n", + (u_longlong_t)zs.zs_block_type); + (void) printf("\t\tzap_magic: 0x%llx\n", + (u_longlong_t)zs.zs_magic); + (void) printf("\t\tzap_salt: 0x%llx\n", + (u_longlong_t)zs.zs_salt); + + (void) printf("\t\tLeafs with 2^n pointers:\n"); + dump_zap_histogram(zs.zs_leafs_with_2n_pointers); + + (void) printf("\t\tBlocks with n*5 entries:\n"); + dump_zap_histogram(zs.zs_blocks_with_n5_entries); + + (void) printf("\t\tBlocks n/10 full:\n"); + dump_zap_histogram(zs.zs_blocks_n_tenths_full); + + (void) printf("\t\tEntries with n chunks:\n"); + dump_zap_histogram(zs.zs_entries_using_n_chunks); + + (void) printf("\t\tBuckets with n entries:\n"); + dump_zap_histogram(zs.zs_buckets_with_n_entries); +} + +/*ARGSUSED*/ +static void +dump_none(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) +{ + (void) printf("\tUNKNOWN OBJECT TYPE\n"); +} + +/*ARGSUSED*/ +void +dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_zap(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + void *prop; + int i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + prop = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + (void) zap_lookup(os, object, attr.za_name, + attr.za_integer_length, attr.za_num_integers, prop); + if (attr.za_integer_length == 1) { + (void) printf("%s", (char *)prop); + } else { + for (i = 0; i < attr.za_num_integers; i++) { + switch (attr.za_integer_length) { + case 2: + (void) printf("%u ", + ((uint16_t *)prop)[i]); + break; + case 4: + (void) printf("%u ", + ((uint32_t *)prop)[i]); + break; + case 8: + (void) printf("%lld ", + (u_longlong_t)((int64_t *)prop)[i]); + break; + } + } + } + (void) printf("\n"); + umem_free(prop, attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) +{ + dump_zap_stats(os, object); + /* contents are printed elsewhere, properly decoded */ +} + +/*ARGSUSED*/ +static void +dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + (void) printf(" %llx : [%d:%d:%d]\n", + (u_longlong_t)attr.za_first_integer, + (int)ATTR_LENGTH(attr.za_first_integer), + (int)ATTR_BSWAP(attr.za_first_integer), + (int)ATTR_NUM(attr.za_first_integer)); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + uint16_t *layout_attrs; + int i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = [", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + + VERIFY(attr.za_integer_length == 2); + layout_attrs = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + + VERIFY(zap_lookup(os, object, attr.za_name, + attr.za_integer_length, + attr.za_num_integers, layout_attrs) == 0); + + for (i = 0; i != attr.za_num_integers; i++) + (void) printf(" %d ", (int)layout_attrs[i]); + (void) printf("]\n"); + umem_free(layout_attrs, + attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + const char *typenames[] = { + /* 0 */ "not specified", + /* 1 */ "FIFO", + /* 2 */ "Character Device", + /* 3 */ "3 (invalid)", + /* 4 */ "Directory", + /* 5 */ "5 (invalid)", + /* 6 */ "Block Device", + /* 7 */ "7 (invalid)", + /* 8 */ "Regular File", + /* 9 */ "9 (invalid)", + /* 10 */ "Symbolic Link", + /* 11 */ "11 (invalid)", + /* 12 */ "Socket", + /* 13 */ "Door", + /* 14 */ "Event Port", + /* 15 */ "15 (invalid)", + }; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = %lld (type: %s)\n", + attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), + typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); + } + zap_cursor_fini(&zc); +} + +static void +dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) +{ + uint64_t alloc, offset, entry; + uint8_t mapshift = sm->sm_shift; + uint64_t mapstart = sm->sm_start; + char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", + "INVALID", "INVALID", "INVALID", "INVALID" }; + + if (smo->smo_object == 0) + return; + + /* + * Print out the freelist entries in both encoded and decoded form. + */ + alloc = 0; + for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) { + VERIFY3U(0, ==, dmu_read(os, smo->smo_object, offset, + sizeof (entry), &entry, DMU_READ_PREFETCH)); + if (SM_DEBUG_DECODE(entry)) { + (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", + (u_longlong_t)(offset / sizeof (entry)), + ddata[SM_DEBUG_ACTION_DECODE(entry)], + (u_longlong_t)SM_DEBUG_TXG_DECODE(entry), + (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry)); + } else { + (void) printf("\t [%6llu] %c range:" + " %010llx-%010llx size: %06llx\n", + (u_longlong_t)(offset / sizeof (entry)), + SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F', + (u_longlong_t)((SM_OFFSET_DECODE(entry) << + mapshift) + mapstart), + (u_longlong_t)((SM_OFFSET_DECODE(entry) << + mapshift) + mapstart + (SM_RUN_DECODE(entry) << + mapshift)), + (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift)); + if (SM_TYPE_DECODE(entry) == SM_ALLOC) + alloc += SM_RUN_DECODE(entry) << mapshift; + else + alloc -= SM_RUN_DECODE(entry) << mapshift; + } + } + if (alloc != smo->smo_alloc) { + (void) printf("space_map_object alloc (%llu) INCONSISTENT " + "with space map summary (%llu)\n", + (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc); + } +} + +static void +dump_metaslab_stats(metaslab_t *msp) +{ + char maxbuf[32]; + space_map_t *sm = &msp->ms_map; + avl_tree_t *t = sm->sm_pp_root; + int free_pct = sm->sm_space * 100 / sm->sm_size; + + zdb_nicenum(space_map_maxsize(sm), maxbuf); + + (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", + "segments", avl_numnodes(t), "maxsize", maxbuf, + "freepct", free_pct); +} + +static void +dump_metaslab(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; + char freebuf[32]; + + zdb_nicenum(sm->sm_size - smo->smo_alloc, freebuf); + + (void) printf( + "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", + (u_longlong_t)(sm->sm_start / sm->sm_size), + (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf); + + if (dump_opt['m'] > 1 && !dump_opt['L']) { + mutex_enter(&msp->ms_lock); + space_map_load_wait(sm); + if (!sm->sm_loaded) + VERIFY(space_map_load(sm, zfs_metaslab_ops, + SM_FREE, smo, spa->spa_meta_objset) == 0); + dump_metaslab_stats(msp); + space_map_unload(sm); + mutex_exit(&msp->ms_lock); + } + + if (dump_opt['d'] > 5 || dump_opt['m'] > 2) { + ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift)); + + mutex_enter(&msp->ms_lock); + dump_spacemap(spa->spa_meta_objset, smo, sm); + mutex_exit(&msp->ms_lock); + } +} + +static void +print_vdev_metaslab_header(vdev_t *vd) +{ + (void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n", + (u_longlong_t)vd->vdev_id, + "metaslabs", (u_longlong_t)vd->vdev_ms_count, + "offset", "spacemap", "free"); + (void) printf("\t%15s %19s %15s %10s\n", + "---------------", "-------------------", + "---------------", "-------------"); +} + +static void +dump_metaslabs(spa_t *spa) +{ + vdev_t *vd, *rvd = spa->spa_root_vdev; + uint64_t m, c = 0, children = rvd->vdev_children; + + (void) printf("\nMetaslabs:\n"); + + if (!dump_opt['d'] && zopt_objects > 0) { + c = zopt_object[0]; + + if (c >= children) + (void) fatal("bad vdev id: %llu", (u_longlong_t)c); + + if (zopt_objects > 1) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + + for (m = 1; m < zopt_objects; m++) { + if (zopt_object[m] < vd->vdev_ms_count) + dump_metaslab( + vd->vdev_ms[zopt_object[m]]); + else + (void) fprintf(stderr, "bad metaslab " + "number %llu\n", + (u_longlong_t)zopt_object[m]); + } + (void) printf("\n"); + return; + } + children = c + 1; + } + for (; c < children; c++) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + + for (m = 0; m < vd->vdev_ms_count; m++) + dump_metaslab(vd->vdev_ms[m]); + (void) printf("\n"); + } +} + +static void +dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) +{ + const ddt_phys_t *ddp = dde->dde_phys; + const ddt_key_t *ddk = &dde->dde_key; + char *types[4] = { "ditto", "single", "double", "triple" }; + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t blk; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + sprintf_blkptr(blkbuf, &blk); + (void) printf("index %llx refcnt %llu %s %s\n", + (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, + types[p], blkbuf); + } +} + +static void +dump_dedup_ratio(const ddt_stat_t *dds) +{ + double rL, rP, rD, D, dedup, compress, copies; + + if (dds->dds_blocks == 0) + return; + + rL = (double)dds->dds_ref_lsize; + rP = (double)dds->dds_ref_psize; + rD = (double)dds->dds_ref_dsize; + D = (double)dds->dds_dsize; + + dedup = rD / D; + compress = rL / rP; + copies = rD / rP; + + (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " + "dedup * compress / copies = %.2f\n\n", + dedup, compress, copies, dedup * compress / copies); +} + +static void +dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + char name[DDT_NAMELEN]; + ddt_entry_t dde; + uint64_t walk = 0; + dmu_object_info_t doi; + uint64_t count, dspace, mspace; + int error; + + error = ddt_object_info(ddt, type, class, &doi); + + if (error == ENOENT) + return; + ASSERT(error == 0); + + if ((count = ddt_object_count(ddt, type, class)) == 0) + return; + + dspace = doi.doi_physical_blocks_512 << 9; + mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ddt_object_name(ddt, type, class, name); + + (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", + name, + (u_longlong_t)count, + (u_longlong_t)(dspace / count), + (u_longlong_t)(mspace / count)); + + if (dump_opt['D'] < 3) + return; + + zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); + + if (dump_opt['D'] < 4) + return; + + if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) + return; + + (void) printf("%s contents:\n\n", name); + + while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) + dump_dde(ddt, &dde, walk); + + ASSERT(error == ENOENT); + + (void) printf("\n"); +} + +static void +dump_all_ddts(spa_t *spa) +{ + ddt_histogram_t ddh_total = { 0 }; + ddt_stat_t dds_total = { 0 }; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + dump_ddt(ddt, type, class); + } + } + } + + ddt_get_dedup_stats(spa, &dds_total); + + if (dds_total.dds_blocks == 0) { + (void) printf("All DDTs are empty\n"); + return; + } + + (void) printf("\n"); + + if (dump_opt['D'] > 1) { + (void) printf("DDT histogram (aggregated over all DDTs):\n"); + ddt_get_dedup_histogram(spa, &ddh_total); + zpool_dump_ddt(&dds_total, &ddh_total); + } + + dump_dedup_ratio(&dds_total); +} + +static void +dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size) +{ + char *prefix = (void *)sm; + + (void) printf("%s [%llu,%llu) length %llu\n", + prefix, + (u_longlong_t)start, + (u_longlong_t)(start + size), + (u_longlong_t)(size)); +} + +static void +dump_dtl(vdev_t *vd, int indent) +{ + spa_t *spa = vd->vdev_spa; + boolean_t required; + char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; + char prefix[256]; + + spa_vdev_state_enter(spa, SCL_NONE); + required = vdev_dtl_required(vd); + (void) spa_vdev_state_exit(spa, NULL, 0); + + if (indent == 0) + (void) printf("\nDirty time logs:\n\n"); + + (void) printf("\t%*s%s [%s]\n", indent, "", + vd->vdev_path ? vd->vdev_path : + vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), + required ? "DTL-required" : "DTL-expendable"); + + for (int t = 0; t < DTL_TYPES; t++) { + space_map_t *sm = &vd->vdev_dtl[t]; + if (sm->sm_space == 0) + continue; + (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", + indent + 2, "", name[t]); + mutex_enter(sm->sm_lock); + space_map_walk(sm, dump_dtl_seg, (void *)prefix); + mutex_exit(sm->sm_lock); + if (dump_opt['d'] > 5 && vd->vdev_children == 0) + dump_spacemap(spa->spa_meta_objset, + &vd->vdev_dtl_smo, sm); + } + + for (int c = 0; c < vd->vdev_children; c++) + dump_dtl(vd->vdev_child[c], indent + 4); +} + +static void +dump_history(spa_t *spa) +{ + nvlist_t **events = NULL; + char buf[SPA_MAXBLOCKSIZE]; + uint64_t resid, len, off = 0; + uint_t num = 0; + int error; + time_t tsec; + struct tm t; + char tbuf[30]; + char internalstr[MAXPATHLEN]; + + do { + len = sizeof (buf); + + if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { + (void) fprintf(stderr, "Unable to read history: " + "error %d\n", error); + return; + } + + if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) + break; + + off -= resid; + } while (len != 0); + + (void) printf("\nHistory:\n"); + for (int i = 0; i < num; i++) { + uint64_t time, txg, ievent; + char *cmd, *intstr; + + if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, + &time) != 0) + continue; + if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, + &cmd) != 0) { + if (nvlist_lookup_uint64(events[i], + ZPOOL_HIST_INT_EVENT, &ievent) != 0) + continue; + verify(nvlist_lookup_uint64(events[i], + ZPOOL_HIST_TXG, &txg) == 0); + verify(nvlist_lookup_string(events[i], + ZPOOL_HIST_INT_STR, &intstr) == 0); + if (ievent >= LOG_END) + continue; + + (void) snprintf(internalstr, + sizeof (internalstr), + "[internal %s txg:%lld] %s", + zfs_history_event_names[ievent], txg, + intstr); + cmd = internalstr; + } + tsec = time; + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + (void) printf("%s %s\n", tbuf, cmd); + } +} + +/*ARGSUSED*/ +static void +dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +static uint64_t +blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb) +{ + if (dnp == NULL) { + ASSERT(zb->zb_level < 0); + if (zb->zb_object == 0) + return (zb->zb_blkid); + return (zb->zb_blkid * BP_GET_LSIZE(bp)); + } + + ASSERT(zb->zb_level >= 0); + + return ((zb->zb_blkid << + (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); +} + +static void +sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; + + if (dump_opt['b'] >= 5) { + sprintf_blkptr(blkbuf, bp); + return; + } + + blkbuf[0] = '\0'; + + for (int i = 0; i < ndvas; i++) + (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ", + (u_longlong_t)DVA_GET_VDEV(&dva[i]), + (u_longlong_t)DVA_GET_OFFSET(&dva[i]), + (u_longlong_t)DVA_GET_ASIZE(&dva[i])); + + (void) sprintf(blkbuf + strlen(blkbuf), + "%llxL/%llxP F=%llu B=%llu/%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp), + (u_longlong_t)bp->blk_fill, + (u_longlong_t)bp->blk_birth, + (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); +} + +static void +print_indirect(blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp) +{ + char blkbuf[BP_SPRINTF_LEN]; + int l; + + ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); + ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); + + (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); + + ASSERT(zb->zb_level >= 0); + + for (l = dnp->dn_nlevels - 1; l >= -1; l--) { + if (l == zb->zb_level) { + (void) printf("L%llx", (u_longlong_t)zb->zb_level); + } else { + (void) printf(" "); + } + } + + sprintf_blkptr_compact(blkbuf, bp); + (void) printf("%s\n", blkbuf); +} + +static int +visit_indirect(spa_t *spa, const dnode_phys_t *dnp, + blkptr_t *bp, const zbookmark_t *zb) +{ + int err = 0; + + if (bp->blk_birth == 0) + return (0); + + print_indirect(bp, zb, dnp); + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + arc_buf_t *buf; + uint64_t fill = 0; + + err = arc_read_nolock(NULL, spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + ASSERT(buf->b_data); + + /* recursively visit blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = visit_indirect(spa, dnp, cbp, &czb); + if (err) + break; + fill += cbp->blk_fill; + } + if (!err) + ASSERT3U(fill, ==, bp->blk_fill); + (void) arc_buf_remove_ref(buf, &buf); + } + + return (err); +} + +/*ARGSUSED*/ +static void +dump_indirect(dnode_t *dn) +{ + dnode_phys_t *dnp = dn->dn_phys; + int j; + zbookmark_t czb; + + (void) printf("Indirect blocks:\n"); + + SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), + dn->dn_object, dnp->dn_nlevels - 1, 0); + for (j = 0; j < dnp->dn_nblkptr; j++) { + czb.zb_blkid = j; + (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, + &dnp->dn_blkptr[j], &czb); + } + + (void) printf("\n"); +} + +/*ARGSUSED*/ +static void +dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) +{ + dsl_dir_phys_t *dd = data; + time_t crtime; + char nice[32]; + + if (dd == NULL) + return; + + ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); + + crtime = dd->dd_creation_time; + (void) printf("\t\tcreation_time = %s", ctime(&crtime)); + (void) printf("\t\thead_dataset_obj = %llu\n", + (u_longlong_t)dd->dd_head_dataset_obj); + (void) printf("\t\tparent_dir_obj = %llu\n", + (u_longlong_t)dd->dd_parent_obj); + (void) printf("\t\torigin_obj = %llu\n", + (u_longlong_t)dd->dd_origin_obj); + (void) printf("\t\tchild_dir_zapobj = %llu\n", + (u_longlong_t)dd->dd_child_dir_zapobj); + zdb_nicenum(dd->dd_used_bytes, nice); + (void) printf("\t\tused_bytes = %s\n", nice); + zdb_nicenum(dd->dd_compressed_bytes, nice); + (void) printf("\t\tcompressed_bytes = %s\n", nice); + zdb_nicenum(dd->dd_uncompressed_bytes, nice); + (void) printf("\t\tuncompressed_bytes = %s\n", nice); + zdb_nicenum(dd->dd_quota, nice); + (void) printf("\t\tquota = %s\n", nice); + zdb_nicenum(dd->dd_reserved, nice); + (void) printf("\t\treserved = %s\n", nice); + (void) printf("\t\tprops_zapobj = %llu\n", + (u_longlong_t)dd->dd_props_zapobj); + (void) printf("\t\tdeleg_zapobj = %llu\n", + (u_longlong_t)dd->dd_deleg_zapobj); + (void) printf("\t\tflags = %llx\n", + (u_longlong_t)dd->dd_flags); + +#define DO(which) \ + zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \ + (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) + DO(HEAD); + DO(SNAP); + DO(CHILD); + DO(CHILD_RSRV); + DO(REFRSRV); +#undef DO +} + +/*ARGSUSED*/ +static void +dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) +{ + dsl_dataset_phys_t *ds = data; + time_t crtime; + char used[32], compressed[32], uncompressed[32], unique[32]; + char blkbuf[BP_SPRINTF_LEN]; + + if (ds == NULL) + return; + + ASSERT(size == sizeof (*ds)); + crtime = ds->ds_creation_time; + zdb_nicenum(ds->ds_used_bytes, used); + zdb_nicenum(ds->ds_compressed_bytes, compressed); + zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed); + zdb_nicenum(ds->ds_unique_bytes, unique); + sprintf_blkptr(blkbuf, &ds->ds_bp); + + (void) printf("\t\tdir_obj = %llu\n", + (u_longlong_t)ds->ds_dir_obj); + (void) printf("\t\tprev_snap_obj = %llu\n", + (u_longlong_t)ds->ds_prev_snap_obj); + (void) printf("\t\tprev_snap_txg = %llu\n", + (u_longlong_t)ds->ds_prev_snap_txg); + (void) printf("\t\tnext_snap_obj = %llu\n", + (u_longlong_t)ds->ds_next_snap_obj); + (void) printf("\t\tsnapnames_zapobj = %llu\n", + (u_longlong_t)ds->ds_snapnames_zapobj); + (void) printf("\t\tnum_children = %llu\n", + (u_longlong_t)ds->ds_num_children); + (void) printf("\t\tuserrefs_obj = %llu\n", + (u_longlong_t)ds->ds_userrefs_obj); + (void) printf("\t\tcreation_time = %s", ctime(&crtime)); + (void) printf("\t\tcreation_txg = %llu\n", + (u_longlong_t)ds->ds_creation_txg); + (void) printf("\t\tdeadlist_obj = %llu\n", + (u_longlong_t)ds->ds_deadlist_obj); + (void) printf("\t\tused_bytes = %s\n", used); + (void) printf("\t\tcompressed_bytes = %s\n", compressed); + (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); + (void) printf("\t\tunique = %s\n", unique); + (void) printf("\t\tfsid_guid = %llu\n", + (u_longlong_t)ds->ds_fsid_guid); + (void) printf("\t\tguid = %llu\n", + (u_longlong_t)ds->ds_guid); + (void) printf("\t\tflags = %llx\n", + (u_longlong_t)ds->ds_flags); + (void) printf("\t\tnext_clones_obj = %llu\n", + (u_longlong_t)ds->ds_next_clones_obj); + (void) printf("\t\tprops_obj = %llu\n", + (u_longlong_t)ds->ds_props_obj); + (void) printf("\t\tbp = %s\n", blkbuf); +} + +/* ARGSUSED */ +static int +dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + ASSERT(bp->blk_birth != 0); + sprintf_blkptr_compact(blkbuf, bp); + (void) printf("\t%s\n", blkbuf); + return (0); +} + +static void +dump_bpobj(bpobj_t *bpo, char *name) +{ + char bytes[32]; + char comp[32]; + char uncomp[32]; + + if (dump_opt['d'] < 3) + return; + + zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes); + if (bpo->bpo_havesubobj) { + zdb_nicenum(bpo->bpo_phys->bpo_comp, comp); + zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp); + (void) printf("\n %s: %llu local blkptrs, %llu subobjs, " + "%s (%s/%s comp)\n", + name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + bytes, comp, uncomp); + } else { + (void) printf("\n %s: %llu blkptrs, %s\n", + name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); + } + + if (dump_opt['d'] < 5) + return; + + (void) printf("\n"); + + (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); +} + +static void +dump_deadlist(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + char bytes[32]; + char comp[32]; + char uncomp[32]; + + if (dump_opt['d'] < 3) + return; + + zdb_nicenum(dl->dl_phys->dl_used, bytes); + zdb_nicenum(dl->dl_phys->dl_comp, comp); + zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp); + (void) printf("\n Deadlist: %s (%s/%s comp)\n", + bytes, comp, uncomp); + + if (dump_opt['d'] < 4) + return; + + (void) printf("\n"); + + for (dle = avl_first(&dl->dl_tree); dle; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + (void) printf(" mintxg %llu -> obj %llu\n", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + + if (dump_opt['d'] >= 5) + dump_bpobj(&dle->dle_bpobj, ""); + } +} + +static avl_tree_t idx_tree; +static avl_tree_t domain_tree; +static boolean_t fuid_table_loaded; +static boolean_t sa_loaded; +sa_attr_type_t *sa_attr_table; + +static void +fuid_table_destroy() +{ + if (fuid_table_loaded) { + zfs_fuid_table_destroy(&idx_tree, &domain_tree); + fuid_table_loaded = B_FALSE; + } +} + +/* + * print uid or gid information. + * For normal POSIX id just the id is printed in decimal format. + * For CIFS files with FUID the fuid is printed in hex followed by + * the doman-rid string. + */ +static void +print_idstr(uint64_t id, const char *id_type) +{ + if (FUID_INDEX(id)) { + char *domain; + + domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); + (void) printf("\t%s %llx [%s-%d]\n", id_type, + (u_longlong_t)id, domain, (int)FUID_RID(id)); + } else { + (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); + } + +} + +static void +dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) +{ + uint32_t uid_idx, gid_idx; + + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); + + /* Load domain table, if not already loaded */ + if (!fuid_table_loaded && (uid_idx || gid_idx)) { + uint64_t fuid_obj; + + /* first find the fuid object. It lives in the master node */ + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, + 8, 1, &fuid_obj) == 0); + zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); + (void) zfs_fuid_table_load(os, fuid_obj, + &idx_tree, &domain_tree); + fuid_table_loaded = B_TRUE; + } + + print_idstr(uid, "uid"); + print_idstr(gid, "gid"); +} + +/*ARGSUSED*/ +static void +dump_znode(objset_t *os, uint64_t object, void *data, size_t size) +{ + char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ + sa_handle_t *hdl; + uint64_t xattr, rdev, gen; + uint64_t uid, gid, mode, fsize, parent, links; + uint64_t pflags; + uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; + time_t z_crtime, z_atime, z_mtime, z_ctime; + sa_bulk_attr_t bulk[12]; + int idx = 0; + int error; + + if (!sa_loaded) { + uint64_t sa_attrs = 0; + uint64_t version; + + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &version) == 0); + if (version >= ZPL_VERSION_SA) { + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, + 8, 1, &sa_attrs) == 0); + } + if ((error = sa_setup(os, sa_attrs, zfs_attr_table, + ZPL_END, &sa_attr_table)) != 0) { + (void) printf("sa_setup failed errno %d, can't " + "display znode contents\n", error); + return; + } + sa_loaded = B_TRUE; + } + + if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { + (void) printf("Failed to get handle for SA znode\n"); + return; + } + + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, + &links, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, + &mode, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], + NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, + &fsize, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, + acctm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, + modtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, + crtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, + chgtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, + &pflags, 8); + + if (sa_bulk_lookup(hdl, bulk, idx)) { + (void) sa_handle_destroy(hdl); + return; + } + + error = zfs_obj_to_path(os, object, path, sizeof (path)); + if (error != 0) { + (void) snprintf(path, sizeof (path), "\?\?\?", + (u_longlong_t)object); + } + if (dump_opt['d'] < 3) { + (void) printf("\t%s\n", path); + (void) sa_handle_destroy(hdl); + return; + } + + z_crtime = (time_t)crtm[0]; + z_atime = (time_t)acctm[0]; + z_mtime = (time_t)modtm[0]; + z_ctime = (time_t)chgtm[0]; + + (void) printf("\tpath %s\n", path); + dump_uidgid(os, uid, gid); + (void) printf("\tatime %s", ctime(&z_atime)); + (void) printf("\tmtime %s", ctime(&z_mtime)); + (void) printf("\tctime %s", ctime(&z_ctime)); + (void) printf("\tcrtime %s", ctime(&z_crtime)); + (void) printf("\tgen %llu\n", (u_longlong_t)gen); + (void) printf("\tmode %llo\n", (u_longlong_t)mode); + (void) printf("\tsize %llu\n", (u_longlong_t)fsize); + (void) printf("\tparent %llu\n", (u_longlong_t)parent); + (void) printf("\tlinks %llu\n", (u_longlong_t)links); + (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); + if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, + sizeof (uint64_t)) == 0) + (void) printf("\txattr %llu\n", (u_longlong_t)xattr); + if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, + sizeof (uint64_t)) == 0) + (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); + sa_handle_destroy(hdl); +} + +/*ARGSUSED*/ +static void +dump_acl(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { + dump_none, /* unallocated */ + dump_zap, /* object directory */ + dump_uint64, /* object array */ + dump_none, /* packed nvlist */ + dump_packed_nvlist, /* packed nvlist size */ + dump_none, /* bplist */ + dump_none, /* bplist header */ + dump_none, /* SPA space map header */ + dump_none, /* SPA space map */ + dump_none, /* ZIL intent log */ + dump_dnode, /* DMU dnode */ + dump_dmu_objset, /* DMU objset */ + dump_dsl_dir, /* DSL directory */ + dump_zap, /* DSL directory child map */ + dump_zap, /* DSL dataset snap map */ + dump_zap, /* DSL props */ + dump_dsl_dataset, /* DSL dataset */ + dump_znode, /* ZFS znode */ + dump_acl, /* ZFS V0 ACL */ + dump_uint8, /* ZFS plain file */ + dump_zpldir, /* ZFS directory */ + dump_zap, /* ZFS master node */ + dump_zap, /* ZFS delete queue */ + dump_uint8, /* zvol object */ + dump_zap, /* zvol prop */ + dump_uint8, /* other uint8[] */ + dump_uint64, /* other uint64[] */ + dump_zap, /* other ZAP */ + dump_zap, /* persistent error log */ + dump_uint8, /* SPA history */ + dump_uint64, /* SPA history offsets */ + dump_zap, /* Pool properties */ + dump_zap, /* DSL permissions */ + dump_acl, /* ZFS ACL */ + dump_uint8, /* ZFS SYSACL */ + dump_none, /* FUID nvlist */ + dump_packed_nvlist, /* FUID nvlist size */ + dump_zap, /* DSL dataset next clones */ + dump_zap, /* DSL scrub queue */ + dump_zap, /* ZFS user/group used */ + dump_zap, /* ZFS user/group quota */ + dump_zap, /* snapshot refcount tags */ + dump_ddt_zap, /* DDT ZAP object */ + dump_zap, /* DDT statistics */ + dump_znode, /* SA object */ + dump_zap, /* SA Master Node */ + dump_sa_attrs, /* SA attribute registration */ + dump_sa_layouts, /* SA attribute layouts */ + dump_zap, /* DSL scrub translations */ + dump_none, /* fake dedup BP */ + dump_zap, /* deadlist */ + dump_none, /* deadlist hdr */ + dump_zap, /* dsl clones */ + dump_none, /* bpobj subobjs */ + dump_unknown, /* Unknown type, must be last */ +}; + +static void +dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) +{ + dmu_buf_t *db = NULL; + dmu_object_info_t doi; + dnode_t *dn; + void *bonus = NULL; + size_t bsize = 0; + char iblk[32], dblk[32], lsize[32], asize[32], fill[32]; + char bonus_size[32]; + char aux[50]; + int error; + + if (*print_header) { + (void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n", + "Object", "lvl", "iblk", "dblk", "dsize", "lsize", + "%full", "type"); + *print_header = 0; + } + + if (object == 0) { + dn = DMU_META_DNODE(os); + } else { + error = dmu_bonus_hold(os, object, FTAG, &db); + if (error) + fatal("dmu_bonus_hold(%llu) failed, errno %u", + object, error); + bonus = db->db_data; + bsize = db->db_size; + dn = DB_DNODE((dmu_buf_impl_t *)db); + } + dmu_object_info_from_dnode(dn, &doi); + + zdb_nicenum(doi.doi_metadata_block_size, iblk); + zdb_nicenum(doi.doi_data_block_size, dblk); + zdb_nicenum(doi.doi_max_offset, lsize); + zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize); + zdb_nicenum(doi.doi_bonus_size, bonus_size); + (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * + doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / + doi.doi_max_offset); + + aux[0] = '\0'; + + if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)", + ZDB_CHECKSUM_NAME(doi.doi_checksum)); + } + + if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)", + ZDB_COMPRESS_NAME(doi.doi_compress)); + } + + (void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n", + (u_longlong_t)object, doi.doi_indirection, iblk, dblk, + asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); + + if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { + (void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n", + "", "", "", "", "", bonus_size, "bonus", + ZDB_OT_NAME(doi.doi_bonus_type)); + } + + if (verbosity >= 4) { + (void) printf("\tdnode flags: %s%s%s\n", + (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? + "USED_BYTES " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? + "USERUSED_ACCOUNTED " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? + "SPILL_BLKPTR" : ""); + (void) printf("\tdnode maxblkid: %llu\n", + (longlong_t)dn->dn_phys->dn_maxblkid); + + object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, + bonus, bsize); + object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); + *print_header = 1; + } + + if (verbosity >= 5) + dump_indirect(dn); + + if (verbosity >= 5) { + /* + * Report the list of segments that comprise the object. + */ + uint64_t start = 0; + uint64_t end; + uint64_t blkfill = 1; + int minlvl = 1; + + if (dn->dn_type == DMU_OT_DNODE) { + minlvl = 0; + blkfill = DNODES_PER_BLOCK; + } + + for (;;) { + char segsize[32]; + error = dnode_next_offset(dn, + 0, &start, minlvl, blkfill, 0); + if (error) + break; + end = start; + error = dnode_next_offset(dn, + DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); + zdb_nicenum(end - start, segsize); + (void) printf("\t\tsegment [%016llx, %016llx)" + " size %5s\n", (u_longlong_t)start, + (u_longlong_t)end, segsize); + if (error) + break; + start = end; + } + } + + if (db != NULL) + dmu_buf_rele(db, FTAG); +} + +static char *objset_types[DMU_OST_NUMTYPES] = { + "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; + +static void +dump_dir(objset_t *os) +{ + dmu_objset_stats_t dds; + uint64_t object, object_count; + uint64_t refdbytes, usedobjs, scratch; + char numbuf[32]; + char blkbuf[BP_SPRINTF_LEN + 20]; + char osname[MAXNAMELEN]; + char *type = "UNKNOWN"; + int verbosity = dump_opt['d']; + int print_header = 1; + int i, error; + + dmu_objset_fast_stat(os, &dds); + + if (dds.dds_type < DMU_OST_NUMTYPES) + type = objset_types[dds.dds_type]; + + if (dds.dds_type == DMU_OST_META) { + dds.dds_creation_txg = TXG_INITIAL; + usedobjs = os->os_rootbp->blk_fill; + refdbytes = os->os_spa->spa_dsl_pool-> + dp_mos_dir->dd_phys->dd_used_bytes; + } else { + dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); + } + + ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill); + + zdb_nicenum(refdbytes, numbuf); + + if (verbosity >= 4) { + (void) sprintf(blkbuf, ", rootbp "); + (void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp); + } else { + blkbuf[0] = '\0'; + } + + dmu_objset_name(os, osname); + + (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " + "%s, %llu objects%s\n", + osname, type, (u_longlong_t)dmu_objset_id(os), + (u_longlong_t)dds.dds_creation_txg, + numbuf, (u_longlong_t)usedobjs, blkbuf); + + if (zopt_objects != 0) { + for (i = 0; i < zopt_objects; i++) + dump_object(os, zopt_object[i], verbosity, + &print_header); + (void) printf("\n"); + return; + } + + if (dump_opt['i'] != 0 || verbosity >= 2) + dump_intent_log(dmu_objset_zil(os)); + + if (dmu_objset_ds(os) != NULL) + dump_deadlist(&dmu_objset_ds(os)->ds_deadlist); + + if (verbosity < 2) + return; + + if (os->os_rootbp->blk_birth == 0) + return; + + dump_object(os, 0, verbosity, &print_header); + object_count = 0; + if (DMU_USERUSED_DNODE(os) != NULL && + DMU_USERUSED_DNODE(os)->dn_type != 0) { + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); + } + + object = 0; + while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { + dump_object(os, object, verbosity, &print_header); + object_count++; + } + + ASSERT3U(object_count, ==, usedobjs); + + (void) printf("\n"); + + if (error != ESRCH) { + (void) fprintf(stderr, "dmu_object_next() = %d\n", error); + abort(); + } +} + +static void +dump_uberblock(uberblock_t *ub, const char *header, const char *footer) +{ + time_t timestamp = ub->ub_timestamp; + + (void) printf(header ? header : ""); + (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); + (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); + (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); + (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); + (void) printf("\ttimestamp = %llu UTC = %s", + (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); + if (dump_opt['u'] >= 3) { + char blkbuf[BP_SPRINTF_LEN]; + sprintf_blkptr(blkbuf, &ub->ub_rootbp); + (void) printf("\trootbp = %s\n", blkbuf); + } + (void) printf(footer ? footer : ""); +} + +static void +dump_config(spa_t *spa) +{ + dmu_buf_t *db; + size_t nvsize = 0; + int error = 0; + + + error = dmu_bonus_hold(spa->spa_meta_objset, + spa->spa_config_object, FTAG, &db); + + if (error == 0) { + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + (void) printf("\nMOS Configuration:\n"); + dump_packed_nvlist(spa->spa_meta_objset, + spa->spa_config_object, (void *)&nvsize, 1); + } else { + (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", + (u_longlong_t)spa->spa_config_object, error); + } +} + +static void +dump_cachefile(const char *cachefile) +{ + int fd; + struct stat64 statbuf; + char *buf; + nvlist_t *config; + + if ((fd = open64(cachefile, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", cachefile, + strerror(errno)); + exit(1); + } + + if (fstat64(fd, &statbuf) != 0) { + (void) printf("failed to stat '%s': %s\n", cachefile, + strerror(errno)); + exit(1); + } + + if ((buf = malloc(statbuf.st_size)) == NULL) { + (void) fprintf(stderr, "failed to allocate %llu bytes\n", + (u_longlong_t)statbuf.st_size); + exit(1); + } + + if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { + (void) fprintf(stderr, "failed to read %llu bytes\n", + (u_longlong_t)statbuf.st_size); + exit(1); + } + + (void) close(fd); + + if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { + (void) fprintf(stderr, "failed to unpack nvlist\n"); + exit(1); + } + + free(buf); + + dump_nvlist(config, 0); + + nvlist_free(config); +} + +#define ZDB_MAX_UB_HEADER_SIZE 32 + +static void +dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift) +{ + vdev_t vd; + vdev_t *vdp = &vd; + char header[ZDB_MAX_UB_HEADER_SIZE]; + + vd.vdev_ashift = ashift; + vdp->vdev_top = vdp; + + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) { + uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i); + uberblock_t *ub = (void *)((char *)lbl + uoff); + + if (uberblock_verify(ub)) + continue; + (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, + "Uberblock[%d]\n", i); + dump_uberblock(ub, header, ""); + } +} + +static void +dump_label(const char *dev) +{ + int fd; + vdev_label_t label; + char *path, *buf = label.vl_vdev_phys.vp_nvlist; + size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); + struct stat64 statbuf; + uint64_t psize, ashift; + int len = strlen(dev) + 1; + + if (strncmp(dev, "/dev/dsk/", 9) == 0) { + len++; + path = malloc(len); + (void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9); + } else { + path = strdup(dev); + } + + if ((fd = open64(path, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", path, strerror(errno)); + free(path); + exit(1); + } + + if (fstat64(fd, &statbuf) != 0) { + (void) printf("failed to stat '%s': %s\n", path, + strerror(errno)); + free(path); + (void) close(fd); + exit(1); + } + + if (S_ISBLK(statbuf.st_mode)) { + (void) printf("cannot use '%s': character device required\n", + path); + free(path); + (void) close(fd); + exit(1); + } + + psize = statbuf.st_size; + psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + + for (int l = 0; l < VDEV_LABELS; l++) { + nvlist_t *config = NULL; + + (void) printf("--------------------------------------------\n"); + (void) printf("LABEL %d\n", l); + (void) printf("--------------------------------------------\n"); + + if (pread64(fd, &label, sizeof (label), + vdev_label_offset(psize, l, 0)) != sizeof (label)) { + (void) printf("failed to read label %d\n", l); + continue; + } + + if (nvlist_unpack(buf, buflen, &config, 0) != 0) { + (void) printf("failed to unpack label %d\n", l); + ashift = SPA_MINBLOCKSHIFT; + } else { + nvlist_t *vdev_tree = NULL; + + dump_nvlist(config, 4); + if ((nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || + (nvlist_lookup_uint64(vdev_tree, + ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) + ashift = SPA_MINBLOCKSHIFT; + nvlist_free(config); + } + if (dump_opt['u']) + dump_label_uberblocks(&label, ashift); + } + + free(path); + (void) close(fd); +} + +/*ARGSUSED*/ +static int +dump_one_dir(const char *dsname, void *arg) +{ + int error; + objset_t *os; + + error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os); + if (error) { + (void) printf("Could not open %s, error %d\n", dsname, error); + return (0); + } + dump_dir(os); + dmu_objset_disown(os, FTAG); + fuid_table_destroy(); + sa_loaded = B_FALSE; + return (0); +} + +/* + * Block statistics. + */ +typedef struct zdb_blkstats { + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_count; +} zdb_blkstats_t; + +/* + * Extended object types to report deferred frees and dedup auto-ditto blocks. + */ +#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) +#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) +#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 2) + +static char *zdb_ot_extname[] = { + "deferred free", + "dedup ditto", + "Total", +}; + +#define ZB_TOTAL DN_MAX_LEVELS + +typedef struct zdb_cb { + zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; + uint64_t zcb_dedup_asize; + uint64_t zcb_dedup_blocks; + uint64_t zcb_errors[256]; + int zcb_readfails; + int zcb_haderrors; + spa_t *zcb_spa; +} zdb_cb_t; + +static void +zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, + dmu_object_type_t type) +{ + uint64_t refcnt = 0; + + ASSERT(type < ZDB_OT_TOTAL); + + if (zilog && zil_bp_tree_add(zilog, bp) != 0) + return; + + for (int i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; + int t = (i & 1) ? type : ZDB_OT_TOTAL; + zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; + + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_count++; + } + + if (dump_opt['L']) + return; + + if (BP_GET_DEDUP(bp)) { + ddt_t *ddt; + ddt_entry_t *dde; + + ddt = ddt_select(zcb->zcb_spa, bp); + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_FALSE); + + if (dde == NULL) { + refcnt = 0; + } else { + ddt_phys_t *ddp = ddt_phys_select(dde, bp); + ddt_phys_decref(ddp); + refcnt = ddp->ddp_refcnt; + if (ddt_phys_total_refcnt(dde) == 0) + ddt_remove(ddt, dde); + } + ddt_exit(ddt); + } + + VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, + refcnt ? 0 : spa_first_txg(zcb->zcb_spa), + bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); +} + +/* ARGSUSED */ +static int +zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + zdb_cb_t *zcb = arg; + char blkbuf[BP_SPRINTF_LEN]; + dmu_object_type_t type; + boolean_t is_metadata; + + if (bp == NULL) + return (0); + + type = BP_GET_TYPE(bp); + + zdb_count_block(zcb, zilog, bp, type); + + is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata); + + if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { + int ioerr; + size_t size = BP_GET_PSIZE(bp); + void *data = malloc(size); + int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + flags |= ZIO_FLAG_SPECULATIVE; + + ioerr = zio_wait(zio_read(NULL, spa, bp, data, size, + NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb)); + + free(data); + + if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) { + zcb->zcb_haderrors = 1; + zcb->zcb_errors[ioerr]++; + + if (dump_opt['b'] >= 2) + sprintf_blkptr(blkbuf, bp); + else + blkbuf[0] = '\0'; + + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } + } + + zcb->zcb_readfails = 0; + + if (dump_opt['b'] >= 4) { + sprintf_blkptr(blkbuf, bp); + (void) printf("objset %llu object %llu " + "level %lld offset 0x%llx %s\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (u_longlong_t)blkid2offset(dnp, bp, zb), + blkbuf); + } + + return (0); +} + +static void +zdb_leak(space_map_t *sm, uint64_t start, uint64_t size) +{ + vdev_t *vd = sm->sm_ppd; + + (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", + (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); +} + +/* ARGSUSED */ +static void +zdb_space_map_load(space_map_t *sm) +{ +} + +static void +zdb_space_map_unload(space_map_t *sm) +{ + space_map_vacate(sm, zdb_leak, sm); +} + +/* ARGSUSED */ +static void +zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ +} + +static space_map_ops_t zdb_space_map_ops = { + zdb_space_map_load, + zdb_space_map_unload, + NULL, /* alloc */ + zdb_space_map_claim, + NULL, /* free */ + NULL /* maxsize */ +}; + +static void +zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + ddt_bookmark_t ddb = { 0 }; + ddt_entry_t dde; + int error; + + while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { + blkptr_t blk; + ddt_phys_t *ddp = dde.dde_phys; + + if (ddb.ddb_class == DDT_CLASS_UNIQUE) + return; + + ASSERT(ddt_phys_total_refcnt(&dde) > 1); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddb.ddb_checksum, + &dde.dde_key, ddp, &blk); + if (p == DDT_PHYS_DITTO) { + zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); + } else { + zcb->zcb_dedup_asize += + BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); + zcb->zcb_dedup_blocks++; + } + } + if (!dump_opt['L']) { + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); + VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); + ddt_exit(ddt); + } + } + + ASSERT(error == ENOENT); +} + +static void +zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + zcb->zcb_spa = spa; + + if (!dump_opt['L']) { + vdev_t *rvd = spa->spa_root_vdev; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + mutex_enter(&msp->ms_lock); + space_map_unload(&msp->ms_map); + VERIFY(space_map_load(&msp->ms_map, + &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo, + spa->spa_meta_objset) == 0); + msp->ms_map.sm_ppd = vd; + mutex_exit(&msp->ms_lock); + } + } + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + zdb_ddt_leak_init(spa, zcb); + + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static void +zdb_leak_fini(spa_t *spa) +{ + if (!dump_opt['L']) { + vdev_t *rvd = spa->spa_root_vdev; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + mutex_enter(&msp->ms_lock); + space_map_unload(&msp->ms_map); + mutex_exit(&msp->ms_lock); + } + } + } +} + +/* ARGSUSED */ +static int +count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + + if (dump_opt['b'] >= 4) { + char blkbuf[BP_SPRINTF_LEN]; + sprintf_blkptr(blkbuf, bp); + (void) printf("[%s] %s\n", + "deferred free", blkbuf); + } + zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); + return (0); +} + +static int +dump_block_stats(spa_t *spa) +{ + zdb_cb_t zcb = { 0 }; + zdb_blkstats_t *zb, *tzb; + uint64_t norm_alloc, norm_space, total_alloc, total_found; + int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; + int leaks = 0; + + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n", + (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", + (dump_opt['c'] == 1) ? "metadata " : "", + dump_opt['c'] ? "checksums " : "", + (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", + !dump_opt['L'] ? "nothing leaked " : ""); + + /* + * Load all space maps as SM_ALLOC maps, then traverse the pool + * claiming each block we discover. If the pool is perfectly + * consistent, the space maps will be empty when we're done. + * Anything left over is a leak; any block we can't claim (because + * it's not part of any space map) is a double allocation, + * reference to a freed block, or an unclaimed log block. + */ + zdb_leak_init(spa, &zcb); + + /* + * If there's a deferred-free bplist, process that first. + */ + (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, + count_block_cb, &zcb, NULL); + (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, + count_block_cb, &zcb, NULL); + + if (dump_opt['c'] > 1) + flags |= TRAVERSE_PREFETCH_DATA; + + zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + + if (zcb.zcb_haderrors) { + (void) printf("\nError counts:\n\n"); + (void) printf("\t%5s %s\n", "errno", "count"); + for (int e = 0; e < 256; e++) { + if (zcb.zcb_errors[e] != 0) { + (void) printf("\t%5d %llu\n", + e, (u_longlong_t)zcb.zcb_errors[e]); + } + } + } + + /* + * Report any leaked segments. + */ + zdb_leak_fini(spa); + + tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; + + norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + norm_space = metaslab_class_get_space(spa_normal_class(spa)); + + total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)); + total_found = tzb->zb_asize - zcb.zcb_dedup_asize; + + if (total_found == total_alloc) { + if (!dump_opt['L']) + (void) printf("\n\tNo leaks (block sum matches space" + " maps exactly)\n"); + } else { + (void) printf("block traversal size %llu != alloc %llu " + "(%s %lld)\n", + (u_longlong_t)total_found, + (u_longlong_t)total_alloc, + (dump_opt['L']) ? "unreachable" : "leaked", + (longlong_t)(total_alloc - total_found)); + leaks = 1; + } + + if (tzb->zb_count == 0) + return (2); + + (void) printf("\n"); + (void) printf("\tbp count: %10llu\n", + (u_longlong_t)tzb->zb_count); + (void) printf("\tbp logical: %10llu avg: %6llu\n", + (u_longlong_t)tzb->zb_lsize, + (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); + (void) printf("\tbp physical: %10llu avg:" + " %6llu compression: %6.2f\n", + (u_longlong_t)tzb->zb_psize, + (u_longlong_t)(tzb->zb_psize / tzb->zb_count), + (double)tzb->zb_lsize / tzb->zb_psize); + (void) printf("\tbp allocated: %10llu avg:" + " %6llu compression: %6.2f\n", + (u_longlong_t)tzb->zb_asize, + (u_longlong_t)(tzb->zb_asize / tzb->zb_count), + (double)tzb->zb_lsize / tzb->zb_asize); + (void) printf("\tbp deduped: %10llu ref>1:" + " %6llu deduplication: %6.2f\n", + (u_longlong_t)zcb.zcb_dedup_asize, + (u_longlong_t)zcb.zcb_dedup_blocks, + (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); + (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", + (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); + + if (dump_opt['b'] >= 2) { + int l, t, level; + (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" + "\t avg\t comp\t%%Total\tType\n"); + + for (t = 0; t <= ZDB_OT_TOTAL; t++) { + char csize[32], lsize[32], psize[32], asize[32]; + char avg[32]; + char *typename; + + if (t < DMU_OT_NUMTYPES) + typename = dmu_ot[t].ot_name; + else + typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; + + if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { + (void) printf("%6s\t%5s\t%5s\t%5s" + "\t%5s\t%5s\t%6s\t%s\n", + "-", + "-", + "-", + "-", + "-", + "-", + "-", + typename); + continue; + } + + for (l = ZB_TOTAL - 1; l >= -1; l--) { + level = (l == -1 ? ZB_TOTAL : l); + zb = &zcb.zcb_type[level][t]; + + if (zb->zb_asize == 0) + continue; + + if (dump_opt['b'] < 3 && level != ZB_TOTAL) + continue; + + if (level == 0 && zb->zb_asize == + zcb.zcb_type[ZB_TOTAL][t].zb_asize) + continue; + + zdb_nicenum(zb->zb_count, csize); + zdb_nicenum(zb->zb_lsize, lsize); + zdb_nicenum(zb->zb_psize, psize); + zdb_nicenum(zb->zb_asize, asize); + zdb_nicenum(zb->zb_asize / zb->zb_count, avg); + + (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" + "\t%5.2f\t%6.2f\t", + csize, lsize, psize, asize, avg, + (double)zb->zb_lsize / zb->zb_psize, + 100.0 * zb->zb_asize / tzb->zb_asize); + + if (level == ZB_TOTAL) + (void) printf("%s\n", typename); + else + (void) printf(" L%d %s\n", + level, typename); + } + } + } + + (void) printf("\n"); + + if (leaks) + return (2); + + if (zcb.zcb_haderrors) + return (3); + + return (0); +} + +typedef struct zdb_ddt_entry { + ddt_key_t zdde_key; + uint64_t zdde_ref_blocks; + uint64_t zdde_ref_lsize; + uint64_t zdde_ref_psize; + uint64_t zdde_ref_dsize; + avl_node_t zdde_node; +} zdb_ddt_entry_t; + +/* ARGSUSED */ +static int +zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + avl_tree_t *t = arg; + avl_index_t where; + zdb_ddt_entry_t *zdde, zdde_search; + + if (bp == NULL) + return (0); + + if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { + (void) printf("traversing objset %llu, %llu objects, " + "%lu blocks so far\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)bp->blk_fill, + avl_numnodes(t)); + } + + if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || + BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) + return (0); + + ddt_key_fill(&zdde_search.zdde_key, bp); + + zdde = avl_find(t, &zdde_search, &where); + + if (zdde == NULL) { + zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); + zdde->zdde_key = zdde_search.zdde_key; + avl_insert(t, zdde, where); + } + + zdde->zdde_ref_blocks += 1; + zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); + zdde->zdde_ref_psize += BP_GET_PSIZE(bp); + zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); + + return (0); +} + +static void +dump_simulated_ddt(spa_t *spa) +{ + avl_tree_t t; + void *cookie = NULL; + zdb_ddt_entry_t *zdde; + ddt_histogram_t ddh_total = { 0 }; + ddt_stat_t dds_total = { 0 }; + + avl_create(&t, ddt_entry_compare, + sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, + zdb_ddt_add_cb, &t); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { + ddt_stat_t dds; + uint64_t refcnt = zdde->zdde_ref_blocks; + ASSERT(refcnt != 0); + + dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; + dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; + dds.dds_psize = zdde->zdde_ref_psize / refcnt; + dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; + + dds.dds_ref_blocks = zdde->zdde_ref_blocks; + dds.dds_ref_lsize = zdde->zdde_ref_lsize; + dds.dds_ref_psize = zdde->zdde_ref_psize; + dds.dds_ref_dsize = zdde->zdde_ref_dsize; + + ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0); + + umem_free(zdde, sizeof (*zdde)); + } + + avl_destroy(&t); + + ddt_histogram_stat(&dds_total, &ddh_total); + + (void) printf("Simulated DDT histogram:\n"); + + zpool_dump_ddt(&dds_total, &ddh_total); + + dump_dedup_ratio(&dds_total); +} + +static void +dump_zpool(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + int rc = 0; + + if (dump_opt['S']) { + dump_simulated_ddt(spa); + return; + } + + if (!dump_opt['e'] && dump_opt['C'] > 1) { + (void) printf("\nCached configuration:\n"); + dump_nvlist(spa->spa_config, 8); + } + + if (dump_opt['C']) + dump_config(spa); + + if (dump_opt['u']) + dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); + + if (dump_opt['D']) + dump_all_ddts(spa); + + if (dump_opt['d'] > 2 || dump_opt['m']) + dump_metaslabs(spa); + + if (dump_opt['d'] || dump_opt['i']) { + dump_dir(dp->dp_meta_objset); + if (dump_opt['d'] >= 3) { + dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees"); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj, + "Pool frees"); + } + dump_dtl(spa->spa_root_vdev, 0); + } + (void) dmu_objset_find(spa_name(spa), dump_one_dir, + NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + } + if (dump_opt['b'] || dump_opt['c']) + rc = dump_block_stats(spa); + + if (dump_opt['s']) + show_pool_stats(spa); + + if (dump_opt['h']) + dump_history(spa); + + if (rc != 0) + exit(rc); +} + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_PHYS 0x0020 +#define ZDB_FLAG_RAW 0x0040 +#define ZDB_FLAG_PRINT_BLKPTR 0x0080 + +int flagbits[256]; + +static void +zdb_print_blkptr(blkptr_t *bp, int flags) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); + + sprintf_blkptr(blkbuf, bp); + (void) printf("%s\n", blkbuf); +} + +static void +zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) +{ + int i; + + for (i = 0; i < nbps; i++) + zdb_print_blkptr(&bp[i], flags); +} + +static void +zdb_dump_gbh(void *buf, int flags) +{ + zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); +} + +static void +zdb_dump_block_raw(void *buf, uint64_t size, int flags) +{ + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array(buf, size); + (void) write(1, buf, size); +} + +static void +zdb_dump_block(char *label, void *buf, uint64_t size, int flags) +{ + uint64_t *d = (uint64_t *)buf; + int nwords = size / sizeof (uint64_t); + int do_bswap = !!(flags & ZDB_FLAG_BSWAP); + int i, j; + char *hdr, *c; + + + if (do_bswap) + hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; + else + hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; + + (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); + + for (i = 0; i < nwords; i += 2) { + (void) printf("%06llx: %016llx %016llx ", + (u_longlong_t)(i * sizeof (uint64_t)), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); + + c = (char *)&d[i]; + for (j = 0; j < 2 * sizeof (uint64_t); j++) + (void) printf("%c", isprint(c[j]) ? c[j] : '.'); + (void) printf("\n"); + } +} + +/* + * There are two acceptable formats: + * leaf_name - For example: c1t0d0 or /tmp/ztest.0a + * child[.child]* - For example: 0.1.1 + * + * The second form can be used to specify arbitrary vdevs anywhere + * in the heirarchy. For example, in a pool with a mirror of + * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . + */ +static vdev_t * +zdb_vdev_lookup(vdev_t *vdev, char *path) +{ + char *s, *p, *q; + int i; + + if (vdev == NULL) + return (NULL); + + /* First, assume the x.x.x.x format */ + i = (int)strtoul(path, &s, 10); + if (s == path || (s && *s != '.' && *s != '\0')) + goto name; + if (i < 0 || i >= vdev->vdev_children) + return (NULL); + + vdev = vdev->vdev_child[i]; + if (*s == '\0') + return (vdev); + return (zdb_vdev_lookup(vdev, s+1)); + +name: + for (i = 0; i < vdev->vdev_children; i++) { + vdev_t *vc = vdev->vdev_child[i]; + + if (vc->vdev_path == NULL) { + vc = zdb_vdev_lookup(vc, path); + if (vc == NULL) + continue; + else + return (vc); + } + + p = strrchr(vc->vdev_path, '/'); + p = p ? p + 1 : vc->vdev_path; + q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; + + if (strcmp(vc->vdev_path, path) == 0) + return (vc); + if (strcmp(p, path) == 0) + return (vc); + if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) + return (vc); + } + + return (NULL); +} + +/* + * Read a block from a pool and print it out. The syntax of the + * block descriptor is: + * + * pool:vdev_specifier:offset:size[:flags] + * + * pool - The name of the pool you wish to read from + * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) + * offset - offset, in hex, in bytes + * size - Amount of data to read, in hex, in bytes + * flags - A string of characters specifying options + * b: Decode a blkptr at given offset within block + * *c: Calculate and display checksums + * d: Decompress data before dumping + * e: Byteswap data before dumping + * g: Display data as a gang block header + * i: Display as an indirect block + * p: Do I/O to physical offset + * r: Dump raw data to stdout + * + * * = not yet implemented + */ +static void +zdb_read_block(char *thing, spa_t *spa) +{ + blkptr_t blk, *bp = &blk; + dva_t *dva = bp->blk_dva; + int flags = 0; + uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; + zio_t *zio; + vdev_t *vd; + void *pbuf, *lbuf, *buf; + char *s, *p, *dup, *vdev, *flagstr; + int i, error; + + dup = strdup(thing); + s = strtok(dup, ":"); + vdev = s ? s : ""; + s = strtok(NULL, ":"); + offset = strtoull(s ? s : "", NULL, 16); + s = strtok(NULL, ":"); + size = strtoull(s ? s : "", NULL, 16); + s = strtok(NULL, ":"); + flagstr = s ? s : ""; + + s = NULL; + if (size == 0) + s = "size must not be zero"; + if (!IS_P2ALIGNED(size, DEV_BSIZE)) + s = "size must be a multiple of sector size"; + if (!IS_P2ALIGNED(offset, DEV_BSIZE)) + s = "offset must be a multiple of sector size"; + if (s) { + (void) printf("Invalid block specifier: %s - %s\n", thing, s); + free(dup); + return; + } + + for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { + for (i = 0; flagstr[i]; i++) { + int bit = flagbits[(uchar_t)flagstr[i]]; + + if (bit == 0) { + (void) printf("***Invalid flag: %c\n", + flagstr[i]); + continue; + } + flags |= bit; + + /* If it's not something with an argument, keep going */ + if ((bit & (ZDB_FLAG_CHECKSUM | + ZDB_FLAG_PRINT_BLKPTR)) == 0) + continue; + + p = &flagstr[i + 1]; + if (bit == ZDB_FLAG_PRINT_BLKPTR) + blkptr_offset = strtoull(p, &p, 16); + if (*p != ':' && *p != '\0') { + (void) printf("***Invalid flag arg: '%s'\n", s); + free(dup); + return; + } + } + } + + vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); + if (vd == NULL) { + (void) printf("***Invalid vdev: %s\n", vdev); + free(dup); + return; + } else { + if (vd->vdev_path) + (void) fprintf(stderr, "Found vdev: %s\n", + vd->vdev_path); + else + (void) fprintf(stderr, "Found vdev type: %s\n", + vd->vdev_ops->vdev_op_type); + } + + psize = size; + lsize = size; + + pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], offset); + DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); + DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + + BP_SET_LSIZE(bp, lsize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + zio = zio_root(spa, NULL, NULL, 0); + + if (vd == vd->vdev_top) { + /* + * Treat this as a normal block read. + */ + zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); + } else { + /* + * Treat this as a vdev child I/O. + */ + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, + ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); + } + + error = zio_wait(zio); + spa_config_exit(spa, SCL_STATE, FTAG); + + if (error) { + (void) printf("Read of %s failed, error: %d\n", thing, error); + goto out; + } + + if (flags & ZDB_FLAG_DECOMPRESS) { + /* + * We don't know how the data was compressed, so just try + * every decompress function at every inflated blocksize. + */ + enum zio_compress c; + void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + + bcopy(pbuf, pbuf2, psize); + + VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize, + SPA_MAXBLOCKSIZE - psize) == 0); + + VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, + SPA_MAXBLOCKSIZE - psize) == 0); + + for (lsize = SPA_MAXBLOCKSIZE; lsize > psize; + lsize -= SPA_MINBLOCKSIZE) { + for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { + if (zio_decompress_data(c, pbuf, lbuf, + psize, lsize) == 0 && + zio_decompress_data(c, pbuf2, lbuf2, + psize, lsize) == 0 && + bcmp(lbuf, lbuf2, lsize) == 0) + break; + } + if (c != ZIO_COMPRESS_FUNCTIONS) + break; + lsize -= SPA_MINBLOCKSIZE; + } + + umem_free(pbuf2, SPA_MAXBLOCKSIZE); + umem_free(lbuf2, SPA_MAXBLOCKSIZE); + + if (lsize <= psize) { + (void) printf("Decompress of %s failed\n", thing); + goto out; + } + buf = lbuf; + size = lsize; + } else { + buf = pbuf; + size = psize; + } + + if (flags & ZDB_FLAG_PRINT_BLKPTR) + zdb_print_blkptr((blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); + else if (flags & ZDB_FLAG_RAW) + zdb_dump_block_raw(buf, size, flags); + else if (flags & ZDB_FLAG_INDIRECT) + zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), + flags); + else if (flags & ZDB_FLAG_GBH) + zdb_dump_gbh(buf, flags); + else + zdb_dump_block(thing, buf, size, flags); + +out: + umem_free(pbuf, SPA_MAXBLOCKSIZE); + umem_free(lbuf, SPA_MAXBLOCKSIZE); + free(dup); +} + +static boolean_t +pool_match(nvlist_t *cfg, char *tgt) +{ + uint64_t v, guid = strtoull(tgt, NULL, 0); + char *s; + + if (guid != 0) { + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) + return (v == guid); + } else { + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) + return (strcmp(s, tgt) == 0); + } + return (B_FALSE); +} + +static char * +find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv) +{ + nvlist_t *pools; + nvlist_t *match = NULL; + char *name = NULL; + char *sepp = NULL; + char sep; + int count = 0; + importargs_t args = { 0 }; + + args.paths = dirc; + args.path = dirv; + args.can_be_active = B_TRUE; + + if ((sepp = strpbrk(*target, "/@")) != NULL) { + sep = *sepp; + *sepp = '\0'; + } + + pools = zpool_search_import(g_zfs, &args); + + if (pools != NULL) { + nvpair_t *elem = NULL; + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + verify(nvpair_value_nvlist(elem, configp) == 0); + if (pool_match(*configp, *target)) { + count++; + if (match != NULL) { + /* print previously found config */ + if (name != NULL) { + (void) printf("%s\n", name); + dump_nvlist(match, 8); + name = NULL; + } + (void) printf("%s\n", + nvpair_name(elem)); + dump_nvlist(*configp, 8); + } else { + match = *configp; + name = nvpair_name(elem); + } + } + } + } + if (count > 1) + (void) fatal("\tMatched %d pools - use pool GUID " + "instead of pool name or \n" + "\tpool name part of a dataset name to select pool", count); + + if (sepp) + *sepp = sep; + /* + * If pool GUID was specified for pool id, replace it with pool name + */ + if (name && (strstr(*target, name) != *target)) { + int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0); + + *target = umem_alloc(sz, UMEM_NOFAIL); + (void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : ""); + } + + *configp = name ? match : NULL; + + return (name); +} + +int +main(int argc, char **argv) +{ + int i, c; + struct rlimit rl = { 1024, 1024 }; + spa_t *spa = NULL; + objset_t *os = NULL; + int dump_all = 1; + int verbose = 0; + int error = 0; + char **searchdirs = NULL; + int nsearch = 0; + char *target; + nvlist_t *policy = NULL; + uint64_t max_txg = UINT64_MAX; + int rewind = ZPOOL_NEVER_REWIND; + + (void) setrlimit(RLIMIT_NOFILE, &rl); + (void) enable_extended_FILE_stdio(-1, -1); + + dprintf_setup(&argc, argv); + + while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) { + switch (c) { + case 'b': + case 'c': + case 'd': + case 'h': + case 'i': + case 'l': + case 'm': + case 's': + case 'u': + case 'C': + case 'D': + case 'R': + case 'S': + dump_opt[c]++; + dump_all = 0; + break; + case 'A': + case 'F': + case 'L': + case 'X': + case 'e': + case 'P': + dump_opt[c]++; + break; + case 'v': + verbose++; + break; + case 'p': + if (searchdirs == NULL) { + searchdirs = umem_alloc(sizeof (char *), + UMEM_NOFAIL); + } else { + char **tmp = umem_alloc((nsearch + 1) * + sizeof (char *), UMEM_NOFAIL); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + umem_free(searchdirs, + nsearch * sizeof (char *)); + searchdirs = tmp; + } + searchdirs[nsearch++] = optarg; + break; + case 't': + max_txg = strtoull(optarg, NULL, 0); + if (max_txg < TXG_INITIAL) { + (void) fprintf(stderr, "incorrect txg " + "specified: %s\n", optarg); + usage(); + } + break; + case 'U': + spa_config_path = optarg; + break; + default: + usage(); + break; + } + } + + if (!dump_opt['e'] && searchdirs != NULL) { + (void) fprintf(stderr, "-p option requires use of -e\n"); + usage(); + } + + kernel_init(FREAD); + g_zfs = libzfs_init(); + ASSERT(g_zfs != NULL); + + if (dump_all) + verbose = MAX(verbose, 1); + + for (c = 0; c < 256; c++) { + if (dump_all && !strchr("elAFLRSXP", c)) + dump_opt[c] = 1; + if (dump_opt[c]) + dump_opt[c] += verbose; + } + + aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); + zfs_recover = (dump_opt['A'] > 1); + + argc -= optind; + argv += optind; + + if (argc < 2 && dump_opt['R']) + usage(); + if (argc < 1) { + if (!dump_opt['e'] && dump_opt['C']) { + dump_cachefile(spa_config_path); + return (0); + } + usage(); + } + + if (dump_opt['l']) { + dump_label(argv[0]); + return (0); + } + + if (dump_opt['X'] || dump_opt['F']) + rewind = ZPOOL_DO_REWIND | + (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); + + if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || + nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 || + nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0) + fatal("internal error: %s", strerror(ENOMEM)); + + error = 0; + target = argv[0]; + + if (dump_opt['e']) { + nvlist_t *cfg = NULL; + char *name = find_zpool(&target, &cfg, nsearch, searchdirs); + + error = ENOENT; + if (name) { + if (dump_opt['C'] > 1) { + (void) printf("\nConfiguration for import:\n"); + dump_nvlist(cfg, 8); + } + if (nvlist_add_nvlist(cfg, + ZPOOL_REWIND_POLICY, policy) != 0) { + fatal("can't open '%s': %s", + target, strerror(ENOMEM)); + } + if ((error = spa_import(name, cfg, NULL, + ZFS_IMPORT_MISSING_LOG)) != 0) { + error = spa_import(name, cfg, NULL, + ZFS_IMPORT_VERBATIM); + } + } + } + + if (error == 0) { + if (strpbrk(target, "/@") == NULL || dump_opt['R']) { + error = spa_open_rewind(target, &spa, FTAG, policy, + NULL); + if (error) { + /* + * If we're missing the log device then + * try opening the pool after clearing the + * log state. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(target)) != NULL && + spa->spa_log_state == SPA_LOG_MISSING) { + spa->spa_log_state = SPA_LOG_CLEAR; + error = 0; + } + mutex_exit(&spa_namespace_lock); + + if (!error) { + error = spa_open_rewind(target, &spa, + FTAG, policy, NULL); + } + } + } else { + error = dmu_objset_own(target, DMU_OST_ANY, + B_TRUE, FTAG, &os); + } + } + nvlist_free(policy); + + if (error) + fatal("can't open '%s': %s", target, strerror(error)); + + argv++; + argc--; + if (!dump_opt['R']) { + if (argc > 0) { + zopt_objects = argc; + zopt_object = calloc(zopt_objects, sizeof (uint64_t)); + for (i = 0; i < zopt_objects; i++) { + errno = 0; + zopt_object[i] = strtoull(argv[i], NULL, 0); + if (zopt_object[i] == 0 && errno != 0) + fatal("bad number %s: %s", + argv[i], strerror(errno)); + } + } + (os != NULL) ? dump_dir(os) : dump_zpool(spa); + } else { + flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; + flagbits['c'] = ZDB_FLAG_CHECKSUM; + flagbits['d'] = ZDB_FLAG_DECOMPRESS; + flagbits['e'] = ZDB_FLAG_BSWAP; + flagbits['g'] = ZDB_FLAG_GBH; + flagbits['i'] = ZDB_FLAG_INDIRECT; + flagbits['p'] = ZDB_FLAG_PHYS; + flagbits['r'] = ZDB_FLAG_RAW; + + for (i = 0; i < argc; i++) + zdb_read_block(argv[i], spa); + } + + (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG); + + fuid_table_destroy(); + sa_loaded = B_FALSE; + + libzfs_fini(g_zfs); + kernel_fini(); + + return (0); +} diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c new file mode 100644 index 0000000..a0ed985 --- /dev/null +++ b/cmd/zdb/zdb_il.c @@ -0,0 +1,384 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Print intent log header and statistics. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern uint8_t dump_opt[256]; + +static char prefix[4] = "\t\t\t"; + +static void +print_log_bp(const blkptr_t *bp, const char *prefix) +{ + char blkbuf[BP_SPRINTF_LEN]; + + sprintf_blkptr(blkbuf, bp); + (void) printf("%s%s\n", prefix, blkbuf); +} + +/* ARGSUSED */ +static void +zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr) +{ + time_t crtime = lr->lr_crtime[0]; + char *name, *link; + lr_attr_t *lrattr; + + name = (char *)(lr + 1); + + if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR || + lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) { + lrattr = (lr_attr_t *)(lr + 1); + name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + if (txtype == TX_SYMLINK) { + link = name + strlen(name) + 1; + (void) printf("%s%s -> %s\n", prefix, name, link); + } else if (txtype != TX_MKXATTR) { + (void) printf("%s%s\n", prefix, name); + } + + (void) printf("%s%s", prefix, ctime(&crtime)); + (void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix, + (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid, + (longlong_t)lr->lr_mode); + (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix, + (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid, + (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev); +} + +/* ARGSUSED */ +static void +zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr) +{ + (void) printf("%sdoid %llu, name %s\n", prefix, + (u_longlong_t)lr->lr_doid, (char *)(lr + 1)); +} + +/* ARGSUSED */ +static void +zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr) +{ + (void) printf("%sdoid %llu, link_obj %llu, name %s\n", prefix, + (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj, + (char *)(lr + 1)); +} + +/* ARGSUSED */ +static void +zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr) +{ + char *snm = (char *)(lr + 1); + char *tnm = snm + strlen(snm) + 1; + + (void) printf("%ssdoid %llu, tdoid %llu\n", prefix, + (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid); + (void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm); +} + +/* ARGSUSED */ +static void +zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) +{ + char *data, *dlimit; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + char buf[SPA_MAXBLOCKSIZE]; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + int error; + + (void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length); + + if (txtype == TX_WRITE2 || verbose < 5) + return; + + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + (void) printf("%shas blkptr, %s\n", prefix, + bp->blk_birth >= spa_first_txg(zilog->zl_spa) ? + "will claim" : "won't claim"); + print_log_bp(bp, prefix); + + if (BP_IS_HOLE(bp)) { + (void) printf("\t\t\tLSIZE 0x%llx\n", + (u_longlong_t)BP_GET_LSIZE(bp)); + } + if (bp->blk_birth == 0) { + bzero(buf, sizeof (buf)); + (void) printf("%s\n", prefix); + return; + } + if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { + (void) printf("%s\n", prefix); + return; + } + + SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + error = zio_wait(zio_read(NULL, zilog->zl_spa, + bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); + if (error) + return; + data = buf; + } else { + data = (char *)(lr + 1); + } + + dlimit = data + MIN(lr->lr_length, + (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)); + + (void) printf("%s", prefix); + while (data < dlimit) { + if (isprint(*data)) + (void) printf("%c ", *data); + else + (void) printf("%2X", *data); + data++; + } + (void) printf("\n"); +} + +/* ARGSUSED */ +static void +zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr) +{ + (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", prefix, + (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length); +} + +/* ARGSUSED */ +static void +zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr) +{ + time_t atime = (time_t)lr->lr_atime[0]; + time_t mtime = (time_t)lr->lr_mtime[0]; + + (void) printf("%sfoid %llu, mask 0x%llx\n", prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask); + + if (lr->lr_mask & AT_MODE) { + (void) printf("%sAT_MODE %llo\n", prefix, + (longlong_t)lr->lr_mode); + } + + if (lr->lr_mask & AT_UID) { + (void) printf("%sAT_UID %llu\n", prefix, + (u_longlong_t)lr->lr_uid); + } + + if (lr->lr_mask & AT_GID) { + (void) printf("%sAT_GID %llu\n", prefix, + (u_longlong_t)lr->lr_gid); + } + + if (lr->lr_mask & AT_SIZE) { + (void) printf("%sAT_SIZE %llu\n", prefix, + (u_longlong_t)lr->lr_size); + } + + if (lr->lr_mask & AT_ATIME) { + (void) printf("%sAT_ATIME %llu.%09llu %s", prefix, + (u_longlong_t)lr->lr_atime[0], + (u_longlong_t)lr->lr_atime[1], + ctime(&atime)); + } + + if (lr->lr_mask & AT_MTIME) { + (void) printf("%sAT_MTIME %llu.%09llu %s", prefix, + (u_longlong_t)lr->lr_mtime[0], + (u_longlong_t)lr->lr_mtime[1], + ctime(&mtime)); + } +} + +/* ARGSUSED */ +static void +zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr) +{ + (void) printf("%sfoid %llu, aclcnt %llu\n", prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt); +} + +typedef void (*zil_prt_rec_func_t)(); +typedef struct zil_rec_info { + zil_prt_rec_func_t zri_print; + char *zri_name; + uint64_t zri_count; +} zil_rec_info_t; + +static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { + { NULL, "Total " }, + { zil_prt_rec_create, "TX_CREATE " }, + { zil_prt_rec_create, "TX_MKDIR " }, + { zil_prt_rec_create, "TX_MKXATTR " }, + { zil_prt_rec_create, "TX_SYMLINK " }, + { zil_prt_rec_remove, "TX_REMOVE " }, + { zil_prt_rec_remove, "TX_RMDIR " }, + { zil_prt_rec_link, "TX_LINK " }, + { zil_prt_rec_rename, "TX_RENAME " }, + { zil_prt_rec_write, "TX_WRITE " }, + { zil_prt_rec_truncate, "TX_TRUNCATE " }, + { zil_prt_rec_setattr, "TX_SETATTR " }, + { zil_prt_rec_acl, "TX_ACL_V0 " }, + { zil_prt_rec_acl, "TX_ACL_ACL " }, + { zil_prt_rec_create, "TX_CREATE_ACL " }, + { zil_prt_rec_create, "TX_CREATE_ATTR " }, + { zil_prt_rec_create, "TX_CREATE_ACL_ATTR " }, + { zil_prt_rec_create, "TX_MKDIR_ACL " }, + { zil_prt_rec_create, "TX_MKDIR_ATTR " }, + { zil_prt_rec_create, "TX_MKDIR_ACL_ATTR " }, + { zil_prt_rec_write, "TX_WRITE2 " }, +}; + +/* ARGSUSED */ +static int +print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) +{ + int txtype; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + /* reduce size of txtype to strip off TX_CI bit */ + txtype = lr->lrc_txtype; + + ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE); + ASSERT(lr->lrc_txg); + + (void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n", + (lr->lrc_txtype & TX_CI) ? "CI-" : "", + zil_rec_info[txtype].zri_name, + (u_longlong_t)lr->lrc_reclen, + (u_longlong_t)lr->lrc_txg, + (u_longlong_t)lr->lrc_seq); + + if (txtype && verbose >= 3) + zil_rec_info[txtype].zri_print(zilog, txtype, lr); + + zil_rec_info[txtype].zri_count++; + zil_rec_info[0].zri_count++; + + return (0); +} + +/* ARGSUSED */ +static int +print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + char blkbuf[BP_SPRINTF_LEN + 10]; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + char *claim; + + if (verbose <= 3) + return (0); + + if (verbose >= 5) { + (void) strcpy(blkbuf, ", "); + sprintf_blkptr(blkbuf + strlen(blkbuf), bp); + } else { + blkbuf[0] = '\0'; + } + + if (claim_txg != 0) + claim = "already claimed"; + else if (bp->blk_birth >= spa_first_txg(zilog->zl_spa)) + claim = "will claim"; + else + claim = "won't claim"; + + (void) printf("\tBlock seqno %llu, %s%s\n", + (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf); + + return (0); +} + +static void +print_log_stats(int verbose) +{ + int i, w, p10; + + if (verbose > 3) + (void) printf("\n"); + + if (zil_rec_info[0].zri_count == 0) + return; + + for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10) + w++; + + for (i = 0; i < TX_MAX_TYPE; i++) + if (zil_rec_info[i].zri_count || verbose >= 3) + (void) printf("\t\t%s %*llu\n", + zil_rec_info[i].zri_name, w, + (u_longlong_t)zil_rec_info[i].zri_count); + (void) printf("\n"); +} + +/* ARGSUSED */ +void +dump_intent_log(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + int i; + + if (zh->zh_log.blk_birth == 0 || verbose < 1) + return; + + (void) printf("\n ZIL header: claim_txg %llu, " + "claim_blk_seq %llu, claim_lr_seq %llu", + (u_longlong_t)zh->zh_claim_txg, + (u_longlong_t)zh->zh_claim_blk_seq, + (u_longlong_t)zh->zh_claim_lr_seq); + (void) printf(" replay_seq %llu, flags 0x%llx\n", + (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags); + + for (i = 0; i < TX_MAX_TYPE; i++) + zil_rec_info[i].zri_count = 0; + + if (verbose >= 2) { + (void) printf("\n"); + (void) zil_parse(zilog, print_log_block, print_log_record, NULL, + zh->zh_claim_txg); + print_log_stats(verbose); + } +} diff --git a/cmd/zfs/zfs_iter.c b/cmd/zfs/zfs_iter.c new file mode 100644 index 0000000..e2ab90e --- /dev/null +++ b/cmd/zfs/zfs_iter.c @@ -0,0 +1,464 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "zfs_util.h" +#include "zfs_iter.h" + +/* + * This is a private interface used to gather up all the datasets specified on + * the command line so that we can iterate over them in order. + * + * First, we iterate over all filesystems, gathering them together into an + * AVL tree. We report errors for any explicitly specified datasets + * that we couldn't open. + * + * When finished, we have an AVL tree of ZFS handles. We go through and execute + * the provided callback for each one, passing whatever data the user supplied. + */ + +typedef struct zfs_node { + zfs_handle_t *zn_handle; + uu_avl_node_t zn_avlnode; +} zfs_node_t; + +typedef struct callback_data { + uu_avl_t *cb_avl; + int cb_flags; + zfs_type_t cb_types; + zfs_sort_column_t *cb_sortcol; + zprop_list_t **cb_proplist; + int cb_depth_limit; + int cb_depth; + uint8_t cb_props_table[ZFS_NUM_PROPS]; +} callback_data_t; + +uu_avl_pool_t *avl_pool; + +/* + * Include snaps if they were requested or if this a zfs list where types + * were not specified and the "listsnapshots" property is set on this pool. + */ +static int +zfs_include_snapshots(zfs_handle_t *zhp, callback_data_t *cb) +{ + zpool_handle_t *zph; + + if ((cb->cb_flags & ZFS_ITER_PROP_LISTSNAPS) == 0) + return (cb->cb_types & ZFS_TYPE_SNAPSHOT); + + zph = zfs_get_pool_handle(zhp); + return (zpool_get_prop_int(zph, ZPOOL_PROP_LISTSNAPS, NULL)); +} + +/* + * Called for each dataset. If the object is of an appropriate type, + * add it to the avl tree and recurse over any children as necessary. + */ +static int +zfs_callback(zfs_handle_t *zhp, void *data) +{ + callback_data_t *cb = data; + int dontclose = 0; + int include_snaps = zfs_include_snapshots(zhp, cb); + + if ((zfs_get_type(zhp) & cb->cb_types) || + ((zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) && include_snaps)) { + uu_avl_index_t idx; + zfs_node_t *node = safe_malloc(sizeof (zfs_node_t)); + + node->zn_handle = zhp; + uu_avl_node_init(node, &node->zn_avlnode, avl_pool); + if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol, + &idx) == NULL) { + if (cb->cb_proplist) { + if ((*cb->cb_proplist) && + !(*cb->cb_proplist)->pl_all) + zfs_prune_proplist(zhp, + cb->cb_props_table); + + if (zfs_expand_proplist(zhp, cb->cb_proplist, + (cb->cb_flags & ZFS_ITER_RECVD_PROPS)) + != 0) { + free(node); + return (-1); + } + } + uu_avl_insert(cb->cb_avl, node, idx); + dontclose = 1; + } else { + free(node); + } + } + + /* + * Recurse if necessary. + */ + if (cb->cb_flags & ZFS_ITER_RECURSE && + ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 || + cb->cb_depth < cb->cb_depth_limit)) { + cb->cb_depth++; + if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) + (void) zfs_iter_filesystems(zhp, zfs_callback, data); + if ((zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) && include_snaps) + (void) zfs_iter_snapshots(zhp, zfs_callback, data); + cb->cb_depth--; + } + + if (!dontclose) + zfs_close(zhp); + + return (0); +} + +int +zfs_add_sort_column(zfs_sort_column_t **sc, const char *name, + boolean_t reverse) +{ + zfs_sort_column_t *col; + zfs_prop_t prop; + + if ((prop = zfs_name_to_prop(name)) == ZPROP_INVAL && + !zfs_prop_user(name)) + return (-1); + + col = safe_malloc(sizeof (zfs_sort_column_t)); + + col->sc_prop = prop; + col->sc_reverse = reverse; + if (prop == ZPROP_INVAL) { + col->sc_user_prop = safe_malloc(strlen(name) + 1); + (void) strcpy(col->sc_user_prop, name); + } + + if (*sc == NULL) { + col->sc_last = col; + *sc = col; + } else { + (*sc)->sc_last->sc_next = col; + (*sc)->sc_last = col; + } + + return (0); +} + +void +zfs_free_sort_columns(zfs_sort_column_t *sc) +{ + zfs_sort_column_t *col; + + while (sc != NULL) { + col = sc->sc_next; + free(sc->sc_user_prop); + free(sc); + sc = col; + } +} + +/* ARGSUSED */ +static int +zfs_compare(const void *larg, const void *rarg, void *unused) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + const char *lname = zfs_get_name(l); + const char *rname = zfs_get_name(r); + char *lat, *rat; + uint64_t lcreate, rcreate; + int ret; + + lat = (char *)strchr(lname, '@'); + rat = (char *)strchr(rname, '@'); + + if (lat != NULL) + *lat = '\0'; + if (rat != NULL) + *rat = '\0'; + + ret = strcmp(lname, rname); + if (ret == 0) { + /* + * If we're comparing a dataset to one of its snapshots, we + * always make the full dataset first. + */ + if (lat == NULL) { + ret = -1; + } else if (rat == NULL) { + ret = 1; + } else { + /* + * If we have two snapshots from the same dataset, then + * we want to sort them according to creation time. We + * use the hidden CREATETXG property to get an absolute + * ordering of snapshots. + */ + lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); + rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); + + if (lcreate < rcreate) + ret = -1; + else if (lcreate > rcreate) + ret = 1; + } + } + + if (lat != NULL) + *lat = '@'; + if (rat != NULL) + *rat = '@'; + + return (ret); +} + +/* + * Sort datasets by specified columns. + * + * o Numeric types sort in ascending order. + * o String types sort in alphabetical order. + * o Types inappropriate for a row sort that row to the literal + * bottom, regardless of the specified ordering. + * + * If no sort columns are specified, or two datasets compare equally + * across all specified columns, they are sorted alphabetically by name + * with snapshots grouped under their parents. + */ +static int +zfs_sort(const void *larg, const void *rarg, void *data) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + zfs_sort_column_t *sc = (zfs_sort_column_t *)data; + zfs_sort_column_t *psc; + + for (psc = sc; psc != NULL; psc = psc->sc_next) { + char lbuf[ZFS_MAXPROPLEN], rbuf[ZFS_MAXPROPLEN]; + char *lstr, *rstr; + uint64_t lnum, rnum; + boolean_t lvalid, rvalid; + int ret = 0; + + /* + * We group the checks below the generic code. If 'lstr' and + * 'rstr' are non-NULL, then we do a string based comparison. + * Otherwise, we compare 'lnum' and 'rnum'. + */ + lstr = rstr = NULL; + if (psc->sc_prop == ZPROP_INVAL) { + nvlist_t *luser, *ruser; + nvlist_t *lval, *rval; + + luser = zfs_get_user_props(l); + ruser = zfs_get_user_props(r); + + lvalid = (nvlist_lookup_nvlist(luser, + psc->sc_user_prop, &lval) == 0); + rvalid = (nvlist_lookup_nvlist(ruser, + psc->sc_user_prop, &rval) == 0); + + if (lvalid) + verify(nvlist_lookup_string(lval, + ZPROP_VALUE, &lstr) == 0); + if (rvalid) + verify(nvlist_lookup_string(rval, + ZPROP_VALUE, &rstr) == 0); + + } else if (zfs_prop_is_string(psc->sc_prop)) { + lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf, + sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0); + rvalid = (zfs_prop_get(r, psc->sc_prop, rbuf, + sizeof (rbuf), NULL, NULL, 0, B_TRUE) == 0); + + lstr = lbuf; + rstr = rbuf; + } else { + lvalid = zfs_prop_valid_for_type(psc->sc_prop, + zfs_get_type(l)); + rvalid = zfs_prop_valid_for_type(psc->sc_prop, + zfs_get_type(r)); + + if (lvalid) + (void) zfs_prop_get_numeric(l, psc->sc_prop, + &lnum, NULL, NULL, 0); + if (rvalid) + (void) zfs_prop_get_numeric(r, psc->sc_prop, + &rnum, NULL, NULL, 0); + } + + if (!lvalid && !rvalid) + continue; + else if (!lvalid) + return (1); + else if (!rvalid) + return (-1); + + if (lstr) + ret = strcmp(lstr, rstr); + else if (lnum < rnum) + ret = -1; + else if (lnum > rnum) + ret = 1; + + if (ret != 0) { + if (psc->sc_reverse == B_TRUE) + ret = (ret < 0) ? 1 : -1; + return (ret); + } + } + + return (zfs_compare(larg, rarg, NULL)); +} + +int +zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, + zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit, + zfs_iter_f callback, void *data) +{ + callback_data_t cb = {0}; + int ret = 0; + zfs_node_t *node; + uu_avl_walk_t *walk; + + avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t), + offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT); + + if (avl_pool == NULL) + nomem(); + + cb.cb_sortcol = sortcol; + cb.cb_flags = flags; + cb.cb_proplist = proplist; + cb.cb_types = types; + cb.cb_depth_limit = limit; + /* + * If cb_proplist is provided then in the zfs_handles created we + * retain only those properties listed in cb_proplist and sortcol. + * The rest are pruned. So, the caller should make sure that no other + * properties other than those listed in cb_proplist/sortcol are + * accessed. + * + * If cb_proplist is NULL then we retain all the properties. We + * always retain the zoned property, which some other properties + * need (userquota & friends), and the createtxg property, which + * we need to sort snapshots. + */ + if (cb.cb_proplist && *cb.cb_proplist) { + zprop_list_t *p = *cb.cb_proplist; + + while (p) { + if (p->pl_prop >= ZFS_PROP_TYPE && + p->pl_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[p->pl_prop] = B_TRUE; + } + p = p->pl_next; + } + + while (sortcol) { + if (sortcol->sc_prop >= ZFS_PROP_TYPE && + sortcol->sc_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[sortcol->sc_prop] = B_TRUE; + } + sortcol = sortcol->sc_next; + } + + cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE; + cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE; + } else { + (void) memset(cb.cb_props_table, B_TRUE, + sizeof (cb.cb_props_table)); + } + + if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) + nomem(); + + if (argc == 0) { + /* + * If given no arguments, iterate over all datasets. + */ + cb.cb_flags |= ZFS_ITER_RECURSE; + ret = zfs_iter_root(g_zfs, zfs_callback, &cb); + } else { + int i; + zfs_handle_t *zhp; + zfs_type_t argtype; + + /* + * If we're recursive, then we always allow filesystems as + * arguments. If we also are interested in snapshots, then we + * can take volumes as well. + */ + argtype = types; + if (flags & ZFS_ITER_RECURSE) { + argtype |= ZFS_TYPE_FILESYSTEM; + if (types & ZFS_TYPE_SNAPSHOT) + argtype |= ZFS_TYPE_VOLUME; + } + + for (i = 0; i < argc; i++) { + if (flags & ZFS_ITER_ARGS_CAN_BE_PATHS) { + zhp = zfs_path_to_zhandle(g_zfs, argv[i], + argtype); + } else { + zhp = zfs_open(g_zfs, argv[i], argtype); + } + if (zhp != NULL) + ret |= zfs_callback(zhp, &cb); + else + ret = 1; + } + } + + /* + * At this point we've got our AVL tree full of zfs handles, so iterate + * over each one and execute the real user callback. + */ + for (node = uu_avl_first(cb.cb_avl); node != NULL; + node = uu_avl_next(cb.cb_avl, node)) + ret |= callback(node->zn_handle, data); + + /* + * Finally, clean up the AVL tree. + */ + if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(cb.cb_avl, node); + zfs_close(node->zn_handle); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(cb.cb_avl); + uu_avl_pool_destroy(avl_pool); + + return (ret); +} diff --git a/cmd/zfs/zfs_iter.h b/cmd/zfs/zfs_iter.h new file mode 100644 index 0000000..8c6b9fd --- /dev/null +++ b/cmd/zfs/zfs_iter.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef ZFS_ITER_H +#define ZFS_ITER_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfs_sort_column { + struct zfs_sort_column *sc_next; + struct zfs_sort_column *sc_last; + zfs_prop_t sc_prop; + char *sc_user_prop; + boolean_t sc_reverse; +} zfs_sort_column_t; + +#define ZFS_ITER_RECURSE (1 << 0) +#define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1) +#define ZFS_ITER_PROP_LISTSNAPS (1 << 2) +#define ZFS_ITER_DEPTH_LIMIT (1 << 3) +#define ZFS_ITER_RECVD_PROPS (1 << 4) + +int zfs_for_each(int, char **, int options, zfs_type_t, + zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *); +int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t); +void zfs_free_sort_columns(zfs_sort_column_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* ZFS_ITER_H */ diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c new file mode 100644 index 0000000..9516697 --- /dev/null +++ b/cmd/zfs/zfs_main.c @@ -0,0 +1,4160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "zfs_iter.h" +#include "zfs_util.h" +#include "zfs_comutil.h" + +libzfs_handle_t *g_zfs; + +static FILE *mnttab_file; +static char history_str[HIS_MAX_RECORD_LEN]; +const char *pypath = "/usr/lib/zfs/pyzfs.py"; + +static int zfs_do_clone(int argc, char **argv); +static int zfs_do_create(int argc, char **argv); +static int zfs_do_destroy(int argc, char **argv); +static int zfs_do_get(int argc, char **argv); +static int zfs_do_inherit(int argc, char **argv); +static int zfs_do_list(int argc, char **argv); +static int zfs_do_mount(int argc, char **argv); +static int zfs_do_rename(int argc, char **argv); +static int zfs_do_rollback(int argc, char **argv); +static int zfs_do_set(int argc, char **argv); +static int zfs_do_upgrade(int argc, char **argv); +static int zfs_do_snapshot(int argc, char **argv); +static int zfs_do_unmount(int argc, char **argv); +static int zfs_do_share(int argc, char **argv); +static int zfs_do_unshare(int argc, char **argv); +static int zfs_do_send(int argc, char **argv); +static int zfs_do_receive(int argc, char **argv); +static int zfs_do_promote(int argc, char **argv); +static int zfs_do_userspace(int argc, char **argv); +static int zfs_do_python(int argc, char **argv); +static int zfs_do_hold(int argc, char **argv); +static int zfs_do_release(int argc, char **argv); +static int zfs_do_diff(int argc, char **argv); + +/* + * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +typedef enum { + HELP_CLONE, + HELP_CREATE, + HELP_DESTROY, + HELP_GET, + HELP_INHERIT, + HELP_UPGRADE, + HELP_LIST, + HELP_MOUNT, + HELP_PROMOTE, + HELP_RECEIVE, + HELP_RENAME, + HELP_ROLLBACK, + HELP_SEND, + HELP_SET, + HELP_SHARE, + HELP_SNAPSHOT, + HELP_UNMOUNT, + HELP_UNSHARE, + HELP_ALLOW, + HELP_UNALLOW, + HELP_USERSPACE, + HELP_GROUPSPACE, + HELP_HOLD, + HELP_HOLDS, + HELP_RELEASE, + HELP_DIFF +} zfs_help_t; + +typedef struct zfs_command { + const char *name; + int (*func)(int argc, char **argv); + zfs_help_t usage; +} zfs_command_t; + +/* + * Master command table. Each ZFS command has a name, associated function, and + * usage message. The usage messages need to be internationalized, so we have + * to have a function to return the usage message based on a command index. + * + * These commands are organized according to how they are displayed in the usage + * message. An empty command (one with a NULL name) indicates an empty line in + * the generic usage message. + */ +static zfs_command_t command_table[] = { + { "create", zfs_do_create, HELP_CREATE }, + { "destroy", zfs_do_destroy, HELP_DESTROY }, + { NULL }, + { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, + { "rollback", zfs_do_rollback, HELP_ROLLBACK }, + { "clone", zfs_do_clone, HELP_CLONE }, + { "promote", zfs_do_promote, HELP_PROMOTE }, + { "rename", zfs_do_rename, HELP_RENAME }, + { NULL }, + { "list", zfs_do_list, HELP_LIST }, + { NULL }, + { "set", zfs_do_set, HELP_SET }, + { "get", zfs_do_get, HELP_GET }, + { "inherit", zfs_do_inherit, HELP_INHERIT }, + { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, + { "userspace", zfs_do_userspace, HELP_USERSPACE }, + { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, + { NULL }, + { "mount", zfs_do_mount, HELP_MOUNT }, + { "unmount", zfs_do_unmount, HELP_UNMOUNT }, + { "share", zfs_do_share, HELP_SHARE }, + { "unshare", zfs_do_unshare, HELP_UNSHARE }, + { NULL }, + { "send", zfs_do_send, HELP_SEND }, + { "receive", zfs_do_receive, HELP_RECEIVE }, + { NULL }, + { "allow", zfs_do_python, HELP_ALLOW }, + { NULL }, + { "unallow", zfs_do_python, HELP_UNALLOW }, + { NULL }, + { "hold", zfs_do_hold, HELP_HOLD }, + { "holds", zfs_do_python, HELP_HOLDS }, + { "release", zfs_do_release, HELP_RELEASE }, + { "diff", zfs_do_diff, HELP_DIFF }, +}; + +#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) + +zfs_command_t *current_command; + +static const char * +get_usage(zfs_help_t idx) +{ + switch (idx) { + case HELP_CLONE: + return (gettext("\tclone [-p] [-o property=value] ... " + " \n")); + case HELP_CREATE: + return (gettext("\tcreate [-p] [-o property=value] ... " + "\n" + "\tcreate [-ps] [-b blocksize] [-o property=value] ... " + "-V \n")); + case HELP_DESTROY: + return (gettext("\tdestroy [-rRf] \n" + "\tdestroy [-rRd] \n")); + case HELP_GET: + return (gettext("\tget [-rHp] [-d max] " + "[-o \"all\" | field[,...]] [-s source[,...]]\n" + "\t <\"all\" | property[,...]> " + "[filesystem|volume|snapshot] ...\n")); + case HELP_INHERIT: + return (gettext("\tinherit [-rS] " + " ...\n")); + case HELP_UPGRADE: + return (gettext("\tupgrade [-v]\n" + "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); + case HELP_LIST: + return (gettext("\tlist [-rH][-d max] " + "[-o property[,...]] [-t type[,...]] [-s property] ...\n" + "\t [-S property] ... " + "[filesystem|volume|snapshot] ...\n")); + case HELP_MOUNT: + return (gettext("\tmount\n" + "\tmount [-vO] [-o opts] <-a | filesystem>\n")); + case HELP_PROMOTE: + return (gettext("\tpromote \n")); + case HELP_RECEIVE: + return (gettext("\treceive [-vnFu] \n" + "\treceive [-vnFu] [-d | -e] \n")); + case HELP_RENAME: + return (gettext("\trename " + "\n" + "\trename -p \n" + "\trename -r ")); + case HELP_ROLLBACK: + return (gettext("\trollback [-rRf] \n")); + case HELP_SEND: + return (gettext("\tsend [-RDp] [-[iI] snapshot] \n")); + case HELP_SET: + return (gettext("\tset " + " ...\n")); + case HELP_SHARE: + return (gettext("\tshare <-a | filesystem>\n")); + case HELP_SNAPSHOT: + return (gettext("\tsnapshot [-r] [-o property=value] ... " + "\n")); + case HELP_UNMOUNT: + return (gettext("\tunmount [-f] " + "<-a | filesystem|mountpoint>\n")); + case HELP_UNSHARE: + return (gettext("\tunshare " + "<-a | filesystem|mountpoint>\n")); + case HELP_ALLOW: + return (gettext("\tallow \n" + "\tallow [-ldug] " + "<\"everyone\"|user|group>[,...] [,...]\n" + "\t \n" + "\tallow [-ld] -e [,...] " + "\n" + "\tallow -c [,...] \n" + "\tallow -s @setname [,...] " + "\n")); + case HELP_UNALLOW: + return (gettext("\tunallow [-rldug] " + "<\"everyone\"|user|group>[,...]\n" + "\t [[,...]] \n" + "\tunallow [-rld] -e [[,...]] " + "\n" + "\tunallow [-r] -c [[,...]] " + "\n" + "\tunallow [-r] -s @setname [[,...]] " + "\n")); + case HELP_USERSPACE: + return (gettext("\tuserspace [-hniHp] [-o field[,...]] " + "[-sS field] ... [-t type[,...]]\n" + "\t \n")); + case HELP_GROUPSPACE: + return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] " + "[-sS field] ... [-t type[,...]]\n" + "\t \n")); + case HELP_HOLD: + return (gettext("\thold [-r] ...\n")); + case HELP_HOLDS: + return (gettext("\tholds [-r] ...\n")); + case HELP_RELEASE: + return (gettext("\trelease [-r] ...\n")); + case HELP_DIFF: + return (gettext("\tdiff [-FHt] " + "[snapshot|filesystem]\n")); + } + + abort(); + /* NOTREACHED */ +} + +void +nomem(void) +{ + (void) fprintf(stderr, gettext("internal error: out of memory\n")); + exit(1); +} + +/* + * Utility function to guarantee malloc() success. + */ + +void * +safe_malloc(size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) + nomem(); + + return (data); +} + +static char * +safe_strdup(char *str) +{ + char *dupstr = strdup(str); + + if (dupstr == NULL) + nomem(); + + return (dupstr); +} + +/* + * Callback routine that will print out information for each of + * the properties. + */ +static int +usage_prop_cb(int prop, void *cb) +{ + FILE *fp = cb; + + (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); + + if (zfs_prop_readonly(prop)) + (void) fprintf(fp, " NO "); + else + (void) fprintf(fp, "YES "); + + if (zfs_prop_inheritable(prop)) + (void) fprintf(fp, " YES "); + else + (void) fprintf(fp, " NO "); + + if (zfs_prop_values(prop) == NULL) + (void) fprintf(fp, "-\n"); + else + (void) fprintf(fp, "%s\n", zfs_prop_values(prop)); + + return (ZPROP_CONT); +} + +/* + * Display usage message. If we're inside a command, display only the usage for + * that command. Otherwise, iterate over the entire command table and display + * a complete usage message. + */ +static void +usage(boolean_t requested) +{ + int i; + boolean_t show_properties = B_FALSE; + FILE *fp = requested ? stdout : stderr; + + if (current_command == NULL) { + + (void) fprintf(fp, gettext("usage: zfs command args ...\n")); + (void) fprintf(fp, + gettext("where 'command' is one of the following:\n\n")); + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + (void) fprintf(fp, "\n"); + else + (void) fprintf(fp, "%s", + get_usage(command_table[i].usage)); + } + + (void) fprintf(fp, gettext("\nEach dataset is of the form: " + "pool/[dataset/]*dataset[@name]\n")); + } else { + (void) fprintf(fp, gettext("usage:\n")); + (void) fprintf(fp, "%s", get_usage(current_command->usage)); + } + + if (current_command != NULL && + (strcmp(current_command->name, "set") == 0 || + strcmp(current_command->name, "get") == 0 || + strcmp(current_command->name, "inherit") == 0 || + strcmp(current_command->name, "list") == 0)) + show_properties = B_TRUE; + + if (show_properties) { + (void) fprintf(fp, + gettext("\nThe following properties are supported:\n")); + + (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", + "PROPERTY", "EDIT", "INHERIT", "VALUES"); + + /* Iterate over all properties */ + (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, + ZFS_TYPE_DATASET); + + (void) fprintf(fp, "\t%-15s ", "userused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "groupused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "userquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "groupquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + + (void) fprintf(fp, gettext("\nSizes are specified in bytes " + "with standard units such as K, M, G, etc.\n")); + (void) fprintf(fp, gettext("\nUser-defined properties can " + "be specified by using a name containing a colon (:).\n")); + (void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ " + "properties must be appended with\n" + "a user or group specifier of one of these forms:\n" + " POSIX name (eg: \"matt\")\n" + " POSIX id (eg: \"126829\")\n" + " SMB name@domain (eg: \"matt@sun\")\n" + " SMB SID (eg: \"S-1-234-567-89\")\n")); + } else { + (void) fprintf(fp, + gettext("\nFor the property list, run: %s\n"), + "zfs set|get"); + (void) fprintf(fp, + gettext("\nFor the delegated permission list, run: %s\n"), + "zfs allow|unallow"); + } + + /* + * See comments at end of main(). + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + exit(requested ? 0 : 2); +} + +static int +parseprop(nvlist_t *props) +{ + char *propname = optarg; + char *propval, *strval; + + if ((propval = strchr(propname, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + return (-1); + } + *propval = '\0'; + propval++; + if (nvlist_lookup_string(props, propname, &strval) == 0) { + (void) fprintf(stderr, gettext("property '%s' " + "specified multiple times\n"), propname); + return (-1); + } + if (nvlist_add_string(props, propname, propval) != 0) + nomem(); + return (0); +} + +static int +parse_depth(char *opt, int *flags) +{ + char *tmp; + int depth; + + depth = (int)strtol(opt, &tmp, 0); + if (*tmp) { + (void) fprintf(stderr, + gettext("%s is not an integer\n"), optarg); + usage(B_FALSE); + } + if (depth < 0) { + (void) fprintf(stderr, + gettext("Depth can not be negative.\n")); + usage(B_FALSE); + } + *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); + return (depth); +} + +#define PROGRESS_DELAY 2 /* seconds */ + +static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; +static time_t pt_begin; +static char *pt_header = NULL; +static boolean_t pt_shown; + +static void +start_progress_timer(void) +{ + pt_begin = time(NULL) + PROGRESS_DELAY; + pt_shown = B_FALSE; +} + +static void +set_progress_header(char *header) +{ + assert(pt_header == NULL); + pt_header = safe_strdup(header); + if (pt_shown) { + (void) printf("%s: ", header); + (void) fflush(stdout); + } +} + +static void +update_progress(char *update) +{ + if (!pt_shown && time(NULL) > pt_begin) { + int len = strlen(update); + + (void) printf("%s: %s%*.*s", pt_header, update, len, len, + pt_reverse); + (void) fflush(stdout); + pt_shown = B_TRUE; + } else if (pt_shown) { + int len = strlen(update); + + (void) printf("%s%*.*s", update, len, len, pt_reverse); + (void) fflush(stdout); + } +} + +static void +finish_progress(char *done) +{ + if (pt_shown) { + (void) printf("%s\n", done); + (void) fflush(stdout); + } + free(pt_header); + pt_header = NULL; +} +/* + * zfs clone [-p] [-o prop=value] ... + * + * Given an existing dataset, create a writable copy whose initial contents + * are the same as the source. The newly created dataset maintains a + * dependency on the original; the original cannot be destroyed so long as + * the clone exists. + * + * The '-p' flag creates all the non-existing ancestors of the target first. + */ +static int +zfs_do_clone(int argc, char **argv) +{ + zfs_handle_t *zhp = NULL; + boolean_t parents = B_FALSE; + nvlist_t *props; + int ret; + int c; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, "o:p")) != -1) { + switch (c) { + case 'o': + if (parseprop(props)) + return (1); + break; + case 'p': + parents = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing source dataset " + "argument\n")); + goto usage; + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing target dataset " + "argument\n")); + goto usage; + } + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + goto usage; + } + + /* open the source dataset */ + if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) + return (1); + + if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) { + /* + * Now create the ancestors of the target dataset. If the + * target already exists and '-p' option was used we should not + * complain. + */ + if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) + return (0); + if (zfs_create_ancestors(g_zfs, argv[1]) != 0) + return (1); + } + + /* pass to libzfs */ + ret = zfs_clone(zhp, argv[1], props); + + /* create the mountpoint if necessary */ + if (ret == 0) { + zfs_handle_t *clone; + + clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET); + if (clone != NULL) { + if (zfs_get_type(clone) != ZFS_TYPE_VOLUME) + if ((ret = zfs_mount(clone, NULL, 0)) == 0) + ret = zfs_share(clone); + zfs_close(clone); + } + } + + zfs_close(zhp); + nvlist_free(props); + + return (!!ret); + +usage: + if (zhp) + zfs_close(zhp); + nvlist_free(props); + usage(B_FALSE); + return (-1); +} + +/* + * zfs create [-p] [-o prop=value] ... fs + * zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size + * + * Create a new dataset. This command can be used to create filesystems + * and volumes. Snapshot creation is handled by 'zfs snapshot'. + * For volumes, the user must specify a size to be used. + * + * The '-s' flag applies only to volumes, and indicates that we should not try + * to set the reservation for this volume. By default we set a reservation + * equal to the size for any volume. For pools with SPA_VERSION >= + * SPA_VERSION_REFRESERVATION, we set a refreservation instead. + * + * The '-p' flag creates all the non-existing ancestors of the target first. + */ +static int +zfs_do_create(int argc, char **argv) +{ + zfs_type_t type = ZFS_TYPE_FILESYSTEM; + zfs_handle_t *zhp = NULL; + uint64_t volsize; + int c; + boolean_t noreserve = B_FALSE; + boolean_t bflag = B_FALSE; + boolean_t parents = B_FALSE; + int ret = 1; + nvlist_t *props; + uint64_t intval; + int canmount = ZFS_CANMOUNT_OFF; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, ":V:b:so:p")) != -1) { + switch (c) { + case 'V': + type = ZFS_TYPE_VOLUME; + if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { + (void) fprintf(stderr, gettext("bad volume " + "size '%s': %s\n"), optarg, + libzfs_error_description(g_zfs)); + goto error; + } + + if (nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) + nomem(); + volsize = intval; + break; + case 'p': + parents = B_TRUE; + break; + case 'b': + bflag = B_TRUE; + if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { + (void) fprintf(stderr, gettext("bad volume " + "block size '%s': %s\n"), optarg, + libzfs_error_description(g_zfs)); + goto error; + } + + if (nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + intval) != 0) + nomem(); + break; + case 'o': + if (parseprop(props)) + goto error; + break; + case 's': + noreserve = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing size " + "argument\n")); + goto badusage; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto badusage; + } + } + + if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { + (void) fprintf(stderr, gettext("'-s' and '-b' can only be " + "used when creating a volume\n")); + goto badusage; + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc == 0) { + (void) fprintf(stderr, gettext("missing %s argument\n"), + zfs_type_to_name(type)); + goto badusage; + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + goto badusage; + } + + if (type == ZFS_TYPE_VOLUME && !noreserve) { + zpool_handle_t *zpool_handle; + uint64_t spa_version; + char *p; + zfs_prop_t resv_prop; + char *strval; + + if (p = strchr(argv[0], '/')) + *p = '\0'; + zpool_handle = zpool_open(g_zfs, argv[0]); + if (p != NULL) + *p = '/'; + if (zpool_handle == NULL) + goto error; + spa_version = zpool_get_prop_int(zpool_handle, + ZPOOL_PROP_VERSION, NULL); + zpool_close(zpool_handle); + if (spa_version >= SPA_VERSION_REFRESERVATION) + resv_prop = ZFS_PROP_REFRESERVATION; + else + resv_prop = ZFS_PROP_RESERVATION; + volsize = zvol_volsize_to_reservation(volsize, props); + + if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), + &strval) != 0) { + if (nvlist_add_uint64(props, + zfs_prop_to_name(resv_prop), volsize) != 0) { + nvlist_free(props); + nomem(); + } + } + } + + if (parents && zfs_name_valid(argv[0], type)) { + /* + * Now create the ancestors of target dataset. If the target + * already exists and '-p' option was used we should not + * complain. + */ + if (zfs_dataset_exists(g_zfs, argv[0], type)) { + ret = 0; + goto error; + } + if (zfs_create_ancestors(g_zfs, argv[0]) != 0) + goto error; + } + + /* pass to libzfs */ + if (zfs_create(g_zfs, argv[0], type, props) != 0) + goto error; + + if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) + goto error; + + ret = 0; + /* + * if the user doesn't want the dataset automatically mounted, + * then skip the mount/share step + */ + if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type)) + canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); + + /* + * Mount and/or share the new filesystem as appropriate. We provide a + * verbose error message to let the user know that their filesystem was + * in fact created, even if we failed to mount or share it. + */ + if (canmount == ZFS_CANMOUNT_ON) { + if (zfs_mount(zhp, NULL, 0) != 0) { + (void) fprintf(stderr, gettext("filesystem " + "successfully created, but not mounted\n")); + ret = 1; + } else if (zfs_share(zhp) != 0) { + (void) fprintf(stderr, gettext("filesystem " + "successfully created, but not shared\n")); + ret = 1; + } + } + +error: + if (zhp) + zfs_close(zhp); + nvlist_free(props); + return (ret); +badusage: + nvlist_free(props); + usage(B_FALSE); + return (2); +} + +/* + * zfs destroy [-rRf] + * zfs destroy [-rRd] + * + * -r Recursively destroy all children + * -R Recursively destroy all dependents, including clones + * -f Force unmounting of any dependents + * -d If we can't destroy now, mark for deferred destruction + * + * Destroys the given dataset. By default, it will unmount any filesystems, + * and refuse to destroy a dataset that has any dependents. A dependent can + * either be a child, or a clone of a child. + */ +typedef struct destroy_cbdata { + boolean_t cb_first; + int cb_force; + int cb_recurse; + int cb_error; + int cb_needforce; + int cb_doclones; + boolean_t cb_closezhp; + zfs_handle_t *cb_target; + char *cb_snapname; + boolean_t cb_defer_destroy; +} destroy_cbdata_t; + +/* + * Check for any dependents based on the '-r' or '-R' flags. + */ +static int +destroy_check_dependent(zfs_handle_t *zhp, void *data) +{ + destroy_cbdata_t *cbp = data; + const char *tname = zfs_get_name(cbp->cb_target); + const char *name = zfs_get_name(zhp); + + if (strncmp(tname, name, strlen(tname)) == 0 && + (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { + /* + * This is a direct descendant, not a clone somewhere else in + * the hierarchy. + */ + if (cbp->cb_recurse) + goto out; + + if (cbp->cb_first) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "%s has children\n"), + zfs_get_name(cbp->cb_target), + zfs_type_to_name(zfs_get_type(cbp->cb_target))); + (void) fprintf(stderr, gettext("use '-r' to destroy " + "the following datasets:\n")); + cbp->cb_first = B_FALSE; + cbp->cb_error = 1; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + } else { + /* + * This is a clone. We only want to report this if the '-r' + * wasn't specified, or the target is a snapshot. + */ + if (!cbp->cb_recurse && + zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) + goto out; + + if (cbp->cb_first) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "%s has dependent clones\n"), + zfs_get_name(cbp->cb_target), + zfs_type_to_name(zfs_get_type(cbp->cb_target))); + (void) fprintf(stderr, gettext("use '-R' to destroy " + "the following datasets:\n")); + cbp->cb_first = B_FALSE; + cbp->cb_error = 1; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + } + +out: + zfs_close(zhp); + return (0); +} + +static int +destroy_callback(zfs_handle_t *zhp, void *data) +{ + destroy_cbdata_t *cbp = data; + + /* + * Ignore pools (which we've already flagged as an error before getting + * here). + */ + if (strchr(zfs_get_name(zhp), '/') == NULL && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + zfs_close(zhp); + return (0); + } + + /* + * Bail out on the first error. + */ + if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 || + zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) { + zfs_close(zhp); + return (-1); + } + + zfs_close(zhp); + return (0); +} + +static int +destroy_snap_clones(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cbp = arg; + char thissnap[MAXPATHLEN]; + zfs_handle_t *szhp; + boolean_t closezhp = cbp->cb_closezhp; + int rv; + + (void) snprintf(thissnap, sizeof (thissnap), + "%s@%s", zfs_get_name(zhp), cbp->cb_snapname); + + libzfs_print_on_error(g_zfs, B_FALSE); + szhp = zfs_open(g_zfs, thissnap, ZFS_TYPE_SNAPSHOT); + libzfs_print_on_error(g_zfs, B_TRUE); + if (szhp) { + /* + * Destroy any clones of this snapshot + */ + if (zfs_iter_dependents(szhp, B_FALSE, destroy_callback, + cbp) != 0) { + zfs_close(szhp); + if (closezhp) + zfs_close(zhp); + return (-1); + } + zfs_close(szhp); + } + + cbp->cb_closezhp = B_TRUE; + rv = zfs_iter_filesystems(zhp, destroy_snap_clones, arg); + if (closezhp) + zfs_close(zhp); + return (rv); +} + +static int +zfs_do_destroy(int argc, char **argv) +{ + destroy_cbdata_t cb = { 0 }; + int c; + zfs_handle_t *zhp; + char *cp; + zfs_type_t type = ZFS_TYPE_DATASET; + + /* check options */ + while ((c = getopt(argc, argv, "dfrR")) != -1) { + switch (c) { + case 'd': + cb.cb_defer_destroy = B_TRUE; + type = ZFS_TYPE_SNAPSHOT; + break; + case 'f': + cb.cb_force = 1; + break; + case 'r': + cb.cb_recurse = 1; + break; + case 'R': + cb.cb_recurse = 1; + cb.cb_doclones = 1; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc == 0) { + (void) fprintf(stderr, gettext("missing path argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* + * If we are doing recursive destroy of a snapshot, then the + * named snapshot may not exist. Go straight to libzfs. + */ + if (cb.cb_recurse && (cp = strchr(argv[0], '@'))) { + int ret; + + *cp = '\0'; + if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) + return (1); + *cp = '@'; + cp++; + + if (cb.cb_doclones) { + boolean_t defer = cb.cb_defer_destroy; + + /* + * Temporarily ignore the defer_destroy setting since + * it's not supported for clones. + */ + cb.cb_defer_destroy = B_FALSE; + cb.cb_snapname = cp; + if (destroy_snap_clones(zhp, &cb) != 0) { + zfs_close(zhp); + return (1); + } + cb.cb_defer_destroy = defer; + } + + ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy); + zfs_close(zhp); + if (ret) { + (void) fprintf(stderr, + gettext("no snapshots destroyed\n")); + } + return (ret != 0); + } + + /* Open the given dataset */ + if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) + return (1); + + cb.cb_target = zhp; + + /* + * Perform an explicit check for pools before going any further. + */ + if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "operation does not apply to pools\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zfs destroy -r " + "%s' to destroy all datasets in the pool\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zpool destroy %s' " + "to destroy the pool itself\n"), zfs_get_name(zhp)); + zfs_close(zhp); + return (1); + } + + /* + * Check for any dependents and/or clones. + */ + cb.cb_first = B_TRUE; + if (!cb.cb_doclones && !cb.cb_defer_destroy && + zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, + &cb) != 0) { + zfs_close(zhp); + return (1); + } + + if (cb.cb_error || (!cb.cb_defer_destroy && + (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) { + zfs_close(zhp); + return (1); + } + + /* + * Do the real thing. The callback will close the handle regardless of + * whether it succeeds or not. + */ + + if (destroy_callback(zhp, &cb) != 0) + return (1); + + return (0); +} + +static boolean_t +is_recvd_column(zprop_get_cbdata_t *cbp) +{ + int i; + zfs_get_column_t col; + + for (i = 0; i < ZFS_GET_NCOLS && + (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) + if (col == GET_COL_RECVD) + return (B_TRUE); + return (B_FALSE); +} + +/* + * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] + * < all | property[,property]... > < fs | snap | vol > ... + * + * -r recurse over any child datasets + * -H scripted mode. Headers are stripped, and fields are separated + * by tabs instead of spaces. + * -o Set of fields to display. One of "name,property,value, + * received,source". Default is "name,property,value,source". + * "all" is an alias for all five. + * -s Set of sources to allow. One of + * "local,default,inherited,received,temporary,none". Default is + * all six. + * -p Display values in parsable (literal) format. + * + * Prints properties for the given datasets. The user can control which + * columns to display as well as which property types to allow. + */ + +/* + * Invoked to display the properties for a single dataset. + */ +static int +get_callback(zfs_handle_t *zhp, void *data) +{ + char buf[ZFS_MAXPROPLEN]; + char rbuf[ZFS_MAXPROPLEN]; + zprop_source_t sourcetype; + char source[ZFS_MAXNAMELEN]; + zprop_get_cbdata_t *cbp = data; + nvlist_t *user_props = zfs_get_user_props(zhp); + zprop_list_t *pl = cbp->cb_proplist; + nvlist_t *propval; + char *strval; + char *sourceval; + boolean_t received = is_recvd_column(cbp); + + for (; pl != NULL; pl = pl->pl_next) { + char *recvdval = NULL; + /* + * Skip the special fake placeholder. This will also skip over + * the name property when 'all' is specified. + */ + if (pl->pl_prop == ZFS_PROP_NAME && + pl == cbp->cb_proplist) + continue; + + if (pl->pl_prop != ZPROP_INVAL) { + if (zfs_prop_get(zhp, pl->pl_prop, buf, + sizeof (buf), &sourcetype, source, + sizeof (source), + cbp->cb_literal) != 0) { + if (pl->pl_all) + continue; + if (!zfs_prop_valid_for_type(pl->pl_prop, + ZFS_TYPE_DATASET)) { + (void) fprintf(stderr, + gettext("No such property '%s'\n"), + zfs_prop_to_name(pl->pl_prop)); + continue; + } + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + if (received && (zfs_prop_get_recvd(zhp, + zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), + cbp->cb_literal) == 0)) + recvdval = rbuf; + + zprop_print_one_property(zfs_get_name(zhp), cbp, + zfs_prop_to_name(pl->pl_prop), + buf, sourcetype, source, recvdval); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + sourcetype = ZPROP_SRC_LOCAL; + + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + buf, sizeof (buf), cbp->cb_literal) != 0) { + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, buf, sourcetype, source, NULL); + } else { + if (nvlist_lookup_nvlist(user_props, + pl->pl_user_prop, &propval) != 0) { + if (pl->pl_all) + continue; + sourcetype = ZPROP_SRC_NONE; + strval = "-"; + } else { + verify(nvlist_lookup_string(propval, + ZPROP_VALUE, &strval) == 0); + verify(nvlist_lookup_string(propval, + ZPROP_SOURCE, &sourceval) == 0); + + if (strcmp(sourceval, + zfs_get_name(zhp)) == 0) { + sourcetype = ZPROP_SRC_LOCAL; + } else if (strcmp(sourceval, + ZPROP_SOURCE_VAL_RECVD) == 0) { + sourcetype = ZPROP_SRC_RECEIVED; + } else { + sourcetype = ZPROP_SRC_INHERITED; + (void) strlcpy(source, + sourceval, sizeof (source)); + } + } + + if (received && (zfs_prop_get_recvd(zhp, + pl->pl_user_prop, rbuf, sizeof (rbuf), + cbp->cb_literal) == 0)) + recvdval = rbuf; + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, strval, sourcetype, + source, recvdval); + } + } + + return (0); +} + +static int +zfs_do_get(int argc, char **argv) +{ + zprop_get_cbdata_t cb = { 0 }; + int i, c, flags = 0; + char *value, *fields; + int ret; + int limit = 0; + zprop_list_t fake_name = { 0 }; + + /* + * Set up default columns and sources. + */ + cb.cb_sources = ZPROP_SRC_ALL; + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_SOURCE; + cb.cb_type = ZFS_TYPE_DATASET; + + /* check options */ + while ((c = getopt(argc, argv, ":d:o:s:rHp")) != -1) { + switch (c) { + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'd': + limit = parse_depth(optarg, &flags); + break; + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case 'o': + /* + * Process the set of columns to display. We zero out + * the structure to give us a blank slate. + */ + bzero(&cb.cb_columns, sizeof (cb.cb_columns)); + i = 0; + while (*optarg != '\0') { + static char *col_subopts[] = + { "name", "property", "value", "received", + "source", "all", NULL }; + + if (i == ZFS_GET_NCOLS) { + (void) fprintf(stderr, gettext("too " + "many fields given to -o " + "option\n")); + usage(B_FALSE); + } + + switch (getsubopt(&optarg, col_subopts, + &value)) { + case 0: + cb.cb_columns[i++] = GET_COL_NAME; + break; + case 1: + cb.cb_columns[i++] = GET_COL_PROPERTY; + break; + case 2: + cb.cb_columns[i++] = GET_COL_VALUE; + break; + case 3: + cb.cb_columns[i++] = GET_COL_RECVD; + flags |= ZFS_ITER_RECVD_PROPS; + break; + case 4: + cb.cb_columns[i++] = GET_COL_SOURCE; + break; + case 5: + if (i > 0) { + (void) fprintf(stderr, + gettext("\"all\" conflicts " + "with specific fields " + "given to -o option\n")); + usage(B_FALSE); + } + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_RECVD; + cb.cb_columns[4] = GET_COL_SOURCE; + flags |= ZFS_ITER_RECVD_PROPS; + i = ZFS_GET_NCOLS; + break; + default: + (void) fprintf(stderr, + gettext("invalid column name " + "'%s'\n"), value); + usage(B_FALSE); + } + } + break; + + case 's': + cb.cb_sources = 0; + while (*optarg != '\0') { + static char *source_subopts[] = { + "local", "default", "inherited", + "received", "temporary", "none", + NULL }; + + switch (getsubopt(&optarg, source_subopts, + &value)) { + case 0: + cb.cb_sources |= ZPROP_SRC_LOCAL; + break; + case 1: + cb.cb_sources |= ZPROP_SRC_DEFAULT; + break; + case 2: + cb.cb_sources |= ZPROP_SRC_INHERITED; + break; + case 3: + cb.cb_sources |= ZPROP_SRC_RECEIVED; + break; + case 4: + cb.cb_sources |= ZPROP_SRC_TEMPORARY; + break; + case 5: + cb.cb_sources |= ZPROP_SRC_NONE; + break; + default: + (void) fprintf(stderr, + gettext("invalid source " + "'%s'\n"), value); + usage(B_FALSE); + } + } + break; + + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing property " + "argument\n")); + usage(B_FALSE); + } + + fields = argv[0]; + + if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) + != 0) + usage(B_FALSE); + + argc--; + argv++; + + /* + * As part of zfs_expand_proplist(), we keep track of the maximum column + * width for each property. For the 'NAME' (and 'SOURCE') columns, we + * need to know the maximum name length. However, the user likely did + * not specify 'name' as one of the properties to fetch, so we need to + * make sure we always include at least this property for + * print_get_headers() to work properly. + */ + if (cb.cb_proplist != NULL) { + fake_name.pl_prop = ZFS_PROP_NAME; + fake_name.pl_width = strlen(gettext("NAME")); + fake_name.pl_next = cb.cb_proplist; + cb.cb_proplist = &fake_name; + } + + cb.cb_first = B_TRUE; + + /* run for each object */ + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, + &cb.cb_proplist, limit, get_callback, &cb); + + if (cb.cb_proplist == &fake_name) + zprop_free_list(fake_name.pl_next); + else + zprop_free_list(cb.cb_proplist); + + return (ret); +} + +/* + * inherit [-rS] ... + * + * -r Recurse over all children + * -S Revert to received value, if any + * + * For each dataset specified on the command line, inherit the given property + * from its parent. Inheriting a property at the pool level will cause it to + * use the default value. The '-r' flag will recurse over all children, and is + * useful for setting a property on a hierarchy-wide basis, regardless of any + * local modifications for each dataset. + */ + +typedef struct inherit_cbdata { + const char *cb_propname; + boolean_t cb_received; +} inherit_cbdata_t; + +static int +inherit_recurse_cb(zfs_handle_t *zhp, void *data) +{ + inherit_cbdata_t *cb = data; + zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); + + /* + * If we're doing it recursively, then ignore properties that + * are not valid for this type of dataset. + */ + if (prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, zfs_get_type(zhp))) + return (0); + + return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); +} + +static int +inherit_cb(zfs_handle_t *zhp, void *data) +{ + inherit_cbdata_t *cb = data; + + return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); +} + +static int +zfs_do_inherit(int argc, char **argv) +{ + int c; + zfs_prop_t prop; + inherit_cbdata_t cb = { 0 }; + char *propname; + int ret; + int flags = 0; + boolean_t received = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "rS")) != -1) { + switch (c) { + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'S': + received = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing property argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing dataset argument\n")); + usage(B_FALSE); + } + + propname = argv[0]; + argc--; + argv++; + + if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { + if (zfs_prop_readonly(prop)) { + (void) fprintf(stderr, gettext( + "%s property is read-only\n"), + propname); + return (1); + } + if (!zfs_prop_inheritable(prop) && !received) { + (void) fprintf(stderr, gettext("'%s' property cannot " + "be inherited\n"), propname); + if (prop == ZFS_PROP_QUOTA || + prop == ZFS_PROP_RESERVATION || + prop == ZFS_PROP_REFQUOTA || + prop == ZFS_PROP_REFRESERVATION) + (void) fprintf(stderr, gettext("use 'zfs set " + "%s=none' to clear\n"), propname); + return (1); + } + if (received && (prop == ZFS_PROP_VOLSIZE || + prop == ZFS_PROP_VERSION)) { + (void) fprintf(stderr, gettext("'%s' property cannot " + "be reverted to a received value\n"), propname); + return (1); + } + } else if (!zfs_prop_user(propname)) { + (void) fprintf(stderr, gettext("invalid property '%s'\n"), + propname); + usage(B_FALSE); + } + + cb.cb_propname = propname; + cb.cb_received = received; + + if (flags & ZFS_ITER_RECURSE) { + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, + NULL, NULL, 0, inherit_recurse_cb, &cb); + } else { + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, + NULL, NULL, 0, inherit_cb, &cb); + } + + return (ret); +} + +typedef struct upgrade_cbdata { + uint64_t cb_numupgraded; + uint64_t cb_numsamegraded; + uint64_t cb_numfailed; + uint64_t cb_version; + boolean_t cb_newer; + boolean_t cb_foundone; + char cb_lastfs[ZFS_MAXNAMELEN]; +} upgrade_cbdata_t; + +static int +same_pool(zfs_handle_t *zhp, const char *name) +{ + int len1 = strcspn(name, "/@"); + const char *zhname = zfs_get_name(zhp); + int len2 = strcspn(zhname, "/@"); + + if (len1 != len2) + return (B_FALSE); + return (strncmp(name, zhname, len1) == 0); +} + +static int +upgrade_list_callback(zfs_handle_t *zhp, void *data) +{ + upgrade_cbdata_t *cb = data; + int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + + /* list if it's old/new */ + if ((!cb->cb_newer && version < ZPL_VERSION) || + (cb->cb_newer && version > ZPL_VERSION)) { + char *str; + if (cb->cb_newer) { + str = gettext("The following filesystems are " + "formatted using a newer software version and\n" + "cannot be accessed on the current system.\n\n"); + } else { + str = gettext("The following filesystems are " + "out of date, and can be upgraded. After being\n" + "upgraded, these filesystems (and any 'zfs send' " + "streams generated from\n" + "subsequent snapshots) will no longer be " + "accessible by older software versions.\n\n"); + } + + if (!cb->cb_foundone) { + (void) puts(str); + (void) printf(gettext("VER FILESYSTEM\n")); + (void) printf(gettext("--- ------------\n")); + cb->cb_foundone = B_TRUE; + } + + (void) printf("%2u %s\n", version, zfs_get_name(zhp)); + } + + return (0); +} + +static int +upgrade_set_callback(zfs_handle_t *zhp, void *data) +{ + upgrade_cbdata_t *cb = data; + int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + int needed_spa_version; + int spa_version; + + if (zfs_spa_version(zhp, &spa_version) < 0) + return (-1); + + needed_spa_version = zfs_spa_version_map(cb->cb_version); + + if (needed_spa_version < 0) + return (-1); + + if (spa_version < needed_spa_version) { + /* can't upgrade */ + (void) printf(gettext("%s: can not be " + "upgraded; the pool version needs to first " + "be upgraded\nto version %d\n\n"), + zfs_get_name(zhp), needed_spa_version); + cb->cb_numfailed++; + return (0); + } + + /* upgrade */ + if (version < cb->cb_version) { + char verstr[16]; + (void) snprintf(verstr, sizeof (verstr), + "%llu", cb->cb_version); + if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { + /* + * If they did "zfs upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool. + */ + verify(zpool_stage_history(g_zfs, history_str) == 0); + } + if (zfs_prop_set(zhp, "version", verstr) == 0) + cb->cb_numupgraded++; + else + cb->cb_numfailed++; + (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp)); + } else if (version > cb->cb_version) { + /* can't downgrade */ + (void) printf(gettext("%s: can not be downgraded; " + "it is already at version %u\n"), + zfs_get_name(zhp), version); + cb->cb_numfailed++; + } else { + cb->cb_numsamegraded++; + } + return (0); +} + +/* + * zfs upgrade + * zfs upgrade -v + * zfs upgrade [-r] [-V ] <-a | filesystem> + */ +static int +zfs_do_upgrade(int argc, char **argv) +{ + boolean_t all = B_FALSE; + boolean_t showversions = B_FALSE; + int ret; + upgrade_cbdata_t cb = { 0 }; + char c; + int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; + + /* check options */ + while ((c = getopt(argc, argv, "rvV:a")) != -1) { + switch (c) { + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'v': + showversions = B_TRUE; + break; + case 'V': + if (zfs_prop_string_to_index(ZFS_PROP_VERSION, + optarg, &cb.cb_version) != 0) { + (void) fprintf(stderr, + gettext("invalid version %s\n"), optarg); + usage(B_FALSE); + } + break; + case 'a': + all = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) + usage(B_FALSE); + if (showversions && (flags & ZFS_ITER_RECURSE || all || + cb.cb_version || argc)) + usage(B_FALSE); + if ((all || argc) && (showversions)) + usage(B_FALSE); + if (all && argc) + usage(B_FALSE); + + if (showversions) { + /* Show info on available versions. */ + (void) printf(gettext("The following filesystem versions are " + "supported:\n\n")); + (void) printf(gettext("VER DESCRIPTION\n")); + (void) printf("--- -----------------------------------------" + "---------------\n"); + (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); + (void) printf(gettext(" 2 Enhanced directory entries\n")); + (void) printf(gettext(" 3 Case insensitive and File system " + "unique identifier (FUID)\n")); + (void) printf(gettext(" 4 userquota, groupquota " + "properties\n")); + (void) printf(gettext(" 5 System attributes\n")); + (void) printf(gettext("\nFor more information on a particular " + "version, including supported releases,\n")); + (void) printf("see the ZFS Administration Guide.\n\n"); + ret = 0; + } else if (argc || all) { + /* Upgrade filesystems */ + if (cb.cb_version == 0) + cb.cb_version = ZPL_VERSION; + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, + NULL, NULL, 0, upgrade_set_callback, &cb); + (void) printf(gettext("%llu filesystems upgraded\n"), + cb.cb_numupgraded); + if (cb.cb_numsamegraded) { + (void) printf(gettext("%llu filesystems already at " + "this version\n"), + cb.cb_numsamegraded); + } + if (cb.cb_numfailed != 0) + ret = 1; + } else { + /* List old-version filesytems */ + boolean_t found; + (void) printf(gettext("This system is currently running " + "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); + + flags |= ZFS_ITER_RECURSE; + ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, + NULL, NULL, 0, upgrade_list_callback, &cb); + + found = cb.cb_foundone; + cb.cb_foundone = B_FALSE; + cb.cb_newer = B_TRUE; + + ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, + NULL, NULL, 0, upgrade_list_callback, &cb); + + if (!cb.cb_foundone && !found) { + (void) printf(gettext("All filesystems are " + "formatted with the current version.\n")); + } + } + + return (ret); +} + +/* + * zfs userspace + */ +static int +userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) +{ + zfs_userquota_prop_t *typep = arg; + zfs_userquota_prop_t p = *typep; + char *name = NULL; + char *ug, *propname; + char namebuf[32]; + char sizebuf[32]; + + if (domain == NULL || domain[0] == '\0') { + if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) { + struct group *g = getgrgid(rid); + if (g) + name = g->gr_name; + } else { + struct passwd *p = getpwuid(rid); + if (p) + name = p->pw_name; + } + } + + if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) + ug = "group"; + else + ug = "user"; + + if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED) + propname = "used"; + else + propname = "quota"; + + if (name == NULL) { + (void) snprintf(namebuf, sizeof (namebuf), + "%llu", (longlong_t)rid); + name = namebuf; + } + zfs_nicenum(space, sizebuf, sizeof (sizebuf)); + + (void) printf("%s %s %s%c%s %s\n", propname, ug, domain, + domain[0] ? '-' : ' ', name, sizebuf); + + return (0); +} + +static int +zfs_do_userspace(int argc, char **argv) +{ + zfs_handle_t *zhp; + zfs_userquota_prop_t p; + int error; + + /* + * Try the python version. If the execv fails, we'll continue + * and do a simplistic implementation. + */ + (void) execv(pypath, argv-1); + + (void) printf("internal error: %s not found\n" + "falling back on built-in implementation, " + "some features will not work\n", pypath); + + if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL) + return (1); + + (void) printf("PROP TYPE NAME VALUE\n"); + + for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { + error = zfs_userspace(zhp, p, userspace_cb, &p); + if (error) + break; + } + return (error); +} + +/* + * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...] + * [-s property [-s property]...] [-S property [-S property]...] + * ... + * + * -r Recurse over all children + * -d Limit recursion by depth. + * -H Scripted mode; elide headers and separate columns by tabs + * -o Control which fields to display. + * -t Control which object types to display. + * -s Specify sort columns, descending order. + * -S Specify sort columns, ascending order. + * + * When given no arguments, lists all filesystems in the system. + * Otherwise, list the specified datasets, optionally recursing down them if + * '-r' is specified. + */ +typedef struct list_cbdata { + boolean_t cb_first; + boolean_t cb_scripted; + zprop_list_t *cb_proplist; +} list_cbdata_t; + +/* + * Given a list of columns to display, output appropriate headers for each one. + */ +static void +print_header(zprop_list_t *pl) +{ + char headerbuf[ZFS_MAXPROPLEN]; + const char *header; + int i; + boolean_t first = B_TRUE; + boolean_t right_justify; + + for (; pl != NULL; pl = pl->pl_next) { + if (!first) { + (void) printf(" "); + } else { + first = B_FALSE; + } + + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + header = zfs_prop_column_name(pl->pl_prop); + right_justify = zfs_prop_align_right(pl->pl_prop); + } else { + for (i = 0; pl->pl_user_prop[i] != '\0'; i++) + headerbuf[i] = toupper(pl->pl_user_prop[i]); + headerbuf[i] = '\0'; + header = headerbuf; + } + + if (pl->pl_next == NULL && !right_justify) + (void) printf("%s", header); + else if (right_justify) + (void) printf("%*s", pl->pl_width, header); + else + (void) printf("%-*s", pl->pl_width, header); + } + + (void) printf("\n"); +} + +/* + * Given a dataset and a list of fields, print out all the properties according + * to the described layout. + */ +static void +print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) +{ + boolean_t first = B_TRUE; + char property[ZFS_MAXPROPLEN]; + nvlist_t *userprops = zfs_get_user_props(zhp); + nvlist_t *propval; + char *propstr; + boolean_t right_justify; + int width; + + for (; pl != NULL; pl = pl->pl_next) { + if (!first) { + if (scripted) + (void) printf("\t"); + else + (void) printf(" "); + } else { + first = B_FALSE; + } + + if (pl->pl_prop != ZPROP_INVAL) { + if (zfs_prop_get(zhp, pl->pl_prop, property, + sizeof (property), NULL, NULL, 0, B_FALSE) != 0) + propstr = "-"; + else + propstr = property; + + right_justify = zfs_prop_align_right(pl->pl_prop); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + property, sizeof (property), B_FALSE) != 0) + propstr = "-"; + else + propstr = property; + right_justify = B_TRUE; + } else { + if (nvlist_lookup_nvlist(userprops, + pl->pl_user_prop, &propval) != 0) + propstr = "-"; + else + verify(nvlist_lookup_string(propval, + ZPROP_VALUE, &propstr) == 0); + right_justify = B_FALSE; + } + + width = pl->pl_width; + + /* + * If this is being called in scripted mode, or if this is the + * last column and it is left-justified, don't include a width + * format specifier. + */ + if (scripted || (pl->pl_next == NULL && !right_justify)) + (void) printf("%s", propstr); + else if (right_justify) + (void) printf("%*s", width, propstr); + else + (void) printf("%-*s", width, propstr); + } + + (void) printf("\n"); +} + +/* + * Generic callback function to list a dataset or snapshot. + */ +static int +list_callback(zfs_handle_t *zhp, void *data) +{ + list_cbdata_t *cbp = data; + + if (cbp->cb_first) { + if (!cbp->cb_scripted) + print_header(cbp->cb_proplist); + cbp->cb_first = B_FALSE; + } + + print_dataset(zhp, cbp->cb_proplist, cbp->cb_scripted); + + return (0); +} + +static int +zfs_do_list(int argc, char **argv) +{ + int c; + boolean_t scripted = B_FALSE; + static char default_fields[] = + "name,used,available,referenced,mountpoint"; + int types = ZFS_TYPE_DATASET; + boolean_t types_specified = B_FALSE; + char *fields = NULL; + list_cbdata_t cb = { 0 }; + char *value; + int limit = 0; + int ret; + zfs_sort_column_t *sortcol = NULL; + int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; + + /* check options */ + while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) { + switch (c) { + case 'o': + fields = optarg; + break; + case 'd': + limit = parse_depth(optarg, &flags); + break; + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'H': + scripted = B_TRUE; + break; + case 's': + if (zfs_add_sort_column(&sortcol, optarg, + B_FALSE) != 0) { + (void) fprintf(stderr, + gettext("invalid property '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 'S': + if (zfs_add_sort_column(&sortcol, optarg, + B_TRUE) != 0) { + (void) fprintf(stderr, + gettext("invalid property '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 't': + types = 0; + types_specified = B_TRUE; + flags &= ~ZFS_ITER_PROP_LISTSNAPS; + while (*optarg != '\0') { + static char *type_subopts[] = { "filesystem", + "volume", "snapshot", "all", NULL }; + + switch (getsubopt(&optarg, type_subopts, + &value)) { + case 0: + types |= ZFS_TYPE_FILESYSTEM; + break; + case 1: + types |= ZFS_TYPE_VOLUME; + break; + case 2: + types |= ZFS_TYPE_SNAPSHOT; + break; + case 3: + types = ZFS_TYPE_DATASET; + break; + + default: + (void) fprintf(stderr, + gettext("invalid type '%s'\n"), + value); + usage(B_FALSE); + } + } + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (fields == NULL) + fields = default_fields; + + /* + * If "-o space" and no types were specified, don't display snapshots. + */ + if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) + types &= ~ZFS_TYPE_SNAPSHOT; + + /* + * If the user specifies '-o all', the zprop_get_list() doesn't + * normally include the name of the dataset. For 'zfs list', we always + * want this property to be first. + */ + if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) + != 0) + usage(B_FALSE); + + cb.cb_scripted = scripted; + cb.cb_first = B_TRUE; + + ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, + limit, list_callback, &cb); + + zprop_free_list(cb.cb_proplist); + zfs_free_sort_columns(sortcol); + + if (ret == 0 && cb.cb_first && !cb.cb_scripted) + (void) printf(gettext("no datasets available\n")); + + return (ret); +} + +/* + * zfs rename + * zfs rename -p + * zfs rename -r + * + * Renames the given dataset to another of the same type. + * + * The '-p' flag creates all the non-existing ancestors of the target first. + */ +/* ARGSUSED */ +static int +zfs_do_rename(int argc, char **argv) +{ + zfs_handle_t *zhp; + int c; + int ret; + boolean_t recurse = B_FALSE; + boolean_t parents = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "pr")) != -1) { + switch (c) { + case 'p': + parents = B_TRUE; + break; + case 'r': + recurse = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing source dataset " + "argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing target dataset " + "argument\n")); + usage(B_FALSE); + } + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (recurse && parents) { + (void) fprintf(stderr, gettext("-p and -r options are mutually " + "exclusive\n")); + usage(B_FALSE); + } + + if (recurse && strchr(argv[0], '@') == 0) { + (void) fprintf(stderr, gettext("source dataset for recursive " + "rename must be a snapshot\n")); + usage(B_FALSE); + } + + if ((zhp = zfs_open(g_zfs, argv[0], parents ? ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME : ZFS_TYPE_DATASET)) == NULL) + return (1); + + /* If we were asked and the name looks good, try to create ancestors. */ + if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && + zfs_create_ancestors(g_zfs, argv[1]) != 0) { + zfs_close(zhp); + return (1); + } + + ret = (zfs_rename(zhp, argv[1], recurse) != 0); + + zfs_close(zhp); + return (ret); +} + +/* + * zfs promote + * + * Promotes the given clone fs to be the parent + */ +/* ARGSUSED */ +static int +zfs_do_promote(int argc, char **argv) +{ + zfs_handle_t *zhp; + int ret; + + /* check options */ + if (argc > 1 && argv[1][0] == '-') { + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + argv[1][1]); + usage(B_FALSE); + } + + /* check number of arguments */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing clone filesystem" + " argument\n")); + usage(B_FALSE); + } + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + + ret = (zfs_promote(zhp) != 0); + + + zfs_close(zhp); + return (ret); +} + +/* + * zfs rollback [-rRf] + * + * -r Delete any intervening snapshots before doing rollback + * -R Delete any snapshots and their clones + * -f ignored for backwards compatability + * + * Given a filesystem, rollback to a specific snapshot, discarding any changes + * since then and making it the active dataset. If more recent snapshots exist, + * the command will complain unless the '-r' flag is given. + */ +typedef struct rollback_cbdata { + uint64_t cb_create; + boolean_t cb_first; + int cb_doclones; + char *cb_target; + int cb_error; + boolean_t cb_recurse; + boolean_t cb_dependent; +} rollback_cbdata_t; + +/* + * Report any snapshots more recent than the one specified. Used when '-r' is + * not specified. We reuse this same callback for the snapshot dependents - if + * 'cb_dependent' is set, then this is a dependent and we should report it + * without checking the transaction group. + */ +static int +rollback_check(zfs_handle_t *zhp, void *data) +{ + rollback_cbdata_t *cbp = data; + + if (cbp->cb_doclones) { + zfs_close(zhp); + return (0); + } + + if (!cbp->cb_dependent) { + if (strcmp(zfs_get_name(zhp), cbp->cb_target) != 0 && + zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && + zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > + cbp->cb_create) { + + if (cbp->cb_first && !cbp->cb_recurse) { + (void) fprintf(stderr, gettext("cannot " + "rollback to '%s': more recent snapshots " + "exist\n"), + cbp->cb_target); + (void) fprintf(stderr, gettext("use '-r' to " + "force deletion of the following " + "snapshots:\n")); + cbp->cb_first = 0; + cbp->cb_error = 1; + } + + if (cbp->cb_recurse) { + cbp->cb_dependent = B_TRUE; + if (zfs_iter_dependents(zhp, B_TRUE, + rollback_check, cbp) != 0) { + zfs_close(zhp); + return (-1); + } + cbp->cb_dependent = B_FALSE; + } else { + (void) fprintf(stderr, "%s\n", + zfs_get_name(zhp)); + } + } + } else { + if (cbp->cb_first && cbp->cb_recurse) { + (void) fprintf(stderr, gettext("cannot rollback to " + "'%s': clones of previous snapshots exist\n"), + cbp->cb_target); + (void) fprintf(stderr, gettext("use '-R' to " + "force deletion of the following clones and " + "dependents:\n")); + cbp->cb_first = 0; + cbp->cb_error = 1; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + } + + zfs_close(zhp); + return (0); +} + +static int +zfs_do_rollback(int argc, char **argv) +{ + int ret; + int c; + boolean_t force = B_FALSE; + rollback_cbdata_t cb = { 0 }; + zfs_handle_t *zhp, *snap; + char parentname[ZFS_MAXNAMELEN]; + char *delim; + + /* check options */ + while ((c = getopt(argc, argv, "rRf")) != -1) { + switch (c) { + case 'r': + cb.cb_recurse = 1; + break; + case 'R': + cb.cb_recurse = 1; + cb.cb_doclones = 1; + break; + case 'f': + force = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing dataset argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* open the snapshot */ + if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) + return (1); + + /* open the parent dataset */ + (void) strlcpy(parentname, argv[0], sizeof (parentname)); + verify((delim = strrchr(parentname, '@')) != NULL); + *delim = '\0'; + if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { + zfs_close(snap); + return (1); + } + + /* + * Check for more recent snapshots and/or clones based on the presence + * of '-r' and '-R'. + */ + cb.cb_target = argv[0]; + cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); + cb.cb_first = B_TRUE; + cb.cb_error = 0; + if ((ret = zfs_iter_children(zhp, rollback_check, &cb)) != 0) + goto out; + + if ((ret = cb.cb_error) != 0) + goto out; + + /* + * Rollback parent to the given snapshot. + */ + ret = zfs_rollback(zhp, snap, force); + +out: + zfs_close(snap); + zfs_close(zhp); + + if (ret == 0) + return (0); + else + return (1); +} + +/* + * zfs set property=value { fs | snap | vol } ... + * + * Sets the given property for all datasets specified on the command line. + */ +typedef struct set_cbdata { + char *cb_propname; + char *cb_value; +} set_cbdata_t; + +static int +set_callback(zfs_handle_t *zhp, void *data) +{ + set_cbdata_t *cbp = data; + + if (zfs_prop_set(zhp, cbp->cb_propname, cbp->cb_value) != 0) { + switch (libzfs_errno(g_zfs)) { + case EZFS_MOUNTFAILED: + (void) fprintf(stderr, gettext("property may be set " + "but unable to remount filesystem\n")); + break; + case EZFS_SHARENFSFAILED: + (void) fprintf(stderr, gettext("property may be set " + "but unable to reshare filesystem\n")); + break; + } + return (1); + } + return (0); +} + +static int +zfs_do_set(int argc, char **argv) +{ + set_cbdata_t cb; + int ret; + + /* check for options */ + if (argc > 1 && argv[1][0] == '-') { + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + argv[1][1]); + usage(B_FALSE); + } + + /* check number of arguments */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing property=value " + "argument\n")); + usage(B_FALSE); + } + if (argc < 3) { + (void) fprintf(stderr, gettext("missing dataset name\n")); + usage(B_FALSE); + } + + /* validate property=value argument */ + cb.cb_propname = argv[1]; + if (((cb.cb_value = strchr(cb.cb_propname, '=')) == NULL) || + (cb.cb_value[1] == '\0')) { + (void) fprintf(stderr, gettext("missing value in " + "property=value argument\n")); + usage(B_FALSE); + } + + *cb.cb_value = '\0'; + cb.cb_value++; + + if (*cb.cb_propname == '\0') { + (void) fprintf(stderr, + gettext("missing property in property=value argument\n")); + usage(B_FALSE); + } + + ret = zfs_for_each(argc - 2, argv + 2, NULL, + ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb); + + return (ret); +} + +/* + * zfs snapshot [-r] [-o prop=value] ... + * + * Creates a snapshot with the given name. While functionally equivalent to + * 'zfs create', it is a separate command to differentiate intent. + */ +static int +zfs_do_snapshot(int argc, char **argv) +{ + boolean_t recursive = B_FALSE; + int ret; + char c; + nvlist_t *props; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, "ro:")) != -1) { + switch (c) { + case 'o': + if (parseprop(props)) + return (1); + break; + case 'r': + recursive = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing snapshot argument\n")); + goto usage; + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + goto usage; + } + + ret = zfs_snapshot(g_zfs, argv[0], recursive, props); + nvlist_free(props); + if (ret && recursive) + (void) fprintf(stderr, gettext("no snapshots were created\n")); + return (ret != 0); + +usage: + nvlist_free(props); + usage(B_FALSE); + return (-1); +} + +/* + * zfs send [-vDp] -R [-i|-I <@snap>] + * zfs send [-vDp] [-i|-I <@snap>] + * + * Send a backup stream to stdout. + */ +static int +zfs_do_send(int argc, char **argv) +{ + char *fromname = NULL; + char *toname = NULL; + char *cp; + zfs_handle_t *zhp; + sendflags_t flags = { 0 }; + int c, err; + nvlist_t *dbgnv; + boolean_t extraverbose = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) { + switch (c) { + case 'i': + if (fromname) + usage(B_FALSE); + fromname = optarg; + break; + case 'I': + if (fromname) + usage(B_FALSE); + fromname = optarg; + flags.doall = B_TRUE; + break; + case 'R': + flags.replicate = B_TRUE; + break; + case 'p': + flags.props = B_TRUE; + break; + case 'v': + if (flags.verbose) + extraverbose = B_TRUE; + flags.verbose = B_TRUE; + break; + case 'D': + flags.dedup = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (isatty(STDOUT_FILENO)) { + (void) fprintf(stderr, + gettext("Error: Stream can not be written to a terminal.\n" + "You must redirect standard output.\n")); + return (1); + } + + cp = strchr(argv[0], '@'); + if (cp == NULL) { + (void) fprintf(stderr, + gettext("argument must be a snapshot\n")); + usage(B_FALSE); + } + *cp = '\0'; + toname = cp + 1; + zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + + /* + * If they specified the full path to the snapshot, chop off + * everything except the short name of the snapshot, but special + * case if they specify the origin. + */ + if (fromname && (cp = strchr(fromname, '@')) != NULL) { + char origin[ZFS_MAXNAMELEN]; + zprop_source_t src; + + (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, + origin, sizeof (origin), &src, NULL, 0, B_FALSE); + + if (strcmp(origin, fromname) == 0) { + fromname = NULL; + flags.fromorigin = B_TRUE; + } else { + *cp = '\0'; + if (cp != fromname && strcmp(argv[0], fromname)) { + (void) fprintf(stderr, + gettext("incremental source must be " + "in same filesystem\n")); + usage(B_FALSE); + } + fromname = cp + 1; + if (strchr(fromname, '@') || strchr(fromname, '/')) { + (void) fprintf(stderr, + gettext("invalid incremental source\n")); + usage(B_FALSE); + } + } + } + + if (flags.replicate && fromname == NULL) + flags.doall = B_TRUE; + + err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0, + extraverbose ? &dbgnv : NULL); + + if (extraverbose) { + /* + * dump_nvlist prints to stdout, but that's been + * redirected to a file. Make it print to stderr + * instead. + */ + (void) dup2(STDERR_FILENO, STDOUT_FILENO); + dump_nvlist(dbgnv, 0); + nvlist_free(dbgnv); + } + zfs_close(zhp); + + return (err != 0); +} + +/* + * zfs receive [-vnFu] [-d | -e] + * + * Restore a backup stream from stdin. + */ +static int +zfs_do_receive(int argc, char **argv) +{ + int c, err; + recvflags_t flags = { 0 }; + + /* check options */ + while ((c = getopt(argc, argv, ":denuvF")) != -1) { + switch (c) { + case 'd': + flags.isprefix = B_TRUE; + break; + case 'e': + flags.isprefix = B_TRUE; + flags.istail = B_TRUE; + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'u': + flags.nomount = B_TRUE; + break; + case 'v': + flags.verbose = B_TRUE; + break; + case 'F': + flags.force = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (isatty(STDIN_FILENO)) { + (void) fprintf(stderr, + gettext("Error: Backup stream can not be read " + "from a terminal.\n" + "You must redirect standard input.\n")); + return (1); + } + + err = zfs_receive(g_zfs, argv[0], flags, STDIN_FILENO, NULL); + + return (err != 0); +} + +static int +zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) +{ + int errors = 0; + int i; + const char *tag; + boolean_t recursive = B_FALSE; + boolean_t temphold = B_FALSE; + const char *opts = holding ? "rt" : "r"; + int c; + + /* check options */ + while ((c = getopt(argc, argv, opts)) != -1) { + switch (c) { + case 'r': + recursive = B_TRUE; + break; + case 't': + temphold = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 2) + usage(B_FALSE); + + tag = argv[0]; + --argc; + ++argv; + + if (holding && tag[0] == '.') { + /* tags starting with '.' are reserved for libzfs */ + (void) fprintf(stderr, gettext("tag may not start with '.'\n")); + usage(B_FALSE); + } + + for (i = 0; i < argc; ++i) { + zfs_handle_t *zhp; + char parent[ZFS_MAXNAMELEN]; + const char *delim; + char *path = argv[i]; + + delim = strchr(path, '@'); + if (delim == NULL) { + (void) fprintf(stderr, + gettext("'%s' is not a snapshot\n"), path); + ++errors; + continue; + } + (void) strncpy(parent, path, delim - path); + parent[delim - path] = '\0'; + + zhp = zfs_open(g_zfs, parent, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + ++errors; + continue; + } + if (holding) { + if (zfs_hold(zhp, delim+1, tag, recursive, + temphold, B_FALSE, -1, 0, 0) != 0) + ++errors; + } else { + if (zfs_release(zhp, delim+1, tag, recursive) != 0) + ++errors; + } + zfs_close(zhp); + } + + return (errors != 0); +} + +/* + * zfs hold [-r] [-t] ... + * + * -r Recursively hold + * -t Temporary hold (hidden option) + * + * Apply a user-hold with the given tag to the list of snapshots. + */ +static int +zfs_do_hold(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); +} + +/* + * zfs release [-r] ... + * + * -r Recursively release + * + * Release a user-hold with the given tag from the list of snapshots. + */ +static int +zfs_do_release(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); +} + +#define CHECK_SPINNER 30 +#define SPINNER_TIME 3 /* seconds */ +#define MOUNT_TIME 5 /* seconds */ + +static int +get_one_dataset(zfs_handle_t *zhp, void *data) +{ + static char *spin[] = { "-", "\\", "|", "/" }; + static int spinval = 0; + static int spincheck = 0; + static time_t last_spin_time = (time_t)0; + get_all_cb_t *cbp = data; + zfs_type_t type = zfs_get_type(zhp); + + if (cbp->cb_verbose) { + if (--spincheck < 0) { + time_t now = time(NULL); + if (last_spin_time + SPINNER_TIME < now) { + update_progress(spin[spinval++ % 4]); + last_spin_time = now; + } + spincheck = CHECK_SPINNER; + } + } + + /* + * Interate over any nested datasets. + */ + if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { + zfs_close(zhp); + return (1); + } + + /* + * Skip any datasets whose type does not match. + */ + if ((type & ZFS_TYPE_FILESYSTEM) == 0) { + zfs_close(zhp); + return (0); + } + libzfs_add_handle(cbp, zhp); + assert(cbp->cb_used <= cbp->cb_alloc); + + return (0); +} + +static void +get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose) +{ + get_all_cb_t cb = { 0 }; + cb.cb_verbose = verbose; + cb.cb_getone = get_one_dataset; + + if (verbose) + set_progress_header(gettext("Reading ZFS config")); + (void) zfs_iter_root(g_zfs, get_one_dataset, &cb); + + *dslist = cb.cb_handles; + *count = cb.cb_used; + + if (verbose) + finish_progress(gettext("done.")); +} + +/* + * Generic callback for sharing or mounting filesystems. Because the code is so + * similar, we have a common function with an extra parameter to determine which + * mode we are using. + */ +#define OP_SHARE 0x1 +#define OP_MOUNT 0x2 + +/* + * Share or mount a dataset. + */ +static int +share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, + boolean_t explicit, const char *options) +{ + char mountpoint[ZFS_MAXPROPLEN]; + char shareopts[ZFS_MAXPROPLEN]; + char smbshareopts[ZFS_MAXPROPLEN]; + const char *cmdname = op == OP_SHARE ? "share" : "mount"; + struct mnttab mnt; + uint64_t zoned, canmount; + boolean_t shared_nfs, shared_smb; + + assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); + + /* + * Check to make sure we can mount/share this dataset. If we + * are in the global zone and the filesystem is exported to a + * local zone, or if we are in a local zone and the + * filesystem is not exported, then it is an error. + */ + zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + + if (zoned && getzoneid() == GLOBAL_ZONEID) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "dataset is exported to a local zone\n"), cmdname, + zfs_get_name(zhp)); + return (1); + + } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "permission denied\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } + + /* + * Ignore any filesystems which don't apply to us. This + * includes those with a legacy mountpoint, or those with + * legacy share options. + */ + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, + sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, + sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); + + if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && + strcmp(smbshareopts, "off") == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot share '%s': " + "legacy share\n"), zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use share(1M) to " + "share this filesystem, or set " + "sharenfs property on\n")); + return (1); + } + + /* + * We cannot share or mount legacy filesystems. If the + * shareopts is non-legacy but the mountpoint is legacy, we + * treat it as a legacy share. + */ + if (strcmp(mountpoint, "legacy") == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use %s(1M) to " + "%s this filesystem\n"), cmdname, cmdname); + return (1); + } + + if (strcmp(mountpoint, "none") == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': no " + "mountpoint set\n"), cmdname, zfs_get_name(zhp)); + return (1); + } + + /* + * canmount explicit outcome + * on no pass through + * on yes pass through + * off no return 0 + * off yes display error, return 1 + * noauto no return 0 + * noauto yes pass through + */ + canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); + if (canmount == ZFS_CANMOUNT_OFF) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "'canmount' property is set to 'off'\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { + return (0); + } + + /* + * At this point, we have verified that the mountpoint and/or + * shareopts are appropriate for auto management. If the + * filesystem is already mounted or shared, return (failing + * for explicit requests); otherwise mount or share the + * filesystem. + */ + switch (op) { + case OP_SHARE: + + shared_nfs = zfs_is_shared_nfs(zhp, NULL); + shared_smb = zfs_is_shared_smb(zhp, NULL); + + if (shared_nfs && shared_smb || + (shared_nfs && strcmp(shareopts, "on") == 0 && + strcmp(smbshareopts, "off") == 0) || + (shared_smb && strcmp(smbshareopts, "on") == 0 && + strcmp(shareopts, "off") == 0)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot share " + "'%s': filesystem already shared\n"), + zfs_get_name(zhp)); + return (1); + } + + if (!zfs_is_mounted(zhp, NULL) && + zfs_mount(zhp, NULL, 0) != 0) + return (1); + + if (protocol == NULL) { + if (zfs_shareall(zhp) != 0) + return (1); + } else if (strcmp(protocol, "nfs") == 0) { + if (zfs_share_nfs(zhp)) + return (1); + } else if (strcmp(protocol, "smb") == 0) { + if (zfs_share_smb(zhp)) + return (1); + } else { + (void) fprintf(stderr, gettext("cannot share " + "'%s': invalid share type '%s' " + "specified\n"), + zfs_get_name(zhp), protocol); + return (1); + } + + break; + + case OP_MOUNT: + if (options == NULL) + mnt.mnt_mntopts = ""; + else + mnt.mnt_mntopts = (char *)options; + + if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && + zfs_is_mounted(zhp, NULL)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot mount " + "'%s': filesystem already mounted\n"), + zfs_get_name(zhp)); + return (1); + } + + if (zfs_mount(zhp, options, flags) != 0) + return (1); + break; + } + + return (0); +} + +/* + * Reports progress in the form "(current/total)". Not thread-safe. + */ +static void +report_mount_progress(int current, int total) +{ + static time_t last_progress_time = 0; + time_t now = time(NULL); + char info[32]; + + /* report 1..n instead of 0..n-1 */ + ++current; + + /* display header if we're here for the first time */ + if (current == 1) { + set_progress_header(gettext("Mounting ZFS filesystems")); + } else if (current != total && last_progress_time + MOUNT_TIME >= now) { + /* too soon to report again */ + return; + } + + last_progress_time = now; + + (void) sprintf(info, "(%d/%d)", current, total); + + if (current == total) + finish_progress(info); + else + update_progress(info); +} + +static void +append_options(char *mntopts, char *newopts) +{ + int len = strlen(mntopts); + + /* original length plus new string to append plus 1 for the comma */ + if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { + (void) fprintf(stderr, gettext("the opts argument for " + "'%c' option is too long (more than %d chars)\n"), + "-o", MNT_LINE_MAX); + usage(B_FALSE); + } + + if (*mntopts) + mntopts[len++] = ','; + + (void) strcpy(&mntopts[len], newopts); +} + +static int +share_mount(int op, int argc, char **argv) +{ + int do_all = 0; + boolean_t verbose = B_FALSE; + int c, ret = 0; + char *options = NULL; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a")) + != -1) { + switch (c) { + case 'a': + do_all = 1; + break; + case 'v': + verbose = B_TRUE; + break; + case 'o': + if (*optarg == '\0') { + (void) fprintf(stderr, gettext("empty mount " + "options (-o) specified\n")); + usage(B_FALSE); + } + + if (options == NULL) + options = safe_malloc(MNT_LINE_MAX + 1); + + /* option validation is done later */ + append_options(options, optarg); + break; + + case 'O': + flags |= MS_OVERLAY; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (do_all) { + zfs_handle_t **dslist = NULL; + size_t i, count = 0; + char *protocol = NULL; + + if (op == OP_SHARE && argc > 0) { + if (strcmp(argv[0], "nfs") != 0 && + strcmp(argv[0], "smb") != 0) { + (void) fprintf(stderr, gettext("share type " + "must be 'nfs' or 'smb'\n")); + usage(B_FALSE); + } + protocol = argv[0]; + argc--; + argv++; + } + + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + start_progress_timer(); + get_all_datasets(&dslist, &count, verbose); + + if (count == 0) + return (0); + + qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp); + + for (i = 0; i < count; i++) { + if (verbose) + report_mount_progress(i, count); + + if (share_mount_one(dslist[i], op, flags, protocol, + B_FALSE, options) != 0) + ret = 1; + zfs_close(dslist[i]); + } + + free(dslist); + } else if (argc == 0) { + struct mnttab entry; + + if ((op == OP_SHARE) || (options != NULL)) { + (void) fprintf(stderr, gettext("missing filesystem " + "argument (specify -a for all)\n")); + usage(B_FALSE); + } + + /* + * When mount is given no arguments, go through /etc/mnttab and + * display any active ZFS mounts. We hide any snapshots, since + * they are controlled automatically. + */ + rewind(mnttab_file); + while (getmntent(mnttab_file, &entry) == 0) { + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || + strchr(entry.mnt_special, '@') != NULL) + continue; + + (void) printf("%-30s %s\n", entry.mnt_special, + entry.mnt_mountp); + } + + } else { + zfs_handle_t *zhp; + + if (argc > 1) { + (void) fprintf(stderr, + gettext("too many arguments\n")); + usage(B_FALSE); + } + + if ((zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM)) == NULL) { + ret = 1; + } else { + ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, + options); + zfs_close(zhp); + } + } + + return (ret); +} + +/* + * zfs mount -a [nfs] + * zfs mount filesystem + * + * Mount all filesystems, or mount the given filesystem. + */ +static int +zfs_do_mount(int argc, char **argv) +{ + return (share_mount(OP_MOUNT, argc, argv)); +} + +/* + * zfs share -a [nfs | smb] + * zfs share filesystem + * + * Share all filesystems, or share the given filesystem. + */ +static int +zfs_do_share(int argc, char **argv) +{ + return (share_mount(OP_SHARE, argc, argv)); +} + +typedef struct unshare_unmount_node { + zfs_handle_t *un_zhp; + char *un_mountp; + uu_avl_node_t un_avlnode; +} unshare_unmount_node_t; + +/* ARGSUSED */ +static int +unshare_unmount_compare(const void *larg, const void *rarg, void *unused) +{ + const unshare_unmount_node_t *l = larg; + const unshare_unmount_node_t *r = rarg; + + return (strcmp(l->un_mountp, r->un_mountp)); +} + +/* + * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an + * absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem, + * and unmount it appropriately. + */ +static int +unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) +{ + zfs_handle_t *zhp; + int ret; + struct stat64 statbuf; + struct extmnttab entry; + const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; + ino_t path_inode; + + /* + * Search for the path in /etc/mnttab. Rather than looking for the + * specific path, which can be fooled by non-standard paths (i.e. ".." + * or "//"), we stat() the path and search for the corresponding + * (major,minor) device pair. + */ + if (stat64(path, &statbuf) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), + cmdname, path, strerror(errno)); + return (1); + } + path_inode = statbuf.st_ino; + + /* + * Search for the given (major,minor) pair in the mount table. + */ + rewind(mnttab_file); + while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) { + if (entry.mnt_major == major(statbuf.st_dev) && + entry.mnt_minor == minor(statbuf.st_dev)) + break; + } + if (ret != 0) { + if (op == OP_SHARE) { + (void) fprintf(stderr, gettext("cannot %s '%s': not " + "currently mounted\n"), cmdname, path); + return (1); + } + (void) fprintf(stderr, gettext("warning: %s not in mnttab\n"), + path); + if ((ret = umount2(path, flags)) != 0) + (void) fprintf(stderr, gettext("%s: %s\n"), path, + strerror(errno)); + return (ret != 0); + } + + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " + "filesystem\n"), cmdname, path); + return (1); + } + + if ((zhp = zfs_open(g_zfs, entry.mnt_special, + ZFS_TYPE_FILESYSTEM)) == NULL) + return (1); + + ret = 1; + if (stat64(entry.mnt_mountp, &statbuf) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), + cmdname, path, strerror(errno)); + goto out; + } else if (statbuf.st_ino != path_inode) { + (void) fprintf(stderr, gettext("cannot " + "%s '%s': not a mountpoint\n"), cmdname, path); + goto out; + } + + if (op == OP_SHARE) { + char nfs_mnt_prop[ZFS_MAXPROPLEN]; + char smbshare_prop[ZFS_MAXPROPLEN]; + + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, + sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, + sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); + + if (strcmp(nfs_mnt_prop, "off") == 0 && + strcmp(smbshare_prop, "off") == 0) { + (void) fprintf(stderr, gettext("cannot unshare " + "'%s': legacy share\n"), path); + (void) fprintf(stderr, gettext("use " + "unshare(1M) to unshare this filesystem\n")); + } else if (!zfs_is_shared(zhp)) { + (void) fprintf(stderr, gettext("cannot unshare '%s': " + "not currently shared\n"), path); + } else { + ret = zfs_unshareall_bypath(zhp, path); + } + } else { + char mtpt_prop[ZFS_MAXPROPLEN]; + + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, + sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); + + if (is_manual) { + ret = zfs_unmount(zhp, NULL, flags); + } else if (strcmp(mtpt_prop, "legacy") == 0) { + (void) fprintf(stderr, gettext("cannot unmount " + "'%s': legacy mountpoint\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use umount(1M) " + "to unmount this filesystem\n")); + } else { + ret = zfs_unmountall(zhp, flags); + } + } + +out: + zfs_close(zhp); + + return (ret != 0); +} + +/* + * Generic callback for unsharing or unmounting a filesystem. + */ +static int +unshare_unmount(int op, int argc, char **argv) +{ + int do_all = 0; + int flags = 0; + int ret = 0; + int c; + zfs_handle_t *zhp; + char nfs_mnt_prop[ZFS_MAXPROPLEN]; + char sharesmb[ZFS_MAXPROPLEN]; + + /* check options */ + while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) { + switch (c) { + case 'a': + do_all = 1; + break; + case 'f': + flags = MS_FORCE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (do_all) { + /* + * We could make use of zfs_for_each() to walk all datasets in + * the system, but this would be very inefficient, especially + * since we would have to linearly search /etc/mnttab for each + * one. Instead, do one pass through /etc/mnttab looking for + * zfs entries and call zfs_unmount() for each one. + * + * Things get a little tricky if the administrator has created + * mountpoints beneath other ZFS filesystems. In this case, we + * have to unmount the deepest filesystems first. To accomplish + * this, we place all the mountpoints in an AVL tree sorted by + * the special type (dataset name), and walk the result in + * reverse to make sure to get any snapshots first. + */ + struct mnttab entry; + uu_avl_pool_t *pool; + uu_avl_t *tree; + unshare_unmount_node_t *node; + uu_avl_index_t idx; + uu_avl_walk_t *walk; + + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (((pool = uu_avl_pool_create("unmount_pool", + sizeof (unshare_unmount_node_t), + offsetof(unshare_unmount_node_t, un_avlnode), + unshare_unmount_compare, UU_DEFAULT)) == NULL) || + ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) + nomem(); + + rewind(mnttab_file); + while (getmntent(mnttab_file, &entry) == 0) { + + /* ignore non-ZFS entries */ + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) + continue; + + /* ignore snapshots */ + if (strchr(entry.mnt_special, '@') != NULL) + continue; + + if ((zhp = zfs_open(g_zfs, entry.mnt_special, + ZFS_TYPE_FILESYSTEM)) == NULL) { + ret = 1; + continue; + } + + switch (op) { + case OP_SHARE: + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(nfs_mnt_prop, "off") != 0) + break; + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(nfs_mnt_prop, "off") == 0) + continue; + break; + case OP_MOUNT: + /* Ignore legacy mounts */ + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(nfs_mnt_prop, "legacy") == 0) + continue; + /* Ignore canmount=noauto mounts */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == + ZFS_CANMOUNT_NOAUTO) + continue; + default: + break; + } + + node = safe_malloc(sizeof (unshare_unmount_node_t)); + node->un_zhp = zhp; + node->un_mountp = safe_strdup(entry.mnt_mountp); + + uu_avl_node_init(node, &node->un_avlnode, pool); + + if (uu_avl_find(tree, node, NULL, &idx) == NULL) { + uu_avl_insert(tree, node, idx); + } else { + zfs_close(node->un_zhp); + free(node->un_mountp); + free(node); + } + } + + /* + * Walk the AVL tree in reverse, unmounting each filesystem and + * removing it from the AVL tree in the process. + */ + if ((walk = uu_avl_walk_start(tree, + UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(tree, node); + + switch (op) { + case OP_SHARE: + if (zfs_unshareall_bypath(node->un_zhp, + node->un_mountp) != 0) + ret = 1; + break; + + case OP_MOUNT: + if (zfs_unmount(node->un_zhp, + node->un_mountp, flags) != 0) + ret = 1; + break; + } + + zfs_close(node->un_zhp); + free(node->un_mountp); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(tree); + uu_avl_pool_destroy(pool); + + } else { + if (argc != 1) { + if (argc == 0) + (void) fprintf(stderr, + gettext("missing filesystem argument\n")); + else + (void) fprintf(stderr, + gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* + * We have an argument, but it may be a full path or a ZFS + * filesystem. Pass full paths off to unmount_path() (shared by + * manual_unmount), otherwise open the filesystem and pass to + * zfs_unmount(). + */ + if (argv[0][0] == '/') + return (unshare_unmount_path(op, argv[0], + flags, B_FALSE)); + + if ((zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM)) == NULL) + return (1); + + verify(zfs_prop_get(zhp, op == OP_SHARE ? + ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, + nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, + NULL, 0, B_FALSE) == 0); + + switch (op) { + case OP_SHARE: + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, + sharesmb, sizeof (sharesmb), NULL, NULL, + 0, B_FALSE) == 0); + + if (strcmp(nfs_mnt_prop, "off") == 0 && + strcmp(sharesmb, "off") == 0) { + (void) fprintf(stderr, gettext("cannot " + "unshare '%s': legacy share\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use " + "unshare(1M) to unshare this " + "filesystem\n")); + ret = 1; + } else if (!zfs_is_shared(zhp)) { + (void) fprintf(stderr, gettext("cannot " + "unshare '%s': not currently " + "shared\n"), zfs_get_name(zhp)); + ret = 1; + } else if (zfs_unshareall(zhp) != 0) { + ret = 1; + } + break; + + case OP_MOUNT: + if (strcmp(nfs_mnt_prop, "legacy") == 0) { + (void) fprintf(stderr, gettext("cannot " + "unmount '%s': legacy " + "mountpoint\n"), zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use " + "umount(1M) to unmount this " + "filesystem\n")); + ret = 1; + } else if (!zfs_is_mounted(zhp, NULL)) { + (void) fprintf(stderr, gettext("cannot " + "unmount '%s': not currently " + "mounted\n"), + zfs_get_name(zhp)); + ret = 1; + } else if (zfs_unmountall(zhp, flags) != 0) { + ret = 1; + } + break; + } + + zfs_close(zhp); + } + + return (ret); +} + +/* + * zfs unmount -a + * zfs unmount filesystem + * + * Unmount all filesystems, or a specific ZFS filesystem. + */ +static int +zfs_do_unmount(int argc, char **argv) +{ + return (unshare_unmount(OP_MOUNT, argc, argv)); +} + +/* + * zfs unshare -a + * zfs unshare filesystem + * + * Unshare all filesystems, or a specific ZFS filesystem. + */ +static int +zfs_do_unshare(int argc, char **argv) +{ + return (unshare_unmount(OP_SHARE, argc, argv)); +} + +/* ARGSUSED */ +static int +zfs_do_python(int argc, char **argv) +{ + (void) execv(pypath, argv-1); + (void) printf("internal error: %s not found\n", pypath); + return (-1); +} + +/* + * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is + * 'legacy'. Otherwise, complain that use should be using 'zfs mount'. + */ +static int +manual_mount(int argc, char **argv) +{ + zfs_handle_t *zhp; + char mountpoint[ZFS_MAXPROPLEN]; + char mntopts[MNT_LINE_MAX] = { '\0' }; + int ret; + int c; + int flags = 0; + char *dataset, *path; + + /* check options */ + while ((c = getopt(argc, argv, ":mo:O")) != -1) { + switch (c) { + case 'o': + (void) strlcpy(mntopts, optarg, sizeof (mntopts)); + break; + case 'O': + flags |= MS_OVERLAY; + break; + case 'm': + flags |= MS_NOMNTTAB; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + (void) fprintf(stderr, gettext("usage: mount [-o opts] " + "\n")); + return (2); + } + } + + argc -= optind; + argv += optind; + + /* check that we only have two arguments */ + if (argc != 2) { + if (argc == 0) + (void) fprintf(stderr, gettext("missing dataset " + "argument\n")); + else if (argc == 1) + (void) fprintf(stderr, + gettext("missing mountpoint argument\n")); + else + (void) fprintf(stderr, gettext("too many arguments\n")); + (void) fprintf(stderr, "usage: mount \n"); + return (2); + } + + dataset = argv[0]; + path = argv[1]; + + /* try to open the dataset */ + if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL) + return (1); + + (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE); + + /* check for legacy mountpoint and complain appropriately */ + ret = 0; + if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) { + if (mount(dataset, path, MS_OPTIONSTR | flags, MNTTYPE_ZFS, + NULL, 0, mntopts, sizeof (mntopts)) != 0) { + (void) fprintf(stderr, gettext("mount failed: %s\n"), + strerror(errno)); + ret = 1; + } + } else { + (void) fprintf(stderr, gettext("filesystem '%s' cannot be " + "mounted using 'mount -F zfs'\n"), dataset); + (void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' " + "instead.\n"), path); + (void) fprintf(stderr, gettext("If you must use 'mount -F zfs' " + "or /etc/vfstab, use 'zfs set mountpoint=legacy'.\n")); + (void) fprintf(stderr, gettext("See zfs(1M) for more " + "information.\n")); + ret = 1; + } + + return (ret); +} + +/* + * Called when invoked as /etc/fs/zfs/umount. Unlike a manual mount, we allow + * unmounts of non-legacy filesystems, as this is the dominant administrative + * interface. + */ +static int +manual_unmount(int argc, char **argv) +{ + int flags = 0; + int c; + + /* check options */ + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + flags = MS_FORCE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + (void) fprintf(stderr, gettext("usage: unmount [-f] " + "\n")); + return (2); + } + } + + argc -= optind; + argv += optind; + + /* check arguments */ + if (argc != 1) { + if (argc == 0) + (void) fprintf(stderr, gettext("missing path " + "argument\n")); + else + (void) fprintf(stderr, gettext("too many arguments\n")); + (void) fprintf(stderr, gettext("usage: unmount [-f] \n")); + return (2); + } + + return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE)); +} + +static int +find_command_idx(char *command, int *idx) +{ + int i; + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + continue; + + if (strcmp(command, command_table[i].name) == 0) { + *idx = i; + return (0); + } + } + return (1); +} + +static int +zfs_do_diff(int argc, char **argv) +{ + zfs_handle_t *zhp; + int flags = 0; + char *tosnap = NULL; + char *fromsnap = NULL; + char *atp, *copy; + int err; + int c; + + while ((c = getopt(argc, argv, "FHt")) != -1) { + switch (c) { + case 'F': + flags |= ZFS_DIFF_CLASSIFY; + break; + case 'H': + flags |= ZFS_DIFF_PARSEABLE; + break; + case 't': + flags |= ZFS_DIFF_TIMESTAMP; + break; + default: + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, + gettext("must provide at least one snapshot name\n")); + usage(B_FALSE); + } + + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + fromsnap = argv[0]; + tosnap = (argc == 2) ? argv[1] : NULL; + + copy = NULL; + if (*fromsnap != '@') + copy = strdup(fromsnap); + else if (tosnap) + copy = strdup(tosnap); + if (copy == NULL) + usage(B_FALSE); + + if (atp = strchr(copy, '@')) + *atp = '\0'; + + if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) + return (1); + + free(copy); + + /* + * Ignore SIGPIPE so that the library can give us + * information on any failure + */ + (void) sigignore(SIGPIPE); + + err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); + + zfs_close(zhp); + + return (err != 0); +} + +int +main(int argc, char **argv) +{ + int ret; + int i; + char *progname; + char *cmdname; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + opterr = 0; + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, gettext("internal error: failed to " + "initialize ZFS library\n")); + return (1); + } + + zpool_set_history_str("zfs", argc, argv, history_str); + verify(zpool_stage_history(g_zfs, history_str) == 0); + + libzfs_print_on_error(g_zfs, B_TRUE); + + if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) { + (void) fprintf(stderr, gettext("internal error: unable to " + "open %s\n"), MNTTAB); + return (1); + } + + /* + * This command also doubles as the /etc/fs mount and unmount program. + * Determine if we should take this behavior based on argv[0]. + */ + progname = basename(argv[0]); + if (strcmp(progname, "mount") == 0) { + ret = manual_mount(argc, argv); + } else if (strcmp(progname, "umount") == 0) { + ret = manual_unmount(argc, argv); + } else { + /* + * Make sure the user has specified some command. + */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing command\n")); + usage(B_FALSE); + } + + cmdname = argv[1]; + + /* + * The 'umount' command is an alias for 'unmount' + */ + if (strcmp(cmdname, "umount") == 0) + cmdname = "unmount"; + + /* + * The 'recv' command is an alias for 'receive' + */ + if (strcmp(cmdname, "recv") == 0) + cmdname = "receive"; + + /* + * Special case '-?' + */ + if (strcmp(cmdname, "-?") == 0) + usage(B_TRUE); + + /* + * Run the appropriate command. + */ + libzfs_mnttab_cache(g_zfs, B_TRUE); + if (find_command_idx(cmdname, &i) == 0) { + current_command = &command_table[i]; + ret = command_table[i].func(argc - 1, argv + 1); + } else if (strchr(cmdname, '=') != NULL) { + verify(find_command_idx("set", &i) == 0); + current_command = &command_table[i]; + ret = command_table[i].func(argc, argv); + } else { + (void) fprintf(stderr, gettext("unrecognized " + "command '%s'\n"), cmdname); + usage(B_FALSE); + } + libzfs_mnttab_cache(g_zfs, B_FALSE); + } + + (void) fclose(mnttab_file); + + libzfs_fini(g_zfs); + + /* + * The 'ZFS_ABORT' environment variable causes us to dump core on exit + * for the purposes of running ::findleaks. + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + return (ret); +} diff --git a/cmd/zfs/zfs_util.h b/cmd/zfs/zfs_util.h new file mode 100644 index 0000000..3ddff9e --- /dev/null +++ b/cmd/zfs/zfs_util.h @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZFS_UTIL_H +#define _ZFS_UTIL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void * safe_malloc(size_t size); +void nomem(void); +libzfs_handle_t *g_zfs; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_UTIL_H */ diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c new file mode 100644 index 0000000..87751e3 --- /dev/null +++ b/cmd/zinject/translate.c @@ -0,0 +1,494 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "zinject.h" + +extern void kernel_init(int); +extern void kernel_fini(void); + +static int debug; + +static void +ziprintf(const char *fmt, ...) +{ + va_list ap; + + if (!debug) + return; + + va_start(ap, fmt); + (void) vprintf(fmt, ap); + va_end(ap); +} + +static void +compress_slashes(const char *src, char *dest) +{ + while (*src != '\0') { + *dest = *src++; + while (*dest == '/' && *src == '/') + ++src; + ++dest; + } + *dest = '\0'; +} + +/* + * Given a full path to a file, translate into a dataset name and a relative + * path within the dataset. 'dataset' must be at least MAXNAMELEN characters, + * and 'relpath' must be at least MAXPATHLEN characters. We also pass a stat64 + * buffer, which we need later to get the object ID. + */ +static int +parse_pathname(const char *inpath, char *dataset, char *relpath, + struct stat64 *statbuf) +{ + struct extmnttab mp; + FILE *fp; + int match; + const char *rel; + char fullpath[MAXPATHLEN]; + + compress_slashes(inpath, fullpath); + + if (fullpath[0] != '/') { + (void) fprintf(stderr, "invalid object '%s': must be full " + "path\n", fullpath); + usage(); + return (-1); + } + + if (strlen(fullpath) >= MAXPATHLEN) { + (void) fprintf(stderr, "invalid object; pathname too long\n"); + return (-1); + } + + if (stat64(fullpath, statbuf) != 0) { + (void) fprintf(stderr, "cannot open '%s': %s\n", + fullpath, strerror(errno)); + return (-1); + } + + if ((fp = fopen(MNTTAB, "r")) == NULL) { + (void) fprintf(stderr, "cannot open /etc/mnttab\n"); + return (-1); + } + + match = 0; + while (getextmntent(fp, &mp, sizeof (mp)) == 0) { + if (makedev(mp.mnt_major, mp.mnt_minor) == statbuf->st_dev) { + match = 1; + break; + } + } + + if (!match) { + (void) fprintf(stderr, "cannot find mountpoint for '%s'\n", + fullpath); + return (-1); + } + + if (strcmp(mp.mnt_fstype, MNTTYPE_ZFS) != 0) { + (void) fprintf(stderr, "invalid path '%s': not a ZFS " + "filesystem\n", fullpath); + return (-1); + } + + if (strncmp(fullpath, mp.mnt_mountp, strlen(mp.mnt_mountp)) != 0) { + (void) fprintf(stderr, "invalid path '%s': mountpoint " + "doesn't match path\n", fullpath); + return (-1); + } + + (void) strcpy(dataset, mp.mnt_special); + + rel = fullpath + strlen(mp.mnt_mountp); + if (rel[0] == '/') + rel++; + (void) strcpy(relpath, rel); + + return (0); +} + +/* + * Convert from a (dataset, path) pair into a (objset, object) pair. Note that + * we grab the object number from the inode number, since looking this up via + * libzpool is a real pain. + */ +/* ARGSUSED */ +static int +object_from_path(const char *dataset, const char *path, struct stat64 *statbuf, + zinject_record_t *record) +{ + objset_t *os; + int err; + + /* + * Before doing any libzpool operations, call sync() to ensure that the + * on-disk state is consistent with the in-core state. + */ + sync(); + + err = dmu_objset_own(dataset, DMU_OST_ZFS, B_TRUE, FTAG, &os); + if (err != 0) { + (void) fprintf(stderr, "cannot open dataset '%s': %s\n", + dataset, strerror(err)); + return (-1); + } + + record->zi_objset = dmu_objset_id(os); + record->zi_object = statbuf->st_ino; + + dmu_objset_disown(os, FTAG); + + return (0); +} + +/* + * Calculate the real range based on the type, level, and range given. + */ +static int +calculate_range(const char *dataset, err_type_t type, int level, char *range, + zinject_record_t *record) +{ + objset_t *os = NULL; + dnode_t *dn = NULL; + int err; + int ret = -1; + + /* + * Determine the numeric range from the string. + */ + if (range == NULL) { + /* + * If range is unspecified, set the range to [0,-1], which + * indicates that the whole object should be treated as an + * error. + */ + record->zi_start = 0; + record->zi_end = -1ULL; + } else { + char *end; + + /* XXX add support for suffixes */ + record->zi_start = strtoull(range, &end, 10); + + + if (*end == '\0') + record->zi_end = record->zi_start + 1; + else if (*end == ',') + record->zi_end = strtoull(end + 1, &end, 10); + + if (*end != '\0') { + (void) fprintf(stderr, "invalid range '%s': must be " + "a numeric range of the form 'start[,end]'\n", + range); + goto out; + } + } + + switch (type) { + case TYPE_DATA: + break; + + case TYPE_DNODE: + /* + * If this is a request to inject faults into the dnode, then we + * must translate the current (objset,object) pair into an + * offset within the metadnode for the objset. Specifying any + * kind of range with type 'dnode' is illegal. + */ + if (range != NULL) { + (void) fprintf(stderr, "range cannot be specified when " + "type is 'dnode'\n"); + goto out; + } + + record->zi_start = record->zi_object * sizeof (dnode_phys_t); + record->zi_end = record->zi_start + sizeof (dnode_phys_t); + record->zi_object = 0; + break; + } + + /* + * Get the dnode associated with object, so we can calculate the block + * size. + */ + if ((err = dmu_objset_own(dataset, DMU_OST_ANY, + B_TRUE, FTAG, &os)) != 0) { + (void) fprintf(stderr, "cannot open dataset '%s': %s\n", + dataset, strerror(err)); + goto out; + } + + if (record->zi_object == 0) { + dn = DMU_META_DNODE(os); + } else { + err = dnode_hold(os, record->zi_object, FTAG, &dn); + if (err != 0) { + (void) fprintf(stderr, "failed to hold dnode " + "for object %llu\n", + (u_longlong_t)record->zi_object); + goto out; + } + } + + + ziprintf("data shift: %d\n", (int)dn->dn_datablkshift); + ziprintf(" ind shift: %d\n", (int)dn->dn_indblkshift); + + /* + * Translate range into block IDs. + */ + if (record->zi_start != 0 || record->zi_end != -1ULL) { + record->zi_start >>= dn->dn_datablkshift; + record->zi_end >>= dn->dn_datablkshift; + } + + /* + * Check level, and then translate level 0 blkids into ranges + * appropriate for level of indirection. + */ + record->zi_level = level; + if (level > 0) { + ziprintf("level 0 blkid range: [%llu, %llu]\n", + record->zi_start, record->zi_end); + + if (level >= dn->dn_nlevels) { + (void) fprintf(stderr, "level %d exceeds max level " + "of object (%d)\n", level, dn->dn_nlevels - 1); + goto out; + } + + if (record->zi_start != 0 || record->zi_end != 0) { + int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + for (; level > 0; level--) { + record->zi_start >>= shift; + record->zi_end >>= shift; + } + } + } + + ret = 0; +out: + if (dn) { + if (dn != DMU_META_DNODE(os)) + dnode_rele(dn, FTAG); + } + if (os) + dmu_objset_disown(os, FTAG); + + return (ret); +} + +int +translate_record(err_type_t type, const char *object, const char *range, + int level, zinject_record_t *record, char *poolname, char *dataset) +{ + char path[MAXPATHLEN]; + char *slash; + struct stat64 statbuf; + int ret = -1; + + kernel_init(FREAD); + + debug = (getenv("ZINJECT_DEBUG") != NULL); + + ziprintf("translating: %s\n", object); + + if (MOS_TYPE(type)) { + /* + * MOS objects are treated specially. + */ + switch (type) { + case TYPE_MOS: + record->zi_type = 0; + break; + case TYPE_MOSDIR: + record->zi_type = DMU_OT_OBJECT_DIRECTORY; + break; + case TYPE_METASLAB: + record->zi_type = DMU_OT_OBJECT_ARRAY; + break; + case TYPE_CONFIG: + record->zi_type = DMU_OT_PACKED_NVLIST; + break; + case TYPE_BPOBJ: + record->zi_type = DMU_OT_BPOBJ; + break; + case TYPE_SPACEMAP: + record->zi_type = DMU_OT_SPACE_MAP; + break; + case TYPE_ERRLOG: + record->zi_type = DMU_OT_ERROR_LOG; + break; + } + + dataset[0] = '\0'; + (void) strcpy(poolname, object); + return (0); + } + + /* + * Convert a full path into a (dataset, file) pair. + */ + if (parse_pathname(object, dataset, path, &statbuf) != 0) + goto err; + + ziprintf(" dataset: %s\n", dataset); + ziprintf(" path: %s\n", path); + + /* + * Convert (dataset, file) into (objset, object) + */ + if (object_from_path(dataset, path, &statbuf, record) != 0) + goto err; + + ziprintf("raw objset: %llu\n", record->zi_objset); + ziprintf("raw object: %llu\n", record->zi_object); + + /* + * For the given object, calculate the real (type, level, range) + */ + if (calculate_range(dataset, type, level, (char *)range, record) != 0) + goto err; + + ziprintf(" objset: %llu\n", record->zi_objset); + ziprintf(" object: %llu\n", record->zi_object); + if (record->zi_start == 0 && + record->zi_end == -1ULL) + ziprintf(" range: all\n"); + else + ziprintf(" range: [%llu, %llu]\n", record->zi_start, + record->zi_end); + + /* + * Copy the pool name + */ + (void) strcpy(poolname, dataset); + if ((slash = strchr(poolname, '/')) != NULL) + *slash = '\0'; + + ret = 0; + +err: + kernel_fini(); + return (ret); +} + +int +translate_raw(const char *str, zinject_record_t *record) +{ + /* + * A raw bookmark of the form objset:object:level:blkid, where each + * number is a hexidecimal value. + */ + if (sscanf(str, "%llx:%llx:%x:%llx", (u_longlong_t *)&record->zi_objset, + (u_longlong_t *)&record->zi_object, &record->zi_level, + (u_longlong_t *)&record->zi_start) != 4) { + (void) fprintf(stderr, "bad raw spec '%s': must be of the form " + "'objset:object:level:blkid'\n", str); + return (-1); + } + + record->zi_end = record->zi_start; + + return (0); +} + +int +translate_device(const char *pool, const char *device, err_type_t label_type, + zinject_record_t *record) +{ + char *end; + zpool_handle_t *zhp; + nvlist_t *tgt; + boolean_t isspare, iscache; + + /* + * Given a device name or GUID, create an appropriate injection record + * with zi_guid set. + */ + if ((zhp = zpool_open(g_zfs, pool)) == NULL) + return (-1); + + record->zi_guid = strtoull(device, &end, 16); + if (record->zi_guid == 0 || *end != '\0') { + tgt = zpool_find_vdev(zhp, device, &isspare, &iscache, NULL); + + if (tgt == NULL) { + (void) fprintf(stderr, "cannot find device '%s' in " + "pool '%s'\n", device, pool); + return (-1); + } + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, + &record->zi_guid) == 0); + } + + switch (label_type) { + case TYPE_LABEL_UBERBLOCK: + record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]); + record->zi_end = record->zi_start + VDEV_UBERBLOCK_RING - 1; + break; + case TYPE_LABEL_NVLIST: + record->zi_start = offsetof(vdev_label_t, vl_vdev_phys); + record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1; + break; + case TYPE_LABEL_PAD1: + record->zi_start = offsetof(vdev_label_t, vl_pad1); + record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; + break; + case TYPE_LABEL_PAD2: + record->zi_start = offsetof(vdev_label_t, vl_pad2); + record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; + break; + } + return (0); +} diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c new file mode 100644 index 0000000..60c53ce --- /dev/null +++ b/cmd/zinject/zinject.c @@ -0,0 +1,972 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * ZFS Fault Injector + * + * This userland component takes a set of options and uses libzpool to translate + * from a user-visible object type and name to an internal representation. + * There are two basic types of faults: device faults and data faults. + * + * + * DEVICE FAULTS + * + * Errors can be injected into a particular vdev using the '-d' option. This + * option takes a path or vdev GUID to uniquely identify the device within a + * pool. There are two types of errors that can be injected, EIO and ENXIO, + * that can be controlled through the '-e' option. The default is ENXIO. For + * EIO failures, any attempt to read data from the device will return EIO, but + * subsequent attempt to reopen the device will succeed. For ENXIO failures, + * any attempt to read from the device will return EIO, but any attempt to + * reopen the device will also return ENXIO. + * For label faults, the -L option must be specified. This allows faults + * to be injected into either the nvlist, uberblock, pad1, or pad2 region + * of all the labels for the specified device. + * + * This form of the command looks like: + * + * zinject -d device [-e errno] [-L ] pool + * + * + * DATA FAULTS + * + * We begin with a tuple of the form: + * + * + * + * type A string describing the type of data to target. Each type + * implicitly describes how to interpret 'object'. Currently, + * the following values are supported: + * + * data User data for a file + * dnode Dnode for a file or directory + * + * The following MOS objects are special. Instead of injecting + * errors on a particular object or blkid, we inject errors across + * all objects of the given type. + * + * mos Any data in the MOS + * mosdir object directory + * config pool configuration + * bpobj blkptr list + * spacemap spacemap + * metaslab metaslab + * errlog persistent error log + * + * level Object level. Defaults to '0', not applicable to all types. If + * a range is given, this corresponds to the indirect block + * corresponding to the specific range. + * + * range A numerical range [start,end) within the object. Defaults to + * the full size of the file. + * + * object A string describing the logical location of the object. For + * files and directories (currently the only supported types), + * this is the path of the object on disk. + * + * This is translated, via libzpool, into the following internal representation: + * + * + * + * These types should be self-explanatory. This tuple is then passed to the + * kernel via a special ioctl() to initiate fault injection for the given + * object. Note that 'type' is not strictly necessary for fault injection, but + * is used when translating existing faults into a human-readable string. + * + * + * The command itself takes one of the forms: + * + * zinject + * zinject <-a | -u pool> + * zinject -c + * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] + * [-r range] + * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool + * + * With no arguments, the command prints all currently registered injection + * handlers, with their numeric identifiers. + * + * The '-c' option will clear the given handler, or all handlers if 'all' is + * specified. + * + * The '-e' option takes a string describing the errno to simulate. This must + * be either 'io' or 'checksum'. In most cases this will result in the same + * behavior, but RAID-Z will produce a different set of ereports for this + * situation. + * + * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is + * specified, then the ARC cache is flushed appropriately. If '-u' is + * specified, then the underlying SPA is unloaded. Either of these flags can be + * specified independently of any other handlers. The '-m' flag automatically + * does an unmount and remount of the underlying dataset to aid in flushing the + * cache. + * + * The '-f' flag controls the frequency of errors injected, expressed as a + * integer percentage between 1 and 100. The default is 100. + * + * The this form is responsible for actually injecting the handler into the + * framework. It takes the arguments described above, translates them to the + * internal tuple using libzpool, and then issues an ioctl() to register the + * handler. + * + * The final form can target a specific bookmark, regardless of whether a + * human-readable interface has been designed. It allows developers to specify + * a particular block by number. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#undef verify /* both libzfs.h and zfs_context.h want to define this */ + +#include "zinject.h" + +libzfs_handle_t *g_zfs; +int zfs_fd; + +#define ECKSUM EBADE + +static const char *errtable[TYPE_INVAL] = { + "data", + "dnode", + "mos", + "mosdir", + "metaslab", + "config", + "bpobj", + "spacemap", + "errlog", + "uber", + "nvlist", + "pad1", + "pad2" +}; + +static err_type_t +name_to_type(const char *arg) +{ + int i; + for (i = 0; i < TYPE_INVAL; i++) + if (strcmp(errtable[i], arg) == 0) + return (i); + + return (TYPE_INVAL); +} + +static const char * +type_to_name(uint64_t type) +{ + switch (type) { + case DMU_OT_OBJECT_DIRECTORY: + return ("mosdir"); + case DMU_OT_OBJECT_ARRAY: + return ("metaslab"); + case DMU_OT_PACKED_NVLIST: + return ("config"); + case DMU_OT_BPOBJ: + return ("bpobj"); + case DMU_OT_SPACE_MAP: + return ("spacemap"); + case DMU_OT_ERROR_LOG: + return ("errlog"); + default: + return ("-"); + } +} + + +/* + * Print usage message. + */ +void +usage(void) +{ + (void) printf( + "usage:\n" + "\n" + "\tzinject\n" + "\n" + "\t\tList all active injection records.\n" + "\n" + "\tzinject -c \n" + "\n" + "\t\tClear the particular record (if given a numeric ID), or\n" + "\t\tall records if 'all' is specificed.\n" + "\n" + "\tzinject -p pool\n" + "\t\tInject a panic fault at the specified function. Only \n" + "\t\tfunctions which call spa_vdev_config_exit(), or \n" + "\t\tspa_vdev_exit() will trigger a panic.\n" + "\n" + "\tzinject -d device [-e errno] [-L ] [-F]\n" + "\t [-T pool\n" + "\t\tInject a fault into a particular device or the device's\n" + "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " + "\t\t'pad1', or 'pad2'.\n" + "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n" + "\n" + "\tzinject -d device -A pool\n" + "\t\tPerform a specific action on a particular device\n" + "\n" + "\tzinject -I [-s | -g ] pool\n" + "\t\tCause the pool to stop writing blocks yet not\n" + "\t\treport errors for a duration. Simulates buggy hardware\n" + "\t\tthat fails to honor cache flush requests.\n" + "\t\tDefault duration is 30 seconds. The machine is panicked\n" + "\t\tat the end of the duration.\n" + "\n" + "\tzinject -b objset:object:level:blkid pool\n" + "\n" + "\t\tInject an error into pool 'pool' with the numeric bookmark\n" + "\t\tspecified by the remaining tuple. Each number is in\n" + "\t\thexidecimal, and only one block can be specified.\n" + "\n" + "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n" + "\t [-a] [-m] [-u] [-f freq] \n" + "\n" + "\t\tInject an error into the object specified by the '-t' option\n" + "\t\tand the object descriptor. The 'object' parameter is\n" + "\t\tinterperted depending on the '-t' option.\n" + "\n" + "\t\t-q\tQuiet mode. Only print out the handler number added.\n" + "\t\t-e\tInject a specific error. Must be either 'io' or\n" + "\t\t\t'checksum'. Default is 'io'.\n" + "\t\t-l\tInject error at a particular block level. Default is " + "0.\n" + "\t\t-m\tAutomatically remount underlying filesystem.\n" + "\t\t-r\tInject error over a particular logical range of an\n" + "\t\t\tobject. Will be translated to the appropriate blkid\n" + "\t\t\trange according to the object's properties.\n" + "\t\t-a\tFlush the ARC cache. Can be specified without any\n" + "\t\t\tassociated object.\n" + "\t\t-u\tUnload the associated pool. Can be specified with only\n" + "\t\t\ta pool object.\n" + "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n" + "\t\t\ta percentage between 1 and 100.\n" + "\n" + "\t-t data\t\tInject an error into the plain file contents of a\n" + "\t\t\tfile. The object must be specified as a complete path\n" + "\t\t\tto a file on a ZFS filesystem.\n" + "\n" + "\t-t dnode\tInject an error into the metadnode in the block\n" + "\t\t\tcorresponding to the dnode for a file or directory. The\n" + "\t\t\t'-r' option is incompatible with this mode. The object\n" + "\t\t\tis specified as a complete path to a file or directory\n" + "\t\t\ton a ZFS filesystem.\n" + "\n" + "\t-t \tInject errors into the MOS for objects of the given\n" + "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n" + "\t\t\tspacemap, metaslab, errlog. The only valid is\n" + "\t\t\tthe poolname.\n"); +} + +static int +iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), + void *data) +{ + zfs_cmd_t zc; + int ret; + + zc.zc_guid = 0; + + while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) + if ((ret = func((int)zc.zc_guid, zc.zc_name, + &zc.zc_inject_record, data)) != 0) + return (ret); + + if (errno != ENOENT) { + (void) fprintf(stderr, "Unable to list handlers: %s\n", + strerror(errno)); + return (-1); + } + + return (0); +} + +static int +print_data_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid != 0 || record->zi_func[0] != '\0') + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n", + "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE"); + (void) printf("--- --------------- ------ " + "------ -------- --- ---------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool, + (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object, + type_to_name(record->zi_type), record->zi_level); + + if (record->zi_start == 0 && + record->zi_end == -1ULL) + (void) printf("all\n"); + else + (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start, + (u_longlong_t)record->zi_end); + + return (0); +} + +static int +print_device_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid == 0 || record->zi_func[0] != '\0') + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); + (void) printf("--- --------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %llx\n", id, pool, + (u_longlong_t)record->zi_guid); + + return (0); +} + +static int +print_panic_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_func[0] == '\0') + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION"); + (void) printf("--- --------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %s\n", id, pool, record->zi_func); + + return (0); +} + +/* + * Print all registered error handlers. Returns the number of handlers + * registered. + */ +static int +print_all_handlers(void) +{ + int count = 0, total = 0; + + (void) iter_handlers(print_device_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + + (void) iter_handlers(print_data_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + + (void) iter_handlers(print_panic_handler, &count); + + return (count + total); +} + +/* ARGSUSED */ +static int +cancel_one_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + zfs_cmd_t zc; + + zc.zc_guid = (uint64_t)id; + + if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { + (void) fprintf(stderr, "failed to remove handler %d: %s\n", + id, strerror(errno)); + return (1); + } + + return (0); +} + +/* + * Remove all fault injection handlers. + */ +static int +cancel_all_handlers(void) +{ + int ret = iter_handlers(cancel_one_handler, NULL); + + if (ret == 0) + (void) printf("removed all registered handlers\n"); + + return (ret); +} + +/* + * Remove a specific fault injection handler. + */ +static int +cancel_handler(int id) +{ + zfs_cmd_t zc; + + zc.zc_guid = (uint64_t)id; + + if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { + (void) fprintf(stderr, "failed to remove handler %d: %s\n", + id, strerror(errno)); + return (1); + } + + (void) printf("removed handler %d\n", id); + + return (0); +} + +/* + * Register a new fault injection handler. + */ +static int +register_handler(const char *pool, int flags, zinject_record_t *record, + int quiet) +{ + zfs_cmd_t zc; + + (void) strcpy(zc.zc_name, pool); + zc.zc_inject_record = *record; + zc.zc_guid = flags; + + if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) { + (void) fprintf(stderr, "failed to add handler: %s\n", + strerror(errno)); + return (1); + } + + if (flags & ZINJECT_NULL) + return (0); + + if (quiet) { + (void) printf("%llu\n", (u_longlong_t)zc.zc_guid); + } else { + (void) printf("Added handler %llu with the following " + "properties:\n", (u_longlong_t)zc.zc_guid); + (void) printf(" pool: %s\n", pool); + if (record->zi_guid) { + (void) printf(" vdev: %llx\n", + (u_longlong_t)record->zi_guid); + } else if (record->zi_func[0] != '\0') { + (void) printf(" panic function: %s\n", + record->zi_func); + } else if (record->zi_duration > 0) { + (void) printf(" time: %lld seconds\n", + (u_longlong_t)record->zi_duration); + } else if (record->zi_duration < 0) { + (void) printf(" txgs: %lld \n", + (u_longlong_t)-record->zi_duration); + } else { + (void) printf("objset: %llu\n", + (u_longlong_t)record->zi_objset); + (void) printf("object: %llu\n", + (u_longlong_t)record->zi_object); + (void) printf(" type: %llu\n", + (u_longlong_t)record->zi_type); + (void) printf(" level: %d\n", record->zi_level); + if (record->zi_start == 0 && + record->zi_end == -1ULL) + (void) printf(" range: all\n"); + else + (void) printf(" range: [%llu, %llu)\n", + (u_longlong_t)record->zi_start, + (u_longlong_t)record->zi_end); + } + } + + return (0); +} + +int +perform_action(const char *pool, zinject_record_t *record, int cmd) +{ + zfs_cmd_t zc; + + ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED); + (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); + zc.zc_guid = record->zi_guid; + zc.zc_cookie = cmd; + + if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + return (1); +} + +int +main(int argc, char **argv) +{ + int c; + char *range = NULL; + char *cancel = NULL; + char *end; + char *raw = NULL; + char *device = NULL; + int level = 0; + int quiet = 0; + int error = 0; + int domount = 0; + int io_type = ZIO_TYPES; + int action = VDEV_STATE_UNKNOWN; + err_type_t type = TYPE_INVAL; + err_type_t label = TYPE_INVAL; + zinject_record_t record = { 0 }; + char pool[MAXNAMELEN]; + char dataset[MAXNAMELEN]; + zfs_handle_t *zhp; + int nowrites = 0; + int dur_txg = 0; + int dur_secs = 0; + int ret; + int flags = 0; + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "internal error: failed to " + "initialize ZFS library\n"); + return (1); + } + + libzfs_print_on_error(g_zfs, B_TRUE); + + if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { + (void) fprintf(stderr, "failed to open ZFS device\n"); + return (1); + } + + if (argc == 1) { + /* + * No arguments. Print the available handlers. If there are no + * available handlers, direct the user to '-h' for help + * information. + */ + if (print_all_handlers() == 0) { + (void) printf("No handlers registered.\n"); + (void) printf("Run 'zinject -h' for usage " + "information.\n"); + } + + return (0); + } + + while ((c = getopt(argc, argv, + ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + switch (c) { + case 'a': + flags |= ZINJECT_FLUSH_ARC; + break; + case 'A': + if (strcasecmp(optarg, "degrade") == 0) { + action = VDEV_STATE_DEGRADED; + } else if (strcasecmp(optarg, "fault") == 0) { + action = VDEV_STATE_FAULTED; + } else { + (void) fprintf(stderr, "invalid action '%s': " + "must be 'degrade' or 'fault'\n", optarg); + usage(); + return (1); + } + break; + case 'b': + raw = optarg; + break; + case 'c': + cancel = optarg; + break; + case 'd': + device = optarg; + break; + case 'e': + if (strcasecmp(optarg, "io") == 0) { + error = EIO; + } else if (strcasecmp(optarg, "checksum") == 0) { + error = ECKSUM; + } else if (strcasecmp(optarg, "nxio") == 0) { + error = ENXIO; + } else if (strcasecmp(optarg, "dtl") == 0) { + error = ECHILD; + } else { + (void) fprintf(stderr, "invalid error type " + "'%s': must be 'io', 'checksum' or " + "'nxio'\n", optarg); + usage(); + return (1); + } + break; + case 'f': + record.zi_freq = atoi(optarg); + if (record.zi_freq < 1 || record.zi_freq > 100) { + (void) fprintf(stderr, "frequency range must " + "be in the range (0, 100]\n"); + return (1); + } + break; + case 'F': + record.zi_failfast = B_TRUE; + break; + case 'g': + dur_txg = 1; + record.zi_duration = (int)strtol(optarg, &end, 10); + if (record.zi_duration <= 0 || *end != '\0') { + (void) fprintf(stderr, "invalid duration '%s': " + "must be a positive integer\n", optarg); + usage(); + return (1); + } + /* store duration of txgs as its negative */ + record.zi_duration *= -1; + break; + case 'h': + usage(); + return (0); + case 'I': + /* default duration, if one hasn't yet been defined */ + nowrites = 1; + if (dur_secs == 0 && dur_txg == 0) + record.zi_duration = 30; + break; + case 'l': + level = (int)strtol(optarg, &end, 10); + if (*end != '\0') { + (void) fprintf(stderr, "invalid level '%s': " + "must be an integer\n", optarg); + usage(); + return (1); + } + break; + case 'm': + domount = 1; + break; + case 'p': + (void) strlcpy(record.zi_func, optarg, + sizeof (record.zi_func)); + break; + case 'q': + quiet = 1; + break; + case 'r': + range = optarg; + break; + case 's': + dur_secs = 1; + record.zi_duration = (int)strtol(optarg, &end, 10); + if (record.zi_duration <= 0 || *end != '\0') { + (void) fprintf(stderr, "invalid duration '%s': " + "must be a positive integer\n", optarg); + usage(); + return (1); + } + break; + case 'T': + if (strcasecmp(optarg, "read") == 0) { + io_type = ZIO_TYPE_READ; + } else if (strcasecmp(optarg, "write") == 0) { + io_type = ZIO_TYPE_WRITE; + } else if (strcasecmp(optarg, "free") == 0) { + io_type = ZIO_TYPE_FREE; + } else if (strcasecmp(optarg, "claim") == 0) { + io_type = ZIO_TYPE_CLAIM; + } else if (strcasecmp(optarg, "all") == 0) { + io_type = ZIO_TYPES; + } else { + (void) fprintf(stderr, "invalid I/O type " + "'%s': must be 'read', 'write', 'free', " + "'claim' or 'all'\n", optarg); + usage(); + return (1); + } + break; + case 't': + if ((type = name_to_type(optarg)) == TYPE_INVAL && + !MOS_TYPE(type)) { + (void) fprintf(stderr, "invalid type '%s'\n", + optarg); + usage(); + return (1); + } + break; + case 'u': + flags |= ZINJECT_UNLOAD_SPA; + break; + case 'L': + if ((label = name_to_type(optarg)) == TYPE_INVAL && + !LABEL_TYPE(type)) { + (void) fprintf(stderr, "invalid label type " + "'%s'\n", optarg); + usage(); + return (1); + } + break; + case ':': + (void) fprintf(stderr, "option -%c requires an " + "operand\n", optopt); + usage(); + return (1); + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + usage(); + return (2); + } + } + + argc -= optind; + argv += optind; + + if (cancel != NULL) { + /* + * '-c' is invalid with any other options. + */ + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_func[0] != '\0' || + record.zi_duration != 0) { + (void) fprintf(stderr, "cancel (-c) incompatible with " + "any other options\n"); + usage(); + return (2); + } + if (argc != 0) { + (void) fprintf(stderr, "extraneous argument to '-c'\n"); + usage(); + return (2); + } + + if (strcmp(cancel, "all") == 0) { + return (cancel_all_handlers()); + } else { + int id = (int)strtol(cancel, &end, 10); + if (*end != '\0') { + (void) fprintf(stderr, "invalid handle id '%s':" + " must be an integer or 'all'\n", cancel); + usage(); + return (1); + } + return (cancel_handler(id)); + } + } + + if (device != NULL) { + /* + * Device (-d) injection uses a completely different mechanism + * for doing injection, so handle it separately here. + */ + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_func[0] != '\0' || + record.zi_duration != 0) { + (void) fprintf(stderr, "device (-d) incompatible with " + "data error injection\n"); + usage(); + return (2); + } + + if (argc != 1) { + (void) fprintf(stderr, "device (-d) injection requires " + "a single pool name\n"); + usage(); + return (2); + } + + (void) strcpy(pool, argv[0]); + dataset[0] = '\0'; + + if (error == ECKSUM) { + (void) fprintf(stderr, "device error type must be " + "'io' or 'nxio'\n"); + return (1); + } + + record.zi_iotype = io_type; + if (translate_device(pool, device, label, &record) != 0) + return (1); + if (!error) + error = ENXIO; + + if (action != VDEV_STATE_UNKNOWN) + return (perform_action(pool, &record, action)); + + } else if (raw != NULL) { + if (range != NULL || type != TYPE_INVAL || level != 0 || + record.zi_func[0] != '\0' || record.zi_duration != 0) { + (void) fprintf(stderr, "raw (-b) format with " + "any other options\n"); + usage(); + return (2); + } + + if (argc != 1) { + (void) fprintf(stderr, "raw (-b) format expects a " + "single pool name\n"); + usage(); + return (2); + } + + (void) strcpy(pool, argv[0]); + dataset[0] = '\0'; + + if (error == ENXIO) { + (void) fprintf(stderr, "data error type must be " + "'checksum' or 'io'\n"); + return (1); + } + + if (translate_raw(raw, &record) != 0) + return (1); + if (!error) + error = EIO; + } else if (record.zi_func[0] != '\0') { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || device != NULL || record.zi_duration != 0) { + (void) fprintf(stderr, "panic (-p) incompatible with " + "other options\n"); + usage(); + return (2); + } + + if (argc < 1 || argc > 2) { + (void) fprintf(stderr, "panic (-p) injection requires " + "a single pool name and an optional id\n"); + usage(); + return (2); + } + + (void) strcpy(pool, argv[0]); + if (argv[1] != NULL) + record.zi_type = atoi(argv[1]); + dataset[0] = '\0'; + } else if (record.zi_duration != 0) { + if (nowrites == 0) { + (void) fprintf(stderr, "-s or -g meaningless " + "without -I (ignore writes)\n"); + usage(); + return (2); + } else if (dur_secs && dur_txg) { + (void) fprintf(stderr, "choose a duration either " + "in seconds (-s) or a number of txgs (-g) " + "but not both\n"); + usage(); + return (2); + } else if (argc != 1) { + (void) fprintf(stderr, "ignore writes (-I) " + "injection requires a single pool name\n"); + usage(); + return (2); + } + + (void) strcpy(pool, argv[0]); + dataset[0] = '\0'; + } else if (type == TYPE_INVAL) { + if (flags == 0) { + (void) fprintf(stderr, "at least one of '-b', '-d', " + "'-t', '-a', '-p', '-I' or '-u' " + "must be specified\n"); + usage(); + return (2); + } + + if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) { + (void) strcpy(pool, argv[0]); + dataset[0] = '\0'; + } else if (argc != 0) { + (void) fprintf(stderr, "extraneous argument for " + "'-f'\n"); + usage(); + return (2); + } + + flags |= ZINJECT_NULL; + } else { + if (argc != 1) { + (void) fprintf(stderr, "missing object\n"); + usage(); + return (2); + } + + if (error == ENXIO) { + (void) fprintf(stderr, "data error type must be " + "'checksum' or 'io'\n"); + return (1); + } + + if (translate_record(type, argv[0], range, level, &record, pool, + dataset) != 0) + return (1); + if (!error) + error = EIO; + } + + /* + * If this is pool-wide metadata, unmount everything. The ioctl() will + * unload the pool, so that we trigger spa-wide reopen of metadata next + * time we access the pool. + */ + if (dataset[0] != '\0' && domount) { + if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) + return (1); + + if (zfs_unmount(zhp, NULL, 0) != 0) + return (1); + } + + record.zi_error = error; + + ret = register_handler(pool, flags, &record, quiet); + + if (dataset[0] != '\0' && domount) + ret = (zfs_mount(zhp, NULL, 0) != 0); + + libzfs_fini(g_zfs); + + return (ret); +} diff --git a/cmd/zinject/zinject.h b/cmd/zinject/zinject.h new file mode 100644 index 0000000..46fdcad --- /dev/null +++ b/cmd/zinject/zinject.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZINJECT_H +#define _ZINJECT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + TYPE_DATA, /* plain file contents */ + TYPE_DNODE, /* metadnode contents */ + TYPE_MOS, /* all MOS data */ + TYPE_MOSDIR, /* MOS object directory */ + TYPE_METASLAB, /* metaslab objects */ + TYPE_CONFIG, /* MOS config */ + TYPE_BPOBJ, /* block pointer list */ + TYPE_SPACEMAP, /* space map objects */ + TYPE_ERRLOG, /* persistent error log */ + TYPE_LABEL_UBERBLOCK, /* label specific uberblock */ + TYPE_LABEL_NVLIST, /* label specific nvlist */ + TYPE_LABEL_PAD1, /* label specific 8K pad1 area */ + TYPE_LABEL_PAD2, /* label specific 8K pad2 area */ + TYPE_INVAL +} err_type_t; + +#define MOS_TYPE(t) \ + ((t) >= TYPE_MOS && (t) < TYPE_LABEL_UBERBLOCK) + +#define LABEL_TYPE(t) \ + ((t) >= TYPE_LABEL_UBERBLOCK && (t) < TYPE_INVAL) + +int translate_record(err_type_t type, const char *object, const char *range, + int level, zinject_record_t *record, char *poolname, char *dataset); +int translate_raw(const char *raw, zinject_record_t *record); +int translate_device(const char *pool, const char *device, + err_type_t label_type, zinject_record_t *record); +void usage(void); + +extern libzfs_handle_t *g_zfs; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZINJECT_H */ diff --git a/cmd/zlook/zlook.c b/cmd/zlook/zlook.c new file mode 100644 index 0000000..29a6559 --- /dev/null +++ b/cmd/zlook/zlook.c @@ -0,0 +1,411 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * This is a test program that uses ioctls to the ZFS Unit Test driver + * to perform readdirs or lookups using flags not normally available + * to user-land programs. This allows testing of the flags' + * behavior outside of a complicated consumer, such as the SMB driver. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define _KERNEL + +#include +#include + +#undef _KERNEL + +#define MAXBUF (64 * 1024) +#define BIGBUF 4096 +#define LILBUF (sizeof (dirent_t)) + +#define DIRENT_NAMELEN(reclen) \ + ((reclen) - (offsetof(dirent_t, d_name[0]))) + +static void +usage(char *pnam) +{ + (void) fprintf(stderr, "Usage:\n %s -l [-is] dir-to-look-in " + "file-in-dir [xfile-on-file]\n", pnam); + (void) fprintf(stderr, " %s -i [-ls] dir-to-look-in " + "file-in-dir [xfile-on-file]\n", pnam); + (void) fprintf(stderr, " %s -s [-il] dir-to-look-in " + "file-in-dir [xfile-on-file]\n", pnam); + (void) fprintf(stderr, "\t Perform a lookup\n"); + (void) fprintf(stderr, "\t -l == lookup\n"); + (void) fprintf(stderr, "\t -i == request FIGNORECASE\n"); + (void) fprintf(stderr, "\t -s == request stat(2) and xvattr info\n"); + (void) fprintf(stderr, " %s -r [-ea] [-b buffer-size-in-bytes] " + "dir-to-look-in [file-in-dir]\n", pnam); + (void) fprintf(stderr, " %s -e [-ra] [-b buffer-size-in-bytes] " + "dir-to-look-in [file-in-dir]\n", pnam); + (void) fprintf(stderr, " %s -a [-re] [-b buffer-size-in-bytes] " + "dir-to-look-in [file-in-dir]\n", pnam); + (void) fprintf(stderr, "\t Perform a readdir\n"); + (void) fprintf(stderr, "\t -r == readdir\n"); + (void) fprintf(stderr, "\t -e == request extended entries\n"); + (void) fprintf(stderr, "\t -a == request access filtering\n"); + (void) fprintf(stderr, "\t -b == buffer size (default 4K)\n"); + (void) fprintf(stderr, " %s -A path\n", pnam); + (void) fprintf(stderr, "\t Look up _PC_ACCESS_FILTERING " + "for path with pathconf(2)\n"); + (void) fprintf(stderr, " %s -E path\n", pnam); + (void) fprintf(stderr, "\t Look up _PC_SATTR_EXISTS " + "for path with pathconf(2)\n"); + (void) fprintf(stderr, " %s -S path\n", pnam); + (void) fprintf(stderr, "\t Look up _PC_SATTR_EXISTS " + "for path with pathconf(2)\n"); + exit(EINVAL); +} + +static void +print_extd_entries(zut_readdir_t *r) +{ + struct edirent *eodp; + char *bufstart; + + eodp = (edirent_t *)(uintptr_t)r->zr_buf; + bufstart = (char *)eodp; + while ((char *)eodp < bufstart + r->zr_bytes) { + char *blanks = " "; + int i = 0; + while (i < EDIRENT_NAMELEN(eodp->ed_reclen)) { + if (!eodp->ed_name[i]) + break; + (void) printf("%c", eodp->ed_name[i++]); + } + if (i < 16) + (void) printf("%.*s", 16 - i, blanks); + (void) printf("\t%x\n", eodp->ed_eflags); + eodp = (edirent_t *)((intptr_t)eodp + eodp->ed_reclen); + } +} + +static void +print_entries(zut_readdir_t *r) +{ + dirent64_t *dp; + char *bufstart; + + dp = (dirent64_t *)(intptr_t)r->zr_buf; + bufstart = (char *)dp; + while ((char *)dp < bufstart + r->zr_bytes) { + int i = 0; + while (i < DIRENT_NAMELEN(dp->d_reclen)) { + if (!dp->d_name[i]) + break; + (void) printf("%c", dp->d_name[i++]); + } + (void) printf("\n"); + dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen); + } +} + +static void +print_stats(struct stat64 *sb) +{ + char timebuf[512]; + + (void) printf("st_mode\t\t\t%04lo\n", (unsigned long)sb->st_mode); + (void) printf("st_ino\t\t\t%llu\n", (unsigned long long)sb->st_ino); + (void) printf("st_nlink\t\t%lu\n", (unsigned long)sb->st_nlink); + (void) printf("st_uid\t\t\t%d\n", sb->st_uid); + (void) printf("st_gid\t\t\t%d\n", sb->st_gid); + (void) printf("st_size\t\t\t%lld\n", (long long)sb->st_size); + (void) printf("st_blksize\t\t%ld\n", (long)sb->st_blksize); + (void) printf("st_blocks\t\t%lld\n", (long long)sb->st_blocks); + + timebuf[0] = 0; + if (ctime_r(&sb->st_atime, timebuf, 512)) { + (void) printf("st_atime\t\t"); + (void) printf("%s", timebuf); + } + timebuf[0] = 0; + if (ctime_r(&sb->st_mtime, timebuf, 512)) { + (void) printf("st_mtime\t\t"); + (void) printf("%s", timebuf); + } + timebuf[0] = 0; + if (ctime_r(&sb->st_ctime, timebuf, 512)) { + (void) printf("st_ctime\t\t"); + (void) printf("%s", timebuf); + } +} + +static void +print_xvs(uint64_t xvs) +{ + uint_t bits; + int idx = 0; + + if (xvs == 0) + return; + + (void) printf("-------------------\n"); + (void) printf("Attribute bit(s) set:\n"); + (void) printf("-------------------\n"); + + bits = xvs & ((1 << F_ATTR_ALL) - 1); + while (bits) { + uint_t rest = bits >> 1; + if (bits & 1) { + (void) printf("%s", attr_to_name((f_attr_t)idx)); + if (rest) + (void) printf(", "); + } + idx++; + bits = rest; + } + (void) printf("\n"); +} + +int +main(int argc, char **argv) +{ + zut_lookup_t lk = {0}; + zut_readdir_t rd = {0}; + boolean_t checking = B_FALSE; + boolean_t looking = B_FALSE; + boolean_t reading = B_FALSE; + boolean_t bflag = B_FALSE; + long rddir_bufsize = BIGBUF; + int error = 0; + int check; + int fd; + int c; + + while ((c = getopt(argc, argv, "lisaerb:ASE")) != -1) { + switch (c) { + case 'l': + looking = B_TRUE; + break; + case 'i': + lk.zl_reqflags |= ZUT_IGNORECASE; + looking = B_TRUE; + break; + case 's': + lk.zl_reqflags |= ZUT_GETSTAT; + looking = B_TRUE; + break; + case 'a': + rd.zr_reqflags |= ZUT_ACCFILTER; + reading = B_TRUE; + break; + case 'e': + rd.zr_reqflags |= ZUT_EXTRDDIR; + reading = B_TRUE; + break; + case 'r': + reading = B_TRUE; + break; + case 'b': + reading = B_TRUE; + bflag = B_TRUE; + rddir_bufsize = strtol(optarg, NULL, 0); + break; + case 'A': + checking = B_TRUE; + check = _PC_ACCESS_FILTERING; + break; + case 'S': + checking = B_TRUE; + check = _PC_SATTR_ENABLED; + break; + case 'E': + checking = B_TRUE; + check = _PC_SATTR_EXISTS; + break; + case '?': + default: + usage(argv[0]); /* no return */ + } + } + + if ((checking && looking) || (checking && reading) || + (looking && reading) || (!reading && bflag) || + (!checking && !reading && !looking)) + usage(argv[0]); /* no return */ + + if (rddir_bufsize < LILBUF || rddir_bufsize > MAXBUF) { + (void) fprintf(stderr, "Sorry, buffer size " + "must be >= %d and less than or equal to %d bytes.\n", + (int)LILBUF, MAXBUF); + exit(EINVAL); + } + + if (checking) { + char pathbuf[MAXPATHLEN]; + long result; + + if (argc - optind < 1) + usage(argv[0]); /* no return */ + (void) strlcpy(pathbuf, argv[optind], MAXPATHLEN); + result = pathconf(pathbuf, check); + (void) printf("pathconf(2) check for %s\n", pathbuf); + switch (check) { + case _PC_SATTR_ENABLED: + (void) printf("System attributes "); + if (result != 0) + (void) printf("Enabled\n"); + else + (void) printf("Not enabled\n"); + break; + case _PC_SATTR_EXISTS: + (void) printf("System attributes "); + if (result != 0) + (void) printf("Exist\n"); + else + (void) printf("Do not exist\n"); + break; + case _PC_ACCESS_FILTERING: + (void) printf("Access filtering "); + if (result != 0) + (void) printf("Available\n"); + else + (void) printf("Not available\n"); + break; + } + return (result); + } + + if ((fd = open(ZUT_DEV, O_RDONLY)) < 0) { + perror(ZUT_DEV); + return (ENXIO); + } + + if (reading) { + char *buf; + + if (argc - optind < 1) + usage(argv[0]); /* no return */ + + (void) strlcpy(rd.zr_dir, argv[optind], MAXPATHLEN); + if (argc - optind > 1) { + (void) strlcpy(rd.zr_file, argv[optind + 1], + MAXNAMELEN); + rd.zr_reqflags |= ZUT_XATTR; + } + + if ((buf = malloc(rddir_bufsize)) == NULL) { + error = errno; + perror("malloc"); + (void) close(fd); + return (error); + } + + rd.zr_buf = (uint64_t)(uintptr_t)buf; + rd.zr_buflen = rddir_bufsize; + + while (!rd.zr_eof) { + int ierr; + + if ((ierr = ioctl(fd, ZUT_IOC_READDIR, &rd)) != 0) { + (void) fprintf(stderr, + "IOCTL error: %s (%d)\n", + strerror(ierr), ierr); + free(buf); + (void) close(fd); + return (ierr); + } + if (rd.zr_retcode) { + (void) fprintf(stderr, + "readdir result: %s (%d)\n", + strerror(rd.zr_retcode), rd.zr_retcode); + free(buf); + (void) close(fd); + return (rd.zr_retcode); + } + if (rd.zr_reqflags & ZUT_EXTRDDIR) + print_extd_entries(&rd); + else + print_entries(&rd); + } + free(buf); + } else { + int ierr; + + if (argc - optind < 2) + usage(argv[0]); /* no return */ + + (void) strlcpy(lk.zl_dir, argv[optind], MAXPATHLEN); + (void) strlcpy(lk.zl_file, argv[optind + 1], MAXNAMELEN); + if (argc - optind > 2) { + (void) strlcpy(lk.zl_xfile, + argv[optind + 2], MAXNAMELEN); + lk.zl_reqflags |= ZUT_XATTR; + } + + if ((ierr = ioctl(fd, ZUT_IOC_LOOKUP, &lk)) != 0) { + (void) fprintf(stderr, + "IOCTL error: %s (%d)\n", + strerror(ierr), ierr); + (void) close(fd); + return (ierr); + } + + (void) printf("\nLookup of "); + if (lk.zl_reqflags & ZUT_XATTR) { + (void) printf("extended attribute \"%s\" of ", + lk.zl_xfile); + } + (void) printf("file \"%s\" ", lk.zl_file); + (void) printf("in directory \"%s\" ", lk.zl_dir); + if (lk.zl_retcode) { + (void) printf("failed: %s (%d)\n", + strerror(lk.zl_retcode), lk.zl_retcode); + (void) close(fd); + return (lk.zl_retcode); + } + + (void) printf("succeeded.\n"); + if (lk.zl_reqflags & ZUT_IGNORECASE) { + (void) printf("----------------------------\n"); + (void) printf("dirent flags: 0x%0x\n", lk.zl_deflags); + (void) printf("real name: %s\n", lk.zl_real); + } + if (lk.zl_reqflags & ZUT_GETSTAT) { + (void) printf("----------------------------\n"); + print_stats(&lk.zl_statbuf); + print_xvs(lk.zl_xvattrs); + } + } + + (void) close(fd); + return (0); +} diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c new file mode 100644 index 0000000..2f0daef --- /dev/null +++ b/cmd/zpool/zpool_iter.c @@ -0,0 +1,252 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include +#include +#include +#include +#include +#include + +#include + +#include "zpool_util.h" + +/* + * Private interface for iterating over pools specified on the command line. + * Most consumers will call for_each_pool, but in order to support iostat, we + * allow fined grained control through the zpool_list_t interface. + */ + +typedef struct zpool_node { + zpool_handle_t *zn_handle; + uu_avl_node_t zn_avlnode; + int zn_mark; +} zpool_node_t; + +struct zpool_list { + boolean_t zl_findall; + uu_avl_t *zl_avl; + uu_avl_pool_t *zl_pool; + zprop_list_t **zl_proplist; +}; + +/* ARGSUSED */ +static int +zpool_compare(const void *larg, const void *rarg, void *unused) +{ + zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle; + zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle; + const char *lname = zpool_get_name(l); + const char *rname = zpool_get_name(r); + + return (strcmp(lname, rname)); +} + +/* + * Callback function for pool_list_get(). Adds the given pool to the AVL tree + * of known pools. + */ +static int +add_pool(zpool_handle_t *zhp, void *data) +{ + zpool_list_t *zlp = data; + zpool_node_t *node = safe_malloc(sizeof (zpool_node_t)); + uu_avl_index_t idx; + + node->zn_handle = zhp; + uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); + if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { + if (zlp->zl_proplist && + zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) { + zpool_close(zhp); + free(node); + return (-1); + } + uu_avl_insert(zlp->zl_avl, node, idx); + } else { + zpool_close(zhp); + free(node); + return (-1); + } + + return (0); +} + +/* + * Create a list of pools based on the given arguments. If we're given no + * arguments, then iterate over all pools in the system and add them to the AVL + * tree. Otherwise, add only those pool explicitly specified on the command + * line. + */ +zpool_list_t * +pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) +{ + zpool_list_t *zlp; + + zlp = safe_malloc(sizeof (zpool_list_t)); + + zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t), + offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT); + + if (zlp->zl_pool == NULL) + zpool_no_memory(); + + if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL, + UU_DEFAULT)) == NULL) + zpool_no_memory(); + + zlp->zl_proplist = proplist; + + if (argc == 0) { + (void) zpool_iter(g_zfs, add_pool, zlp); + zlp->zl_findall = B_TRUE; + } else { + int i; + + for (i = 0; i < argc; i++) { + zpool_handle_t *zhp; + + if (zhp = zpool_open_canfail(g_zfs, argv[i])) { + if (add_pool(zhp, zlp) != 0) + *err = B_TRUE; + } else { + *err = B_TRUE; + } + } + } + + return (zlp); +} + +/* + * Search for any new pools, adding them to the list. We only add pools when no + * options were given on the command line. Otherwise, we keep the list fixed as + * those that were explicitly specified. + */ +void +pool_list_update(zpool_list_t *zlp) +{ + if (zlp->zl_findall) + (void) zpool_iter(g_zfs, add_pool, zlp); +} + +/* + * Iterate over all pools in the list, executing the callback for each + */ +int +pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func, + void *data) +{ + zpool_node_t *node, *next_node; + int ret = 0; + + for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) { + next_node = uu_avl_next(zlp->zl_avl, node); + if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL || + unavail) + ret |= func(node->zn_handle, data); + } + + return (ret); +} + +/* + * Remove the given pool from the list. When running iostat, we want to remove + * those pools that no longer exist. + */ +void +pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp) +{ + zpool_node_t search, *node; + + search.zn_handle = zhp; + if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) { + uu_avl_remove(zlp->zl_avl, node); + zpool_close(node->zn_handle); + free(node); + } +} + +/* + * Free all the handles associated with this list. + */ +void +pool_list_free(zpool_list_t *zlp) +{ + uu_avl_walk_t *walk; + zpool_node_t *node; + + if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) { + (void) fprintf(stderr, + gettext("internal error: out of memory")); + exit(1); + } + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(zlp->zl_avl, node); + zpool_close(node->zn_handle); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(zlp->zl_avl); + uu_avl_pool_destroy(zlp->zl_pool); + + free(zlp); +} + +/* + * Returns the number of elements in the pool list. + */ +int +pool_list_count(zpool_list_t *zlp) +{ + return (uu_avl_numnodes(zlp->zl_avl)); +} + +/* + * High level function which iterates over all pools given on the command line, + * using the pool_list_* interfaces. + */ +int +for_each_pool(int argc, char **argv, boolean_t unavail, + zprop_list_t **proplist, zpool_iter_f func, void *data) +{ + zpool_list_t *list; + int ret = 0; + + if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL) + return (1); + + if (pool_list_iter(list, unavail, func, data) != 0) + ret = 1; + + pool_list_free(list); + + return (ret); +} diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c new file mode 100644 index 0000000..8aa985b --- /dev/null +++ b/cmd/zpool/zpool_main.c @@ -0,0 +1,4467 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "zpool_util.h" +#include "zfs_comutil.h" + +#include "statcommon.h" + +static int zpool_do_create(int, char **); +static int zpool_do_destroy(int, char **); + +static int zpool_do_add(int, char **); +static int zpool_do_remove(int, char **); + +static int zpool_do_list(int, char **); +static int zpool_do_iostat(int, char **); +static int zpool_do_status(int, char **); + +static int zpool_do_online(int, char **); +static int zpool_do_offline(int, char **); +static int zpool_do_clear(int, char **); + +static int zpool_do_attach(int, char **); +static int zpool_do_detach(int, char **); +static int zpool_do_replace(int, char **); +static int zpool_do_split(int, char **); + +static int zpool_do_scrub(int, char **); + +static int zpool_do_import(int, char **); +static int zpool_do_export(int, char **); + +static int zpool_do_upgrade(int, char **); + +static int zpool_do_history(int, char **); + +static int zpool_do_get(int, char **); +static int zpool_do_set(int, char **); + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +typedef enum { + HELP_ADD, + HELP_ATTACH, + HELP_CLEAR, + HELP_CREATE, + HELP_DESTROY, + HELP_DETACH, + HELP_EXPORT, + HELP_HISTORY, + HELP_IMPORT, + HELP_IOSTAT, + HELP_LIST, + HELP_OFFLINE, + HELP_ONLINE, + HELP_REPLACE, + HELP_REMOVE, + HELP_SCRUB, + HELP_STATUS, + HELP_UPGRADE, + HELP_GET, + HELP_SET, + HELP_SPLIT +} zpool_help_t; + + +typedef struct zpool_command { + const char *name; + int (*func)(int, char **); + zpool_help_t usage; +} zpool_command_t; + +/* + * Master command table. Each ZFS command has a name, associated function, and + * usage message. The usage messages need to be internationalized, so we have + * to have a function to return the usage message based on a command index. + * + * These commands are organized according to how they are displayed in the usage + * message. An empty command (one with a NULL name) indicates an empty line in + * the generic usage message. + */ +static zpool_command_t command_table[] = { + { "create", zpool_do_create, HELP_CREATE }, + { "destroy", zpool_do_destroy, HELP_DESTROY }, + { NULL }, + { "add", zpool_do_add, HELP_ADD }, + { "remove", zpool_do_remove, HELP_REMOVE }, + { NULL }, + { "list", zpool_do_list, HELP_LIST }, + { "iostat", zpool_do_iostat, HELP_IOSTAT }, + { "status", zpool_do_status, HELP_STATUS }, + { NULL }, + { "online", zpool_do_online, HELP_ONLINE }, + { "offline", zpool_do_offline, HELP_OFFLINE }, + { "clear", zpool_do_clear, HELP_CLEAR }, + { NULL }, + { "attach", zpool_do_attach, HELP_ATTACH }, + { "detach", zpool_do_detach, HELP_DETACH }, + { "replace", zpool_do_replace, HELP_REPLACE }, + { "split", zpool_do_split, HELP_SPLIT }, + { NULL }, + { "scrub", zpool_do_scrub, HELP_SCRUB }, + { NULL }, + { "import", zpool_do_import, HELP_IMPORT }, + { "export", zpool_do_export, HELP_EXPORT }, + { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, + { NULL }, + { "history", zpool_do_history, HELP_HISTORY }, + { "get", zpool_do_get, HELP_GET }, + { "set", zpool_do_set, HELP_SET }, +}; + +#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) + +zpool_command_t *current_command; +static char history_str[HIS_MAX_RECORD_LEN]; + +static uint_t timestamp_fmt = NODATE; + +static const char * +get_usage(zpool_help_t idx) { + switch (idx) { + case HELP_ADD: + return (gettext("\tadd [-fn] ...\n")); + case HELP_ATTACH: + return (gettext("\tattach [-f] " + "\n")); + case HELP_CLEAR: + return (gettext("\tclear [-nF] [device]\n")); + case HELP_CREATE: + return (gettext("\tcreate [-fn] [-o property=value] ... \n" + "\t [-O file-system-property=value] ... \n" + "\t [-m mountpoint] [-R root] ...\n")); + case HELP_DESTROY: + return (gettext("\tdestroy [-f] \n")); + case HELP_DETACH: + return (gettext("\tdetach \n")); + case HELP_EXPORT: + return (gettext("\texport [-f] ...\n")); + case HELP_HISTORY: + return (gettext("\thistory [-il] [] ...\n")); + case HELP_IMPORT: + return (gettext("\timport [-d dir] [-D]\n" + "\timport [-d dir | -c cachefile] [-F [-n]] \n" + "\timport [-o mntopts] [-o property=value] ... \n" + "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " + "[-R root] [-F [-n]] -a\n" + "\timport [-o mntopts] [-o property=value] ... \n" + "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " + "[-R root] [-F [-n]]\n" + "\t [newpool]\n")); + case HELP_IOSTAT: + return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval " + "[count]]\n")); + case HELP_LIST: + return (gettext("\tlist [-H] [-o property[,...]] " + "[-T d|u] [pool] ... [interval [count]]\n")); + case HELP_OFFLINE: + return (gettext("\toffline [-t] ...\n")); + case HELP_ONLINE: + return (gettext("\tonline ...\n")); + case HELP_REPLACE: + return (gettext("\treplace [-f] " + "[new-device]\n")); + case HELP_REMOVE: + return (gettext("\tremove ...\n")); + case HELP_SCRUB: + return (gettext("\tscrub [-s] ...\n")); + case HELP_STATUS: + return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval " + "[count]]\n")); + case HELP_UPGRADE: + return (gettext("\tupgrade\n" + "\tupgrade -v\n" + "\tupgrade [-V version] <-a | pool ...>\n")); + case HELP_GET: + return (gettext("\tget <\"all\" | property[,...]> " + " ...\n")); + case HELP_SET: + return (gettext("\tset \n")); + case HELP_SPLIT: + return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n" + "\t [-o property=value] " + "[ ...]\n")); + } + + abort(); + /* NOTREACHED */ +} + + +/* + * Callback routine that will print out a pool property value. + */ +static int +print_prop_cb(int prop, void *cb) +{ + FILE *fp = cb; + + (void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop)); + + if (zpool_prop_readonly(prop)) + (void) fprintf(fp, " NO "); + else + (void) fprintf(fp, " YES "); + + if (zpool_prop_values(prop) == NULL) + (void) fprintf(fp, "-\n"); + else + (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); + + return (ZPROP_CONT); +} + +/* + * Display usage message. If we're inside a command, display only the usage for + * that command. Otherwise, iterate over the entire command table and display + * a complete usage message. + */ +void +usage(boolean_t requested) +{ + FILE *fp = requested ? stdout : stderr; + + if (current_command == NULL) { + int i; + + (void) fprintf(fp, gettext("usage: zpool command args ...\n")); + (void) fprintf(fp, + gettext("where 'command' is one of the following:\n\n")); + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + (void) fprintf(fp, "\n"); + else + (void) fprintf(fp, "%s", + get_usage(command_table[i].usage)); + } + } else { + (void) fprintf(fp, gettext("usage:\n")); + (void) fprintf(fp, "%s", get_usage(current_command->usage)); + } + + if (current_command != NULL && + ((strcmp(current_command->name, "set") == 0) || + (strcmp(current_command->name, "get") == 0) || + (strcmp(current_command->name, "list") == 0))) { + + (void) fprintf(fp, + gettext("\nthe following properties are supported:\n")); + + (void) fprintf(fp, "\n\t%-15s %s %s\n\n", + "PROPERTY", "EDIT", "VALUES"); + + /* Iterate over all properties */ + (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, + ZFS_TYPE_POOL); + } + + /* + * See comments at end of main(). + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + exit(requested ? 0 : 2); +} + +void +print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, + boolean_t print_logs) +{ + nvlist_t **child; + uint_t c, children; + char *vname; + + if (name != NULL) + (void) printf("\t%*s%s\n", indent, "", name); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if ((is_log && !print_logs) || (!is_log && print_logs)) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + print_vdev_tree(zhp, vname, child[c], indent + 2, + B_FALSE); + free(vname); + } +} + +/* + * Add a property pair (name, string-value) into a property nvlist. + */ +static int +add_prop_list(const char *propname, char *propval, nvlist_t **props, + boolean_t poolprop) +{ + zpool_prop_t prop = ZPROP_INVAL; + zfs_prop_t fprop; + nvlist_t *proplist; + const char *normnm; + char *strval; + + if (*props == NULL && + nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { + (void) fprintf(stderr, + gettext("internal error: out of memory\n")); + return (1); + } + + proplist = *props; + + if (poolprop) { + if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) { + (void) fprintf(stderr, gettext("property '%s' is " + "not a valid pool property\n"), propname); + return (2); + } + normnm = zpool_prop_to_name(prop); + } else { + if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { + normnm = zfs_prop_to_name(fprop); + } else { + normnm = propname; + } + } + + if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && + prop != ZPOOL_PROP_CACHEFILE) { + (void) fprintf(stderr, gettext("property '%s' " + "specified multiple times\n"), propname); + return (2); + } + + if (nvlist_add_string(proplist, normnm, propval) != 0) { + (void) fprintf(stderr, gettext("internal " + "error: out of memory\n")); + return (1); + } + + return (0); +} + +/* + * zpool add [-fn] ... + * + * -f Force addition of devices, even if they appear in use + * -n Do not add the devices, but display the resulting layout if + * they were to be added. + * + * Adds the given vdevs to 'pool'. As with create, the bulk of this work is + * handled by get_vdev_spec(), which constructs the nvlist needed to pass to + * libzfs. + */ +int +zpool_do_add(int argc, char **argv) +{ + boolean_t force = B_FALSE; + boolean_t dryrun = B_FALSE; + int c; + nvlist_t *nvroot; + char *poolname; + int ret; + zpool_handle_t *zhp; + nvlist_t *config; + + /* check options */ + while ((c = getopt(argc, argv, "fn")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case 'n': + dryrun = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing vdev specification\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + argc--; + argv++; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), + poolname); + zpool_close(zhp); + return (1); + } + + /* pass off to get_vdev_spec for processing */ + nvroot = make_root_vdev(zhp, force, !force, B_FALSE, dryrun, + argc, argv); + if (nvroot == NULL) { + zpool_close(zhp); + return (1); + } + + if (dryrun) { + nvlist_t *poolnvroot; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &poolnvroot) == 0); + + (void) printf(gettext("would update '%s' to the following " + "configuration:\n"), zpool_get_name(zhp)); + + /* print original main pool and new tree */ + print_vdev_tree(zhp, poolname, poolnvroot, 0, B_FALSE); + print_vdev_tree(zhp, NULL, nvroot, 0, B_FALSE); + + /* Do the same for the logs */ + if (num_logs(poolnvroot) > 0) { + print_vdev_tree(zhp, "logs", poolnvroot, 0, B_TRUE); + print_vdev_tree(zhp, NULL, nvroot, 0, B_TRUE); + } else if (num_logs(nvroot) > 0) { + print_vdev_tree(zhp, "logs", nvroot, 0, B_TRUE); + } + + ret = 0; + } else { + ret = (zpool_add(zhp, nvroot) != 0); + } + + nvlist_free(nvroot); + zpool_close(zhp); + + return (ret); +} + +/* + * zpool remove ... + * + * Removes the given vdev from the pool. Currently, this supports removing + * spares, cache, and log devices from the pool. + */ +int +zpool_do_remove(int argc, char **argv) +{ + char *poolname; + int i, ret = 0; + zpool_handle_t *zhp; + + argc--; + argv++; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + for (i = 1; i < argc; i++) { + if (zpool_vdev_remove(zhp, argv[i]) != 0) + ret = 1; + } + + return (ret); +} + +/* + * zpool create [-fn] [-o property=value] ... + * [-O file-system-property=value] ... + * [-R root] [-m mountpoint] ... + * + * -f Force creation, even if devices appear in use + * -n Do not create the pool, but display the resulting layout if it + * were to be created. + * -R Create a pool under an alternate root + * -m Set default mountpoint for the root dataset. By default it's + * '/' + * -o Set property=value. + * -O Set fsproperty=value in the pool's root file system + * + * Creates the named pool according to the given vdev specification. The + * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once + * we get the nvlist back from get_vdev_spec(), we either print out the contents + * (if '-n' was specified), or pass it to libzfs to do the creation. + */ +int +zpool_do_create(int argc, char **argv) +{ + boolean_t force = B_FALSE; + boolean_t dryrun = B_FALSE; + int c; + nvlist_t *nvroot = NULL; + char *poolname; + int ret = 1; + char *altroot = NULL; + char *mountpoint = NULL; + nvlist_t *fsprops = NULL; + nvlist_t *props = NULL; + char *propval; + + /* check options */ + while ((c = getopt(argc, argv, ":fnR:m:o:O:")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'R': + altroot = optarg; + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) + goto errout; + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), + &propval) == 0) + break; + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto errout; + break; + case 'm': + mountpoint = optarg; + break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + goto errout; + } + *propval = '\0'; + propval++; + + if (add_prop_list(optarg, propval, &props, B_TRUE)) + goto errout; + break; + case 'O': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -O option\n")); + goto errout; + } + *propval = '\0'; + propval++; + + if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) + goto errout; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + goto badusage; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto badusage; + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + goto badusage; + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing vdev specification\n")); + goto badusage; + } + + poolname = argv[0]; + + /* + * As a special case, check for use of '/' in the name, and direct the + * user to use 'zfs create' instead. + */ + if (strchr(poolname, '/') != NULL) { + (void) fprintf(stderr, gettext("cannot create '%s': invalid " + "character '/' in pool name\n"), poolname); + (void) fprintf(stderr, gettext("use 'zfs create' to " + "create a dataset\n")); + goto errout; + } + + /* pass off to get_vdev_spec for bulk processing */ + nvroot = make_root_vdev(NULL, force, !force, B_FALSE, dryrun, + argc - 1, argv + 1); + if (nvroot == NULL) + goto errout; + + /* make_root_vdev() allows 0 toplevel children if there are spares */ + if (!zfs_allocatable_devs(nvroot)) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: at least one toplevel vdev must be " + "specified\n")); + goto errout; + } + + + if (altroot != NULL && altroot[0] != '/') { + (void) fprintf(stderr, gettext("invalid alternate root '%s': " + "must be an absolute path\n"), altroot); + goto errout; + } + + /* + * Check the validity of the mountpoint and direct the user to use the + * '-m' mountpoint option if it looks like its in use. + */ + if (mountpoint == NULL || + (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && + strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { + char buf[MAXPATHLEN]; + DIR *dirp; + + if (mountpoint && mountpoint[0] != '/') { + (void) fprintf(stderr, gettext("invalid mountpoint " + "'%s': must be an absolute path, 'legacy', or " + "'none'\n"), mountpoint); + goto errout; + } + + if (mountpoint == NULL) { + if (altroot != NULL) + (void) snprintf(buf, sizeof (buf), "%s/%s", + altroot, poolname); + else + (void) snprintf(buf, sizeof (buf), "/%s", + poolname); + } else { + if (altroot != NULL) + (void) snprintf(buf, sizeof (buf), "%s%s", + altroot, mountpoint); + else + (void) snprintf(buf, sizeof (buf), "%s", + mountpoint); + } + + if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { + (void) fprintf(stderr, gettext("mountpoint '%s' : " + "%s\n"), buf, strerror(errno)); + (void) fprintf(stderr, gettext("use '-m' " + "option to provide a different default\n")); + goto errout; + } else if (dirp) { + int count = 0; + + while (count < 3 && readdir(dirp) != NULL) + count++; + (void) closedir(dirp); + + if (count > 2) { + (void) fprintf(stderr, gettext("mountpoint " + "'%s' exists and is not empty\n"), buf); + (void) fprintf(stderr, gettext("use '-m' " + "option to provide a " + "different default\n")); + goto errout; + } + } + } + + if (dryrun) { + /* + * For a dry run invocation, print out a basic message and run + * through all the vdevs in the list and print out in an + * appropriate hierarchy. + */ + (void) printf(gettext("would create '%s' with the " + "following layout:\n\n"), poolname); + + print_vdev_tree(NULL, poolname, nvroot, 0, B_FALSE); + if (num_logs(nvroot) > 0) + print_vdev_tree(NULL, "logs", nvroot, 0, B_TRUE); + + ret = 0; + } else { + /* + * Hand off to libzfs. + */ + if (zpool_create(g_zfs, poolname, + nvroot, props, fsprops) == 0) { + zfs_handle_t *pool = zfs_open(g_zfs, poolname, + ZFS_TYPE_FILESYSTEM); + if (pool != NULL) { + if (mountpoint != NULL) + verify(zfs_prop_set(pool, + zfs_prop_to_name( + ZFS_PROP_MOUNTPOINT), + mountpoint) == 0); + if (zfs_mount(pool, NULL, 0) == 0) + ret = zfs_shareall(pool); + zfs_close(pool); + } + } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { + (void) fprintf(stderr, gettext("pool name may have " + "been omitted\n")); + } + } + +errout: + nvlist_free(nvroot); + nvlist_free(fsprops); + nvlist_free(props); + return (ret); +badusage: + nvlist_free(fsprops); + nvlist_free(props); + usage(B_FALSE); + return (2); +} + +/* + * zpool destroy + * + * -f Forcefully unmount any datasets + * + * Destroy the given pool. Automatically unmounts any datasets in the pool. + */ +int +zpool_do_destroy(int argc, char **argv) +{ + boolean_t force = B_FALSE; + int c; + char *pool; + zpool_handle_t *zhp; + int ret; + + /* check options */ + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + pool = argv[0]; + + if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { + /* + * As a special case, check for use of '/' in the name, and + * direct the user to use 'zfs destroy' instead. + */ + if (strchr(pool, '/') != NULL) + (void) fprintf(stderr, gettext("use 'zfs destroy' to " + "destroy a dataset\n")); + return (1); + } + + if (zpool_disable_datasets(zhp, force) != 0) { + (void) fprintf(stderr, gettext("could not destroy '%s': " + "could not unmount datasets\n"), zpool_get_name(zhp)); + return (1); + } + + ret = (zpool_destroy(zhp) != 0); + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool export [-f] ... + * + * -f Forcefully unmount datasets + * + * Export the given pools. By default, the command will attempt to cleanly + * unmount any active datasets within the pool. If the '-f' flag is specified, + * then the datasets will be forcefully unmounted. + */ +int +zpool_do_export(int argc, char **argv) +{ + boolean_t force = B_FALSE; + boolean_t hardforce = B_FALSE; + int c; + zpool_handle_t *zhp; + int ret; + int i; + + /* check options */ + while ((c = getopt(argc, argv, "fF")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case 'F': + hardforce = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } + + ret = 0; + for (i = 0; i < argc; i++) { + if ((zhp = zpool_open_canfail(g_zfs, argv[i])) == NULL) { + ret = 1; + continue; + } + + if (zpool_disable_datasets(zhp, force) != 0) { + ret = 1; + zpool_close(zhp); + continue; + } + + if (hardforce) { + if (zpool_export_force(zhp) != 0) + ret = 1; + } else if (zpool_export(zhp, force) != 0) { + ret = 1; + } + + zpool_close(zhp); + } + + return (ret); +} + +/* + * Given a vdev configuration, determine the maximum width needed for the device + * name column. + */ +static int +max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) +{ + char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE); + nvlist_t **child; + uint_t c, children; + int ret; + + if (strlen(name) + depth > max) + max = strlen(name) + depth; + + free(name); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max)) > max) + max = ret; + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max)) > max) + max = ret; + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max)) > max) + max = ret; + } + + + return (max); +} + +typedef struct spare_cbdata { + uint64_t cb_guid; + zpool_handle_t *cb_zhp; +} spare_cbdata_t; + +static boolean_t +find_vdev(nvlist_t *nv, uint64_t search) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + search == guid) + return (B_TRUE); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (find_vdev(child[c], search)) + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +find_spare(zpool_handle_t *zhp, void *data) +{ + spare_cbdata_t *cbp = data; + nvlist_t *config, *nvroot; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + if (find_vdev(nvroot, cbp->cb_guid)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +/* + * Print out configuration state as requested by status_callback. + */ +void +print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, + int namewidth, int depth, boolean_t isspare) +{ + nvlist_t **child; + uint_t c, children; + pool_scan_stat_t *ps = NULL; + vdev_stat_t *vs; + char rbuf[6], wbuf[6], cbuf[6]; + char *vname; + uint64_t notpresent; + spare_cbdata_t cb; + char *state; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); + if (isspare) { + /* + * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for + * online drives. + */ + if (vs->vs_aux == VDEV_AUX_SPARED) + state = "INUSE"; + else if (vs->vs_state == VDEV_STATE_HEALTHY) + state = "AVAIL"; + } + + (void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth, + name, state); + + if (!isspare) { + zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); + zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); + zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); + (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + ¬present) == 0) { + char *path; + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); + (void) printf(" was %s", path); + } else if (vs->vs_aux != 0) { + (void) printf(" "); + + switch (vs->vs_aux) { + case VDEV_AUX_OPEN_FAILED: + (void) printf(gettext("cannot open")); + break; + + case VDEV_AUX_BAD_GUID_SUM: + (void) printf(gettext("missing device")); + break; + + case VDEV_AUX_NO_REPLICAS: + (void) printf(gettext("insufficient replicas")); + break; + + case VDEV_AUX_VERSION_NEWER: + (void) printf(gettext("newer version")); + break; + + case VDEV_AUX_SPARED: + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &cb.cb_guid) == 0); + if (zpool_iter(g_zfs, find_spare, &cb) == 1) { + if (strcmp(zpool_get_name(cb.cb_zhp), + zpool_get_name(zhp)) == 0) + (void) printf(gettext("currently in " + "use")); + else + (void) printf(gettext("in use by " + "pool '%s'"), + zpool_get_name(cb.cb_zhp)); + zpool_close(cb.cb_zhp); + } else { + (void) printf(gettext("currently in use")); + } + break; + + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + + case VDEV_AUX_IO_FAILURE: + (void) printf(gettext("experienced I/O failures")); + break; + + case VDEV_AUX_BAD_LOG: + (void) printf(gettext("bad intent log")); + break; + + case VDEV_AUX_EXTERNAL: + (void) printf(gettext("external device fault")); + break; + + case VDEV_AUX_SPLIT_POOL: + (void) printf(gettext("split into new pool")); + break; + + default: + (void) printf(gettext("corrupted data")); + break; + } + } + + (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c); + + if (ps && ps->pss_state == DSS_SCANNING && + vs->vs_scan_processed != 0 && children == 0) { + (void) printf(gettext(" (%s)"), + (ps->pss_func == POOL_SCAN_RESILVER) ? + "resilvering" : "repairing"); + } + + (void) printf("\n"); + + for (c = 0; c < children; c++) { + uint64_t islog = B_FALSE, ishole = B_FALSE; + + /* Don't print logs or holes here */ + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &islog); + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &ishole); + if (islog || ishole) + continue; + vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE); + print_status_config(zhp, vname, child[c], + namewidth, depth + 2, isspare); + free(vname); + } +} + + +/* + * Print the configuration of an exported pool. Iterate over all vdevs in the + * pool, printing out the name and status for each one. + */ +void +print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) +{ + nvlist_t **child; + uint_t c, children; + vdev_stat_t *vs; + char *type, *vname; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + if (strcmp(type, VDEV_TYPE_MISSING) == 0 || + strcmp(type, VDEV_TYPE_HOLE) == 0) + return; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + (void) printf("\t%*s%-*s", depth, "", namewidth - depth, name); + (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); + + if (vs->vs_aux != 0) { + (void) printf(" "); + + switch (vs->vs_aux) { + case VDEV_AUX_OPEN_FAILED: + (void) printf(gettext("cannot open")); + break; + + case VDEV_AUX_BAD_GUID_SUM: + (void) printf(gettext("missing device")); + break; + + case VDEV_AUX_NO_REPLICAS: + (void) printf(gettext("insufficient replicas")); + break; + + case VDEV_AUX_VERSION_NEWER: + (void) printf(gettext("newer version")); + break; + + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + + default: + (void) printf(gettext("corrupted data")); + break; + } + } + (void) printf("\n"); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (is_log) + continue; + + vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE); + print_import_config(vname, child[c], namewidth, depth + 2); + free(vname); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + (void) printf(gettext("\tcache\n")); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE); + (void) printf("\t %s\n", vname); + free(vname); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + (void) printf(gettext("\tspares\n")); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE); + (void) printf("\t %s\n", vname); + free(vname); + } + } +} + +/* + * Print log vdevs. + * Logs are recorded as top level vdevs in the main pool child array + * but with "is_log" set to 1. We use either print_status_config() or + * print_import_config() to print the top level logs then any log + * children (eg mirrored slogs) are printed recursively - which + * works because only the top level vdev is marked "is_log" + */ +static void +print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose) +{ + uint_t c, children; + nvlist_t **child; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return; + + (void) printf(gettext("\tlogs\n")); + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + char *name; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (!is_log) + continue; + name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE); + if (verbose) + print_status_config(zhp, name, child[c], namewidth, + 2, B_FALSE); + else + print_import_config(name, child[c], namewidth, 2); + free(name); + } +} + +/* + * Display the status for the given pool. + */ +static void +show_import(nvlist_t *config) +{ + uint64_t pool_state; + vdev_stat_t *vs; + char *name; + uint64_t guid; + char *msgid; + nvlist_t *nvroot; + int reason; + const char *health; + uint_t vsc; + int namewidth; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + health = zpool_state_to_name(vs->vs_state, vs->vs_aux); + + reason = zpool_import_status(config, &msgid); + + (void) printf(gettext(" pool: %s\n"), name); + (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); + (void) printf(gettext(" state: %s"), health); + if (pool_state == POOL_STATE_DESTROYED) + (void) printf(gettext(" (DESTROYED)")); + (void) printf("\n"); + + switch (reason) { + case ZPOOL_STATUS_MISSING_DEV_R: + case ZPOOL_STATUS_MISSING_DEV_NR: + case ZPOOL_STATUS_BAD_GUID_SUM: + (void) printf(gettext("status: One or more devices are missing " + "from the system.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_LABEL_R: + case ZPOOL_STATUS_CORRUPT_LABEL_NR: + (void) printf(gettext("status: One or more devices contains " + "corrupted data.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_DATA: + (void) printf(gettext("status: The pool data is corrupted.\n")); + break; + + case ZPOOL_STATUS_OFFLINE_DEV: + (void) printf(gettext("status: One or more devices " + "are offlined.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_POOL: + (void) printf(gettext("status: The pool metadata is " + "corrupted.\n")); + break; + + case ZPOOL_STATUS_VERSION_OLDER: + (void) printf(gettext("status: The pool is formatted using an " + "older on-disk version.\n")); + break; + + case ZPOOL_STATUS_VERSION_NEWER: + (void) printf(gettext("status: The pool is formatted using an " + "incompatible version.\n")); + break; + + case ZPOOL_STATUS_HOSTID_MISMATCH: + (void) printf(gettext("status: The pool was last accessed by " + "another system.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_R: + case ZPOOL_STATUS_FAULTED_DEV_NR: + (void) printf(gettext("status: One or more devices are " + "faulted.\n")); + break; + + case ZPOOL_STATUS_BAD_LOG: + (void) printf(gettext("status: An intent log record cannot be " + "read.\n")); + break; + + case ZPOOL_STATUS_RESILVERING: + (void) printf(gettext("status: One or more devices were being " + "resilvered.\n")); + break; + + default: + /* + * No other status can be seen when importing pools. + */ + assert(reason == ZPOOL_STATUS_OK); + } + + /* + * Print out an action according to the overall state of the pool. + */ + if (vs->vs_state == VDEV_STATE_HEALTHY) { + if (reason == ZPOOL_STATUS_VERSION_OLDER) + (void) printf(gettext("action: The pool can be " + "imported using its name or numeric identifier, " + "though\n\tsome features will not be available " + "without an explicit 'zpool upgrade'.\n")); + else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) + (void) printf(gettext("action: The pool can be " + "imported using its name or numeric " + "identifier and\n\tthe '-f' flag.\n")); + else + (void) printf(gettext("action: The pool can be " + "imported using its name or numeric " + "identifier.\n")); + } else if (vs->vs_state == VDEV_STATE_DEGRADED) { + (void) printf(gettext("action: The pool can be imported " + "despite missing or damaged devices. The\n\tfault " + "tolerance of the pool may be compromised if imported.\n")); + } else { + switch (reason) { + case ZPOOL_STATUS_VERSION_NEWER: + (void) printf(gettext("action: The pool cannot be " + "imported. Access the pool on a system running " + "newer\n\tsoftware, or recreate the pool from " + "backup.\n")); + break; + case ZPOOL_STATUS_MISSING_DEV_R: + case ZPOOL_STATUS_MISSING_DEV_NR: + case ZPOOL_STATUS_BAD_GUID_SUM: + (void) printf(gettext("action: The pool cannot be " + "imported. Attach the missing\n\tdevices and try " + "again.\n")); + break; + default: + (void) printf(gettext("action: The pool cannot be " + "imported due to damaged devices or data.\n")); + } + } + + /* + * If the state is "closed" or "can't open", and the aux state + * is "corrupt data": + */ + if (((vs->vs_state == VDEV_STATE_CLOSED) || + (vs->vs_state == VDEV_STATE_CANT_OPEN)) && + (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { + if (pool_state == POOL_STATE_DESTROYED) + (void) printf(gettext("\tThe pool was destroyed, " + "but can be imported using the '-Df' flags.\n")); + else if (pool_state != POOL_STATE_EXPORTED) + (void) printf(gettext("\tThe pool may be active on " + "another system, but can be imported using\n\t" + "the '-f' flag.\n")); + } + + if (msgid != NULL) + (void) printf(gettext(" see: http://www.sun.com/msg/%s\n"), + msgid); + + (void) printf(gettext("config:\n\n")); + + namewidth = max_width(NULL, nvroot, 0, 0); + if (namewidth < 10) + namewidth = 10; + + print_import_config(name, nvroot, namewidth, 0); + if (num_logs(nvroot) > 0) + print_logs(NULL, nvroot, namewidth, B_FALSE); + + if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { + (void) printf(gettext("\n\tAdditional devices are known to " + "be part of this pool, though their\n\texact " + "configuration cannot be determined.\n")); + } +} + +/* + * Perform the import for the given configuration. This passes the heavy + * lifting off to zpool_import_props(), and then mounts the datasets contained + * within the pool. + */ +static int +do_import(nvlist_t *config, const char *newname, const char *mntopts, + nvlist_t *props, int flags) +{ + zpool_handle_t *zhp; + char *name; + uint64_t state; + uint64_t version; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + + verify(nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &state) == 0); + verify(nvlist_lookup_uint64(config, + ZPOOL_CONFIG_VERSION, &version) == 0); + if (version > SPA_VERSION) { + (void) fprintf(stderr, gettext("cannot import '%s': pool " + "is formatted using a newer ZFS version\n"), name); + return (1); + } else if (state != POOL_STATE_EXPORTED && + !(flags & ZFS_IMPORT_ANY_HOST)) { + uint64_t hostid; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, + &hostid) == 0) { + if ((unsigned long)hostid != gethostid()) { + char *hostname; + uint64_t timestamp; + time_t t; + + verify(nvlist_lookup_string(config, + ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); + verify(nvlist_lookup_uint64(config, + ZPOOL_CONFIG_TIMESTAMP, ×tamp) == 0); + t = timestamp; + (void) fprintf(stderr, gettext("cannot import " + "'%s': pool may be in use from other " + "system, it was last accessed by %s " + "(hostid: 0x%lx) on %s"), name, hostname, + (unsigned long)hostid, + asctime(localtime(&t))); + (void) fprintf(stderr, gettext("use '-f' to " + "import anyway\n")); + return (1); + } + } else { + (void) fprintf(stderr, gettext("cannot import '%s': " + "pool may be in use from other system\n"), name); + (void) fprintf(stderr, gettext("use '-f' to import " + "anyway\n")); + return (1); + } + } + + if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) + return (1); + + if (newname != NULL) + name = (char *)newname; + + if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) + return (1); + + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + !(flags & ZFS_IMPORT_ONLY) && + zpool_enable_datasets(zhp, mntopts, 0) != 0) { + zpool_close(zhp); + return (1); + } + + zpool_close(zhp); + return (0); +} + +/* + * zpool import [-d dir] [-D] + * import [-o mntopts] [-o prop=value] ... [-R root] [-D] + * [-d dir | -c cachefile] [-f] -a + * import [-o mntopts] [-o prop=value] ... [-R root] [-D] + * [-d dir | -c cachefile] [-f] [-n] [-F] [newpool] + * + * -c Read pool information from a cachefile instead of searching + * devices. + * + * -d Scan in a specific directory, other than /dev/dsk. More than + * one directory can be specified using multiple '-d' options. + * + * -D Scan for previously destroyed pools or import all or only + * specified destroyed pools. + * + * -R Temporarily import the pool, with all mountpoints relative to + * the given root. The pool will remain exported when the machine + * is rebooted. + * + * -V Import even in the presence of faulted vdevs. This is an + * intentionally undocumented option for testing purposes, and + * treats the pool configuration as complete, leaving any bad + * vdevs in the FAULTED state. In other words, it does verbatim + * import. + * + * -f Force import, even if it appears that the pool is active. + * + * -F Attempt rewind if necessary. + * + * -n See if rewind would work, but don't actually rewind. + * + * -N Import the pool but don't mount datasets. + * + * -T Specify a starting txg to use for import. This option is + * intentionally undocumented option for testing purposes. + * + * -a Import all pools found. + * + * -o Set property=value and/or temporary mount options (without '='). + * + * The import command scans for pools to import, and import pools based on pool + * name and GUID. The pool can also be renamed as part of the import process. + */ +int +zpool_do_import(int argc, char **argv) +{ + char **searchdirs = NULL; + int nsearch = 0; + int c; + int err = 0; + nvlist_t *pools = NULL; + boolean_t do_all = B_FALSE; + boolean_t do_destroyed = B_FALSE; + char *mntopts = NULL; + nvpair_t *elem; + nvlist_t *config; + uint64_t searchguid = 0; + char *searchname = NULL; + char *propval; + nvlist_t *found_config; + nvlist_t *policy = NULL; + nvlist_t *props = NULL; + boolean_t first; + int flags = ZFS_IMPORT_NORMAL; + uint32_t rewind_policy = ZPOOL_NO_REWIND; + boolean_t dryrun = B_FALSE; + boolean_t do_rewind = B_FALSE; + boolean_t xtreme_rewind = B_FALSE; + uint64_t pool_state, txg = -1ULL; + char *cachefile = NULL; + importargs_t idata = { 0 }; + char *endptr; + + /* check options */ + while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:rR:T:VX")) != -1) { + switch (c) { + case 'a': + do_all = B_TRUE; + break; + case 'c': + cachefile = optarg; + break; + case 'd': + if (searchdirs == NULL) { + searchdirs = safe_malloc(sizeof (char *)); + } else { + char **tmp = safe_malloc((nsearch + 1) * + sizeof (char *)); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + free(searchdirs); + searchdirs = tmp; + } + searchdirs[nsearch++] = optarg; + break; + case 'D': + do_destroyed = B_TRUE; + break; + case 'f': + flags |= ZFS_IMPORT_ANY_HOST; + break; + case 'F': + do_rewind = B_TRUE; + break; + case 'm': + flags |= ZFS_IMPORT_MISSING_LOG; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'N': + flags |= ZFS_IMPORT_ONLY; + break; + case 'o': + if ((propval = strchr(optarg, '=')) != NULL) { + *propval = '\0'; + propval++; + if (add_prop_list(optarg, propval, + &props, B_TRUE)) + goto error; + } else { + mntopts = optarg; + } + break; + case 'R': + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) + goto error; + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), + &propval) == 0) + break; + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto error; + break; + case 'T': + errno = 0; + txg = strtoull(optarg, &endptr, 10); + if (errno != 0 || *endptr != '\0') { + (void) fprintf(stderr, + gettext("invalid txg value\n")); + usage(B_FALSE); + } + rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; + break; + case 'V': + flags |= ZFS_IMPORT_VERBATIM; + break; + case 'X': + xtreme_rewind = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (cachefile && nsearch != 0) { + (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); + usage(B_FALSE); + } + + if ((dryrun || xtreme_rewind) && !do_rewind) { + (void) fprintf(stderr, + gettext("-n or -X only meaningful with -F\n")); + usage(B_FALSE); + } + if (dryrun) + rewind_policy = ZPOOL_TRY_REWIND; + else if (do_rewind) + rewind_policy = ZPOOL_DO_REWIND; + if (xtreme_rewind) + rewind_policy |= ZPOOL_EXTREME_REWIND; + + /* In the future, we can capture further policy and include it here */ + if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, txg) != 0 || + nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0) + goto error; + + if (searchdirs == NULL) { + searchdirs = safe_malloc(sizeof (char *)); + searchdirs[0] = "/dev/dsk"; + nsearch = 1; + } + + /* check argument count */ + if (do_all) { + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + } else { + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* + * Check for the SYS_CONFIG privilege. We do this explicitly + * here because otherwise any attempt to discover pools will + * silently fail. + */ + if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) { + (void) fprintf(stderr, gettext("cannot " + "discover pools: permission denied\n")); + free(searchdirs); + nvlist_free(policy); + return (1); + } + } + + /* + * Depending on the arguments given, we do one of the following: + * + * Iterate through all pools and display information about + * each one. + * + * -a Iterate through all pools and try to import each one. + * + * Find the pool that corresponds to the given GUID/pool + * name and import that one. + * + * -D Above options applies only to destroyed pools. + */ + if (argc != 0) { + char *endptr; + + errno = 0; + searchguid = strtoull(argv[0], &endptr, 10); + if (errno != 0 || *endptr != '\0') + searchname = argv[0]; + found_config = NULL; + + /* + * User specified a name or guid. Ensure it's unique. + */ + idata.unique = B_TRUE; + } + + + idata.path = searchdirs; + idata.paths = nsearch; + idata.poolname = searchname; + idata.guid = searchguid; + idata.cachefile = cachefile; + + pools = zpool_search_import(g_zfs, &idata); + + if (pools != NULL && idata.exists && + (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "a pool with that name already exists\n"), + argv[0]); + (void) fprintf(stderr, gettext("use the form '%s " + " ' to give it a new name\n"), + "zpool import"); + err = 1; + } else if (pools == NULL && idata.exists) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "a pool with that name is already created/imported,\n"), + argv[0]); + (void) fprintf(stderr, gettext("and no additional pools " + "with that name were found\n")); + err = 1; + } else if (pools == NULL) { + if (argc != 0) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "no such pool available\n"), argv[0]); + } + err = 1; + } + + if (err == 1) { + free(searchdirs); + nvlist_free(policy); + return (1); + } + + /* + * At this point we have a list of import candidate configs. Even if + * we were searching by pool name or guid, we still need to + * post-process the list to deal with pool state and possible + * duplicate names. + */ + err = 0; + elem = NULL; + first = B_TRUE; + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + + verify(nvpair_value_nvlist(elem, &config) == 0); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) + continue; + if (do_destroyed && pool_state != POOL_STATE_DESTROYED) + continue; + + verify(nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY, + policy) == 0); + + if (argc == 0) { + if (first) + first = B_FALSE; + else if (!do_all) + (void) printf("\n"); + + if (do_all) { + err |= do_import(config, NULL, mntopts, + props, flags); + } else { + show_import(config); + } + } else if (searchname != NULL) { + char *name; + + /* + * We are searching for a pool based on name. + */ + verify(nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &name) == 0); + + if (strcmp(name, searchname) == 0) { + if (found_config != NULL) { + (void) fprintf(stderr, gettext( + "cannot import '%s': more than " + "one matching pool\n"), searchname); + (void) fprintf(stderr, gettext( + "import by numeric ID instead\n")); + err = B_TRUE; + } + found_config = config; + } + } else { + uint64_t guid; + + /* + * Search for a pool by guid. + */ + verify(nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, &guid) == 0); + + if (guid == searchguid) + found_config = config; + } + } + + /* + * If we were searching for a specific pool, verify that we found a + * pool, and then do the import. + */ + if (argc != 0 && err == 0) { + if (found_config == NULL) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "no such pool available\n"), argv[0]); + err = B_TRUE; + } else { + err |= do_import(found_config, argc == 1 ? NULL : + argv[1], mntopts, props, flags); + } + } + + /* + * If we were just looking for pools, report an error if none were + * found. + */ + if (argc == 0 && first) + (void) fprintf(stderr, + gettext("no pools available to import\n")); + +error: + nvlist_free(props); + nvlist_free(pools); + nvlist_free(policy); + free(searchdirs); + + return (err ? 1 : 0); +} + +typedef struct iostat_cbdata { + zpool_list_t *cb_list; + int cb_verbose; + int cb_iteration; + int cb_namewidth; +} iostat_cbdata_t; + +static void +print_iostat_separator(iostat_cbdata_t *cb) +{ + int i = 0; + + for (i = 0; i < cb->cb_namewidth; i++) + (void) printf("-"); + (void) printf(" ----- ----- ----- ----- ----- -----\n"); +} + +static void +print_iostat_header(iostat_cbdata_t *cb) +{ + (void) printf("%*s capacity operations bandwidth\n", + cb->cb_namewidth, ""); + (void) printf("%-*s alloc free read write read write\n", + cb->cb_namewidth, "pool"); + print_iostat_separator(cb); +} + +/* + * Display a single statistic. + */ +static void +print_one_stat(uint64_t value) +{ + char buf[64]; + + zfs_nicenum(value, buf, sizeof (buf)); + (void) printf(" %5s", buf); +} + +/* + * Print out all the statistics for the given vdev. This can either be the + * toplevel configuration, or called recursively. If 'name' is NULL, then this + * is a verbose output, and we don't want to display the toplevel pool stats. + */ +void +print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, + nvlist_t *newnv, iostat_cbdata_t *cb, int depth) +{ + nvlist_t **oldchild, **newchild; + uint_t c, children; + vdev_stat_t *oldvs, *newvs; + vdev_stat_t zerovs = { 0 }; + uint64_t tdelta; + double scale; + char *vname; + + if (oldnv != NULL) { + verify(nvlist_lookup_uint64_array(oldnv, + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); + } else { + oldvs = &zerovs; + } + + verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&newvs, &c) == 0); + + if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - depth), ""); + + tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; + + if (tdelta == 0) + scale = 1.0; + else + scale = (double)NANOSEC / tdelta; + + /* only toplevel vdevs have capacity stats */ + if (newvs->vs_space == 0) { + (void) printf(" - -"); + } else { + print_one_stat(newvs->vs_alloc); + print_one_stat(newvs->vs_space - newvs->vs_alloc); + } + + print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] - + oldvs->vs_ops[ZIO_TYPE_READ]))); + + print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] - + oldvs->vs_ops[ZIO_TYPE_WRITE]))); + + print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] - + oldvs->vs_bytes[ZIO_TYPE_READ]))); + + print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] - + oldvs->vs_bytes[ZIO_TYPE_WRITE]))); + + (void) printf("\n"); + + if (!cb->cb_verbose) + return; + + if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, + &newchild, &children) != 0) + return; + + if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, + &oldchild, &c) != 0) + return; + + for (c = 0; c < children; c++) { + uint64_t ishole = B_FALSE; + + if (nvlist_lookup_uint64(newchild[c], + ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE); + print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, + newchild[c], cb, depth + 2); + free(vname); + } + + /* + * Include level 2 ARC devices in iostat output + */ + if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, + &newchild, &children) != 0) + return; + + if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, + &oldchild, &c) != 0) + return; + + if (children > 0) { + (void) printf("%-*s - - - - - " + "-\n", cb->cb_namewidth, "cache"); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + B_FALSE); + print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, + newchild[c], cb, depth + 2); + free(vname); + } + } +} + +static int +refresh_iostat(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + boolean_t missing; + + /* + * If the pool has disappeared, remove it from the list and continue. + */ + if (zpool_refresh_stats(zhp, &missing) != 0) + return (-1); + + if (missing) + pool_list_remove(cb->cb_list, zhp); + + return (0); +} + +/* + * Callback to print out the iostats for the given pool. + */ +int +print_iostat(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + nvlist_t *oldconfig, *newconfig; + nvlist_t *oldnvroot, *newnvroot; + + newconfig = zpool_get_config(zhp, &oldconfig); + + if (cb->cb_iteration == 1) + oldconfig = NULL; + + verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, + &newnvroot) == 0); + + if (oldconfig == NULL) + oldnvroot = NULL; + else + verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, + &oldnvroot) == 0); + + /* + * Print out the statistics for the pool. + */ + print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); + + if (cb->cb_verbose) + print_iostat_separator(cb); + + return (0); +} + +int +get_namewidth(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + nvlist_t *config, *nvroot; + + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (!cb->cb_verbose) + cb->cb_namewidth = strlen(zpool_get_name(zhp)); + else + cb->cb_namewidth = max_width(zhp, nvroot, 0, 0); + } + + /* + * The width must fall into the range [10,38]. The upper limit is the + * maximum we can have and still fit in 80 columns. + */ + if (cb->cb_namewidth < 10) + cb->cb_namewidth = 10; + if (cb->cb_namewidth > 38) + cb->cb_namewidth = 38; + + return (0); +} + +/* + * Parse the input string, get the 'interval' and 'count' value if there is one. + */ +static void +get_interval_count(int *argcp, char **argv, unsigned long *iv, + unsigned long *cnt) +{ + unsigned long interval = 0, count = 0; + int argc = *argcp, errno; + + /* + * Determine if the last argument is an integer or a pool name + */ + if (argc > 0 && isdigit(argv[argc - 1][0])) { + char *end; + + errno = 0; + interval = strtoul(argv[argc - 1], &end, 10); + + if (*end == '\0' && errno == 0) { + if (interval == 0) { + (void) fprintf(stderr, gettext("interval " + "cannot be zero\n")); + usage(B_FALSE); + } + /* + * Ignore the last parameter + */ + argc--; + } else { + /* + * If this is not a valid number, just plow on. The + * user will get a more informative error message later + * on. + */ + interval = 0; + } + } + + /* + * If the last argument is also an integer, then we have both a count + * and an interval. + */ + if (argc > 0 && isdigit(argv[argc - 1][0])) { + char *end; + + errno = 0; + count = interval; + interval = strtoul(argv[argc - 1], &end, 10); + + if (*end == '\0' && errno == 0) { + if (interval == 0) { + (void) fprintf(stderr, gettext("interval " + "cannot be zero\n")); + usage(B_FALSE); + } + + /* + * Ignore the last parameter + */ + argc--; + } else { + interval = 0; + } + } + + *iv = interval; + *cnt = count; + *argcp = argc; +} + +static void +get_timestamp_arg(char c) +{ + if (c == 'u') + timestamp_fmt = UDATE; + else if (c == 'd') + timestamp_fmt = DDATE; + else + usage(B_FALSE); +} + +/* + * zpool iostat [-v] [-T d|u] [pool] ... [interval [count]] + * + * -v Display statistics for individual vdevs + * -T Display a timestamp in date(1) or Unix format + * + * This command can be tricky because we want to be able to deal with pool + * creation/destruction as well as vdev configuration changes. The bulk of this + * processing is handled by the pool_list_* routines in zpool_iter.c. We rely + * on pool_list_update() to detect the addition of new pools. Configuration + * changes are all handled within libzfs. + */ +int +zpool_do_iostat(int argc, char **argv) +{ + int c; + int ret; + int npools; + unsigned long interval = 0, count = 0; + zpool_list_t *list; + boolean_t verbose = B_FALSE; + iostat_cbdata_t cb; + + /* check options */ + while ((c = getopt(argc, argv, "T:v")) != -1) { + switch (c) { + case 'T': + get_timestamp_arg(*optarg); + break; + case 'v': + verbose = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &interval, &count); + + /* + * Construct the list of all interesting pools. + */ + ret = 0; + if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) + return (1); + + if (pool_list_count(list) == 0 && argc != 0) { + pool_list_free(list); + return (1); + } + + if (pool_list_count(list) == 0 && interval == 0) { + pool_list_free(list); + (void) fprintf(stderr, gettext("no pools available\n")); + return (1); + } + + /* + * Enter the main iostat loop. + */ + cb.cb_list = list; + cb.cb_verbose = verbose; + cb.cb_iteration = 0; + cb.cb_namewidth = 0; + + for (;;) { + pool_list_update(list); + + if ((npools = pool_list_count(list)) == 0) + break; + + /* + * Refresh all statistics. This is done as an explicit step + * before calculating the maximum name width, so that any + * configuration changes are properly accounted for. + */ + (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); + + /* + * Iterate over all pools to determine the maximum width + * for the pool / device name column across all pools. + */ + cb.cb_namewidth = 0; + (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); + + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + /* + * If it's the first time, or verbose mode, print the header. + */ + if (++cb.cb_iteration == 1 || verbose) + print_iostat_header(&cb); + + (void) pool_list_iter(list, B_FALSE, print_iostat, &cb); + + /* + * If there's more than one pool, and we're not in verbose mode + * (which prints a separator for us), then print a separator. + */ + if (npools > 1 && !verbose) + print_iostat_separator(&cb); + + if (verbose) + (void) printf("\n"); + + /* + * Flush the output so that redirection to a file isn't buffered + * indefinitely. + */ + (void) fflush(stdout); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) sleep(interval); + } + + pool_list_free(list); + + return (ret); +} + +typedef struct list_cbdata { + boolean_t cb_scripted; + boolean_t cb_first; + zprop_list_t *cb_proplist; +} list_cbdata_t; + +/* + * Given a list of columns to display, output appropriate headers for each one. + */ +static void +print_header(zprop_list_t *pl) +{ + const char *header; + boolean_t first = B_TRUE; + boolean_t right_justify; + + for (; pl != NULL; pl = pl->pl_next) { + if (pl->pl_prop == ZPROP_INVAL) + continue; + + if (!first) + (void) printf(" "); + else + first = B_FALSE; + + header = zpool_prop_column_name(pl->pl_prop); + right_justify = zpool_prop_align_right(pl->pl_prop); + + if (pl->pl_next == NULL && !right_justify) + (void) printf("%s", header); + else if (right_justify) + (void) printf("%*s", pl->pl_width, header); + else + (void) printf("%-*s", pl->pl_width, header); + } + + (void) printf("\n"); +} + +/* + * Given a pool and a list of properties, print out all the properties according + * to the described layout. + */ +static void +print_pool(zpool_handle_t *zhp, zprop_list_t *pl, int scripted) +{ + boolean_t first = B_TRUE; + char property[ZPOOL_MAXPROPLEN]; + char *propstr; + boolean_t right_justify; + int width; + + for (; pl != NULL; pl = pl->pl_next) { + if (!first) { + if (scripted) + (void) printf("\t"); + else + (void) printf(" "); + } else { + first = B_FALSE; + } + + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + if (zpool_get_prop(zhp, pl->pl_prop, property, + sizeof (property), NULL) != 0) + propstr = "-"; + else + propstr = property; + + right_justify = zpool_prop_align_right(pl->pl_prop); + } else { + propstr = "-"; + } + + width = pl->pl_width; + + /* + * If this is being called in scripted mode, or if this is the + * last column and it is left-justified, don't include a width + * format specifier. + */ + if (scripted || (pl->pl_next == NULL && !right_justify)) + (void) printf("%s", propstr); + else if (right_justify) + (void) printf("%*s", width, propstr); + else + (void) printf("%-*s", width, propstr); + } + + (void) printf("\n"); +} + +/* + * Generic callback function to list a pool. + */ +int +list_callback(zpool_handle_t *zhp, void *data) +{ + list_cbdata_t *cbp = data; + + if (cbp->cb_first) { + if (!cbp->cb_scripted) + print_header(cbp->cb_proplist); + cbp->cb_first = B_FALSE; + } + + print_pool(zhp, cbp->cb_proplist, cbp->cb_scripted); + + return (0); +} + +/* + * zpool list [-H] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] + * + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -o List of properties to display. Defaults to + * "name,size,allocated,free,capacity,health,altroot" + * -T Display a timestamp in date(1) or Unix format + * + * List all pools in the system, whether or not they're healthy. Output space + * statistics for each one, as well as health status summary. + */ +int +zpool_do_list(int argc, char **argv) +{ + int c; + int ret; + list_cbdata_t cb = { 0 }; + static char default_props[] = + "name,size,allocated,free,capacity,dedupratio,health,altroot"; + char *props = default_props; + unsigned long interval = 0, count = 0; + + /* check options */ + while ((c = getopt(argc, argv, ":Ho:T:")) != -1) { + switch (c) { + case 'H': + cb.cb_scripted = B_TRUE; + break; + case 'o': + props = optarg; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &interval, &count); + + if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) + usage(B_FALSE); + + cb.cb_first = B_TRUE; + + for (;;) { + + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, + list_callback, &cb); + + if (argc == 0 && cb.cb_first && !cb.cb_scripted) { + (void) printf(gettext("no pools available\n")); + zprop_free_list(cb.cb_proplist); + return (0); + } + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) sleep(interval); + } + + zprop_free_list(cb.cb_proplist); + return (ret); +} + +static nvlist_t * +zpool_get_vdev_by_name(nvlist_t *nv, char *name) +{ + nvlist_t **child; + uint_t c, children; + nvlist_t *match; + char *path; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); + if (strncmp(name, "/dev/dsk/", 9) == 0) + name += 9; + if (strncmp(path, "/dev/dsk/", 9) == 0) + path += 9; + if (strcmp(name, path) == 0) + return (nv); + return (NULL); + } + + for (c = 0; c < children; c++) + if ((match = zpool_get_vdev_by_name(child[c], name)) != NULL) + return (match); + + return (NULL); +} + +static int +zpool_do_attach_or_replace(int argc, char **argv, int replacing) +{ + boolean_t force = B_FALSE; + int c; + nvlist_t *nvroot; + char *poolname, *old_disk, *new_disk; + zpool_handle_t *zhp; + int ret; + + /* check options */ + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if (argc < 2) { + (void) fprintf(stderr, + gettext("missing specification\n")); + usage(B_FALSE); + } + + old_disk = argv[1]; + + if (argc < 3) { + if (!replacing) { + (void) fprintf(stderr, + gettext("missing specification\n")); + usage(B_FALSE); + } + new_disk = old_disk; + argc -= 1; + argv += 1; + } else { + new_disk = argv[2]; + argc -= 2; + argv += 2; + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + if (zpool_get_config(zhp, NULL) == NULL) { + (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), + poolname); + zpool_close(zhp); + return (1); + } + + nvroot = make_root_vdev(zhp, force, B_FALSE, replacing, B_FALSE, + argc, argv); + if (nvroot == NULL) { + zpool_close(zhp); + return (1); + } + + ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing); + + nvlist_free(nvroot); + zpool_close(zhp); + + return (ret); +} + +/* + * zpool replace [-f] + * + * -f Force attach, even if appears to be in use. + * + * Replace with . + */ +/* ARGSUSED */ +int +zpool_do_replace(int argc, char **argv) +{ + return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); +} + +/* + * zpool attach [-f] + * + * -f Force attach, even if appears to be in use. + * + * Attach to the mirror containing . If is not + * part of a mirror, then will be transformed into a mirror of + * and . In either case, will begin life + * with a DTL of [0, now], and will immediately begin to resilver itself. + */ +int +zpool_do_attach(int argc, char **argv) +{ + return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); +} + +/* + * zpool detach [-f] + * + * -f Force detach of , even if DTLs argue against it + * (not supported yet) + * + * Detach a device from a mirror. The operation will be refused if + * is the last device in the mirror, or if the DTLs indicate that this device + * has the only valid copy of some data. + */ +/* ARGSUSED */ +int +zpool_do_detach(int argc, char **argv) +{ + int c; + char *poolname, *path; + zpool_handle_t *zhp; + int ret; + + /* check options */ + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + if (argc < 2) { + (void) fprintf(stderr, + gettext("missing specification\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + path = argv[1]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + ret = zpool_vdev_detach(zhp, path); + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool split [-n] [-o prop=val] ... + * [-o mntopt] ... + * [-R altroot] [ ...] + * + * -n Do not split the pool, but display the resulting layout if + * it were to be split. + * -o Set property=value, or set mount options. + * -R Mount the split-off pool under an alternate root. + * + * Splits the named pool and gives it the new pool name. Devices to be split + * off may be listed, provided that no more than one device is specified + * per top-level vdev mirror. The newly split pool is left in an exported + * state unless -R is specified. + * + * Restrictions: the top-level of the pool pool must only be made up of + * mirrors; all devices in the pool must be healthy; no device may be + * undergoing a resilvering operation. + */ +int +zpool_do_split(int argc, char **argv) +{ + char *srcpool, *newpool, *propval; + char *mntopts = NULL; + splitflags_t flags; + int c, ret = 0; + zpool_handle_t *zhp; + nvlist_t *config, *props = NULL; + + flags.dryrun = B_FALSE; + flags.import = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, ":R:no:")) != -1) { + switch (c) { + case 'R': + flags.import = B_TRUE; + if (add_prop_list( + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, + &props, B_TRUE) != 0) { + if (props) + nvlist_free(props); + usage(B_FALSE); + } + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'o': + if ((propval = strchr(optarg, '=')) != NULL) { + *propval = '\0'; + propval++; + if (add_prop_list(optarg, propval, + &props, B_TRUE) != 0) { + if (props) + nvlist_free(props); + usage(B_FALSE); + } + } else { + mntopts = optarg; + } + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + break; + } + } + + if (!flags.import && mntopts != NULL) { + (void) fprintf(stderr, gettext("setting mntopts is only " + "valid when importing the pool\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("Missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("Missing new pool name\n")); + usage(B_FALSE); + } + + srcpool = argv[0]; + newpool = argv[1]; + + argc -= 2; + argv += 2; + + if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) + return (1); + + config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); + if (config == NULL) { + ret = 1; + } else { + if (flags.dryrun) { + (void) printf(gettext("would create '%s' with the " + "following layout:\n\n"), newpool); + print_vdev_tree(NULL, newpool, config, 0, B_FALSE); + } + nvlist_free(config); + } + + zpool_close(zhp); + + if (ret != 0 || flags.dryrun || !flags.import) + return (ret); + + /* + * The split was successful. Now we need to open the new + * pool and import it. + */ + if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) + return (1); + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + zpool_enable_datasets(zhp, mntopts, 0) != 0) { + ret = 1; + (void) fprintf(stderr, gettext("Split was succssful, but " + "the datasets could not all be mounted\n")); + (void) fprintf(stderr, gettext("Try doing '%s' with a " + "different altroot\n"), "zpool import"); + } + zpool_close(zhp); + + return (ret); +} + + + +/* + * zpool online ... + */ +int +zpool_do_online(int argc, char **argv) +{ + int c, i; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + vdev_state_t newstate; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, "et")) != -1) { + switch (c) { + case 'e': + flags |= ZFS_ONLINE_EXPAND; + break; + case 't': + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device name\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + for (i = 1; i < argc; i++) { + if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { + if (newstate != VDEV_STATE_HEALTHY) { + (void) printf(gettext("warning: device '%s' " + "onlined, but remains in faulted state\n"), + argv[i]); + if (newstate == VDEV_STATE_FAULTED) + (void) printf(gettext("use 'zpool " + "clear' to restore a faulted " + "device\n")); + else + (void) printf(gettext("use 'zpool " + "replace' to replace devices " + "that are no longer present\n")); + } + } else { + ret = 1; + } + } + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool offline [-ft] ... + * + * -f Force the device into the offline state, even if doing + * so would appear to compromise pool availability. + * (not supported yet) + * + * -t Only take the device off-line temporarily. The offline + * state will not be persistent across reboots. + */ +/* ARGSUSED */ +int +zpool_do_offline(int argc, char **argv) +{ + int c, i; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + boolean_t istmp = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "ft")) != -1) { + switch (c) { + case 't': + istmp = B_TRUE; + break; + case 'f': + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device name\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + for (i = 1; i < argc; i++) { + if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) + ret = 1; + } + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool clear [device] + * + * Clear all errors associated with a pool or a particular device. + */ +int +zpool_do_clear(int argc, char **argv) +{ + int c; + int ret = 0; + boolean_t dryrun = B_FALSE; + boolean_t do_rewind = B_FALSE; + boolean_t xtreme_rewind = B_FALSE; + uint32_t rewind_policy = ZPOOL_NO_REWIND; + nvlist_t *policy = NULL; + zpool_handle_t *zhp; + char *pool, *device; + + /* check options */ + while ((c = getopt(argc, argv, "FnX")) != -1) { + switch (c) { + case 'F': + do_rewind = B_TRUE; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'X': + xtreme_rewind = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if ((dryrun || xtreme_rewind) && !do_rewind) { + (void) fprintf(stderr, + gettext("-n or -X only meaningful with -F\n")); + usage(B_FALSE); + } + if (dryrun) + rewind_policy = ZPOOL_TRY_REWIND; + else if (do_rewind) + rewind_policy = ZPOOL_DO_REWIND; + if (xtreme_rewind) + rewind_policy |= ZPOOL_EXTREME_REWIND; + + /* In future, further rewind policy choices can be passed along here */ + if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0) + return (1); + + pool = argv[0]; + device = argc == 2 ? argv[1] : NULL; + + if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { + nvlist_free(policy); + return (1); + } + + if (zpool_clear(zhp, device, policy) != 0) + ret = 1; + + zpool_close(zhp); + + nvlist_free(policy); + + return (ret); +} + +typedef struct scrub_cbdata { + int cb_type; + int cb_argc; + char **cb_argv; +} scrub_cbdata_t; + +int +scrub_callback(zpool_handle_t *zhp, void *data) +{ + scrub_cbdata_t *cb = data; + int err; + + /* + * Ignore faulted pools. + */ + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + (void) fprintf(stderr, gettext("cannot scrub '%s': pool is " + "currently unavailable\n"), zpool_get_name(zhp)); + return (1); + } + + err = zpool_scan(zhp, cb->cb_type); + + return (err != 0); +} + +/* + * zpool scrub [-s] ... + * + * -s Stop. Stops any in-progress scrub. + */ +int +zpool_do_scrub(int argc, char **argv) +{ + int c; + scrub_cbdata_t cb; + + cb.cb_type = POOL_SCAN_SCRUB; + + /* check options */ + while ((c = getopt(argc, argv, "s")) != -1) { + switch (c) { + case 's': + cb.cb_type = POOL_SCAN_NONE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + cb.cb_argc = argc; + cb.cb_argv = argv; + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); +} + +typedef struct status_cbdata { + int cb_count; + boolean_t cb_allpools; + boolean_t cb_verbose; + boolean_t cb_explain; + boolean_t cb_first; + boolean_t cb_dedup_stats; +} status_cbdata_t; + +/* + * Print out detailed scrub status. + */ +void +print_scan_status(pool_scan_stat_t *ps) +{ + time_t start, end; + uint64_t elapsed, mins_left, hours_left; + uint64_t pass_exam, examined, total; + uint_t rate; + double fraction_done; + char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; + + (void) printf(gettext(" scan: ")); + + /* If there's never been a scan, there's not much to say. */ + if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || + ps->pss_func >= POOL_SCAN_FUNCS) { + (void) printf(gettext("none requested\n")); + return; + } + + start = ps->pss_start_time; + end = ps->pss_end_time; + zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf)); + + assert(ps->pss_func == POOL_SCAN_SCRUB || + ps->pss_func == POOL_SCAN_RESILVER); + /* + * Scan is finished or canceled. + */ + if (ps->pss_state == DSS_FINISHED) { + uint64_t minutes_taken = (end - start) / 60; + char *fmt; + + if (ps->pss_func == POOL_SCAN_SCRUB) { + fmt = gettext("scrub repaired %s in %lluh%um with " + "%llu errors on %s"); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + fmt = gettext("resilvered %s in %lluh%um with " + "%llu errors on %s"); + } + /* LINTED */ + (void) printf(fmt, processed_buf, + (u_longlong_t)(minutes_taken / 60), + (uint_t)(minutes_taken % 60), + (u_longlong_t)ps->pss_errors, + ctime((time_t *)&end)); + return; + } else if (ps->pss_state == DSS_CANCELED) { + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub canceled on %s"), + ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver canceled on %s"), + ctime(&end)); + } + return; + } + + assert(ps->pss_state == DSS_SCANNING); + + /* + * Scan is in progress. + */ + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub in progress since %s"), + ctime(&start)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver in progress since %s"), + ctime(&start)); + } + + examined = ps->pss_examined ? ps->pss_examined : 1; + total = ps->pss_to_examine; + fraction_done = (double)examined / total; + + /* elapsed time for this pass */ + elapsed = time(NULL) - ps->pss_pass_start; + elapsed = elapsed ? elapsed : 1; + pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; + rate = pass_exam / elapsed; + rate = rate ? rate : 1; + mins_left = ((total - examined) / rate) / 60; + hours_left = mins_left / 60; + + zfs_nicenum(examined, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than 30 days + */ + (void) printf(gettext(" %s scanned out of %s at %s/s"), + examined_buf, total_buf, rate_buf); + if (hours_left < (30 * 24)) { + (void) printf(gettext(", %lluh%um to go\n"), + (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); + } else { + (void) printf(gettext( + ", (scan is slow, no estimated time)\n")); + } + + if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext(" %s resilvered, %.2f%% done\n"), + processed_buf, 100 * fraction_done); + } else if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext(" %s repaired, %.2f%% done\n"), + processed_buf, 100 * fraction_done); + } +} + +static void +print_error_log(zpool_handle_t *zhp) +{ + nvlist_t *nverrlist = NULL; + nvpair_t *elem; + char *pathname; + size_t len = MAXPATHLEN * 2; + + if (zpool_get_errlog(zhp, &nverrlist) != 0) { + (void) printf("errors: List of errors unavailable " + "(insufficient privileges)\n"); + return; + } + + (void) printf("errors: Permanent errors have been " + "detected in the following files:\n\n"); + + pathname = safe_malloc(len); + elem = NULL; + while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { + nvlist_t *nv; + uint64_t dsobj, obj; + + verify(nvpair_value_nvlist(elem, &nv) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, + &dsobj) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, + &obj) == 0); + zpool_obj_to_path(zhp, dsobj, obj, pathname, len); + (void) printf("%7s %s\n", "", pathname); + } + free(pathname); + nvlist_free(nverrlist); +} + +static void +print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares, + int namewidth) +{ + uint_t i; + char *name; + + if (nspares == 0) + return; + + (void) printf(gettext("\tspares\n")); + + for (i = 0; i < nspares; i++) { + name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE); + print_status_config(zhp, name, spares[i], + namewidth, 2, B_TRUE); + free(name); + } +} + +static void +print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache, + int namewidth) +{ + uint_t i; + char *name; + + if (nl2cache == 0) + return; + + (void) printf(gettext("\tcache\n")); + + for (i = 0; i < nl2cache; i++) { + name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE); + print_status_config(zhp, name, l2cache[i], + namewidth, 2, B_FALSE); + free(name); + } +} + +static void +print_dedup_stats(nvlist_t *config) +{ + ddt_histogram_t *ddh; + ddt_stat_t *dds; + ddt_object_t *ddo; + uint_t c; + + /* + * If the pool was faulted then we may not have been able to + * obtain the config. Otherwise, if have anything in the dedup + * table continue processing the stats. + */ + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, + (uint64_t **)&ddo, &c) != 0 || ddo->ddo_count == 0) + return; + + (void) printf("\n"); + (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n", + (u_longlong_t)ddo->ddo_count, + (u_longlong_t)ddo->ddo_dspace, + (u_longlong_t)ddo->ddo_mspace); + + verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, + (uint64_t **)&dds, &c) == 0); + verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, + (uint64_t **)&ddh, &c) == 0); + zpool_dump_ddt(dds, ddh); +} + +/* + * Display a summary of pool status. Displays a summary such as: + * + * pool: tank + * status: DEGRADED + * reason: One or more devices ... + * see: http://www.sun.com/msg/ZFS-xxxx-01 + * config: + * mirror DEGRADED + * c1t0d0 OK + * c2t0d0 UNAVAIL + * + * When given the '-v' option, we print out the complete config. If the '-e' + * option is specified, then we print out error rate information as well. + */ +int +status_callback(zpool_handle_t *zhp, void *data) +{ + status_cbdata_t *cbp = data; + nvlist_t *config, *nvroot; + char *msgid; + int reason; + const char *health; + uint_t c; + vdev_stat_t *vs; + + config = zpool_get_config(zhp, NULL); + reason = zpool_get_status(zhp, &msgid); + + cbp->cb_count++; + + /* + * If we were given 'zpool status -x', only report those pools with + * problems. + */ + if (reason == ZPOOL_STATUS_OK && cbp->cb_explain) { + if (!cbp->cb_allpools) { + (void) printf(gettext("pool '%s' is healthy\n"), + zpool_get_name(zhp)); + if (cbp->cb_first) + cbp->cb_first = B_FALSE; + } + return (0); + } + + if (cbp->cb_first) + cbp->cb_first = B_FALSE; + else + (void) printf("\n"); + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + health = zpool_state_to_name(vs->vs_state, vs->vs_aux); + + (void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp)); + (void) printf(gettext(" state: %s\n"), health); + + switch (reason) { + case ZPOOL_STATUS_MISSING_DEV_R: + (void) printf(gettext("status: One or more devices could not " + "be opened. Sufficient replicas exist for\n\tthe pool to " + "continue functioning in a degraded state.\n")); + (void) printf(gettext("action: Attach the missing device and " + "online it using 'zpool online'.\n")); + break; + + case ZPOOL_STATUS_MISSING_DEV_NR: + (void) printf(gettext("status: One or more devices could not " + "be opened. There are insufficient\n\treplicas for the " + "pool to continue functioning.\n")); + (void) printf(gettext("action: Attach the missing device and " + "online it using 'zpool online'.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_LABEL_R: + (void) printf(gettext("status: One or more devices could not " + "be used because the label is missing or\n\tinvalid. " + "Sufficient replicas exist for the pool to continue\n\t" + "functioning in a degraded state.\n")); + (void) printf(gettext("action: Replace the device using " + "'zpool replace'.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_LABEL_NR: + (void) printf(gettext("status: One or more devices could not " + "be used because the label is missing \n\tor invalid. " + "There are insufficient replicas for the pool to " + "continue\n\tfunctioning.\n")); + zpool_explain_recover(zpool_get_handle(zhp), + zpool_get_name(zhp), reason, config); + break; + + case ZPOOL_STATUS_FAILING_DEV: + (void) printf(gettext("status: One or more devices has " + "experienced an unrecoverable error. An\n\tattempt was " + "made to correct the error. Applications are " + "unaffected.\n")); + (void) printf(gettext("action: Determine if the device needs " + "to be replaced, and clear the errors\n\tusing " + "'zpool clear' or replace the device with 'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_OFFLINE_DEV: + (void) printf(gettext("status: One or more devices has " + "been taken offline by the administrator.\n\tSufficient " + "replicas exist for the pool to continue functioning in " + "a\n\tdegraded state.\n")); + (void) printf(gettext("action: Online the device using " + "'zpool online' or replace the device with\n\t'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_REMOVED_DEV: + (void) printf(gettext("status: One or more devices has " + "been removed by the administrator.\n\tSufficient " + "replicas exist for the pool to continue functioning in " + "a\n\tdegraded state.\n")); + (void) printf(gettext("action: Online the device using " + "'zpool online' or replace the device with\n\t'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_RESILVERING: + (void) printf(gettext("status: One or more devices is " + "currently being resilvered. The pool will\n\tcontinue " + "to function, possibly in a degraded state.\n")); + (void) printf(gettext("action: Wait for the resilver to " + "complete.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_DATA: + (void) printf(gettext("status: One or more devices has " + "experienced an error resulting in data\n\tcorruption. " + "Applications may be affected.\n")); + (void) printf(gettext("action: Restore the file in question " + "if possible. Otherwise restore the\n\tentire pool from " + "backup.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_POOL: + (void) printf(gettext("status: The pool metadata is corrupted " + "and the pool cannot be opened.\n")); + zpool_explain_recover(zpool_get_handle(zhp), + zpool_get_name(zhp), reason, config); + break; + + case ZPOOL_STATUS_VERSION_OLDER: + (void) printf(gettext("status: The pool is formatted using an " + "older on-disk format. The pool can\n\tstill be used, but " + "some features are unavailable.\n")); + (void) printf(gettext("action: Upgrade the pool using 'zpool " + "upgrade'. Once this is done, the\n\tpool will no longer " + "be accessible on older software versions.\n")); + break; + + case ZPOOL_STATUS_VERSION_NEWER: + (void) printf(gettext("status: The pool has been upgraded to a " + "newer, incompatible on-disk version.\n\tThe pool cannot " + "be accessed on this system.\n")); + (void) printf(gettext("action: Access the pool from a system " + "running more recent software, or\n\trestore the pool from " + "backup.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_R: + (void) printf(gettext("status: One or more devices are " + "faulted in response to persistent errors.\n\tSufficient " + "replicas exist for the pool to continue functioning " + "in a\n\tdegraded state.\n")); + (void) printf(gettext("action: Replace the faulted device, " + "or use 'zpool clear' to mark the device\n\trepaired.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_NR: + (void) printf(gettext("status: One or more devices are " + "faulted in response to persistent errors. There are " + "insufficient replicas for the pool to\n\tcontinue " + "functioning.\n")); + (void) printf(gettext("action: Destroy and re-create the pool " + "from a backup source. Manually marking the device\n" + "\trepaired using 'zpool clear' may allow some data " + "to be recovered.\n")); + break; + + case ZPOOL_STATUS_IO_FAILURE_WAIT: + case ZPOOL_STATUS_IO_FAILURE_CONTINUE: + (void) printf(gettext("status: One or more devices are " + "faulted in response to IO failures.\n")); + (void) printf(gettext("action: Make sure the affected devices " + "are connected, then run 'zpool clear'.\n")); + break; + + case ZPOOL_STATUS_BAD_LOG: + (void) printf(gettext("status: An intent log record " + "could not be read.\n" + "\tWaiting for adminstrator intervention to fix the " + "faulted pool.\n")); + (void) printf(gettext("action: Either restore the affected " + "device(s) and run 'zpool online',\n" + "\tor ignore the intent log records by running " + "'zpool clear'.\n")); + break; + + default: + /* + * The remaining errors can't actually be generated, yet. + */ + assert(reason == ZPOOL_STATUS_OK); + } + + if (msgid != NULL) + (void) printf(gettext(" see: http://www.sun.com/msg/%s\n"), + msgid); + + if (config != NULL) { + int namewidth; + uint64_t nerr; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + pool_scan_stat_t *ps = NULL; + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); + print_scan_status(ps); + + namewidth = max_width(zhp, nvroot, 0, 0); + if (namewidth < 10) + namewidth = 10; + + (void) printf(gettext("config:\n\n")); + (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth, + "NAME", "STATE", "READ", "WRITE", "CKSUM"); + print_status_config(zhp, zpool_get_name(zhp), nvroot, + namewidth, 0, B_FALSE); + + if (num_logs(nvroot) > 0) + print_logs(zhp, nvroot, namewidth, B_TRUE); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) + print_l2cache(zhp, l2cache, nl2cache, namewidth); + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) + print_spares(zhp, spares, nspares, namewidth); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, + &nerr) == 0) { + nvlist_t *nverrlist = NULL; + + /* + * If the approximate error count is small, get a + * precise count by fetching the entire log and + * uniquifying the results. + */ + if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && + zpool_get_errlog(zhp, &nverrlist) == 0) { + nvpair_t *elem; + + elem = NULL; + nerr = 0; + while ((elem = nvlist_next_nvpair(nverrlist, + elem)) != NULL) { + nerr++; + } + } + nvlist_free(nverrlist); + + (void) printf("\n"); + + if (nerr == 0) + (void) printf(gettext("errors: No known data " + "errors\n")); + else if (!cbp->cb_verbose) + (void) printf(gettext("errors: %llu data " + "errors, use '-v' for a list\n"), + (u_longlong_t)nerr); + else + print_error_log(zhp); + } + + if (cbp->cb_dedup_stats) + print_dedup_stats(config); + } else { + (void) printf(gettext("config: The configuration cannot be " + "determined.\n")); + } + + return (0); +} + +/* + * zpool status [-vx] [-T d|u] [pool] ... [interval [count]] + * + * -v Display complete error logs + * -x Display only pools with potential problems + * -D Display dedup status (undocumented) + * -T Display a timestamp in date(1) or Unix format + * + * Describes the health status of all pools or some subset. + */ +int +zpool_do_status(int argc, char **argv) +{ + int c; + int ret; + unsigned long interval = 0, count = 0; + status_cbdata_t cb = { 0 }; + + /* check options */ + while ((c = getopt(argc, argv, "vxDT:")) != -1) { + switch (c) { + case 'v': + cb.cb_verbose = B_TRUE; + break; + case 'x': + cb.cb_explain = B_TRUE; + break; + case 'D': + cb.cb_dedup_stats = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &interval, &count); + + if (argc == 0) + cb.cb_allpools = B_TRUE; + + cb.cb_first = B_TRUE; + + for (;;) { + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + ret = for_each_pool(argc, argv, B_TRUE, NULL, + status_callback, &cb); + + if (argc == 0 && cb.cb_count == 0) + (void) printf(gettext("no pools available\n")); + else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) + (void) printf(gettext("all pools are healthy\n")); + + if (ret != 0) + return (ret); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) sleep(interval); + } + + return (0); +} + +typedef struct upgrade_cbdata { + int cb_all; + int cb_first; + int cb_newer; + int cb_argc; + uint64_t cb_version; + char **cb_argv; +} upgrade_cbdata_t; + +static int +upgrade_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + int ret = 0; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + if (!cbp->cb_newer && version < SPA_VERSION) { + if (!cbp->cb_all) { + if (cbp->cb_first) { + (void) printf(gettext("The following pools are " + "out of date, and can be upgraded. After " + "being\nupgraded, these pools will no " + "longer be accessible by older software " + "versions.\n\n")); + (void) printf(gettext("VER POOL\n")); + (void) printf(gettext("--- ------------\n")); + cbp->cb_first = B_FALSE; + } + + (void) printf("%2llu %s\n", (u_longlong_t)version, + zpool_get_name(zhp)); + } else { + cbp->cb_first = B_FALSE; + ret = zpool_upgrade(zhp, cbp->cb_version); + if (!ret) { + (void) printf(gettext("Successfully upgraded " + "'%s'\n\n"), zpool_get_name(zhp)); + } + } + } else if (cbp->cb_newer && version > SPA_VERSION) { + assert(!cbp->cb_all); + + if (cbp->cb_first) { + (void) printf(gettext("The following pools are " + "formatted using a newer software version and\n" + "cannot be accessed on the current system.\n\n")); + (void) printf(gettext("VER POOL\n")); + (void) printf(gettext("--- ------------\n")); + cbp->cb_first = B_FALSE; + } + + (void) printf("%2llu %s\n", (u_longlong_t)version, + zpool_get_name(zhp)); + } + + zpool_close(zhp); + return (ret); +} + +/* ARGSUSED */ +static int +upgrade_one(zpool_handle_t *zhp, void *data) +{ + upgrade_cbdata_t *cbp = data; + uint64_t cur_version; + int ret; + + if (strcmp("log", zpool_get_name(zhp)) == 0) { + (void) printf(gettext("'log' is now a reserved word\n" + "Pool 'log' must be renamed using export and import" + " to upgrade.\n")); + return (1); + } + + cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); + if (cur_version > cbp->cb_version) { + (void) printf(gettext("Pool '%s' is already formatted " + "using more current version '%llu'.\n"), + zpool_get_name(zhp), cur_version); + return (0); + } + if (cur_version == cbp->cb_version) { + (void) printf(gettext("Pool '%s' is already formatted " + "using the current version.\n"), zpool_get_name(zhp)); + return (0); + } + + ret = zpool_upgrade(zhp, cbp->cb_version); + + if (!ret) { + (void) printf(gettext("Successfully upgraded '%s' " + "from version %llu to version %llu\n\n"), + zpool_get_name(zhp), (u_longlong_t)cur_version, + (u_longlong_t)cbp->cb_version); + } + + return (ret != 0); +} + +/* + * zpool upgrade + * zpool upgrade -v + * zpool upgrade [-V version] <-a | pool ...> + * + * With no arguments, display downrev'd ZFS pool available for upgrade. + * Individual pools can be upgraded by specifying the pool, and '-a' will + * upgrade all pools. + */ +int +zpool_do_upgrade(int argc, char **argv) +{ + int c; + upgrade_cbdata_t cb = { 0 }; + int ret = 0; + boolean_t showversions = B_FALSE; + char *end; + + + /* check options */ + while ((c = getopt(argc, argv, ":avV:")) != -1) { + switch (c) { + case 'a': + cb.cb_all = B_TRUE; + break; + case 'v': + showversions = B_TRUE; + break; + case 'V': + cb.cb_version = strtoll(optarg, &end, 10); + if (*end != '\0' || cb.cb_version > SPA_VERSION || + cb.cb_version < SPA_VERSION_1) { + (void) fprintf(stderr, + gettext("invalid version '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + cb.cb_argc = argc; + cb.cb_argv = argv; + argc -= optind; + argv += optind; + + if (cb.cb_version == 0) { + cb.cb_version = SPA_VERSION; + } else if (!cb.cb_all && argc == 0) { + (void) fprintf(stderr, gettext("-V option is " + "incompatible with other arguments\n")); + usage(B_FALSE); + } + + if (showversions) { + if (cb.cb_all || argc != 0) { + (void) fprintf(stderr, gettext("-v option is " + "incompatible with other arguments\n")); + usage(B_FALSE); + } + } else if (cb.cb_all) { + if (argc != 0) { + (void) fprintf(stderr, gettext("-a option should not " + "be used along with a pool name\n")); + usage(B_FALSE); + } + } + + (void) printf(gettext("This system is currently running " + "ZFS pool version %llu.\n\n"), SPA_VERSION); + cb.cb_first = B_TRUE; + if (showversions) { + (void) printf(gettext("The following versions are " + "supported:\n\n")); + (void) printf(gettext("VER DESCRIPTION\n")); + (void) printf("--- -----------------------------------------" + "---------------\n"); + (void) printf(gettext(" 1 Initial ZFS version\n")); + (void) printf(gettext(" 2 Ditto blocks " + "(replicated metadata)\n")); + (void) printf(gettext(" 3 Hot spares and double parity " + "RAID-Z\n")); + (void) printf(gettext(" 4 zpool history\n")); + (void) printf(gettext(" 5 Compression using the gzip " + "algorithm\n")); + (void) printf(gettext(" 6 bootfs pool property\n")); + (void) printf(gettext(" 7 Separate intent log devices\n")); + (void) printf(gettext(" 8 Delegated administration\n")); + (void) printf(gettext(" 9 refquota and refreservation " + "properties\n")); + (void) printf(gettext(" 10 Cache devices\n")); + (void) printf(gettext(" 11 Improved scrub performance\n")); + (void) printf(gettext(" 12 Snapshot properties\n")); + (void) printf(gettext(" 13 snapused property\n")); + (void) printf(gettext(" 14 passthrough-x aclinherit\n")); + (void) printf(gettext(" 15 user/group space accounting\n")); + (void) printf(gettext(" 16 stmf property support\n")); + (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); + (void) printf(gettext(" 18 Snapshot user holds\n")); + (void) printf(gettext(" 19 Log device removal\n")); + (void) printf(gettext(" 20 Compression using zle " + "(zero-length encoding)\n")); + (void) printf(gettext(" 21 Deduplication\n")); + (void) printf(gettext(" 22 Received properties\n")); + (void) printf(gettext(" 23 Slim ZIL\n")); + (void) printf(gettext(" 24 System attributes\n")); + (void) printf(gettext(" 25 Improved scrub stats\n")); + (void) printf(gettext(" 26 Improved snapshot deletion " + "performance\n")); + (void) printf(gettext(" 27 Improved snapshot creation " + "performance\n")); + (void) printf(gettext(" 28 Multiple vdev replacements\n")); + (void) printf(gettext("\nFor more information on a particular " + "version, including supported releases,\n")); + (void) printf(gettext("see the ZFS Administration Guide.\n\n")); + } else if (argc == 0) { + int notfound; + + ret = zpool_iter(g_zfs, upgrade_cb, &cb); + notfound = cb.cb_first; + + if (!cb.cb_all && ret == 0) { + if (!cb.cb_first) + (void) printf("\n"); + cb.cb_first = B_TRUE; + cb.cb_newer = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_cb, &cb); + if (!cb.cb_first) { + notfound = B_FALSE; + (void) printf("\n"); + } + } + + if (ret == 0) { + if (notfound) + (void) printf(gettext("All pools are formatted " + "using this version.\n")); + else if (!cb.cb_all) + (void) printf(gettext("Use 'zpool upgrade -v' " + "for a list of available versions and " + "their associated\nfeatures.\n")); + } + } else { + ret = for_each_pool(argc, argv, B_FALSE, NULL, + upgrade_one, &cb); + } + + return (ret); +} + +typedef struct hist_cbdata { + boolean_t first; + int longfmt; + int internal; +} hist_cbdata_t; + +/* + * Print out the command history for a specific pool. + */ +static int +get_history_one(zpool_handle_t *zhp, void *data) +{ + nvlist_t *nvhis; + nvlist_t **records; + uint_t numrecords; + char *cmdstr; + char *pathstr; + uint64_t dst_time; + time_t tsec; + struct tm t; + char tbuf[30]; + int ret, i; + uint64_t who; + struct passwd *pwd; + char *hostname; + char *zonename; + char internalstr[MAXPATHLEN]; + hist_cbdata_t *cb = (hist_cbdata_t *)data; + uint64_t txg; + uint64_t ievent; + + cb->first = B_FALSE; + + (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); + + if ((ret = zpool_get_history(zhp, &nvhis)) != 0) + return (ret); + + verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, + &records, &numrecords) == 0); + for (i = 0; i < numrecords; i++) { + if (nvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME, + &dst_time) != 0) + continue; + + /* is it an internal event or a standard event? */ + if (nvlist_lookup_string(records[i], ZPOOL_HIST_CMD, + &cmdstr) != 0) { + if (cb->internal == 0) + continue; + + if (nvlist_lookup_uint64(records[i], + ZPOOL_HIST_INT_EVENT, &ievent) != 0) + continue; + verify(nvlist_lookup_uint64(records[i], + ZPOOL_HIST_TXG, &txg) == 0); + verify(nvlist_lookup_string(records[i], + ZPOOL_HIST_INT_STR, &pathstr) == 0); + if (ievent >= LOG_END) + continue; + (void) snprintf(internalstr, + sizeof (internalstr), + "[internal %s txg:%lld] %s", + zfs_history_event_names[ievent], txg, + pathstr); + cmdstr = internalstr; + } + tsec = dst_time; + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + (void) printf("%s %s", tbuf, cmdstr); + + if (!cb->longfmt) { + (void) printf("\n"); + continue; + } + (void) printf(" ["); + if (nvlist_lookup_uint64(records[i], + ZPOOL_HIST_WHO, &who) == 0) { + pwd = getpwuid((uid_t)who); + if (pwd) + (void) printf("user %s on", + pwd->pw_name); + else + (void) printf("user %d on", + (int)who); + } else { + (void) printf(gettext("no info]\n")); + continue; + } + if (nvlist_lookup_string(records[i], + ZPOOL_HIST_HOST, &hostname) == 0) { + (void) printf(" %s", hostname); + } + if (nvlist_lookup_string(records[i], + ZPOOL_HIST_ZONE, &zonename) == 0) { + (void) printf(":%s", zonename); + } + + (void) printf("]"); + (void) printf("\n"); + } + (void) printf("\n"); + nvlist_free(nvhis); + + return (ret); +} + +/* + * zpool history + * + * Displays the history of commands that modified pools. + */ + + +int +zpool_do_history(int argc, char **argv) +{ + hist_cbdata_t cbdata = { 0 }; + int ret; + int c; + + cbdata.first = B_TRUE; + /* check options */ + while ((c = getopt(argc, argv, "li")) != -1) { + switch (c) { + case 'l': + cbdata.longfmt = 1; + break; + case 'i': + cbdata.internal = 1; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, + &cbdata); + + if (argc == 0 && cbdata.first == B_TRUE) { + (void) printf(gettext("no pools available\n")); + return (0); + } + + return (ret); +} + +static int +get_callback(zpool_handle_t *zhp, void *data) +{ + zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; + char value[MAXNAMELEN]; + zprop_source_t srctype; + zprop_list_t *pl; + + for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { + + /* + * Skip the special fake placeholder. This will also skip + * over the name property when 'all' is specified. + */ + if (pl->pl_prop == ZPOOL_PROP_NAME && + pl == cbp->cb_proplist) + continue; + + if (zpool_get_prop(zhp, pl->pl_prop, + value, sizeof (value), &srctype) != 0) + continue; + + zprop_print_one_property(zpool_get_name(zhp), cbp, + zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, + NULL); + } + return (0); +} + +int +zpool_do_get(int argc, char **argv) +{ + zprop_get_cbdata_t cb = { 0 }; + zprop_list_t fake_name = { 0 }; + int ret; + + if (argc < 3) + usage(B_FALSE); + + cb.cb_first = B_TRUE; + cb.cb_sources = ZPROP_SRC_ALL; + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_SOURCE; + cb.cb_type = ZFS_TYPE_POOL; + + if (zprop_get_list(g_zfs, argv[1], &cb.cb_proplist, + ZFS_TYPE_POOL) != 0) + usage(B_FALSE); + + if (cb.cb_proplist != NULL) { + fake_name.pl_prop = ZPOOL_PROP_NAME; + fake_name.pl_width = strlen(gettext("NAME")); + fake_name.pl_next = cb.cb_proplist; + cb.cb_proplist = &fake_name; + } + + ret = for_each_pool(argc - 2, argv + 2, B_TRUE, &cb.cb_proplist, + get_callback, &cb); + + if (cb.cb_proplist == &fake_name) + zprop_free_list(fake_name.pl_next); + else + zprop_free_list(cb.cb_proplist); + + return (ret); +} + +typedef struct set_cbdata { + char *cb_propname; + char *cb_value; + boolean_t cb_any_successful; +} set_cbdata_t; + +int +set_callback(zpool_handle_t *zhp, void *data) +{ + int error; + set_cbdata_t *cb = (set_cbdata_t *)data; + + error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); + + if (!error) + cb->cb_any_successful = B_TRUE; + + return (error); +} + +int +zpool_do_set(int argc, char **argv) +{ + set_cbdata_t cb = { 0 }; + int error; + + if (argc > 1 && argv[1][0] == '-') { + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + argv[1][1]); + usage(B_FALSE); + } + + if (argc < 2) { + (void) fprintf(stderr, gettext("missing property=value " + "argument\n")); + usage(B_FALSE); + } + + if (argc < 3) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 3) { + (void) fprintf(stderr, gettext("too many pool names\n")); + usage(B_FALSE); + } + + cb.cb_propname = argv[1]; + cb.cb_value = strchr(cb.cb_propname, '='); + if (cb.cb_value == NULL) { + (void) fprintf(stderr, gettext("missing value in " + "property=value argument\n")); + usage(B_FALSE); + } + + *(cb.cb_value) = '\0'; + cb.cb_value++; + + error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, + set_callback, &cb); + + return (error); +} + +static int +find_command_idx(char *command, int *idx) +{ + int i; + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + continue; + + if (strcmp(command, command_table[i].name) == 0) { + *idx = i; + return (0); + } + } + return (1); +} + +int +main(int argc, char **argv) +{ + int ret; + int i; + char *cmdname; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, gettext("internal error: failed to " + "initialize ZFS library\n")); + return (1); + } + + libzfs_print_on_error(g_zfs, B_TRUE); + + opterr = 0; + + /* + * Make sure the user has specified some command. + */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing command\n")); + usage(B_FALSE); + } + + cmdname = argv[1]; + + /* + * Special case '-?' + */ + if (strcmp(cmdname, "-?") == 0) + usage(B_TRUE); + + zpool_set_history_str("zpool", argc, argv, history_str); + verify(zpool_stage_history(g_zfs, history_str) == 0); + + /* + * Run the appropriate command. + */ + if (find_command_idx(cmdname, &i) == 0) { + current_command = &command_table[i]; + ret = command_table[i].func(argc - 1, argv + 1); + } else if (strchr(cmdname, '=')) { + verify(find_command_idx("set", &i) == 0); + current_command = &command_table[i]; + ret = command_table[i].func(argc, argv); + } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { + /* + * 'freeze' is a vile debugging abomination, so we treat + * it as such. + */ + char buf[16384]; + int fd = open(ZFS_DEV, O_RDWR); + (void) strcpy((void *)buf, argv[2]); + return (!!ioctl(fd, ZFS_IOC_POOL_FREEZE, buf)); + } else { + (void) fprintf(stderr, gettext("unrecognized " + "command '%s'\n"), cmdname); + usage(B_FALSE); + } + + libzfs_fini(g_zfs); + + /* + * The 'ZFS_ABORT' environment variable causes us to dump core on exit + * for the purposes of running ::findleaks. + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + return (ret); +} diff --git a/cmd/zpool/zpool_util.c b/cmd/zpool/zpool_util.c new file mode 100644 index 0000000..c7a002e --- /dev/null +++ b/cmd/zpool/zpool_util.c @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include + +#include "zpool_util.h" + +/* + * Utility function to guarantee malloc() success. + */ +void * +safe_malloc(size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) { + (void) fprintf(stderr, "internal error: out of memory\n"); + exit(1); + } + + return (data); +} + +/* + * Display an out of memory error message and abort the current program. + */ +void +zpool_no_memory(void) +{ + assert(errno == ENOMEM); + (void) fprintf(stderr, + gettext("internal error: out of memory\n")); + exit(1); +} + +/* + * Return the number of logs in supplied nvlist + */ +uint_t +num_logs(nvlist_t *nv) +{ + uint_t nlogs = 0; + uint_t c, children; + nvlist_t **child; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (0); + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (is_log) + nlogs++; + } + return (nlogs); +} diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h new file mode 100644 index 0000000..134c730 --- /dev/null +++ b/cmd/zpool/zpool_util.h @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef ZPOOL_UTIL_H +#define ZPOOL_UTIL_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Basic utility functions + */ +void *safe_malloc(size_t); +void zpool_no_memory(void); +uint_t num_logs(nvlist_t *nv); + +/* + * Virtual device functions + */ + +nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, + boolean_t replacing, boolean_t dryrun, int argc, char **argv); +nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, + nvlist_t *props, splitflags_t flags, int argc, char **argv); + +/* + * Pool list functions + */ +int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, + zpool_iter_f, void *); + +typedef struct zpool_list zpool_list_t; + +zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); +void pool_list_update(zpool_list_t *); +int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); +void pool_list_free(zpool_list_t *); +int pool_list_count(zpool_list_t *); +void pool_list_remove(zpool_list_t *, zpool_handle_t *); + +libzfs_handle_t *g_zfs; + +#ifdef __cplusplus +} +#endif + +#endif /* ZPOOL_UTIL_H */ diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c new file mode 100644 index 0000000..53c2e60 --- /dev/null +++ b/cmd/zpool/zpool_vdev.c @@ -0,0 +1,1469 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libdiskmgt, makes sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zpool_util.h" + +#define DISK_ROOT "/dev/dsk" +#define RDISK_ROOT "/dev/rdsk" +#define BACKUP_SLICE "s2" + +/* + * For any given vdev specification, we can have multiple errors. The + * vdev_error() function keeps track of whether we have seen an error yet, and + * prints out a header if its the first error we've seen. + */ +boolean_t error_seen; +boolean_t is_force; + +/*PRINTFLIKE1*/ +static void +vdev_error(const char *fmt, ...) +{ + va_list ap; + + if (!error_seen) { + (void) fprintf(stderr, gettext("invalid vdev specification\n")); + if (!is_force) + (void) fprintf(stderr, gettext("use '-f' to override " + "the following errors:\n")); + else + (void) fprintf(stderr, gettext("the following errors " + "must be manually repaired:\n")); + error_seen = B_TRUE; + } + + va_start(ap, fmt); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); +} + +static void +libdiskmgt_error(int error) +{ + /* + * ENXIO/ENODEV is a valid error message if the device doesn't live in + * /dev/dsk. Don't bother printing an error message in this case. + */ + if (error == ENXIO || error == ENODEV) + return; + + (void) fprintf(stderr, gettext("warning: device in use checking " + "failed: %s\n"), strerror(error)); +} + +/* + * Validate a device, passing the bulk of the work off to libdiskmgt. + */ +static int +check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) +{ + char *msg; + int error = 0; + dm_who_type_t who; + + if (force) + who = DM_WHO_ZPOOL_FORCE; + else if (isspare) + who = DM_WHO_ZPOOL_SPARE; + else + who = DM_WHO_ZPOOL; + + if (dm_inuse((char *)path, &msg, who, &error) || error) { + if (error != 0) { + libdiskmgt_error(error); + return (0); + } else { + vdev_error("%s", msg); + free(msg); + return (-1); + } + } + + /* + * If we're given a whole disk, ignore overlapping slices since we're + * about to label it anyway. + */ + error = 0; + if (!wholedisk && !force && + (dm_isoverlapping((char *)path, &msg, &error) || error)) { + if (error == 0) { + /* dm_isoverlapping returned -1 */ + vdev_error(gettext("%s overlaps with %s\n"), path, msg); + free(msg); + return (-1); + } else if (error != ENODEV) { + /* libdiskmgt's devcache only handles physical drives */ + libdiskmgt_error(error); + return (0); + } + } + + return (0); +} + + +/* + * Validate a whole disk. Iterate over all slices on the disk and make sure + * that none is in use by calling check_slice(). + */ +static int +check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) +{ + dm_descriptor_t *drive, *media, *slice; + int err = 0; + int i; + int ret; + + /* + * Get the drive associated with this disk. This should never fail, + * because we already have an alias handle open for the device. + */ + if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, + &err)) == NULL || *drive == NULL) { + if (err) + libdiskmgt_error(err); + return (0); + } + + if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, + &err)) == NULL) { + dm_free_descriptors(drive); + if (err) + libdiskmgt_error(err); + return (0); + } + + dm_free_descriptors(drive); + + /* + * It is possible that the user has specified a removable media drive, + * and the media is not present. + */ + if (*media == NULL) { + dm_free_descriptors(media); + vdev_error(gettext("'%s' has no media in drive\n"), name); + return (-1); + } + + if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, + &err)) == NULL) { + dm_free_descriptors(media); + if (err) + libdiskmgt_error(err); + return (0); + } + + dm_free_descriptors(media); + + ret = 0; + + /* + * Iterate over all slices and report any errors. We don't care about + * overlapping slices because we are using the whole disk. + */ + for (i = 0; slice[i] != NULL; i++) { + char *name = dm_get_name(slice[i], &err); + + if (check_slice(name, force, B_TRUE, isspare) != 0) + ret = -1; + + dm_free_name(name); + } + + dm_free_descriptors(slice); + return (ret); +} + +/* + * Validate a device. + */ +static int +check_device(const char *path, boolean_t force, boolean_t isspare) +{ + dm_descriptor_t desc; + int err; + char *dev; + + /* + * For whole disks, libdiskmgt does not include the leading dev path. + */ + dev = strrchr(path, '/'); + assert(dev != NULL); + dev++; + if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { + err = check_disk(path, desc, force, isspare); + dm_free_descriptor(desc); + return (err); + } + + return (check_slice(path, force, B_FALSE, isspare)); +} + +/* + * Check that a file is valid. All we can do in this case is check that it's + * not in use by another pool, and not in use by swap. + */ +static int +check_file(const char *file, boolean_t force, boolean_t isspare) +{ + char *name; + int fd; + int ret = 0; + int err; + pool_state_t state; + boolean_t inuse; + + if (dm_inuse_swap(file, &err)) { + if (err) + libdiskmgt_error(err); + else + vdev_error(gettext("%s is currently used by swap. " + "Please see swap(1M).\n"), file); + return (-1); + } + + if ((fd = open(file, O_RDONLY)) < 0) + return (0); + + if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { + const char *desc; + + switch (state) { + case POOL_STATE_ACTIVE: + desc = gettext("active"); + break; + + case POOL_STATE_EXPORTED: + desc = gettext("exported"); + break; + + case POOL_STATE_POTENTIALLY_ACTIVE: + desc = gettext("potentially active"); + break; + + default: + desc = gettext("unknown"); + break; + } + + /* + * Allow hot spares to be shared between pools. + */ + if (state == POOL_STATE_SPARE && isspare) + return (0); + + if (state == POOL_STATE_ACTIVE || + state == POOL_STATE_SPARE || !force) { + switch (state) { + case POOL_STATE_SPARE: + vdev_error(gettext("%s is reserved as a hot " + "spare for pool %s\n"), file, name); + break; + default: + vdev_error(gettext("%s is part of %s pool " + "'%s'\n"), file, desc, name); + break; + } + ret = -1; + } + + free(name); + } + + (void) close(fd); + return (ret); +} + + +/* + * By "whole disk" we mean an entire physical disk (something we can + * label, toggle the write cache on, etc.) as opposed to the full + * capacity of a pseudo-device such as lofi or did. We act as if we + * are labeling the disk, which should be a pretty good test of whether + * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if + * it isn't. + */ +static boolean_t +is_whole_disk(const char *arg) +{ + struct dk_gpt *label; + int fd; + char path[MAXPATHLEN]; + + (void) snprintf(path, sizeof (path), "%s%s%s", + RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); + if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) + return (B_FALSE); + if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { + (void) close(fd); + return (B_FALSE); + } + efi_free(label); + (void) close(fd); + return (B_TRUE); +} + +/* + * Create a leaf vdev. Determine if this is a file or a device. If it's a + * device, fill in the device id to make a complete nvlist. Valid forms for a + * leaf vdev are: + * + * /dev/dsk/xxx Complete disk path + * /xxx Full path to file + * xxx Shorthand for /dev/dsk/xxx + */ +static nvlist_t * +make_leaf_vdev(const char *arg, uint64_t is_log) +{ + char path[MAXPATHLEN]; + struct stat64 statbuf; + nvlist_t *vdev = NULL; + char *type = NULL; + boolean_t wholedisk = B_FALSE; + + /* + * Determine what type of vdev this is, and put the full path into + * 'path'. We detect whether this is a device of file afterwards by + * checking the st_mode of the file. + */ + if (arg[0] == '/') { + /* + * Complete device or file path. Exact type is determined by + * examining the file descriptor afterwards. + */ + wholedisk = is_whole_disk(arg); + if (!wholedisk && (stat64(arg, &statbuf) != 0)) { + (void) fprintf(stderr, + gettext("cannot open '%s': %s\n"), + arg, strerror(errno)); + return (NULL); + } + + (void) strlcpy(path, arg, sizeof (path)); + } else { + /* + * This may be a short path for a device, or it could be total + * gibberish. Check to see if it's a known device in + * /dev/dsk/. As part of this check, see if we've been given a + * an entire disk (minus the slice number). + */ + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, + arg); + wholedisk = is_whole_disk(path); + if (!wholedisk && (stat64(path, &statbuf) != 0)) { + /* + * If we got ENOENT, then the user gave us + * gibberish, so try to direct them with a + * reasonable error message. Otherwise, + * regurgitate strerror() since it's the best we + * can do. + */ + if (errno == ENOENT) { + (void) fprintf(stderr, + gettext("cannot open '%s': no such " + "device in %s\n"), arg, DISK_ROOT); + (void) fprintf(stderr, + gettext("must be a full path or " + "shorthand device name\n")); + return (NULL); + } else { + (void) fprintf(stderr, + gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (NULL); + } + } + } + + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + (void) fprintf(stderr, gettext("cannot use '%s': must be a " + "block device or regular file\n"), path); + return (NULL); + } + + /* + * Finally, we have the complete device or file, and we know that it is + * acceptable to use. Construct the nvlist to describe this vdev. All + * vdevs have a 'path' element, and devices also have a 'devid' element. + */ + verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); + verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (strcmp(type, VDEV_TYPE_DISK) == 0) + verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, + (uint64_t)wholedisk) == 0); + + /* + * For a whole disk, defer getting its devid until after labeling it. + */ + if (S_ISBLK(statbuf.st_mode) && !wholedisk) { + /* + * Get the devid for the device. + */ + int fd; + ddi_devid_t devid; + char *minor = NULL, *devid_str = NULL; + + if ((fd = open(path, O_RDONLY)) < 0) { + (void) fprintf(stderr, gettext("cannot open '%s': " + "%s\n"), path, strerror(errno)); + nvlist_free(vdev); + return (NULL); + } + + if (devid_get(fd, &devid) == 0) { + if (devid_get_minor_name(fd, &minor) == 0 && + (devid_str = devid_str_encode(devid, minor)) != + NULL) { + verify(nvlist_add_string(vdev, + ZPOOL_CONFIG_DEVID, devid_str) == 0); + } + if (devid_str != NULL) + devid_str_free(devid_str); + if (minor != NULL) + devid_str_free(minor); + devid_free(devid); + } + + (void) close(fd); + } + + return (vdev); +} + +/* + * Go through and verify the replication level of the pool is consistent. + * Performs the following checks: + * + * For the new spec, verifies that devices in mirrors and raidz are the + * same size. + * + * If the current configuration already has inconsistent replication + * levels, ignore any other potential problems in the new spec. + * + * Otherwise, make sure that the current spec (if there is one) and the new + * spec have consistent replication levels. + */ +typedef struct replication_level { + char *zprl_type; + uint64_t zprl_children; + uint64_t zprl_parity; +} replication_level_t; + +#define ZPOOL_FUZZ (16 * 1024 * 1024) + +/* + * Given a list of toplevel vdevs, return the current replication level. If + * the config is inconsistent, then NULL is returned. If 'fatal' is set, then + * an error message will be displayed for each self-inconsistent vdev. + */ +static replication_level_t * +get_replication(nvlist_t *nvroot, boolean_t fatal) +{ + nvlist_t **top; + uint_t t, toplevels; + nvlist_t **child; + uint_t c, children; + nvlist_t *nv; + char *type; + replication_level_t lastrep, rep, *ret; + boolean_t dontreport; + + ret = safe_malloc(sizeof (replication_level_t)); + + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &top, &toplevels) == 0); + + lastrep.zprl_type = NULL; + for (t = 0; t < toplevels; t++) { + uint64_t is_log = B_FALSE; + + nv = top[t]; + + /* + * For separate logs we ignore the top level vdev replication + * constraints. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); + if (is_log) + continue; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, + &type) == 0); + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + /* + * This is a 'file' or 'disk' vdev. + */ + rep.zprl_type = type; + rep.zprl_children = 1; + rep.zprl_parity = 0; + } else { + uint64_t vdev_size; + + /* + * This is a mirror or RAID-Z vdev. Go through and make + * sure the contents are all the same (files vs. disks), + * keeping track of the number of elements in the + * process. + * + * We also check that the size of each vdev (if it can + * be determined) is the same. + */ + rep.zprl_type = type; + rep.zprl_children = 0; + + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_NPARITY, + &rep.zprl_parity) == 0); + assert(rep.zprl_parity != 0); + } else { + rep.zprl_parity = 0; + } + + /* + * The 'dontreport' variable indicates that we've + * already reported an error for this spec, so don't + * bother doing it again. + */ + type = NULL; + dontreport = 0; + vdev_size = -1ULL; + for (c = 0; c < children; c++) { + nvlist_t *cnv = child[c]; + char *path; + struct stat64 statbuf; + uint64_t size = -1ULL; + char *childtype; + int fd, err; + + rep.zprl_children++; + + verify(nvlist_lookup_string(cnv, + ZPOOL_CONFIG_TYPE, &childtype) == 0); + + /* + * If this is a replacing or spare vdev, then + * get the real first child of the vdev. + */ + if (strcmp(childtype, + VDEV_TYPE_REPLACING) == 0 || + strcmp(childtype, VDEV_TYPE_SPARE) == 0) { + nvlist_t **rchild; + uint_t rchildren; + + verify(nvlist_lookup_nvlist_array(cnv, + ZPOOL_CONFIG_CHILDREN, &rchild, + &rchildren) == 0); + assert(rchildren == 2); + cnv = rchild[0]; + + verify(nvlist_lookup_string(cnv, + ZPOOL_CONFIG_TYPE, + &childtype) == 0); + } + + verify(nvlist_lookup_string(cnv, + ZPOOL_CONFIG_PATH, &path) == 0); + + /* + * If we have a raidz/mirror that combines disks + * with files, report it as an error. + */ + if (!dontreport && type != NULL && + strcmp(type, childtype) != 0) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: %s contains both " + "files and devices\n"), + rep.zprl_type); + else + return (NULL); + dontreport = B_TRUE; + } + + /* + * According to stat(2), the value of 'st_size' + * is undefined for block devices and character + * devices. But there is no effective way to + * determine the real size in userland. + * + * Instead, we'll take advantage of an + * implementation detail of spec_size(). If the + * device is currently open, then we (should) + * return a valid size. + * + * If we still don't get a valid size (indicated + * by a size of 0 or MAXOFFSET_T), then ignore + * this device altogether. + */ + if ((fd = open(path, O_RDONLY)) >= 0) { + err = fstat64(fd, &statbuf); + (void) close(fd); + } else { + err = stat64(path, &statbuf); + } + + if (err != 0 || + statbuf.st_size == 0 || + statbuf.st_size == MAXOFFSET_T) + continue; + + size = statbuf.st_size; + + /* + * Also make sure that devices and + * slices have a consistent size. If + * they differ by a significant amount + * (~16MB) then report an error. + */ + if (!dontreport && + (vdev_size != -1ULL && + (labs(size - vdev_size) > + ZPOOL_FUZZ))) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "%s contains devices of " + "different sizes\n"), + rep.zprl_type); + else + return (NULL); + dontreport = B_TRUE; + } + + type = childtype; + vdev_size = size; + } + } + + /* + * At this point, we have the replication of the last toplevel + * vdev in 'rep'. Compare it to 'lastrep' to see if its + * different. + */ + if (lastrep.zprl_type != NULL) { + if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication level: " + "both %s and %s vdevs are " + "present\n"), + lastrep.zprl_type, rep.zprl_type); + else + return (NULL); + } else if (lastrep.zprl_parity != rep.zprl_parity) { + if (ret) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication level: " + "both %llu and %llu device parity " + "%s vdevs are present\n"), + lastrep.zprl_parity, + rep.zprl_parity, + rep.zprl_type); + else + return (NULL); + } else if (lastrep.zprl_children != rep.zprl_children) { + if (ret) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication level: " + "both %llu-way and %llu-way %s " + "vdevs are present\n"), + lastrep.zprl_children, + rep.zprl_children, + rep.zprl_type); + else + return (NULL); + } + } + lastrep = rep; + } + + if (ret != NULL) + *ret = rep; + + return (ret); +} + +/* + * Check the replication level of the vdev spec against the current pool. Calls + * get_replication() to make sure the new spec is self-consistent. If the pool + * has a consistent replication level, then we ignore any errors. Otherwise, + * report any difference between the two. + */ +static int +check_replication(nvlist_t *config, nvlist_t *newroot) +{ + nvlist_t **child; + uint_t children; + replication_level_t *current = NULL, *new; + int ret; + + /* + * If we have a current pool configuration, check to see if it's + * self-consistent. If not, simply return success. + */ + if (config != NULL) { + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if ((current = get_replication(nvroot, B_FALSE)) == NULL) + return (0); + } + /* + * for spares there may be no children, and therefore no + * replication level to check + */ + if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) || (children == 0)) { + free(current); + return (0); + } + + /* + * If all we have is logs then there's no replication level to check. + */ + if (num_logs(newroot) == children) { + free(current); + return (0); + } + + /* + * Get the replication level of the new vdev spec, reporting any + * inconsistencies found. + */ + if ((new = get_replication(newroot, B_TRUE)) == NULL) { + free(current); + return (-1); + } + + /* + * Check to see if the new vdev spec matches the replication level of + * the current pool. + */ + ret = 0; + if (current != NULL) { + if (strcmp(current->zprl_type, new->zprl_type) != 0) { + vdev_error(gettext( + "mismatched replication level: pool uses %s " + "and new vdev is %s\n"), + current->zprl_type, new->zprl_type); + ret = -1; + } else if (current->zprl_parity != new->zprl_parity) { + vdev_error(gettext( + "mismatched replication level: pool uses %llu " + "device parity and new vdev uses %llu\n"), + current->zprl_parity, new->zprl_parity); + ret = -1; + } else if (current->zprl_children != new->zprl_children) { + vdev_error(gettext( + "mismatched replication level: pool uses %llu-way " + "%s and new vdev uses %llu-way %s\n"), + current->zprl_children, current->zprl_type, + new->zprl_children, new->zprl_type); + ret = -1; + } + } + + free(new); + if (current != NULL) + free(current); + + return (ret); +} + +/* + * Go through and find any whole disks in the vdev specification, labelling them + * as appropriate. When constructing the vdev spec, we were unable to open this + * device in order to provide a devid. Now that we have labelled the disk and + * know that slice 0 is valid, we can construct the devid now. + * + * If the disk was already labeled with an EFI label, we will have gotten the + * devid already (because we were able to open the whole disk). Otherwise, we + * need to get the devid after we label the disk. + */ +static int +make_disks(zpool_handle_t *zhp, nvlist_t *nv) +{ + nvlist_t **child; + uint_t c, children; + char *type, *path, *diskname; + char buf[MAXPATHLEN]; + uint64_t wholedisk; + int fd; + int ret; + ddi_devid_t devid; + char *minor = NULL, *devid_str = NULL; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + + if (strcmp(type, VDEV_TYPE_DISK) != 0) + return (0); + + /* + * We have a disk device. Get the path to the device + * and see if it's a whole disk by appending the backup + * slice and stat()ing the device. + */ + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) != 0 || !wholedisk) + return (0); + + diskname = strrchr(path, '/'); + assert(diskname != NULL); + diskname++; + if (zpool_label_disk(g_zfs, zhp, diskname) == -1) + return (-1); + + /* + * Fill in the devid, now that we've labeled the disk. + */ + (void) snprintf(buf, sizeof (buf), "%ss0", path); + if ((fd = open(buf, O_RDONLY)) < 0) { + (void) fprintf(stderr, + gettext("cannot open '%s': %s\n"), + buf, strerror(errno)); + return (-1); + } + + if (devid_get(fd, &devid) == 0) { + if (devid_get_minor_name(fd, &minor) == 0 && + (devid_str = devid_str_encode(devid, minor)) != + NULL) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_DEVID, devid_str) == 0); + } + if (devid_str != NULL) + devid_str_free(devid_str); + if (minor != NULL) + devid_str_free(minor); + devid_free(devid); + } + + /* + * Update the path to refer to the 's0' slice. The presence of + * the 'whole_disk' field indicates to the CLI that we should + * chop off the slice number when displaying the device in + * future output. + */ + verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); + + (void) close(fd); + + return (0); + } + + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + + return (0); +} + +/* + * Determine if the given path is a hot spare within the given configuration. + */ +static boolean_t +is_spare(nvlist_t *config, const char *path) +{ + int fd; + pool_state_t state; + char *name = NULL; + nvlist_t *label; + uint64_t guid, spareguid; + nvlist_t *nvroot; + nvlist_t **spares; + uint_t i, nspares; + boolean_t inuse; + + if ((fd = open(path, O_RDONLY)) < 0) + return (B_FALSE); + + if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || + !inuse || + state != POOL_STATE_SPARE || + zpool_read_label(fd, &label) != 0) { + free(name); + (void) close(fd); + return (B_FALSE); + } + free(name); + (void) close(fd); + + verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); + nvlist_free(label); + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + verify(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &spareguid) == 0); + if (spareguid == guid) + return (B_TRUE); + } + } + + return (B_FALSE); +} + +/* + * Go through and find any devices that are in use. We rely on libdiskmgt for + * the majority of this task. + */ +static int +check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, + boolean_t replacing, boolean_t isspare) +{ + nvlist_t **child; + uint_t c, children; + char *type, *path; + int ret; + char buf[MAXPATHLEN]; + uint64_t wholedisk; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); + + /* + * As a generic check, we look to see if this is a replace of a + * hot spare within the same pool. If so, we allow it + * regardless of what libdiskmgt or zpool_in_use() says. + */ + if (replacing) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) == 0 && wholedisk) + (void) snprintf(buf, sizeof (buf), "%ss0", + path); + else + (void) strlcpy(buf, path, sizeof (buf)); + + if (is_spare(config, buf)) + return (0); + } + + if (strcmp(type, VDEV_TYPE_DISK) == 0) + ret = check_device(path, force, isspare); + + if (strcmp(type, VDEV_TYPE_FILE) == 0) + ret = check_file(path, force, isspare); + + return (ret); + } + + for (c = 0; c < children; c++) + if ((ret = check_in_use(config, child[c], force, + replacing, B_FALSE)) != 0) + return (ret); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = check_in_use(config, child[c], force, + replacing, B_TRUE)) != 0) + return (ret); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = check_in_use(config, child[c], force, + replacing, B_FALSE)) != 0) + return (ret); + + return (0); +} + +static const char * +is_grouping(const char *type, int *mindev, int *maxdev) +{ + if (strncmp(type, "raidz", 5) == 0) { + const char *p = type + 5; + char *end; + long nparity; + + if (*p == '\0') { + nparity = 1; + } else if (*p == '0') { + return (NULL); /* no zero prefixes allowed */ + } else { + errno = 0; + nparity = strtol(p, &end, 10); + if (errno != 0 || nparity < 1 || nparity >= 255 || + *end != '\0') + return (NULL); + } + + if (mindev != NULL) + *mindev = nparity + 1; + if (maxdev != NULL) + *maxdev = 255; + return (VDEV_TYPE_RAIDZ); + } + + if (maxdev != NULL) + *maxdev = INT_MAX; + + if (strcmp(type, "mirror") == 0) { + if (mindev != NULL) + *mindev = 2; + return (VDEV_TYPE_MIRROR); + } + + if (strcmp(type, "spare") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_SPARE); + } + + if (strcmp(type, "log") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_LOG); + } + + if (strcmp(type, "cache") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_L2CACHE); + } + + return (NULL); +} + +/* + * Construct a syntactically valid vdev specification, + * and ensure that all devices and files exist and can be opened. + * Note: we don't bother freeing anything in the error paths + * because the program is just going to exit anyway. + */ +nvlist_t * +construct_spec(int argc, char **argv) +{ + nvlist_t *nvroot, *nv, **top, **spares, **l2cache; + int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; + const char *type; + uint64_t is_log; + boolean_t seen_logs; + + top = NULL; + toplevels = 0; + spares = NULL; + l2cache = NULL; + nspares = 0; + nlogs = 0; + nl2cache = 0; + is_log = B_FALSE; + seen_logs = B_FALSE; + + while (argc > 0) { + nv = NULL; + + /* + * If it's a mirror or raidz, the subsequent arguments are + * its leaves -- until we encounter the next mirror or raidz. + */ + if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { + nvlist_t **child = NULL; + int c, children = 0; + + if (strcmp(type, VDEV_TYPE_SPARE) == 0) { + if (spares != NULL) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'spare' can be " + "specified only once\n")); + return (NULL); + } + is_log = B_FALSE; + } + + if (strcmp(type, VDEV_TYPE_LOG) == 0) { + if (seen_logs) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'log' can be " + "specified only once\n")); + return (NULL); + } + seen_logs = B_TRUE; + is_log = B_TRUE; + argc--; + argv++; + /* + * A log is not a real grouping device. + * We just set is_log and continue. + */ + continue; + } + + if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { + if (l2cache != NULL) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'cache' can be " + "specified only once\n")); + return (NULL); + } + is_log = B_FALSE; + } + + if (is_log) { + if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: unsupported 'log' " + "device: %s\n"), type); + return (NULL); + } + nlogs++; + } + + for (c = 1; c < argc; c++) { + if (is_grouping(argv[c], NULL, NULL) != NULL) + break; + children++; + child = realloc(child, + children * sizeof (nvlist_t *)); + if (child == NULL) + zpool_no_memory(); + if ((nv = make_leaf_vdev(argv[c], B_FALSE)) + == NULL) + return (NULL); + child[children - 1] = nv; + } + + if (children < mindev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s requires at least %d " + "devices\n"), argv[0], mindev); + return (NULL); + } + + if (children > maxdev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s supports no more than " + "%d devices\n"), argv[0], maxdev); + return (NULL); + } + + argc -= c; + argv += c; + + if (strcmp(type, VDEV_TYPE_SPARE) == 0) { + spares = child; + nspares = children; + continue; + } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { + l2cache = child; + nl2cache = children; + continue; + } else { + verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, + 0) == 0); + verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, + type) == 0); + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_NPARITY, + mindev - 1) == 0); + } + verify(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, + children) == 0); + + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + } + } else { + /* + * We have a device. Pass off to make_leaf_vdev() to + * construct the appropriate nvlist describing the vdev. + */ + if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL) + return (NULL); + if (is_log) + nlogs++; + argc--; + argv++; + } + + toplevels++; + top = realloc(top, toplevels * sizeof (nvlist_t *)); + if (top == NULL) + zpool_no_memory(); + top[toplevels - 1] = nv; + } + + if (toplevels == 0 && nspares == 0 && nl2cache == 0) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: at least one toplevel vdev must be " + "specified\n")); + return (NULL); + } + + if (seen_logs && nlogs == 0) { + (void) fprintf(stderr, gettext("invalid vdev specification: " + "log requires at least 1 device\n")); + return (NULL); + } + + /* + * Finally, create nvroot and add all top-level vdevs to it. + */ + verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); + verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + top, toplevels) == 0); + if (nspares != 0) + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + spares, nspares) == 0); + if (nl2cache != 0) + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + l2cache, nl2cache) == 0); + + for (t = 0; t < toplevels; t++) + nvlist_free(top[t]); + for (t = 0; t < nspares; t++) + nvlist_free(spares[t]); + for (t = 0; t < nl2cache; t++) + nvlist_free(l2cache[t]); + if (spares) + free(spares); + if (l2cache) + free(l2cache); + free(top); + + return (nvroot); +} + +nvlist_t * +split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, + splitflags_t flags, int argc, char **argv) +{ + nvlist_t *newroot = NULL, **child; + uint_t c, children; + + if (argc > 0) { + if ((newroot = construct_spec(argc, argv)) == NULL) { + (void) fprintf(stderr, gettext("Unable to build a " + "pool from the specified devices\n")); + return (NULL); + } + + if (!flags.dryrun && make_disks(zhp, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + /* avoid any tricks in the spec */ + verify(nvlist_lookup_nvlist_array(newroot, + ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + for (c = 0; c < children; c++) { + char *path; + const char *type; + int min, max; + + verify(nvlist_lookup_string(child[c], + ZPOOL_CONFIG_PATH, &path) == 0); + if ((type = is_grouping(path, &min, &max)) != NULL) { + (void) fprintf(stderr, gettext("Cannot use " + "'%s' as a device for splitting\n"), type); + nvlist_free(newroot); + return (NULL); + } + } + } + + if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { + if (newroot != NULL) + nvlist_free(newroot); + return (NULL); + } + + return (newroot); +} + +/* + * Get and validate the contents of the given vdev specification. This ensures + * that the nvlist returned is well-formed, that all the devices exist, and that + * they are not currently in use by any other known consumer. The 'poolconfig' + * parameter is the current configuration of the pool when adding devices + * existing pool, and is used to perform additional checks, such as changing the + * replication level of the pool. It can be 'NULL' to indicate that this is a + * new pool. The 'force' flag controls whether devices should be forcefully + * added, even if they appear in use. + */ +nvlist_t * +make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, + boolean_t replacing, boolean_t dryrun, int argc, char **argv) +{ + nvlist_t *newroot; + nvlist_t *poolconfig = NULL; + is_force = force; + + /* + * Construct the vdev specification. If this is successful, we know + * that we have a valid specification, and that all devices can be + * opened. + */ + if ((newroot = construct_spec(argc, argv)) == NULL) + return (NULL); + + if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) + return (NULL); + + /* + * Validate each device to make sure that its not shared with another + * subsystem. We do this even if 'force' is set, because there are some + * uses (such as a dedicated dump device) that even '-f' cannot + * override. + */ + if (check_in_use(poolconfig, newroot, force, replacing, B_FALSE) != 0) { + nvlist_free(newroot); + return (NULL); + } + + /* + * Check the replication level of the given vdevs and report any errors + * found. We include the existing pool spec, if any, as we need to + * catch changes against the existing replication level. + */ + if (check_rep && check_replication(poolconfig, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + /* + * Run through the vdev specification and label any whole disks found. + */ + if (!dryrun && make_disks(zhp, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + return (newroot); +} diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c new file mode 100644 index 0000000..df23cc1 --- /dev/null +++ b/cmd/zstreamdump/zstreamdump.c @@ -0,0 +1,429 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +uint64_t drr_record_count[DRR_NUMTYPES]; +uint64_t total_write_size = 0; +uint64_t total_stream_len = 0; +FILE *send_stream = 0; +boolean_t do_byteswap = B_FALSE; +boolean_t do_cksum = B_TRUE; +#define INITIAL_BUFLEN (1<<20) + +static void +usage(void) +{ + (void) fprintf(stderr, "usage: zstreamdump [-v] [-C] < file\n"); + (void) fprintf(stderr, "\t -v -- verbose\n"); + (void) fprintf(stderr, "\t -C -- suppress checksum verification\n"); + exit(1); +} + +/* + * ssread - send stream read. + * + * Read while computing incremental checksum + */ + +static size_t +ssread(void *buf, size_t len, zio_cksum_t *cksum) +{ + size_t outlen; + + if ((outlen = fread(buf, len, 1, send_stream)) == 0) + return (0); + + if (do_cksum && cksum) { + if (do_byteswap) + fletcher_4_incremental_byteswap(buf, len, cksum); + else + fletcher_4_incremental_native(buf, len, cksum); + } + total_stream_len += len; + return (outlen); +} + +int +main(int argc, char *argv[]) +{ + char *buf = malloc(INITIAL_BUFLEN); + dmu_replay_record_t thedrr; + dmu_replay_record_t *drr = &thedrr; + struct drr_begin *drrb = &thedrr.drr_u.drr_begin; + struct drr_end *drre = &thedrr.drr_u.drr_end; + struct drr_object *drro = &thedrr.drr_u.drr_object; + struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects; + struct drr_write *drrw = &thedrr.drr_u.drr_write; + struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref; + struct drr_free *drrf = &thedrr.drr_u.drr_free; + struct drr_spill *drrs = &thedrr.drr_u.drr_spill; + char c; + boolean_t verbose = B_FALSE; + boolean_t first = B_TRUE; + int err; + zio_cksum_t zc = { 0 }; + zio_cksum_t pcksum = { 0 }; + + while ((c = getopt(argc, argv, ":vC")) != -1) { + switch (c) { + case 'C': + do_cksum = B_FALSE; + break; + case 'v': + verbose = B_TRUE; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + usage(); + } + } + + if (isatty(STDIN_FILENO)) { + (void) fprintf(stderr, + "Error: Backup stream can not be read " + "from a terminal.\n" + "You must redirect standard input.\n"); + exit(1); + } + + send_stream = stdin; + pcksum = zc; + while (ssread(drr, sizeof (dmu_replay_record_t), &zc)) { + + if (first) { + if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + do_byteswap = B_TRUE; + if (do_cksum) { + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + /* + * recalculate header checksum now + * that we know it needs to be + * byteswapped. + */ + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &zc); + } + } else if (drrb->drr_magic != DMU_BACKUP_MAGIC) { + (void) fprintf(stderr, "Invalid stream " + "(bad magic number)\n"); + exit(1); + } + first = B_FALSE; + } + if (do_byteswap) { + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = + BSWAP_32(drr->drr_payloadlen); + } + + /* + * At this point, the leading fields of the replay record + * (drr_type and drr_payloadlen) have been byte-swapped if + * necessary, but the rest of the data structure (the + * union of type-specific structures) is still in its + * original state. + */ + if (drr->drr_type >= DRR_NUMTYPES) { + (void) printf("INVALID record found: type 0x%x\n", + drr->drr_type); + (void) printf("Aborting.\n"); + exit(1); + } + + drr_record_count[drr->drr_type]++; + + switch (drr->drr_type) { + case DRR_BEGIN: + if (do_byteswap) { + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_versioninfo = + BSWAP_64(drrb->drr_versioninfo); + drrb->drr_creation_time = + BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_flags = BSWAP_32(drrb->drr_flags); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = + BSWAP_64(drrb->drr_fromguid); + } + + (void) printf("BEGIN record\n"); + (void) printf("\thdrtype = %lld\n", + DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo)); + (void) printf("\tfeatures = %llx\n", + DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo)); + (void) printf("\tmagic = %llx\n", + (u_longlong_t)drrb->drr_magic); + (void) printf("\tcreation_time = %llx\n", + (u_longlong_t)drrb->drr_creation_time); + (void) printf("\ttype = %u\n", drrb->drr_type); + (void) printf("\tflags = 0x%x\n", drrb->drr_flags); + (void) printf("\ttoguid = %llx\n", + (u_longlong_t)drrb->drr_toguid); + (void) printf("\tfromguid = %llx\n", + (u_longlong_t)drrb->drr_fromguid); + (void) printf("\ttoname = %s\n", drrb->drr_toname); + if (verbose) + (void) printf("\n"); + + if ((DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM) && drr->drr_payloadlen != 0) { + nvlist_t *nv; + int sz = drr->drr_payloadlen; + + if (sz > 1<<20) { + free(buf); + buf = malloc(sz); + } + (void) ssread(buf, sz, &zc); + if (ferror(send_stream)) + perror("fread"); + err = nvlist_unpack(buf, sz, &nv, 0); + if (err) + perror(strerror(err)); + nvlist_print(stdout, nv); + nvlist_free(nv); + } + break; + + case DRR_END: + if (do_byteswap) { + drre->drr_checksum.zc_word[0] = + BSWAP_64(drre->drr_checksum.zc_word[0]); + drre->drr_checksum.zc_word[1] = + BSWAP_64(drre->drr_checksum.zc_word[1]); + drre->drr_checksum.zc_word[2] = + BSWAP_64(drre->drr_checksum.zc_word[2]); + drre->drr_checksum.zc_word[3] = + BSWAP_64(drre->drr_checksum.zc_word[3]); + } + /* + * We compare against the *previous* checksum + * value, because the stored checksum is of + * everything before the DRR_END record. + */ + if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum, + pcksum)) { + (void) printf("Expected checksum differs from " + "checksum in stream.\n"); + (void) printf("Expected checksum = " + "%llx/%llx/%llx/%llx\n", + pcksum.zc_word[0], + pcksum.zc_word[1], + pcksum.zc_word[2], + pcksum.zc_word[3]); + } + (void) printf("END checksum = %llx/%llx/%llx/%llx\n", + drre->drr_checksum.zc_word[0], + drre->drr_checksum.zc_word[1], + drre->drr_checksum.zc_word[2], + drre->drr_checksum.zc_word[3]); + + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + break; + + case DRR_OBJECT: + if (do_byteswap) { + drro->drr_object = BSWAP_64(drro->drr_object); + drro->drr_type = BSWAP_32(drro->drr_type); + drro->drr_bonustype = + BSWAP_32(drro->drr_bonustype); + drro->drr_blksz = BSWAP_32(drro->drr_blksz); + drro->drr_bonuslen = + BSWAP_32(drro->drr_bonuslen); + drro->drr_toguid = BSWAP_64(drro->drr_toguid); + } + if (verbose) { + (void) printf("OBJECT object = %llu type = %u " + "bonustype = %u blksz = %u bonuslen = %u\n", + (u_longlong_t)drro->drr_object, + drro->drr_type, + drro->drr_bonustype, + drro->drr_blksz, + drro->drr_bonuslen); + } + if (drro->drr_bonuslen > 0) { + (void) ssread(buf, P2ROUNDUP(drro->drr_bonuslen, + 8), &zc); + } + break; + + case DRR_FREEOBJECTS: + if (do_byteswap) { + drrfo->drr_firstobj = + BSWAP_64(drrfo->drr_firstobj); + drrfo->drr_numobjs = + BSWAP_64(drrfo->drr_numobjs); + drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid); + } + if (verbose) { + (void) printf("FREEOBJECTS firstobj = %llu " + "numobjs = %llu\n", + (u_longlong_t)drrfo->drr_firstobj, + (u_longlong_t)drrfo->drr_numobjs); + } + break; + + case DRR_WRITE: + if (do_byteswap) { + drrw->drr_object = BSWAP_64(drrw->drr_object); + drrw->drr_type = BSWAP_32(drrw->drr_type); + drrw->drr_offset = BSWAP_64(drrw->drr_offset); + drrw->drr_length = BSWAP_64(drrw->drr_length); + drrw->drr_toguid = BSWAP_64(drrw->drr_toguid); + drrw->drr_key.ddk_prop = + BSWAP_64(drrw->drr_key.ddk_prop); + } + if (verbose) { + (void) printf("WRITE object = %llu type = %u " + "checksum type = %u\n" + "offset = %llu length = %llu " + "props = %llx\n", + (u_longlong_t)drrw->drr_object, + drrw->drr_type, + drrw->drr_checksumtype, + (u_longlong_t)drrw->drr_offset, + (u_longlong_t)drrw->drr_length, + (u_longlong_t)drrw->drr_key.ddk_prop); + } + (void) ssread(buf, drrw->drr_length, &zc); + total_write_size += drrw->drr_length; + break; + + case DRR_WRITE_BYREF: + if (do_byteswap) { + drrwbr->drr_object = + BSWAP_64(drrwbr->drr_object); + drrwbr->drr_offset = + BSWAP_64(drrwbr->drr_offset); + drrwbr->drr_length = + BSWAP_64(drrwbr->drr_length); + drrwbr->drr_toguid = + BSWAP_64(drrwbr->drr_toguid); + drrwbr->drr_refguid = + BSWAP_64(drrwbr->drr_refguid); + drrwbr->drr_refobject = + BSWAP_64(drrwbr->drr_refobject); + drrwbr->drr_refoffset = + BSWAP_64(drrwbr->drr_refoffset); + drrwbr->drr_key.ddk_prop = + BSWAP_64(drrwbr->drr_key.ddk_prop); + } + if (verbose) { + (void) printf("WRITE_BYREF object = %llu " + "checksum type = %u props = %llx\n" + "offset = %llu length = %llu\n" + "toguid = %llx refguid = %llx\n" + "refobject = %llu refoffset = %llu\n", + (u_longlong_t)drrwbr->drr_object, + drrwbr->drr_checksumtype, + (u_longlong_t)drrwbr->drr_key.ddk_prop, + (u_longlong_t)drrwbr->drr_offset, + (u_longlong_t)drrwbr->drr_length, + (u_longlong_t)drrwbr->drr_toguid, + (u_longlong_t)drrwbr->drr_refguid, + (u_longlong_t)drrwbr->drr_refobject, + (u_longlong_t)drrwbr->drr_refoffset); + } + break; + + case DRR_FREE: + if (do_byteswap) { + drrf->drr_object = BSWAP_64(drrf->drr_object); + drrf->drr_offset = BSWAP_64(drrf->drr_offset); + drrf->drr_length = BSWAP_64(drrf->drr_length); + } + if (verbose) { + (void) printf("FREE object = %llu " + "offset = %llu length = %lld\n", + (u_longlong_t)drrf->drr_object, + (u_longlong_t)drrf->drr_offset, + (longlong_t)drrf->drr_length); + } + break; + case DRR_SPILL: + if (do_byteswap) { + drrs->drr_object = BSWAP_64(drrs->drr_object); + drrs->drr_length = BSWAP_64(drrs->drr_length); + } + if (verbose) { + (void) printf("SPILL block for object = %llu " + "length = %llu\n", drrs->drr_object, + drrs->drr_length); + } + (void) ssread(buf, drrs->drr_length, &zc); + break; + } + pcksum = zc; + } + free(buf); + + /* Print final summary */ + + (void) printf("SUMMARY:\n"); + (void) printf("\tTotal DRR_BEGIN records = %lld\n", + (u_longlong_t)drr_record_count[DRR_BEGIN]); + (void) printf("\tTotal DRR_END records = %lld\n", + (u_longlong_t)drr_record_count[DRR_END]); + (void) printf("\tTotal DRR_OBJECT records = %lld\n", + (u_longlong_t)drr_record_count[DRR_OBJECT]); + (void) printf("\tTotal DRR_FREEOBJECTS records = %lld\n", + (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]); + (void) printf("\tTotal DRR_WRITE records = %lld\n", + (u_longlong_t)drr_record_count[DRR_WRITE]); + (void) printf("\tTotal DRR_FREE records = %lld\n", + (u_longlong_t)drr_record_count[DRR_FREE]); + (void) printf("\tTotal DRR_SPILL records = %lld\n", + (u_longlong_t)drr_record_count[DRR_SPILL]); + (void) printf("\tTotal records = %lld\n", + (u_longlong_t)(drr_record_count[DRR_BEGIN] + + drr_record_count[DRR_OBJECT] + + drr_record_count[DRR_FREEOBJECTS] + + drr_record_count[DRR_WRITE] + + drr_record_count[DRR_FREE] + + drr_record_count[DRR_SPILL] + + drr_record_count[DRR_END])); + (void) printf("\tTotal write size = %lld (0x%llx)\n", + (u_longlong_t)total_write_size, (u_longlong_t)total_write_size); + (void) printf("\tTotal stream length = %lld (0x%llx)\n", + (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len); + return (0); +} diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c new file mode 100644 index 0000000..b2d81b5 --- /dev/null +++ b/cmd/ztest/ztest.c @@ -0,0 +1,5604 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * The objective of this program is to provide a DMU/ZAP/SPA stress test + * that runs entirely in userland, is easy to use, and easy to extend. + * + * The overall design of the ztest program is as follows: + * + * (1) For each major functional area (e.g. adding vdevs to a pool, + * creating and destroying datasets, reading and writing objects, etc) + * we have a simple routine to test that functionality. These + * individual routines do not have to do anything "stressful". + * + * (2) We turn these simple functionality tests into a stress test by + * running them all in parallel, with as many threads as desired, + * and spread across as many datasets, objects, and vdevs as desired. + * + * (3) While all this is happening, we inject faults into the pool to + * verify that self-healing data really works. + * + * (4) Every time we open a dataset, we change its checksum and compression + * functions. Thus even individual objects vary from block to block + * in which checksum they use and whether they're compressed. + * + * (5) To verify that we never lose on-disk consistency after a crash, + * we run the entire test in a child of the main process. + * At random times, the child self-immolates with a SIGKILL. + * This is the software equivalent of pulling the power cord. + * The parent then runs the test again, using the existing + * storage pool, as many times as desired. + * + * (6) To verify that we don't have future leaks or temporal incursions, + * many of the functional tests record the transaction group number + * as part of their data. When reading old data, they verify that + * the transaction group number is less than the current, open txg. + * If you add a new test, please do this if applicable. + * + * When run with no arguments, ztest runs for about five minutes and + * produces no output if successful. To get a little bit of information, + * specify -V. To get more information, specify -VV, and so on. + * + * To turn this into an overnight stress test, use -T to specify run time. + * + * You can ask more more vdevs [-v], datasets [-d], or threads [-t] + * to increase the pool capacity, fanout, and overall stress level. + * + * The -N(okill) option will suppress kills, so each child runs to completion. + * This can be useful when you're trying to distinguish temporal incursions + * from plain old race conditions. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static char cmdname[] = "ztest"; +static char *zopt_pool = cmdname; + +static uint64_t zopt_vdevs = 5; +static uint64_t zopt_vdevtime; +static int zopt_ashift = SPA_MINBLOCKSHIFT; +static int zopt_mirrors = 2; +static int zopt_raidz = 4; +static int zopt_raidz_parity = 1; +static size_t zopt_vdev_size = SPA_MINDEVSIZE; +static int zopt_datasets = 7; +static int zopt_threads = 23; +static uint64_t zopt_passtime = 60; /* 60 seconds */ +static uint64_t zopt_killrate = 70; /* 70% kill rate */ +static int zopt_verbose = 0; +static int zopt_init = 1; +static char *zopt_dir = "/tmp"; +static uint64_t zopt_time = 300; /* 5 minutes */ +static uint64_t zopt_maxloops = 50; /* max loops during spa_freeze() */ + +#define BT_MAGIC 0x123456789abcdefULL +#define MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1) + +enum ztest_io_type { + ZTEST_IO_WRITE_TAG, + ZTEST_IO_WRITE_PATTERN, + ZTEST_IO_WRITE_ZEROES, + ZTEST_IO_TRUNCATE, + ZTEST_IO_SETATTR, + ZTEST_IO_TYPES +}; + +typedef struct ztest_block_tag { + uint64_t bt_magic; + uint64_t bt_objset; + uint64_t bt_object; + uint64_t bt_offset; + uint64_t bt_gen; + uint64_t bt_txg; + uint64_t bt_crtxg; +} ztest_block_tag_t; + +typedef struct bufwad { + uint64_t bw_index; + uint64_t bw_txg; + uint64_t bw_data; +} bufwad_t; + +/* + * XXX -- fix zfs range locks to be generic so we can use them here. + */ +typedef enum { + RL_READER, + RL_WRITER, + RL_APPEND +} rl_type_t; + +typedef struct rll { + void *rll_writer; + int rll_readers; + mutex_t rll_lock; + cond_t rll_cv; +} rll_t; + +typedef struct rl { + uint64_t rl_object; + uint64_t rl_offset; + uint64_t rl_size; + rll_t *rl_lock; +} rl_t; + +#define ZTEST_RANGE_LOCKS 64 +#define ZTEST_OBJECT_LOCKS 64 + +/* + * Object descriptor. Used as a template for object lookup/create/remove. + */ +typedef struct ztest_od { + uint64_t od_dir; + uint64_t od_object; + dmu_object_type_t od_type; + dmu_object_type_t od_crtype; + uint64_t od_blocksize; + uint64_t od_crblocksize; + uint64_t od_gen; + uint64_t od_crgen; + char od_name[MAXNAMELEN]; +} ztest_od_t; + +/* + * Per-dataset state. + */ +typedef struct ztest_ds { + objset_t *zd_os; + zilog_t *zd_zilog; + uint64_t zd_seq; + ztest_od_t *zd_od; /* debugging aid */ + char zd_name[MAXNAMELEN]; + mutex_t zd_dirobj_lock; + rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; + rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; +} ztest_ds_t; + +/* + * Per-iteration state. + */ +typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); + +typedef struct ztest_info { + ztest_func_t *zi_func; /* test function */ + uint64_t zi_iters; /* iterations per execution */ + uint64_t *zi_interval; /* execute every seconds */ + uint64_t zi_call_count; /* per-pass count */ + uint64_t zi_call_time; /* per-pass time */ + uint64_t zi_call_next; /* next time to call this function */ +} ztest_info_t; + +/* + * Note: these aren't static because we want dladdr() to work. + */ +ztest_func_t ztest_dmu_read_write; +ztest_func_t ztest_dmu_write_parallel; +ztest_func_t ztest_dmu_object_alloc_free; +ztest_func_t ztest_dmu_commit_callbacks; +ztest_func_t ztest_zap; +ztest_func_t ztest_zap_parallel; +ztest_func_t ztest_zil_commit; +ztest_func_t ztest_dmu_read_write_zcopy; +ztest_func_t ztest_dmu_objset_create_destroy; +ztest_func_t ztest_dmu_prealloc; +ztest_func_t ztest_fzap; +ztest_func_t ztest_dmu_snapshot_create_destroy; +ztest_func_t ztest_dsl_prop_get_set; +ztest_func_t ztest_spa_prop_get_set; +ztest_func_t ztest_spa_create_destroy; +ztest_func_t ztest_fault_inject; +ztest_func_t ztest_ddt_repair; +ztest_func_t ztest_dmu_snapshot_hold; +ztest_func_t ztest_spa_rename; +ztest_func_t ztest_scrub; +ztest_func_t ztest_dsl_dataset_promote_busy; +ztest_func_t ztest_vdev_attach_detach; +ztest_func_t ztest_vdev_LUN_growth; +ztest_func_t ztest_vdev_add_remove; +ztest_func_t ztest_vdev_aux_add_remove; +ztest_func_t ztest_split_pool; + +uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ +uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ +uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ +uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ +uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ + +ztest_info_t ztest_info[] = { + { ztest_dmu_read_write, 1, &zopt_always }, + { ztest_dmu_write_parallel, 10, &zopt_always }, + { ztest_dmu_object_alloc_free, 1, &zopt_always }, + { ztest_dmu_commit_callbacks, 1, &zopt_always }, + { ztest_zap, 30, &zopt_always }, + { ztest_zap_parallel, 100, &zopt_always }, + { ztest_split_pool, 1, &zopt_always }, + { ztest_zil_commit, 1, &zopt_incessant }, + { ztest_dmu_read_write_zcopy, 1, &zopt_often }, + { ztest_dmu_objset_create_destroy, 1, &zopt_often }, + { ztest_dsl_prop_get_set, 1, &zopt_often }, + { ztest_spa_prop_get_set, 1, &zopt_sometimes }, +#if 0 + { ztest_dmu_prealloc, 1, &zopt_sometimes }, +#endif + { ztest_fzap, 1, &zopt_sometimes }, + { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, + { ztest_spa_create_destroy, 1, &zopt_sometimes }, + { ztest_fault_inject, 1, &zopt_sometimes }, + { ztest_ddt_repair, 1, &zopt_sometimes }, + { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, + { ztest_spa_rename, 1, &zopt_rarely }, + { ztest_scrub, 1, &zopt_rarely }, + { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, + { ztest_vdev_attach_detach, 1, &zopt_rarely }, + { ztest_vdev_LUN_growth, 1, &zopt_rarely }, + { ztest_vdev_add_remove, 1, &zopt_vdevtime }, + { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, +}; + +#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) + +/* + * The following struct is used to hold a list of uncalled commit callbacks. + * The callbacks are ordered by txg number. + */ +typedef struct ztest_cb_list { + mutex_t zcl_callbacks_lock; + list_t zcl_callbacks; +} ztest_cb_list_t; + +/* + * Stuff we need to share writably between parent and child. + */ +typedef struct ztest_shared { + char *zs_pool; + spa_t *zs_spa; + hrtime_t zs_proc_start; + hrtime_t zs_proc_stop; + hrtime_t zs_thread_start; + hrtime_t zs_thread_stop; + hrtime_t zs_thread_kill; + uint64_t zs_enospc_count; + uint64_t zs_vdev_next_leaf; + uint64_t zs_vdev_aux; + uint64_t zs_alloc; + uint64_t zs_space; + mutex_t zs_vdev_lock; + rwlock_t zs_name_lock; + ztest_info_t zs_info[ZTEST_FUNCS]; + uint64_t zs_splits; + uint64_t zs_mirrors; + ztest_ds_t zs_zd[]; +} ztest_shared_t; + +#define ID_PARALLEL -1ULL + +static char ztest_dev_template[] = "%s/%s.%llua"; +static char ztest_aux_template[] = "%s/%s.%s.%llu"; +ztest_shared_t *ztest_shared; +uint64_t *ztest_seq; + +static int ztest_random_fd; +static int ztest_dump_core = 1; + +static boolean_t ztest_exiting; + +/* Global commit callback list */ +static ztest_cb_list_t zcl; + +extern uint64_t metaslab_gang_bang; +extern uint64_t metaslab_df_alloc_threshold; +static uint64_t metaslab_sz; + +enum ztest_object { + ZTEST_META_DNODE = 0, + ZTEST_DIROBJ, + ZTEST_OBJECTS +}; + +static void usage(boolean_t) __NORETURN; + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ +const char * +_umem_debug_init() +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} + +#define FATAL_MSG_SZ 1024 + +char *fatal_msg; + +static void +fatal(int do_perror, char *message, ...) +{ + va_list args; + int save_errno = errno; + char buf[FATAL_MSG_SZ]; + + (void) fflush(stdout); + + va_start(args, message); + (void) sprintf(buf, "ztest: "); + /* LINTED */ + (void) vsprintf(buf + strlen(buf), message, args); + va_end(args); + if (do_perror) { + (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), + ": %s", strerror(save_errno)); + } + (void) fprintf(stderr, "%s\n", buf); + fatal_msg = buf; /* to ease debugging */ + if (ztest_dump_core) + abort(); + exit(3); +} + +static int +str2shift(const char *buf) +{ + const char *ends = "BKMGTPEZ"; + int i; + + if (buf[0] == '\0') + return (0); + for (i = 0; i < strlen(ends); i++) { + if (toupper(buf[0]) == ends[i]) + break; + } + if (i == strlen(ends)) { + (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", + buf); + usage(B_FALSE); + } + if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { + return (10*i); + } + (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); + usage(B_FALSE); + /* NOTREACHED */ +} + +static uint64_t +nicenumtoull(const char *buf) +{ + char *end; + uint64_t val; + + val = strtoull(buf, &end, 0); + if (end == buf) { + (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); + usage(B_FALSE); + } else if (end[0] == '.') { + double fval = strtod(buf, &end); + fval *= pow(2, str2shift(end)); + if (fval > UINT64_MAX) { + (void) fprintf(stderr, "ztest: value too large: %s\n", + buf); + usage(B_FALSE); + } + val = (uint64_t)fval; + } else { + int shift = str2shift(end); + if (shift >= 64 || (val << shift) >> shift != val) { + (void) fprintf(stderr, "ztest: value too large: %s\n", + buf); + usage(B_FALSE); + } + val <<= shift; + } + return (val); +} + +static void +usage(boolean_t requested) +{ + char nice_vdev_size[10]; + char nice_gang_bang[10]; + FILE *fp = requested ? stdout : stderr; + + nicenum(zopt_vdev_size, nice_vdev_size); + nicenum(metaslab_gang_bang, nice_gang_bang); + + (void) fprintf(fp, "Usage: %s\n" + "\t[-v vdevs (default: %llu)]\n" + "\t[-s size_of_each_vdev (default: %s)]\n" + "\t[-a alignment_shift (default: %d)] use 0 for random\n" + "\t[-m mirror_copies (default: %d)]\n" + "\t[-r raidz_disks (default: %d)]\n" + "\t[-R raidz_parity (default: %d)]\n" + "\t[-d datasets (default: %d)]\n" + "\t[-t threads (default: %d)]\n" + "\t[-g gang_block_threshold (default: %s)]\n" + "\t[-i init_count (default: %d)] initialize pool i times\n" + "\t[-k kill_percentage (default: %llu%%)]\n" + "\t[-p pool_name (default: %s)]\n" + "\t[-f dir (default: %s)] file directory for vdev files\n" + "\t[-V] verbose (use multiple times for ever more blather)\n" + "\t[-E] use existing pool instead of creating new one\n" + "\t[-T time (default: %llu sec)] total run time\n" + "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" + "\t[-P passtime (default: %llu sec)] time per pass\n" + "\t[-h] (print help)\n" + "", + cmdname, + (u_longlong_t)zopt_vdevs, /* -v */ + nice_vdev_size, /* -s */ + zopt_ashift, /* -a */ + zopt_mirrors, /* -m */ + zopt_raidz, /* -r */ + zopt_raidz_parity, /* -R */ + zopt_datasets, /* -d */ + zopt_threads, /* -t */ + nice_gang_bang, /* -g */ + zopt_init, /* -i */ + (u_longlong_t)zopt_killrate, /* -k */ + zopt_pool, /* -p */ + zopt_dir, /* -f */ + (u_longlong_t)zopt_time, /* -T */ + (u_longlong_t)zopt_maxloops, /* -F */ + (u_longlong_t)zopt_passtime); /* -P */ + exit(requested ? 0 : 1); +} + +static void +process_options(int argc, char **argv) +{ + int opt; + uint64_t value; + + /* By default, test gang blocks for blocks 32K and greater */ + metaslab_gang_bang = 32 << 10; + + while ((opt = getopt(argc, argv, + "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:")) != EOF) { + value = 0; + switch (opt) { + case 'v': + case 's': + case 'a': + case 'm': + case 'r': + case 'R': + case 'd': + case 't': + case 'g': + case 'i': + case 'k': + case 'T': + case 'P': + case 'F': + value = nicenumtoull(optarg); + } + switch (opt) { + case 'v': + zopt_vdevs = value; + break; + case 's': + zopt_vdev_size = MAX(SPA_MINDEVSIZE, value); + break; + case 'a': + zopt_ashift = value; + break; + case 'm': + zopt_mirrors = value; + break; + case 'r': + zopt_raidz = MAX(1, value); + break; + case 'R': + zopt_raidz_parity = MIN(MAX(value, 1), 3); + break; + case 'd': + zopt_datasets = MAX(1, value); + break; + case 't': + zopt_threads = MAX(1, value); + break; + case 'g': + metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value); + break; + case 'i': + zopt_init = value; + break; + case 'k': + zopt_killrate = value; + break; + case 'p': + zopt_pool = strdup(optarg); + break; + case 'f': + zopt_dir = strdup(optarg); + break; + case 'V': + zopt_verbose++; + break; + case 'E': + zopt_init = 0; + break; + case 'T': + zopt_time = value; + break; + case 'P': + zopt_passtime = MAX(1, value); + break; + case 'F': + zopt_maxloops = MAX(1, value); + break; + case 'h': + usage(B_TRUE); + break; + case '?': + default: + usage(B_FALSE); + break; + } + } + + zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1); + + zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs : + UINT64_MAX >> 2); +} + +static void +ztest_kill(ztest_shared_t *zs) +{ + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa)); + (void) kill(getpid(), SIGKILL); +} + +static uint64_t +ztest_random(uint64_t range) +{ + uint64_t r; + + if (range == 0) + return (0); + + if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r)) + fatal(1, "short read from /dev/urandom"); + + return (r % range); +} + +/* ARGSUSED */ +static void +ztest_record_enospc(const char *s) +{ + ztest_shared->zs_enospc_count++; +} + +static uint64_t +ztest_get_ashift(void) +{ + if (zopt_ashift == 0) + return (SPA_MINBLOCKSHIFT + ztest_random(3)); + return (zopt_ashift); +} + +static nvlist_t * +make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift) +{ + char pathbuf[MAXPATHLEN]; + uint64_t vdev; + nvlist_t *file; + + if (ashift == 0) + ashift = ztest_get_ashift(); + + if (path == NULL) { + path = pathbuf; + + if (aux != NULL) { + vdev = ztest_shared->zs_vdev_aux; + (void) sprintf(path, ztest_aux_template, + zopt_dir, zopt_pool, aux, vdev); + } else { + vdev = ztest_shared->zs_vdev_next_leaf++; + (void) sprintf(path, ztest_dev_template, + zopt_dir, zopt_pool, vdev); + } + } + + if (size != 0) { + int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); + if (fd == -1) + fatal(1, "can't open %s", path); + if (ftruncate(fd, size) != 0) + fatal(1, "can't ftruncate %s", path); + (void) close(fd); + } + + VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); + VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); + VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); + + return (file); +} + +static nvlist_t * +make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r) +{ + nvlist_t *raidz, **child; + int c; + + if (r < 2) + return (make_vdev_file(path, aux, size, ashift)); + child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (c = 0; c < r; c++) + child[c] = make_vdev_file(path, aux, size, ashift); + + VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_RAIDZ) == 0); + VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, + zopt_raidz_parity) == 0); + VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, + child, r) == 0); + + for (c = 0; c < r; c++) + nvlist_free(child[c]); + + umem_free(child, r * sizeof (nvlist_t *)); + + return (raidz); +} + +static nvlist_t * +make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift, + int r, int m) +{ + nvlist_t *mirror, **child; + int c; + + if (m < 1) + return (make_vdev_raidz(path, aux, size, ashift, r)); + + child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (c = 0; c < m; c++) + child[c] = make_vdev_raidz(path, aux, size, ashift, r); + + VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MIRROR) == 0); + VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, + child, m) == 0); + + for (c = 0; c < m; c++) + nvlist_free(child[c]); + + umem_free(child, m * sizeof (nvlist_t *)); + + return (mirror); +} + +static nvlist_t * +make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift, + int log, int r, int m, int t) +{ + nvlist_t *root, **child; + int c; + + ASSERT(t > 0); + + child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (c = 0; c < t; c++) { + child[c] = make_vdev_mirror(path, aux, size, ashift, r, m); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + log) == 0); + } + + VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, + child, t) == 0); + + for (c = 0; c < t; c++) + nvlist_free(child[c]); + + umem_free(child, t * sizeof (nvlist_t *)); + + return (root); +} + +static int +ztest_random_blocksize(void) +{ + return (1 << (SPA_MINBLOCKSHIFT + + ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1))); +} + +static int +ztest_random_ibshift(void) +{ + return (DN_MIN_INDBLKSHIFT + + ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); +} + +static uint64_t +ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) +{ + uint64_t top; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *tvd; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + do { + top = ztest_random(rvd->vdev_children); + tvd = rvd->vdev_child[top]; + } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) || + tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); + + return (top); +} + +static uint64_t +ztest_random_dsl_prop(zfs_prop_t prop) +{ + uint64_t value; + + do { + value = zfs_prop_random_value(prop, ztest_random(-1ULL)); + } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); + + return (value); +} + +static int +ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, + boolean_t inherit) +{ + const char *propname = zfs_prop_to_name(prop); + const char *valname; + char setpoint[MAXPATHLEN]; + uint64_t curval; + int error; + + error = dsl_prop_set(osname, propname, + (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), + sizeof (value), 1, &value); + + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (error); + } + ASSERT3U(error, ==, 0); + + VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval), + 1, &curval, setpoint), ==, 0); + + if (zopt_verbose >= 6) { + VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); + (void) printf("%s %s = %s at '%s'\n", + osname, propname, valname, setpoint); + } + + return (error); +} + +static int +ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value) +{ + spa_t *spa = zs->zs_spa; + nvlist_t *props = NULL; + int error; + + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); + + error = spa_prop_set(spa, props); + + nvlist_free(props); + + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (error); + } + ASSERT3U(error, ==, 0); + + return (error); +} + +static void +ztest_rll_init(rll_t *rll) +{ + rll->rll_writer = NULL; + rll->rll_readers = 0; + VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0); + VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0); +} + +static void +ztest_rll_destroy(rll_t *rll) +{ + ASSERT(rll->rll_writer == NULL); + ASSERT(rll->rll_readers == 0); + VERIFY(_mutex_destroy(&rll->rll_lock) == 0); + VERIFY(cond_destroy(&rll->rll_cv) == 0); +} + +static void +ztest_rll_lock(rll_t *rll, rl_type_t type) +{ + VERIFY(mutex_lock(&rll->rll_lock) == 0); + + if (type == RL_READER) { + while (rll->rll_writer != NULL) + (void) cond_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_readers++; + } else { + while (rll->rll_writer != NULL || rll->rll_readers) + (void) cond_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_writer = curthread; + } + + VERIFY(mutex_unlock(&rll->rll_lock) == 0); +} + +static void +ztest_rll_unlock(rll_t *rll) +{ + VERIFY(mutex_lock(&rll->rll_lock) == 0); + + if (rll->rll_writer) { + ASSERT(rll->rll_readers == 0); + rll->rll_writer = NULL; + } else { + ASSERT(rll->rll_readers != 0); + ASSERT(rll->rll_writer == NULL); + rll->rll_readers--; + } + + if (rll->rll_writer == NULL && rll->rll_readers == 0) + VERIFY(cond_broadcast(&rll->rll_cv) == 0); + + VERIFY(mutex_unlock(&rll->rll_lock) == 0); +} + +static void +ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) +{ + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; + + ztest_rll_lock(rll, type); +} + +static void +ztest_object_unlock(ztest_ds_t *zd, uint64_t object) +{ + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; + + ztest_rll_unlock(rll); +} + +static rl_t * +ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, + uint64_t size, rl_type_t type) +{ + uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); + rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; + rl_t *rl; + + rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); + rl->rl_object = object; + rl->rl_offset = offset; + rl->rl_size = size; + rl->rl_lock = rll; + + ztest_rll_lock(rll, type); + + return (rl); +} + +static void +ztest_range_unlock(rl_t *rl) +{ + rll_t *rll = rl->rl_lock; + + ztest_rll_unlock(rll); + + umem_free(rl, sizeof (*rl)); +} + +static void +ztest_zd_init(ztest_ds_t *zd, objset_t *os) +{ + zd->zd_os = os; + zd->zd_zilog = dmu_objset_zil(os); + zd->zd_seq = 0; + dmu_objset_name(os, zd->zd_name); + + VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0); + + for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_init(&zd->zd_object_lock[l]); + + for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_init(&zd->zd_range_lock[l]); +} + +static void +ztest_zd_fini(ztest_ds_t *zd) +{ + VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0); + + for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_destroy(&zd->zd_object_lock[l]); + + for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_destroy(&zd->zd_range_lock[l]); +} + +#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) + +static uint64_t +ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) +{ + uint64_t txg; + int error; + + /* + * Attempt to assign tx to some transaction group. + */ + error = dmu_tx_assign(tx, txg_how); + if (error) { + if (error == ERESTART) { + ASSERT(txg_how == TXG_NOWAIT); + dmu_tx_wait(tx); + } else { + ASSERT3U(error, ==, ENOSPC); + ztest_record_enospc(tag); + } + dmu_tx_abort(tx); + return (0); + } + txg = dmu_tx_get_txg(tx); + ASSERT(txg != 0); + return (txg); +} + +static void +ztest_pattern_set(void *buf, uint64_t size, uint64_t value) +{ + uint64_t *ip = buf; + uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); + + while (ip < ip_end) + *ip++ = value; +} + +static boolean_t +ztest_pattern_match(void *buf, uint64_t size, uint64_t value) +{ + uint64_t *ip = buf; + uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); + uint64_t diff = 0; + + while (ip < ip_end) + diff |= (value - *ip++); + + return (diff == 0); +} + +static void +ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) +{ + bt->bt_magic = BT_MAGIC; + bt->bt_objset = dmu_objset_id(os); + bt->bt_object = object; + bt->bt_offset = offset; + bt->bt_gen = gen; + bt->bt_txg = txg; + bt->bt_crtxg = crtxg; +} + +static void +ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) +{ + ASSERT(bt->bt_magic == BT_MAGIC); + ASSERT(bt->bt_objset == dmu_objset_id(os)); + ASSERT(bt->bt_object == object); + ASSERT(bt->bt_offset == offset); + ASSERT(bt->bt_gen <= gen); + ASSERT(bt->bt_txg <= txg); + ASSERT(bt->bt_crtxg == crtxg); +} + +static ztest_block_tag_t * +ztest_bt_bonus(dmu_buf_t *db) +{ + dmu_object_info_t doi; + ztest_block_tag_t *bt; + + dmu_object_info_from_db(db, &doi); + ASSERT3U(doi.doi_bonus_size, <=, db->db_size); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); + bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); + + return (bt); +} + +/* + * ZIL logging ops + */ + +#define lrz_type lr_mode +#define lrz_blocksize lr_uid +#define lrz_ibshift lr_gid +#define lrz_bonustype lr_rdev +#define lrz_bonuslen lr_crtime[1] + +static void +ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + itx->itx_oid = object; + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) +{ + itx_t *itx; + itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + if (lr->lr_length > ZIL_MAX_LOG_DATA) + write_state = WR_INDIRECT; + + itx = zil_itx_create(TX_WRITE, + sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); + + if (write_state == WR_COPIED && + dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, + ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + write_state = WR_NEED_COPY; + } + itx->itx_private = zd; + itx->itx_wr_state = write_state; + itx->itx_sync = (ztest_random(8) == 0); + itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0); + + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) +{ + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + itx->itx_sync = B_FALSE; + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) +{ + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + itx->itx_sync = B_FALSE; + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +/* + * ZIL replay ops + */ +static int +ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + ztest_block_tag_t *bbt; + dmu_buf_t *db; + dmu_tx_t *tx; + uint64_t txg; + int error = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } else { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + } + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) + return (ENOSPC); + + ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + if (lr->lr_foid == 0) { + lr->lr_foid = zap_create(os, + lr->lrz_type, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } else { + error = zap_create_claim(os, lr->lr_foid, + lr->lrz_type, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } + } else { + if (lr->lr_foid == 0) { + lr->lr_foid = dmu_object_alloc(os, + lr->lrz_type, 0, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } else { + error = dmu_object_claim(os, lr->lr_foid, + lr->lrz_type, 0, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } + } + + if (error) { + ASSERT3U(error, ==, EEXIST); + ASSERT(zd->zd_zilog->zl_replay); + dmu_tx_commit(tx); + return (error); + } + + ASSERT(lr->lr_foid != 0); + + if (lr->lrz_type != DMU_OT_ZAP_OTHER) + VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, + lr->lrz_blocksize, lr->lrz_ibshift, tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + bbt = ztest_bt_bonus(db); + dmu_buf_will_dirty(db, tx); + ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg); + dmu_buf_rele(db, FTAG); + + VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, + &lr->lr_foid, tx)); + + (void) ztest_log_create(zd, tx, lr); + + dmu_tx_commit(tx); + + return (0); +} + +static int +ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + dmu_object_info_t doi; + dmu_tx_t *tx; + uint64_t object, txg; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + VERIFY3U(0, ==, + zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); + ASSERT(object != 0); + + ztest_object_lock(zd, object, RL_WRITER); + + VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); + dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_object_unlock(zd, object); + return (ENOSPC); + } + + if (doi.doi_type == DMU_OT_ZAP_OTHER) { + VERIFY3U(0, ==, zap_destroy(os, object, tx)); + } else { + VERIFY3U(0, ==, dmu_object_free(os, object, tx)); + } + + VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); + + (void) ztest_log_remove(zd, tx, lr, object); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, object); + + return (0); +} + +static int +ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) +{ + objset_t *os = zd->zd_os; + void *data = lr + 1; /* data follows lr */ + uint64_t offset, length; + ztest_block_tag_t *bt = data; + ztest_block_tag_t *bbt; + uint64_t gen, txg, lrtxg, crtxg; + dmu_object_info_t doi; + dmu_tx_t *tx; + dmu_buf_t *db; + arc_buf_t *abuf = NULL; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + } + + if (bt->bt_magic == BSWAP_64(BT_MAGIC)) + byteswap_uint64_array(bt, sizeof (*bt)); + + if (bt->bt_magic != BT_MAGIC) + bt = NULL; + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + dmu_object_info_from_db(db, &doi); + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + gen = bbt->bt_gen; + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, lr->lr_foid, offset, length); + + if (ztest_random(8) == 0 && length == doi.doi_data_block_size && + P2PHASE(offset, length) == 0) + abuf = dmu_request_arcbuf(db, length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + dmu_buf_rele(db, FTAG); + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + if (bt != NULL) { + /* + * Usually, verify the old data before writing new data -- + * but not always, because we also want to verify correct + * behavior when the data was not recently read into cache. + */ + ASSERT(offset % doi.doi_data_block_size == 0); + if (ztest_random(4) != 0) { + int prefetch = ztest_random(2) ? + DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; + ztest_block_tag_t rbt; + + VERIFY(dmu_read(os, lr->lr_foid, offset, + sizeof (rbt), &rbt, prefetch) == 0); + if (rbt.bt_magic == BT_MAGIC) { + ztest_bt_verify(&rbt, os, lr->lr_foid, + offset, gen, txg, crtxg); + } + } + + /* + * Writes can appear to be newer than the bonus buffer because + * the ztest_get_data() callback does a dmu_read() of the + * open-context data, which may be different than the data + * as it was when the write was generated. + */ + if (zd->zd_zilog->zl_replay) { + ztest_bt_verify(bt, os, lr->lr_foid, offset, + MAX(gen, bt->bt_gen), MAX(txg, lrtxg), + bt->bt_crtxg); + } + + /* + * Set the bt's gen/txg to the bonus buffer's gen/txg + * so that all of the usual ASSERTs will work. + */ + ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg); + } + + if (abuf == NULL) { + dmu_write(os, lr->lr_foid, offset, length, data, tx); + } else { + bcopy(data, abuf->b_data, length); + dmu_assign_arcbuf(db, offset, abuf, tx); + } + + (void) ztest_log_write(zd, tx, lr); + + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +static int +ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap) +{ + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, + RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, + lr->lr_length, tx) == 0); + + (void) ztest_log_truncate(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +static int +ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap) +{ + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + dmu_buf_t *db; + ztest_block_tag_t *bbt; + uint64_t txg, lrtxg, crtxg; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, lr->lr_foid); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + + if (zd->zd_zilog->zl_replay) { + ASSERT(lr->lr_size != 0); + ASSERT(lr->lr_mode != 0); + ASSERT(lrtxg != 0); + } else { + /* + * Randomly change the size and increment the generation. + */ + lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * + sizeof (*bbt); + lr->lr_mode = bbt->bt_gen + 1; + ASSERT(lrtxg == 0); + } + + /* + * Verify that the current bonus buffer is not newer than our txg. + */ + ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, + MAX(txg, lrtxg), crtxg); + + dmu_buf_will_dirty(db, tx); + + ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); + ASSERT3U(lr->lr_size, <=, db->db_size); + VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0); + bbt = ztest_bt_bonus(db); + + ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg); + + dmu_buf_rele(db, FTAG); + + (void) ztest_log_setattr(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { + NULL, /* 0 no such transaction type */ + ztest_replay_create, /* TX_CREATE */ + NULL, /* TX_MKDIR */ + NULL, /* TX_MKXATTR */ + NULL, /* TX_SYMLINK */ + ztest_replay_remove, /* TX_REMOVE */ + NULL, /* TX_RMDIR */ + NULL, /* TX_LINK */ + NULL, /* TX_RENAME */ + ztest_replay_write, /* TX_WRITE */ + ztest_replay_truncate, /* TX_TRUNCATE */ + ztest_replay_setattr, /* TX_SETATTR */ + NULL, /* TX_ACL */ + NULL, /* TX_CREATE_ACL */ + NULL, /* TX_CREATE_ATTR */ + NULL, /* TX_CREATE_ACL_ATTR */ + NULL, /* TX_MKDIR_ACL */ + NULL, /* TX_MKDIR_ATTR */ + NULL, /* TX_MKDIR_ACL_ATTR */ + NULL, /* TX_WRITE2 */ +}; + +/* + * ZIL get_data callbacks + */ + +static void +ztest_get_done(zgd_t *zgd, int error) +{ + ztest_ds_t *zd = zgd->zgd_private; + uint64_t object = zgd->zgd_rl->rl_object; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + ztest_range_unlock(zgd->zgd_rl); + ztest_object_unlock(zd, object); + + if (error == 0 && zgd->zgd_bp) + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + + umem_free(zgd, sizeof (*zgd)); +} + +static int +ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) +{ + ztest_ds_t *zd = arg; + objset_t *os = zd->zd_os; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + blkptr_t *bp = &lr->lr_blkptr; + uint64_t txg = lr->lr_common.lrc_txg; + uint64_t crtxg; + dmu_object_info_t doi; + dmu_buf_t *db; + zgd_t *zgd; + int error; + + ztest_object_lock(zd, object, RL_READER); + error = dmu_bonus_hold(os, object, FTAG, &db); + if (error) { + ztest_object_unlock(zd, object); + return (error); + } + + crtxg = ztest_bt_bonus(db)->bt_crtxg; + + if (crtxg == 0 || crtxg > txg) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, object); + return (ENOENT); + } + + dmu_object_info_from_db(db, &doi); + dmu_buf_rele(db, FTAG); + db = NULL; + + zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); + zgd->zgd_zilog = zd->zd_zilog; + zgd->zgd_private = zd; + + if (buf != NULL) { /* immediate write */ + zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, + RL_READER); + + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + ASSERT(error == 0); + } else { + size = doi.doi_data_block_size; + if (ISP2(size)) { + offset = P2ALIGN(offset, size); + } else { + ASSERT(offset < size); + offset = 0; + } + + zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, + RL_READER); + + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + ztest_get_done, zgd); + + if (error == 0) + return (0); + } + } + + ztest_get_done(zgd, error); + + return (error); +} + +static void * +ztest_lr_alloc(size_t lrsize, char *name) +{ + char *lr; + size_t namesize = name ? strlen(name) + 1 : 0; + + lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); + + if (name) + bcopy(name, lr + lrsize, namesize); + + return (lr); +} + +void +ztest_lr_free(void *lr, size_t lrsize, char *name) +{ + size_t namesize = name ? strlen(name) + 1 : 0; + + umem_free(lr, lrsize + namesize); +} + +/* + * Lookup a bunch of objects. Returns the number of objects not found. + */ +static int +ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int error; + + ASSERT(_mutex_held(&zd->zd_dirobj_lock)); + + for (int i = 0; i < count; i++, od++) { + od->od_object = 0; + error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, + sizeof (uint64_t), 1, &od->od_object); + if (error) { + ASSERT(error == ENOENT); + ASSERT(od->od_object == 0); + missing++; + } else { + dmu_buf_t *db; + ztest_block_tag_t *bbt; + dmu_object_info_t doi; + + ASSERT(od->od_object != 0); + ASSERT(missing == 0); /* there should be no gaps */ + + ztest_object_lock(zd, od->od_object, RL_READER); + VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, + od->od_object, FTAG, &db)); + dmu_object_info_from_db(db, &doi); + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + od->od_type = doi.doi_type; + od->od_blocksize = doi.doi_data_block_size; + od->od_gen = bbt->bt_gen; + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, od->od_object); + } + } + + return (missing); +} + +static int +ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + + ASSERT(_mutex_held(&zd->zd_dirobj_lock)); + + for (int i = 0; i < count; i++, od++) { + if (missing) { + od->od_object = 0; + missing++; + continue; + } + + lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ + lr->lrz_type = od->od_crtype; + lr->lrz_blocksize = od->od_crblocksize; + lr->lrz_ibshift = ztest_random_ibshift(); + lr->lrz_bonustype = DMU_OT_UINT64_OTHER; + lr->lrz_bonuslen = dmu_bonus_max(); + lr->lr_gen = od->od_crgen; + lr->lr_crtime[0] = time(NULL); + + if (ztest_replay_create(zd, lr, B_FALSE) != 0) { + ASSERT(missing == 0); + od->od_object = 0; + missing++; + } else { + od->od_object = lr->lr_foid; + od->od_type = od->od_crtype; + od->od_blocksize = od->od_crblocksize; + od->od_gen = od->od_crgen; + ASSERT(od->od_object != 0); + } + + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); +} + +static int +ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int error; + + ASSERT(_mutex_held(&zd->zd_dirobj_lock)); + + od += count - 1; + + for (int i = count - 1; i >= 0; i--, od--) { + if (missing) { + missing++; + continue; + } + + if (od->od_object == 0) + continue; + + lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + + if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { + ASSERT3U(error, ==, ENOSPC); + missing++; + } else { + od->od_object = 0; + } + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); +} + +static int +ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, + void *data) +{ + lr_write_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + bcopy(data, lr + 1, size); + + error = ztest_replay_write(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr) + size, NULL); + + return (error); +} + +static int +ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) +{ + lr_truncate_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + + error = ztest_replay_truncate(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); +} + +static int +ztest_setattr(ztest_ds_t *zd, uint64_t object) +{ + lr_setattr_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_size = 0; + lr->lr_mode = 0; + + error = ztest_replay_setattr(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); +} + +static void +ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) +{ + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + txg_wait_synced(dmu_objset_pool(os), 0); + + ztest_object_lock(zd, object, RL_READER); + rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, object, offset, size); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + + if (txg != 0) { + dmu_prealloc(os, object, offset, size, tx); + dmu_tx_commit(tx); + txg_wait_synced(dmu_objset_pool(os), txg); + } else { + (void) dmu_free_long_range(os, object, offset, size); + } + + ztest_range_unlock(rl); + ztest_object_unlock(zd, object); +} + +static void +ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) +{ + ztest_block_tag_t wbt; + dmu_object_info_t doi; + enum ztest_io_type io_type; + uint64_t blocksize; + void *data; + + VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); + blocksize = doi.doi_data_block_size; + data = umem_alloc(blocksize, UMEM_NOFAIL); + + /* + * Pick an i/o type at random, biased toward writing block tags. + */ + io_type = ztest_random(ZTEST_IO_TYPES); + if (ztest_random(2) == 0) + io_type = ZTEST_IO_WRITE_TAG; + + switch (io_type) { + + case ZTEST_IO_WRITE_TAG: + ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0); + (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); + break; + + case ZTEST_IO_WRITE_PATTERN: + (void) memset(data, 'a' + (object + offset) % 5, blocksize); + if (ztest_random(2) == 0) { + /* + * Induce fletcher2 collisions to ensure that + * zio_ddt_collision() detects and resolves them + * when using fletcher2-verify for deduplication. + */ + ((uint64_t *)data)[0] ^= 1ULL << 63; + ((uint64_t *)data)[4] ^= 1ULL << 63; + } + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_WRITE_ZEROES: + bzero(data, blocksize); + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_TRUNCATE: + (void) ztest_truncate(zd, object, offset, blocksize); + break; + + case ZTEST_IO_SETATTR: + (void) ztest_setattr(zd, object); + break; + } + + umem_free(data, blocksize); +} + +/* + * Initialize an object description template. + */ +static void +ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, + dmu_object_type_t type, uint64_t blocksize, uint64_t gen) +{ + od->od_dir = ZTEST_DIROBJ; + od->od_object = 0; + + od->od_crtype = type; + od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); + od->od_crgen = gen; + + od->od_type = DMU_OT_NONE; + od->od_blocksize = 0; + od->od_gen = 0; + + (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", + tag, (int64_t)id, index); +} + +/* + * Lookup or create the objects for a test using the od template. + * If the objects do not all exist, or if 'remove' is specified, + * remove any existing objects and create new ones. Otherwise, + * use the existing objects. + */ +static int +ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) +{ + int count = size / sizeof (*od); + int rv = 0; + + VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0); + if ((ztest_lookup(zd, od, count) != 0 || remove) && + (ztest_remove(zd, od, count) != 0 || + ztest_create(zd, od, count) != 0)) + rv = -1; + zd->zd_od = od; + VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0); + + return (rv); +} + +/* ARGSUSED */ +void +ztest_zil_commit(ztest_ds_t *zd, uint64_t id) +{ + zilog_t *zilog = zd->zd_zilog; + + zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); + + /* + * Remember the committed values in zd, which is in parent/child + * shared memory. If we die, the next iteration of ztest_run() + * will verify that the log really does contain this record. + */ + mutex_enter(&zilog->zl_lock); + ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq); + zd->zd_seq = zilog->zl_commit_lr_seq; + mutex_exit(&zilog->zl_lock); +} + +/* + * Verify that we can't destroy an active pool, create an existing pool, + * or create a pool with a bad vdev spec. + */ +/* ARGSUSED */ +void +ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa; + nvlist_t *nvroot; + + /* + * Attempt to create using a bad file. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create using a bad mirror. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create an existing pool. It shouldn't matter + * what's in the nvroot; we should fail with EEXIST. + */ + (void) rw_rdlock(&zs->zs_name_lock); + nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); + VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool)); + spa_close(spa, FTAG); + + (void) rw_unlock(&zs->zs_name_lock); +} + +static vdev_t * +vdev_lookup_by_path(vdev_t *vd, const char *path) +{ + vdev_t *mvd; + + if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != + NULL) + return (mvd); + + return (NULL); +} + +/* + * Find the first available hole which can be used as a top-level. + */ +int +find_vdev_hole(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + int c; + + ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); + + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + + if (cvd->vdev_ishole) + break; + } + return (c); +} + +/* + * Verify that vdev_add() works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + uint64_t leaves; + uint64_t guid; + nvlist_t *nvroot; + int error; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; + + /* + * If we have slogs then remove them 1/4 of the time. + */ + if (spa_has_slogs(spa) && ztest_random(4) == 0) { + /* + * Grab the guid from the head of the log class rotor. + */ + guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid; + + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between removing a slog (dmu_objset_find) + * and destroying a dataset. Removing the slog will + * grab a reference on the dataset which may cause + * dmu_objset_destroy() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0); + error = spa_vdev_remove(spa, guid, B_FALSE); + VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0); + + if (error && error != EEXIST) + fatal(0, "spa_vdev_remove() = %d", error); + } else { + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * Make 1/4 of the devices be log devices. + */ + nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, + ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1); + + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + + if (error == ENOSPC) + ztest_record_enospc("spa_vdev_add"); + else if (error != 0) + fatal(0, "spa_vdev_add() = %d", error); + } + + VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0); +} + +/* + * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + vdev_t *rvd = spa->spa_root_vdev; + spa_aux_vdev_t *sav; + char *aux; + uint64_t guid = 0; + int error; + + if (ztest_random(2) == 0) { + sav = &spa->spa_spares; + aux = ZPOOL_CONFIG_SPARES; + } else { + sav = &spa->spa_l2cache; + aux = ZPOOL_CONFIG_L2CACHE; + } + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + if (sav->sav_count != 0 && ztest_random(4) == 0) { + /* + * Pick a random device to remove. + */ + guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; + } else { + /* + * Find an unused device we can add. + */ + zs->zs_vdev_aux = 0; + for (;;) { + char path[MAXPATHLEN]; + int c; + (void) sprintf(path, ztest_aux_template, zopt_dir, + zopt_pool, aux, zs->zs_vdev_aux); + for (c = 0; c < sav->sav_count; c++) + if (strcmp(sav->sav_vdevs[c]->vdev_path, + path) == 0) + break; + if (c == sav->sav_count && + vdev_lookup_by_path(rvd, path) == NULL) + break; + zs->zs_vdev_aux++; + } + } + + spa_config_exit(spa, SCL_VDEV, FTAG); + + if (guid == 0) { + /* + * Add a new device. + */ + nvlist_t *nvroot = make_vdev_root(NULL, aux, + (zopt_vdev_size * 5) / 4, 0, 0, 0, 0, 1); + error = spa_vdev_add(spa, nvroot); + if (error != 0) + fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); + nvlist_free(nvroot); + } else { + /* + * Remove an existing device. Sometimes, dirty its + * vdev state first to make sure we handle removal + * of devices that have pending state changes. + */ + if (ztest_random(2) == 0) + (void) vdev_online(spa, guid, 0, NULL); + + error = spa_vdev_remove(spa, guid, B_FALSE); + if (error != 0 && error != EBUSY) + fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); + } + + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); +} + +/* + * split a pool if it has mirror tlvdevs + */ +/* ARGSUSED */ +void +ztest_split_pool(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *tree, **child, *config, *split, **schild; + uint_t c, children, schildren = 0, lastlogid = 0; + int error = 0; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + + /* ensure we have a useable config; mirrors of raidz aren't supported */ + if (zs->zs_mirrors < 3 || zopt_raidz > 1) { + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + + /* clean up the old pool, if any */ + (void) spa_destroy("splitp"); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* generate a config from the existing config */ + mutex_enter(&spa->spa_props_lock); + VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, + &tree) == 0); + mutex_exit(&spa->spa_props_lock); + + VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0); + + schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); + for (c = 0; c < children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + nvlist_t **mchild; + uint_t mchildren; + + if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { + VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, + 0) == 0); + VERIFY(nvlist_add_string(schild[schildren], + ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(schild[schildren], + ZPOOL_CONFIG_IS_HOLE, 1) == 0); + if (lastlogid == 0) + lastlogid = schildren; + ++schildren; + continue; + } + lastlogid = 0; + VERIFY(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); + VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); + } + + /* OK, create a config that can be used to split */ + VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, + lastlogid != 0 ? lastlogid : schildren) == 0); + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); + + for (c = 0; c < schildren; c++) + nvlist_free(schild[c]); + free(schild); + nvlist_free(split); + + spa_config_exit(spa, SCL_VDEV, FTAG); + + (void) rw_wrlock(&zs->zs_name_lock); + error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); + (void) rw_unlock(&zs->zs_name_lock); + + nvlist_free(config); + + if (error == 0) { + (void) printf("successful split - results:\n"); + mutex_enter(&spa_namespace_lock); + show_pool_stats(spa); + show_pool_stats(spa_lookup("splitp")); + mutex_exit(&spa_namespace_lock); + ++zs->zs_splits; + --zs->zs_mirrors; + } + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + +} + +/* + * Verify that we can attach and detach devices. + */ +/* ARGSUSED */ +void +ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *oldvd, *newvd, *pvd; + nvlist_t *root; + uint64_t leaves; + uint64_t leaf, top; + uint64_t ashift = ztest_get_ashift(); + uint64_t oldguid, pguid; + size_t oldsize, newsize; + char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; + int replacing; + int oldvd_has_siblings = B_FALSE; + int newvd_is_spare = B_FALSE; + int oldvd_is_log; + int error, expected_error; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* + * Decide whether to do an attach or a replace. + */ + replacing = ztest_random(2); + + /* + * Pick a random top-level vdev. + */ + top = ztest_random_vdev_top(spa, B_TRUE); + + /* + * Pick a random leaf within it. + */ + leaf = ztest_random(leaves); + + /* + * Locate this vdev. + */ + oldvd = rvd->vdev_child[top]; + if (zs->zs_mirrors >= 1) { + ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); + ASSERT(oldvd->vdev_children >= zs->zs_mirrors); + oldvd = oldvd->vdev_child[leaf / zopt_raidz]; + } + if (zopt_raidz > 1) { + ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); + ASSERT(oldvd->vdev_children == zopt_raidz); + oldvd = oldvd->vdev_child[leaf % zopt_raidz]; + } + + /* + * If we're already doing an attach or replace, oldvd may be a + * mirror vdev -- in which case, pick a random child. + */ + while (oldvd->vdev_children != 0) { + oldvd_has_siblings = B_TRUE; + ASSERT(oldvd->vdev_children >= 2); + oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; + } + + oldguid = oldvd->vdev_guid; + oldsize = vdev_get_min_asize(oldvd); + oldvd_is_log = oldvd->vdev_top->vdev_islog; + (void) strcpy(oldpath, oldvd->vdev_path); + pvd = oldvd->vdev_parent; + pguid = pvd->vdev_guid; + + /* + * If oldvd has siblings, then half of the time, detach it. + */ + if (oldvd_has_siblings && ztest_random(2) == 0) { + spa_config_exit(spa, SCL_VDEV, FTAG); + error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); + if (error != 0 && error != ENODEV && error != EBUSY && + error != ENOTSUP) + fatal(0, "detach (%s) returned %d", oldpath, error); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + + /* + * For the new vdev, choose with equal probability between the two + * standard paths (ending in either 'a' or 'b') or a random hot spare. + */ + if (sav->sav_count != 0 && ztest_random(3) == 0) { + newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; + newvd_is_spare = B_TRUE; + (void) strcpy(newpath, newvd->vdev_path); + } else { + (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, + zopt_dir, zopt_pool, top * leaves + leaf); + if (ztest_random(2) == 0) + newpath[strlen(newpath) - 1] = 'b'; + newvd = vdev_lookup_by_path(rvd, newpath); + } + + if (newvd) { + newsize = vdev_get_min_asize(newvd); + } else { + /* + * Make newsize a little bigger or smaller than oldsize. + * If it's smaller, the attach should fail. + * If it's larger, and we're doing a replace, + * we should get dynamic LUN growth when we're done. + */ + newsize = 10 * oldsize / (9 + ztest_random(3)); + } + + /* + * If pvd is not a mirror or root, the attach should fail with ENOTSUP, + * unless it's a replace; in that case any non-replacing parent is OK. + * + * If newvd is already part of the pool, it should fail with EBUSY. + * + * If newvd is too small, it should fail with EOVERFLOW. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops && (!replacing || + pvd->vdev_ops == &vdev_replacing_ops || + pvd->vdev_ops == &vdev_spare_ops)) + expected_error = ENOTSUP; + else if (newvd_is_spare && (!replacing || oldvd_is_log)) + expected_error = ENOTSUP; + else if (newvd == oldvd) + expected_error = replacing ? 0 : EBUSY; + else if (vdev_lookup_by_path(rvd, newpath) != NULL) + expected_error = EBUSY; + else if (newsize < oldsize) + expected_error = EOVERFLOW; + else if (ashift > oldvd->vdev_top->vdev_ashift) + expected_error = EDOM; + else + expected_error = 0; + + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, newvd == NULL ? newsize : 0, + ashift, 0, 0, 0, 1); + + error = spa_vdev_attach(spa, oldguid, root, replacing); + + nvlist_free(root); + + /* + * If our parent was the replacing vdev, but the replace completed, + * then instead of failing with ENOTSUP we may either succeed, + * fail with ENODEV, or fail with EOVERFLOW. + */ + if (expected_error == ENOTSUP && + (error == 0 || error == ENODEV || error == EOVERFLOW)) + expected_error = error; + + /* + * If someone grew the LUN, the replacement may be too small. + */ + if (error == EOVERFLOW || error == EBUSY) + expected_error = error; + + /* XXX workaround 6690467 */ + if (error != expected_error && expected_error != EBUSY) { + fatal(0, "attach (%s %llu, %s %llu, %d) " + "returned %d, expected %d", + oldpath, (longlong_t)oldsize, newpath, + (longlong_t)newsize, replacing, error, expected_error); + } + + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); +} + +/* + * Callback function which expands the physical size of the vdev. + */ +vdev_t * +grow_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa = vd->vdev_spa; + size_t *newsize = arg; + size_t fsize; + int fd; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if ((fd = open(vd->vdev_path, O_RDWR)) == -1) + return (vd); + + fsize = lseek(fd, 0, SEEK_END); + (void) ftruncate(fd, *newsize); + + if (zopt_verbose >= 6) { + (void) printf("%s grew from %lu to %lu bytes\n", + vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); + } + (void) close(fd); + return (NULL); +} + +/* + * Callback function which expands a given vdev by calling vdev_online(). + */ +/* ARGSUSED */ +vdev_t * +online_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + uint64_t guid = vd->vdev_guid; + uint64_t generation = spa->spa_config_generation + 1; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; + int error; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + /* Calling vdev_online will initialize the new metaslabs */ + spa_config_exit(spa, SCL_STATE, spa); + error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* + * If vdev_online returned an error or the underlying vdev_open + * failed then we abort the expand. The only way to know that + * vdev_open fails is by checking the returned newstate. + */ + if (error || newstate != VDEV_STATE_HEALTHY) { + if (zopt_verbose >= 5) { + (void) printf("Unable to expand vdev, state %llu, " + "error %d\n", (u_longlong_t)newstate, error); + } + return (vd); + } + ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); + + /* + * Since we dropped the lock we need to ensure that we're + * still talking to the original vdev. It's possible this + * vdev may have been detached/replaced while we were + * trying to online it. + */ + if (generation != spa->spa_config_generation) { + if (zopt_verbose >= 5) { + (void) printf("vdev configuration has changed, " + "guid %llu, state %llu, expected gen %llu, " + "got gen %llu\n", + (u_longlong_t)guid, + (u_longlong_t)tvd->vdev_state, + (u_longlong_t)generation, + (u_longlong_t)spa->spa_config_generation); + } + return (vd); + } + return (NULL); +} + +/* + * Traverse the vdev tree calling the supplied function. + * We continue to walk the tree until we either have walked all + * children or we receive a non-NULL return from the callback. + * If a NULL callback is passed, then we just return back the first + * leaf vdev we encounter. + */ +vdev_t * +vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) +{ + if (vd->vdev_ops->vdev_op_leaf) { + if (func == NULL) + return (vd); + else + return (func(vd, arg)); + } + + for (uint_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) + return (cvd); + } + return (NULL); +} + +/* + * Verify that dynamic LUN growth works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + vdev_t *vd, *tvd; + metaslab_class_t *mc; + metaslab_group_t *mg; + size_t psize, newsize; + uint64_t top; + uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + top = ztest_random_vdev_top(spa, B_TRUE); + + tvd = spa->spa_root_vdev->vdev_child[top]; + mg = tvd->vdev_mg; + mc = mg->mg_class; + old_ms_count = tvd->vdev_ms_count; + old_class_space = metaslab_class_get_space(mc); + + /* + * Determine the size of the first leaf vdev associated with + * our top-level device. + */ + vd = vdev_walk_tree(tvd, NULL, NULL); + ASSERT3P(vd, !=, NULL); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + psize = vd->vdev_psize; + + /* + * We only try to expand the vdev if it's healthy, less than 4x its + * original size, and it has a valid psize. + */ + if (tvd->vdev_state != VDEV_STATE_HEALTHY || + psize == 0 || psize >= 4 * zopt_vdev_size) { + spa_config_exit(spa, SCL_STATE, spa); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + ASSERT(psize > 0); + newsize = psize + psize / 8; + ASSERT3U(newsize, >, psize); + + if (zopt_verbose >= 6) { + (void) printf("Expanding LUN %s from %lu to %lu\n", + vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); + } + + /* + * Growing the vdev is a two step process: + * 1). expand the physical size (i.e. relabel) + * 2). online the vdev to create the new metaslabs + */ + if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || + vdev_walk_tree(tvd, online_vdev, NULL) != NULL || + tvd->vdev_state != VDEV_STATE_HEALTHY) { + if (zopt_verbose >= 5) { + (void) printf("Could not expand LUN because " + "the vdev configuration changed.\n"); + } + spa_config_exit(spa, SCL_STATE, spa); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + + spa_config_exit(spa, SCL_STATE, spa); + + /* + * Expanding the LUN will update the config asynchronously, + * thus we must wait for the async thread to complete any + * pending tasks before proceeding. + */ + for (;;) { + boolean_t done; + mutex_enter(&spa->spa_async_lock); + done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); + mutex_exit(&spa->spa_async_lock); + if (done) + break; + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); + } + + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + tvd = spa->spa_root_vdev->vdev_child[top]; + new_ms_count = tvd->vdev_ms_count; + new_class_space = metaslab_class_get_space(mc); + + if (tvd->vdev_mg != mg || mg->mg_class != mc) { + if (zopt_verbose >= 5) { + (void) printf("Could not verify LUN expansion due to " + "intervening vdev offline or remove.\n"); + } + spa_config_exit(spa, SCL_STATE, spa); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + + /* + * Make sure we were able to grow the vdev. + */ + if (new_ms_count <= old_ms_count) + fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n", + old_ms_count, new_ms_count); + + /* + * Make sure we were able to grow the pool. + */ + if (new_class_space <= old_class_space) + fatal(0, "LUN expansion failed: class_space %llu <= %llu\n", + old_class_space, new_class_space); + + if (zopt_verbose >= 5) { + char oldnumbuf[6], newnumbuf[6]; + + nicenum(old_class_space, oldnumbuf); + nicenum(new_class_space, newnumbuf); + (void) printf("%s grew from %s to %s\n", + spa->spa_name, oldnumbuf, newnumbuf); + } + + spa_config_exit(spa, SCL_STATE, spa); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); +} + +/* + * Verify that dmu_objset_{create,destroy,open,close} work as expected. + */ +/* ARGSUSED */ +static void +ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + /* + * Create the objects common to all ztest datasets. + */ + VERIFY(zap_create_claim(os, ZTEST_DIROBJ, + DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); +} + +static int +ztest_dataset_create(char *dsname) +{ + uint64_t zilset = ztest_random(100); + int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, + ztest_objset_create_cb, NULL); + + if (err || zilset < 80) + return (err); + + (void) printf("Setting dataset %s to sync always\n", dsname); + return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, + ZFS_SYNC_ALWAYS, B_FALSE)); +} + +/* ARGSUSED */ +static int +ztest_objset_destroy_cb(const char *name, void *arg) +{ + objset_t *os; + dmu_object_info_t doi; + int error; + + /* + * Verify that the dataset contains a directory object. + */ + VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os)); + error = dmu_object_info(os, ZTEST_DIROBJ, &doi); + if (error != ENOENT) { + /* We could have crashed in the middle of destroying it */ + ASSERT3U(error, ==, 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); + ASSERT3S(doi.doi_physical_blocks_512, >=, 0); + } + dmu_objset_rele(os, FTAG); + + /* + * Destroy the dataset. + */ + VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE)); + return (0); +} + +static boolean_t +ztest_snapshot_create(char *osname, uint64_t id) +{ + char snapname[MAXNAMELEN]; + int error; + + (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, + (u_longlong_t)id); + + error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1, + NULL, NULL, B_FALSE, B_FALSE, -1); + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (B_FALSE); + } + if (error != 0 && error != EEXIST) + fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error); + return (B_TRUE); +} + +static boolean_t +ztest_snapshot_destroy(char *osname, uint64_t id) +{ + char snapname[MAXNAMELEN]; + int error; + + (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, + (u_longlong_t)id); + + error = dmu_objset_destroy(snapname, B_FALSE); + if (error != 0 && error != ENOENT) + fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); + return (B_TRUE); +} + +/* ARGSUSED */ +void +ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + ztest_ds_t zdtmp; + int iters; + int error; + objset_t *os, *os2; + char name[MAXNAMELEN]; + zilog_t *zilog; + + (void) rw_rdlock(&zs->zs_name_lock); + + (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu", + zs->zs_pool, (u_longlong_t)id); + + /* + * If this dataset exists from a previous run, process its replay log + * half of the time. If we don't replay it, then dmu_objset_destroy() + * (invoked from ztest_objset_destroy_cb()) should just throw it away. + */ + if (ztest_random(2) == 0 && + dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { + ztest_zd_init(&zdtmp, os); + zil_replay(os, &zdtmp, ztest_replay_vector); + ztest_zd_fini(&zdtmp); + dmu_objset_disown(os, FTAG); + } + + /* + * There may be an old instance of the dataset we're about to + * create lying around from a previous run. If so, destroy it + * and all of its snapshots. + */ + (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, + DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + + /* + * Verify that the destroyed dataset is no longer in the namespace. + */ + VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os)); + + /* + * Verify that we can create a new dataset. + */ + error = ztest_dataset_create(name); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + (void) rw_unlock(&zs->zs_name_lock); + return; + } + fatal(0, "dmu_objset_create(%s) = %d", name, error); + } + + VERIFY3U(0, ==, + dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); + + ztest_zd_init(&zdtmp, os); + + /* + * Open the intent log for it. + */ + zilog = zil_open(os, ztest_get_data); + + /* + * Put some objects in there, do a little I/O to them, + * and randomly take a couple of snapshots along the way. + */ + iters = ztest_random(5); + for (int i = 0; i < iters; i++) { + ztest_dmu_object_alloc_free(&zdtmp, id); + if (ztest_random(iters) == 0) + (void) ztest_snapshot_create(name, i); + } + + /* + * Verify that we cannot create an existing dataset. + */ + VERIFY3U(EEXIST, ==, + dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL)); + + /* + * Verify that we can hold an objset that is also owned. + */ + VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); + dmu_objset_rele(os2, FTAG); + + /* + * Verify that we cannot own an objset that is already owned. + */ + VERIFY3U(EBUSY, ==, + dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2)); + + zil_close(zilog); + dmu_objset_disown(os, FTAG); + ztest_zd_fini(&zdtmp); + + (void) rw_unlock(&zs->zs_name_lock); +} + +/* + * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. + */ +void +ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + + (void) rw_rdlock(&zs->zs_name_lock); + (void) ztest_snapshot_destroy(zd->zd_name, id); + (void) ztest_snapshot_create(zd->zd_name, id); + (void) rw_unlock(&zs->zs_name_lock); +} + +/* + * Cleanup non-standard snapshots and clones. + */ +void +ztest_dsl_dataset_cleanup(char *osname, uint64_t id) +{ + char snap1name[MAXNAMELEN]; + char clone1name[MAXNAMELEN]; + char snap2name[MAXNAMELEN]; + char clone2name[MAXNAMELEN]; + char snap3name[MAXNAMELEN]; + int error; + + (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); + (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); + (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); + (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); + (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); + + error = dmu_objset_destroy(clone2name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error); + error = dmu_objset_destroy(snap3name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error); + error = dmu_objset_destroy(snap2name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error); + error = dmu_objset_destroy(clone1name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error); + error = dmu_objset_destroy(snap1name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error); +} + +/* + * Verify dsl_dataset_promote handles EBUSY + */ +void +ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + objset_t *clone; + dsl_dataset_t *ds; + char snap1name[MAXNAMELEN]; + char clone1name[MAXNAMELEN]; + char snap2name[MAXNAMELEN]; + char clone2name[MAXNAMELEN]; + char snap3name[MAXNAMELEN]; + char *osname = zd->zd_name; + int error; + + (void) rw_rdlock(&zs->zs_name_lock); + + ztest_dsl_dataset_cleanup(osname, id); + + (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); + (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); + (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); + (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); + (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); + + error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1, + NULL, NULL, B_FALSE, B_FALSE, -1); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); + } + + error = dmu_objset_hold(snap1name, FTAG, &clone); + if (error) + fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error); + + error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0); + dmu_objset_rele(clone, FTAG); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); + } + + error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1, + NULL, NULL, B_FALSE, B_FALSE, -1); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); + } + + error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1, + NULL, NULL, B_FALSE, B_FALSE, -1); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); + } + + error = dmu_objset_hold(snap3name, FTAG, &clone); + if (error) + fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); + + error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0); + dmu_objset_rele(clone, FTAG); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); + } + + error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds); + if (error) + fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error); + error = dsl_dataset_promote(clone2name, NULL); + if (error != EBUSY) + fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, + error); + dsl_dataset_disown(ds, FTAG); + +out: + ztest_dsl_dataset_cleanup(osname, id); + + (void) rw_unlock(&zs->zs_name_lock); +} + +/* + * Verify that dmu_object_{alloc,free} work as expected. + */ +void +ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t od[4]; + int batchsize = sizeof (od) / sizeof (od[0]); + + for (int b = 0; b < batchsize; b++) + ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0); + + /* + * Destroy the previous batch of objects, create a new batch, + * and do some I/O on the new objects. + */ + if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) + return; + + while (ztest_random(4 * batchsize) != 0) + ztest_io(zd, od[ztest_random(batchsize)].od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); +} + +/* + * Verify that dmu_{read,write} work as expected. + */ +void +ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t od[2]; + dmu_tx_t *tx; + int i, freeit, error; + uint64_t n, s, txg; + bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; + uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; + uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); + uint64_t regions = 997; + uint64_t stride = 123456789ULL; + uint64_t width = 40; + int free_percent = 5; + + /* + * This test uses two objects, packobj and bigobj, that are always + * updated together (i.e. in the same tx) so that their contents are + * in sync and can be compared. Their contents relate to each other + * in a simple way: packobj is a dense array of 'bufwad' structures, + * while bigobj is a sparse array of the same bufwads. Specifically, + * for any index n, there are three bufwads that should be identical: + * + * packobj, at offset n * sizeof (bufwad_t) + * bigobj, at the head of the nth chunk + * bigobj, at the tail of the nth chunk + * + * The chunk size is arbitrary. It doesn't have to be a power of two, + * and it doesn't have any relation to the object blocksize. + * The only requirement is that it can hold at least two bufwads. + * + * Normally, we write the bufwad to each of these locations. + * However, free_percent of the time we instead write zeroes to + * packobj and perform a dmu_free_range() on bigobj. By comparing + * bigobj to packobj, we can verify that the DMU is correctly + * tracking which parts of an object are allocated and free, + * and that the contents of the allocated blocks are correct. + */ + + /* + * Read the directory info. If it's the first time, set things up. + */ + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; + + bigobj = od[0].od_object; + packobj = od[1].od_object; + chunksize = od[0].od_gen; + ASSERT(chunksize == od[1].od_gen); + + /* + * Prefetch a random chunk of the big object. + * Our aim here is to get some async reads in flight + * for blocks that we may free below; the DMU should + * handle this race correctly. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(2 * width - 1); + dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); + + /* + * Pick a random index and compute the offsets into packobj and bigobj. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(width - 1); + + packoff = n * sizeof (bufwad_t); + packsize = s * sizeof (bufwad_t); + + bigoff = n * chunksize; + bigsize = s * chunksize; + + packbuf = umem_alloc(packsize, UMEM_NOFAIL); + bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); + + /* + * free_percent of the time, free a range of bigobj rather than + * overwriting it. + */ + freeit = (ztest_random(100) < free_percent); + + /* + * Read the current contents of our objects. + */ + error = dmu_read(os, packobj, packoff, packsize, packbuf, + DMU_READ_PREFETCH); + ASSERT3U(error, ==, 0); + error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, + DMU_READ_PREFETCH); + ASSERT3U(error, ==, 0); + + /* + * Get a tx for the mods to both packobj and bigobj. + */ + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, packobj, packoff, packsize); + + if (freeit) + dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); + else + dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); + + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + return; + } + + dmu_object_set_checksum(os, bigobj, + (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx); + + dmu_object_set_compress(os, bigobj, + (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx); + + /* + * For each index from n to n + s, verify that the existing bufwad + * in packobj matches the bufwads at the head and tail of the + * corresponding chunk in bigobj. Then update all three bufwads + * with the new values we want to write out. + */ + for (i = 0; i < s; i++) { + /* LINTED */ + pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); + /* LINTED */ + bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); + /* LINTED */ + bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; + + ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); + ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + + if (pack->bw_txg > txg) + fatal(0, "future leak: got %llx, open txg is %llx", + pack->bw_txg, txg); + + if (pack->bw_data != 0 && pack->bw_index != n + i) + fatal(0, "wrong index: got %llx, wanted %llx+%llx", + pack->bw_index, n, i); + + if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + + if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + + if (freeit) { + bzero(pack, sizeof (bufwad_t)); + } else { + pack->bw_index = n + i; + pack->bw_txg = txg; + pack->bw_data = 1 + ztest_random(-2ULL); + } + *bigH = *pack; + *bigT = *pack; + } + + /* + * We've verified all the old bufwads, and made new ones. + * Now write them out. + */ + dmu_write(os, packobj, packoff, packsize, packbuf, tx); + + if (freeit) { + if (zopt_verbose >= 7) { + (void) printf("freeing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); + } else { + if (zopt_verbose >= 7) { + (void) printf("writing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); + } + + dmu_tx_commit(tx); + + /* + * Sanity check the stuff we just wrote. + */ + { + void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); + void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, packobj, packoff, + packsize, packcheck, DMU_READ_PREFETCH)); + VERIFY(0 == dmu_read(os, bigobj, bigoff, + bigsize, bigcheck, DMU_READ_PREFETCH)); + + ASSERT(bcmp(packbuf, packcheck, packsize) == 0); + ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + + umem_free(packcheck, packsize); + umem_free(bigcheck, bigsize); + } + + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); +} + +void +compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, + uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) +{ + uint64_t i; + bufwad_t *pack; + bufwad_t *bigH; + bufwad_t *bigT; + + /* + * For each index from n to n + s, verify that the existing bufwad + * in packobj matches the bufwads at the head and tail of the + * corresponding chunk in bigobj. Then update all three bufwads + * with the new values we want to write out. + */ + for (i = 0; i < s; i++) { + /* LINTED */ + pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); + /* LINTED */ + bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); + /* LINTED */ + bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; + + ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); + ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + + if (pack->bw_txg > txg) + fatal(0, "future leak: got %llx, open txg is %llx", + pack->bw_txg, txg); + + if (pack->bw_data != 0 && pack->bw_index != n + i) + fatal(0, "wrong index: got %llx, wanted %llx+%llx", + pack->bw_index, n, i); + + if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + + if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + + pack->bw_index = n + i; + pack->bw_txg = txg; + pack->bw_data = 1 + ztest_random(-2ULL); + + *bigH = *pack; + *bigT = *pack; + } +} + +void +ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t od[2]; + dmu_tx_t *tx; + uint64_t i; + int error; + uint64_t n, s, txg; + bufwad_t *packbuf, *bigbuf; + uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; + uint64_t blocksize = ztest_random_blocksize(); + uint64_t chunksize = blocksize; + uint64_t regions = 997; + uint64_t stride = 123456789ULL; + uint64_t width = 9; + dmu_buf_t *bonus_db; + arc_buf_t **bigbuf_arcbufs; + dmu_object_info_t doi; + + /* + * This test uses two objects, packobj and bigobj, that are always + * updated together (i.e. in the same tx) so that their contents are + * in sync and can be compared. Their contents relate to each other + * in a simple way: packobj is a dense array of 'bufwad' structures, + * while bigobj is a sparse array of the same bufwads. Specifically, + * for any index n, there are three bufwads that should be identical: + * + * packobj, at offset n * sizeof (bufwad_t) + * bigobj, at the head of the nth chunk + * bigobj, at the tail of the nth chunk + * + * The chunk size is set equal to bigobj block size so that + * dmu_assign_arcbuf() can be tested for object updates. + */ + + /* + * Read the directory info. If it's the first time, set things up. + */ + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; + + bigobj = od[0].od_object; + packobj = od[1].od_object; + blocksize = od[0].od_blocksize; + chunksize = blocksize; + ASSERT(chunksize == od[1].od_gen); + + VERIFY(dmu_object_info(os, bigobj, &doi) == 0); + VERIFY(ISP2(doi.doi_data_block_size)); + VERIFY(chunksize == doi.doi_data_block_size); + VERIFY(chunksize >= 2 * sizeof (bufwad_t)); + + /* + * Pick a random index and compute the offsets into packobj and bigobj. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(width - 1); + + packoff = n * sizeof (bufwad_t); + packsize = s * sizeof (bufwad_t); + + bigoff = n * chunksize; + bigsize = s * chunksize; + + packbuf = umem_zalloc(packsize, UMEM_NOFAIL); + bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); + + VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); + + bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); + + /* + * Iteration 0 test zcopy for DB_UNCACHED dbufs. + * Iteration 1 test zcopy to already referenced dbufs. + * Iteration 2 test zcopy to dirty dbuf in the same txg. + * Iteration 3 test zcopy to dbuf dirty in previous txg. + * Iteration 4 test zcopy when dbuf is no longer dirty. + * Iteration 5 test zcopy when it can't be done. + * Iteration 6 one more zcopy write. + */ + for (i = 0; i < 7; i++) { + uint64_t j; + uint64_t off; + + /* + * In iteration 5 (i == 5) use arcbufs + * that don't match bigobj blksz to test + * dmu_assign_arcbuf() when it can't directly + * assign an arcbuf to a dbuf. + */ + for (j = 0; j < s; j++) { + if (i != 5) { + bigbuf_arcbufs[j] = + dmu_request_arcbuf(bonus_db, chunksize); + } else { + bigbuf_arcbufs[2 * j] = + dmu_request_arcbuf(bonus_db, chunksize / 2); + bigbuf_arcbufs[2 * j + 1] = + dmu_request_arcbuf(bonus_db, chunksize / 2); + } + } + + /* + * Get a tx for the mods to both packobj and bigobj. + */ + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, packobj, packoff, packsize); + dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); + + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + for (j = 0; j < s; j++) { + if (i != 5) { + dmu_return_arcbuf(bigbuf_arcbufs[j]); + } else { + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j]); + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j + 1]); + } + } + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); + dmu_buf_rele(bonus_db, FTAG); + return; + } + + /* + * 50% of the time don't read objects in the 1st iteration to + * test dmu_assign_arcbuf() for the case when there're no + * existing dbufs for the specified offsets. + */ + if (i != 0 || ztest_random(2) != 0) { + error = dmu_read(os, packobj, packoff, + packsize, packbuf, DMU_READ_PREFETCH); + ASSERT3U(error, ==, 0); + error = dmu_read(os, bigobj, bigoff, bigsize, + bigbuf, DMU_READ_PREFETCH); + ASSERT3U(error, ==, 0); + } + compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, + n, chunksize, txg); + + /* + * We've verified all the old bufwads, and made new ones. + * Now write them out. + */ + dmu_write(os, packobj, packoff, packsize, packbuf, tx); + if (zopt_verbose >= 7) { + (void) printf("writing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + for (off = bigoff, j = 0; j < s; j++, off += chunksize) { + dmu_buf_t *dbt; + if (i != 5) { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[j]->b_data, chunksize); + } else { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[2 * j]->b_data, + chunksize / 2); + bcopy((caddr_t)bigbuf + (off - bigoff) + + chunksize / 2, + bigbuf_arcbufs[2 * j + 1]->b_data, + chunksize / 2); + } + + if (i == 1) { + VERIFY(dmu_buf_hold(os, bigobj, off, + FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); + } + if (i != 5) { + dmu_assign_arcbuf(bonus_db, off, + bigbuf_arcbufs[j], tx); + } else { + dmu_assign_arcbuf(bonus_db, off, + bigbuf_arcbufs[2 * j], tx); + dmu_assign_arcbuf(bonus_db, + off + chunksize / 2, + bigbuf_arcbufs[2 * j + 1], tx); + } + if (i == 1) { + dmu_buf_rele(dbt, FTAG); + } + } + dmu_tx_commit(tx); + + /* + * Sanity check the stuff we just wrote. + */ + { + void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); + void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, packobj, packoff, + packsize, packcheck, DMU_READ_PREFETCH)); + VERIFY(0 == dmu_read(os, bigobj, bigoff, + bigsize, bigcheck, DMU_READ_PREFETCH)); + + ASSERT(bcmp(packbuf, packcheck, packsize) == 0); + ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + + umem_free(packcheck, packsize); + umem_free(bigcheck, bigsize); + } + if (i == 2) { + txg_wait_open(dmu_objset_pool(os), 0); + } else if (i == 3) { + txg_wait_synced(dmu_objset_pool(os), 0); + } + } + + dmu_buf_rele(bonus_db, FTAG); + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); +} + +/* ARGSUSED */ +void +ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t od[1]; + uint64_t offset = (1ULL << (ztest_random(20) + 43)) + + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + + /* + * Have multiple threads write to large offsets in an object + * to verify that parallel writes to an object -- even to the + * same blocks within the object -- doesn't cause any trouble. + */ + ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; + + while (ztest_random(10) != 0) + ztest_io(zd, od[0].od_object, offset); +} + +void +ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t od[1]; + uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + uint64_t count = ztest_random(20) + 1; + uint64_t blocksize = ztest_random_blocksize(); + void *data; + + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); + + if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) + return; + + if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0) + return; + + ztest_prealloc(zd, od[0].od_object, offset, count * blocksize); + + data = umem_zalloc(blocksize, UMEM_NOFAIL); + + while (ztest_random(count) != 0) { + uint64_t randoff = offset + (ztest_random(count) * blocksize); + if (ztest_write(zd, od[0].od_object, randoff, blocksize, + data) != 0) + break; + while (ztest_random(4) != 0) + ztest_io(zd, od[0].od_object, randoff); + } + + umem_free(data, blocksize); +} + +/* + * Verify that zap_{create,destroy,add,remove,update} work as expected. + */ +#define ZTEST_ZAP_MIN_INTS 1 +#define ZTEST_ZAP_MAX_INTS 4 +#define ZTEST_ZAP_MAX_PROPS 1000 + +void +ztest_zap(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t object; + uint64_t txg, last_txg; + uint64_t value[ZTEST_ZAP_MAX_INTS]; + uint64_t zl_ints, zl_intsize, prop; + int i, ints; + dmu_tx_t *tx; + char propname[100], txgname[100]; + int error; + char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; + + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) + return; + + object = od[0].od_object; + + /* + * Generate a known hash collision, and verify that + * we can lookup and remove both entries. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + for (i = 0; i < 2; i++) { + value[i] = i; + VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), + 1, &value[i], tx)); + } + for (i = 0; i < 2; i++) { + VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], + sizeof (uint64_t), 1, &value[i], tx)); + VERIFY3U(0, ==, + zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, 1); + } + for (i = 0; i < 2; i++) { + VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); + } + dmu_tx_commit(tx); + + /* + * Generate a buch of random entries. + */ + ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); + + prop = ztest_random(ZTEST_ZAP_MAX_PROPS); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + bzero(value, sizeof (value)); + last_txg = 0; + + /* + * If these zap entries already exist, validate their contents. + */ + error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); + if (error == 0) { + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, 1); + + VERIFY(zap_lookup(os, object, txgname, zl_intsize, + zl_ints, &last_txg) == 0); + + VERIFY(zap_length(os, object, propname, &zl_intsize, + &zl_ints) == 0); + + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, ints); + + VERIFY(zap_lookup(os, object, propname, zl_intsize, + zl_ints, value) == 0); + + for (i = 0; i < ints; i++) { + ASSERT3U(value[i], ==, last_txg + object + i); + } + } else { + ASSERT3U(error, ==, ENOENT); + } + + /* + * Atomically update two entries in our zap object. + * The first is named txg_%llu, and contains the txg + * in which the property was last updated. The second + * is named prop_%llu, and the nth element of its value + * should be txg + object + n. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + + if (last_txg > txg) + fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); + + for (i = 0; i < ints; i++) + value[i] = txg + object + i; + + VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), + 1, &txg, tx)); + VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), + ints, value, tx)); + + dmu_tx_commit(tx); + + /* + * Remove a random pair of entries. + */ + prop = ztest_random(ZTEST_ZAP_MAX_PROPS); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + + error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); + + if (error == ENOENT) + return; + + ASSERT3U(error, ==, 0); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); + VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); + dmu_tx_commit(tx); +} + +/* + * Testcase to test the upgrading of a microzap to fatzap. + */ +void +ztest_fzap(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t object, txg; + + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) + return; + + object = od[0].od_object; + + /* + * Add entries to this ZAP and make sure it spills over + * and gets upgraded to a fatzap. Also, since we are adding + * 2050 entries we should see ptrtbl growth and leaf-block split. + */ + for (int i = 0; i < 2050; i++) { + char name[MAXNAMELEN]; + uint64_t value = i; + dmu_tx_t *tx; + int error; + + (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", + id, value); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, name); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + error = zap_add(os, object, name, sizeof (uint64_t), 1, + &value, tx); + ASSERT(error == 0 || error == EEXIST); + dmu_tx_commit(tx); + } +} + +/* ARGSUSED */ +void +ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; + dmu_tx_t *tx; + int i, namelen, error; + int micro = ztest_random(2); + char name[20], string_value[20]; + void *data; + + ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; + + object = od[0].od_object; + + /* + * Generate a random name of the form 'xxx.....' where each + * x is a random printable character and the dots are dots. + * There are 94 such characters, and the name length goes from + * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. + */ + namelen = ztest_random(sizeof (name) - 5) + 5 + 1; + + for (i = 0; i < 3; i++) + name[i] = '!' + ztest_random('~' - '!' + 1); + for (; i < namelen - 1; i++) + name[i] = '.'; + name[i] = '\0'; + + if ((namelen & 1) || micro) { + wsize = sizeof (txg); + wc = 1; + data = &txg; + } else { + wsize = 1; + wc = namelen; + data = string_value; + } + + count = -1ULL; + VERIFY(zap_count(os, object, &count) == 0); + ASSERT(count != -1ULL); + + /* + * Select an operation: length, lookup, add, update, remove. + */ + i = ztest_random(5); + + if (i >= 2) { + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + bcopy(name, string_value, namelen); + } else { + tx = NULL; + txg = 0; + bzero(string_value, namelen); + } + + switch (i) { + + case 0: + error = zap_length(os, object, name, &zl_wsize, &zl_wc); + if (error == 0) { + ASSERT3U(wsize, ==, zl_wsize); + ASSERT3U(wc, ==, zl_wc); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 1: + error = zap_lookup(os, object, name, wsize, wc, data); + if (error == 0) { + if (data == string_value && + bcmp(name, data, namelen) != 0) + fatal(0, "name '%s' != val '%s' len %d", + name, data, namelen); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 2: + error = zap_add(os, object, name, wsize, wc, data, tx); + ASSERT(error == 0 || error == EEXIST); + break; + + case 3: + VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); + break; + + case 4: + error = zap_remove(os, object, name, tx); + ASSERT(error == 0 || error == ENOENT); + break; + } + + if (tx != NULL) + dmu_tx_commit(tx); +} + +/* + * Commit callback data. + */ +typedef struct ztest_cb_data { + list_node_t zcd_node; + uint64_t zcd_txg; + int zcd_expected_err; + boolean_t zcd_added; + boolean_t zcd_called; + spa_t *zcd_spa; +} ztest_cb_data_t; + +/* This is the actual commit callback function */ +static void +ztest_commit_callback(void *arg, int error) +{ + ztest_cb_data_t *data = arg; + uint64_t synced_txg; + + VERIFY(data != NULL); + VERIFY3S(data->zcd_expected_err, ==, error); + VERIFY(!data->zcd_called); + + synced_txg = spa_last_synced_txg(data->zcd_spa); + if (data->zcd_txg > synced_txg) + fatal(0, "commit callback of txg %" PRIu64 " called prematurely" + ", last synced txg = %" PRIu64 "\n", data->zcd_txg, + synced_txg); + + data->zcd_called = B_TRUE; + + if (error == ECANCELED) { + ASSERT3U(data->zcd_txg, ==, 0); + ASSERT(!data->zcd_added); + + /* + * The private callback data should be destroyed here, but + * since we are going to check the zcd_called field after + * dmu_tx_abort(), we will destroy it there. + */ + return; + } + + /* Was this callback added to the global callback list? */ + if (!data->zcd_added) + goto out; + + ASSERT3U(data->zcd_txg, !=, 0); + + /* Remove our callback from the list */ + (void) mutex_lock(&zcl.zcl_callbacks_lock); + list_remove(&zcl.zcl_callbacks, data); + (void) mutex_unlock(&zcl.zcl_callbacks_lock); + +out: + umem_free(data, sizeof (ztest_cb_data_t)); +} + +/* Allocate and initialize callback data structure */ +static ztest_cb_data_t * +ztest_create_cb_data(objset_t *os, uint64_t txg) +{ + ztest_cb_data_t *cb_data; + + cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); + + cb_data->zcd_txg = txg; + cb_data->zcd_spa = dmu_objset_spa(os); + + return (cb_data); +} + +/* + * If a number of txgs equal to this threshold have been created after a commit + * callback has been registered but not called, then we assume there is an + * implementation bug. + */ +#define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) + +/* + * Commit callback test. + */ +void +ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + dmu_tx_t *tx; + ztest_cb_data_t *cb_data[3], *tmp_cb; + uint64_t old_txg, txg; + int i, error; + + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; + + tx = dmu_tx_create(os); + + cb_data[0] = ztest_create_cb_data(os, 0); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); + + dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); + + /* Every once in a while, abort the transaction on purpose */ + if (ztest_random(100) == 0) + error = -1; + + if (!error) + error = dmu_tx_assign(tx, TXG_NOWAIT); + + txg = error ? 0 : dmu_tx_get_txg(tx); + + cb_data[0]->zcd_txg = txg; + cb_data[1] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); + + if (error) { + /* + * It's not a strict requirement to call the registered + * callbacks from inside dmu_tx_abort(), but that's what + * it's supposed to happen in the current implementation + * so we will check for that. + */ + for (i = 0; i < 2; i++) { + cb_data[i]->zcd_expected_err = ECANCELED; + VERIFY(!cb_data[i]->zcd_called); + } + + dmu_tx_abort(tx); + + for (i = 0; i < 2; i++) { + VERIFY(cb_data[i]->zcd_called); + umem_free(cb_data[i], sizeof (ztest_cb_data_t)); + } + + return; + } + + cb_data[2] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); + + /* + * Read existing data to make sure there isn't a future leak. + */ + VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), + &old_txg, DMU_READ_PREFETCH)); + + if (old_txg > txg) + fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, + old_txg, txg); + + dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); + + (void) mutex_lock(&zcl.zcl_callbacks_lock); + + /* + * Since commit callbacks don't have any ordering requirement and since + * it is theoretically possible for a commit callback to be called + * after an arbitrary amount of time has elapsed since its txg has been + * synced, it is difficult to reliably determine whether a commit + * callback hasn't been called due to high load or due to a flawed + * implementation. + * + * In practice, we will assume that if after a certain number of txgs a + * commit callback hasn't been called, then most likely there's an + * implementation bug.. + */ + tmp_cb = list_head(&zcl.zcl_callbacks); + if (tmp_cb != NULL && + tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) { + fatal(0, "Commit callback threshold exceeded, oldest txg: %" + PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); + } + + /* + * Let's find the place to insert our callbacks. + * + * Even though the list is ordered by txg, it is possible for the + * insertion point to not be the end because our txg may already be + * quiescing at this point and other callbacks in the open txg + * (from other objsets) may have sneaked in. + */ + tmp_cb = list_tail(&zcl.zcl_callbacks); + while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) + tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); + + /* Add the 3 callbacks to the list */ + for (i = 0; i < 3; i++) { + if (tmp_cb == NULL) + list_insert_head(&zcl.zcl_callbacks, cb_data[i]); + else + list_insert_after(&zcl.zcl_callbacks, tmp_cb, + cb_data[i]); + + cb_data[i]->zcd_added = B_TRUE; + VERIFY(!cb_data[i]->zcd_called); + + tmp_cb = cb_data[i]; + } + + (void) mutex_unlock(&zcl.zcl_callbacks_lock); + + dmu_tx_commit(tx); +} + +/* ARGSUSED */ +void +ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) +{ + zfs_prop_t proplist[] = { + ZFS_PROP_CHECKSUM, + ZFS_PROP_COMPRESSION, + ZFS_PROP_COPIES, + ZFS_PROP_DEDUP + }; + ztest_shared_t *zs = ztest_shared; + + (void) rw_rdlock(&zs->zs_name_lock); + + for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) + (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], + ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); + + (void) rw_unlock(&zs->zs_name_lock); +} + +/* ARGSUSED */ +void +ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + nvlist_t *props = NULL; + + (void) rw_rdlock(&zs->zs_name_lock); + + (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO, + ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); + + VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0); + + if (zopt_verbose >= 6) + dump_nvlist(props, 4); + + nvlist_free(props); + + (void) rw_unlock(&zs->zs_name_lock); +} + +/* + * Test snapshot hold/release and deferred destroy. + */ +void +ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) +{ + int error; + objset_t *os = zd->zd_os; + objset_t *origin; + char snapname[100]; + char fullname[100]; + char clonename[100]; + char tag[100]; + char osname[MAXNAMELEN]; + + (void) rw_rdlock(&ztest_shared->zs_name_lock); + + dmu_objset_name(os, osname); + + (void) snprintf(snapname, 100, "sh1_%llu", id); + (void) snprintf(fullname, 100, "%s@%s", osname, snapname); + (void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id); + (void) snprintf(tag, 100, "%tag_%llu", id); + + /* + * Clean up from any previous run. + */ + (void) dmu_objset_destroy(clonename, B_FALSE); + (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE); + (void) dmu_objset_destroy(fullname, B_FALSE); + + /* + * Create snapshot, clone it, mark snap for deferred destroy, + * destroy clone, verify snap was also destroyed. + */ + error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, + FALSE, -1); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_snapshot"); + goto out; + } + fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + } + + error = dmu_objset_hold(fullname, FTAG, &origin); + if (error) + fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); + + error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0); + dmu_objset_rele(origin, FTAG); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_clone"); + goto out; + } + fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); + } + + error = dmu_objset_destroy(fullname, B_TRUE); + if (error) { + fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", + fullname, error); + } + + error = dmu_objset_destroy(clonename, B_FALSE); + if (error) + fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error); + + error = dmu_objset_hold(fullname, FTAG, &origin); + if (error != ENOENT) + fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); + + /* + * Create snapshot, add temporary hold, verify that we can't + * destroy a held snapshot, mark for deferred destroy, + * release hold, verify snapshot was destroyed. + */ + error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, + FALSE, -1); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_snapshot"); + goto out; + } + fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + } + + error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, + B_TRUE, -1); + if (error) + fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag); + + error = dmu_objset_destroy(fullname, B_FALSE); + if (error != EBUSY) { + fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d", + fullname, error); + } + + error = dmu_objset_destroy(fullname, B_TRUE); + if (error) { + fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", + fullname, error); + } + + error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE); + if (error) + fatal(0, "dsl_dataset_user_release(%s)", fullname, tag); + + VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT); + +out: + (void) rw_unlock(&ztest_shared->zs_name_lock); +} + +/* + * Inject random faults into the on-disk data. + */ +/* ARGSUSED */ +void +ztest_fault_inject(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + int fd; + uint64_t offset; + uint64_t leaves; + uint64_t bad = 0x1990c0ffeedecade; + uint64_t top, leaf; + char path0[MAXPATHLEN]; + char pathrand[MAXPATHLEN]; + size_t fsize; + int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ + int iters = 1000; + int maxfaults; + int mirror_save; + vdev_t *vd0 = NULL; + uint64_t guid0 = 0; + boolean_t islog = B_FALSE; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + maxfaults = MAXFAULTS(); + leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; + mirror_save = zs->zs_mirrors; + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + + ASSERT(leaves >= 1); + + /* + * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + if (ztest_random(2) == 0) { + /* + * Inject errors on a normal data device or slog device. + */ + top = ztest_random_vdev_top(spa, B_TRUE); + leaf = ztest_random(leaves) + zs->zs_splits; + + /* + * Generate paths to the first leaf in this top-level vdev, + * and to the random leaf we selected. We'll induce transient + * write failures and random online/offline activity on leaf 0, + * and we'll write random garbage to the randomly chosen leaf. + */ + (void) snprintf(path0, sizeof (path0), ztest_dev_template, + zopt_dir, zopt_pool, top * leaves + zs->zs_splits); + (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, + zopt_dir, zopt_pool, top * leaves + leaf); + + vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); + if (vd0 != NULL && vd0->vdev_top->vdev_islog) + islog = B_TRUE; + + if (vd0 != NULL && maxfaults != 1) { + /* + * Make vd0 explicitly claim to be unreadable, + * or unwriteable, or reach behind its back + * and close the underlying fd. We can do this if + * maxfaults == 0 because we'll fail and reexecute, + * and we can do it if maxfaults >= 2 because we'll + * have enough redundancy. If maxfaults == 1, the + * combination of this with injection of random data + * corruption below exceeds the pool's fault tolerance. + */ + vdev_file_t *vf = vd0->vdev_tsd; + + if (vf != NULL && ztest_random(3) == 0) { + (void) close(vf->vf_vnode->v_fd); + vf->vf_vnode->v_fd = -1; + } else if (ztest_random(2) == 0) { + vd0->vdev_cant_read = B_TRUE; + } else { + vd0->vdev_cant_write = B_TRUE; + } + guid0 = vd0->vdev_guid; + } + } else { + /* + * Inject errors on an l2cache device. + */ + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + if (sav->sav_count == 0) { + spa_config_exit(spa, SCL_STATE, FTAG); + return; + } + vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; + guid0 = vd0->vdev_guid; + (void) strcpy(path0, vd0->vdev_path); + (void) strcpy(pathrand, vd0->vdev_path); + + leaf = 0; + leaves = 1; + maxfaults = INT_MAX; /* no limit on cache devices */ + } + + spa_config_exit(spa, SCL_STATE, FTAG); + + /* + * If we can tolerate two or more faults, or we're dealing + * with a slog, randomly online/offline vd0. + */ + if ((maxfaults >= 2 || islog) && guid0 != 0) { + if (ztest_random(10) < 6) { + int flags = (ztest_random(2) == 0 ? + ZFS_OFFLINE_TEMPORARY : 0); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between offlining a slog and + * destroying a dataset. Offlining the slog will + * grab a reference on the dataset which may cause + * dmu_objset_destroy() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + if (islog) + (void) rw_wrlock(&ztest_shared->zs_name_lock); + + VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); + + if (islog) + (void) rw_unlock(&ztest_shared->zs_name_lock); + } else { + (void) vdev_online(spa, guid0, 0, NULL); + } + } + + if (maxfaults == 0) + return; + + /* + * We have at least single-fault tolerance, so inject data corruption. + */ + fd = open(pathrand, O_RDWR); + + if (fd == -1) /* we hit a gap in the device namespace */ + return; + + fsize = lseek(fd, 0, SEEK_END); + + while (--iters != 0) { + offset = ztest_random(fsize / (leaves << bshift)) * + (leaves << bshift) + (leaf << bshift) + + (ztest_random(1ULL << (bshift - 1)) & -8ULL); + + if (offset >= fsize) + continue; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + if (mirror_save != zs->zs_mirrors) { + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + (void) close(fd); + return; + } + + if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) + fatal(1, "can't inject bad word at 0x%llx in %s", + offset, pathrand); + + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + + if (zopt_verbose >= 7) + (void) printf("injected bad word into %s," + " offset 0x%llx\n", pathrand, (u_longlong_t)offset); + } + + (void) close(fd); +} + +/* + * Verify that DDT repair works as expected. + */ +void +ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t object, blocksize, txg, pattern, psize; + enum zio_checksum checksum = spa_dedup_checksum(spa); + dmu_buf_t *db; + dmu_tx_t *tx; + void *buf; + blkptr_t blk; + int copies = 2 * ZIO_DEDUPDITTO_MIN; + + blocksize = ztest_random_blocksize(); + blocksize = MIN(blocksize, 2048); /* because we write so many */ + + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; + + /* + * Take the name lock as writer to prevent anyone else from changing + * the pool and dataset properies we need to maintain during this test. + */ + (void) rw_wrlock(&zs->zs_name_lock); + + if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, + B_FALSE) != 0 || + ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, + B_FALSE) != 0) { + (void) rw_unlock(&zs->zs_name_lock); + return; + } + + object = od[0].od_object; + blocksize = od[0].od_blocksize; + pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os); + + ASSERT(object != 0); + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, object, 0, copies * blocksize); + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + (void) rw_unlock(&zs->zs_name_lock); + return; + } + + /* + * Write all the copies of our block. + */ + for (int i = 0; i < copies; i++) { + uint64_t offset = i * blocksize; + VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db, + DMU_READ_NO_PREFETCH) == 0); + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == blocksize); + ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || + ztest_pattern_match(db->db_data, db->db_size, 0ULL)); + dmu_buf_will_fill(db, tx); + ztest_pattern_set(db->db_data, db->db_size, pattern); + dmu_buf_rele(db, FTAG); + } + + dmu_tx_commit(tx); + txg_wait_synced(spa_get_dsl(spa), txg); + + /* + * Find out what block we got. + */ + VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db, + DMU_READ_NO_PREFETCH) == 0); + blk = *((dmu_buf_impl_t *)db)->db_blkptr; + dmu_buf_rele(db, FTAG); + + /* + * Damage the block. Dedup-ditto will save us when we read it later. + */ + psize = BP_GET_PSIZE(&blk); + buf = zio_buf_alloc(psize); + ztest_pattern_set(buf, psize, ~pattern); + + (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, + buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); + + zio_buf_free(buf, psize); + + (void) rw_unlock(&zs->zs_name_lock); +} + +/* + * Scrub the pool. + */ +/* ARGSUSED */ +void +ztest_scrub(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + + (void) spa_scan(spa, POOL_SCAN_SCRUB); + (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ + (void) spa_scan(spa, POOL_SCAN_SCRUB); +} + +/* + * Rename the pool to a different name and then rename it back. + */ +/* ARGSUSED */ +void +ztest_spa_rename(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + char *oldname, *newname; + spa_t *spa; + + (void) rw_wrlock(&zs->zs_name_lock); + + oldname = zs->zs_pool; + newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); + (void) strcpy(newname, oldname); + (void) strcat(newname, "_tmp"); + + /* + * Do the rename + */ + VERIFY3U(0, ==, spa_rename(oldname, newname)); + + /* + * Try to open it under the old name, which shouldn't exist + */ + VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); + + /* + * Open it under the new name and make sure it's still the same spa_t. + */ + VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); + + ASSERT(spa == zs->zs_spa); + spa_close(spa, FTAG); + + /* + * Rename it back to the original + */ + VERIFY3U(0, ==, spa_rename(newname, oldname)); + + /* + * Make sure it can still be opened + */ + VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); + + ASSERT(spa == zs->zs_spa); + spa_close(spa, FTAG); + + umem_free(newname, strlen(newname) + 1); + + (void) rw_unlock(&zs->zs_name_lock); +} + +/* + * Verify pool integrity by running zdb. + */ +static void +ztest_run_zdb(char *pool) +{ + int status; + char zdb[MAXPATHLEN + MAXNAMELEN + 20]; + char zbuf[1024]; + char *bin; + char *ztest; + char *isa; + int isalen; + FILE *fp; + + (void) realpath(getexecname(), zdb); + + /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ + bin = strstr(zdb, "/usr/bin/"); + ztest = strstr(bin, "/ztest"); + isa = bin + 8; + isalen = ztest - isa; + isa = strdup(isa); + /* LINTED */ + (void) sprintf(bin, + "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s", + isalen, + isa, + zopt_verbose >= 3 ? "s" : "", + zopt_verbose >= 4 ? "v" : "", + spa_config_path, + pool); + free(isa); + + if (zopt_verbose >= 5) + (void) printf("Executing %s\n", strstr(zdb, "zdb ")); + + fp = popen(zdb, "r"); + + while (fgets(zbuf, sizeof (zbuf), fp) != NULL) + if (zopt_verbose >= 3) + (void) printf("%s", zbuf); + + status = pclose(fp); + + if (status == 0) + return; + + ztest_dump_core = 0; + if (WIFEXITED(status)) + fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); + else + fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); +} + +static void +ztest_walk_pool_directory(char *header) +{ + spa_t *spa = NULL; + + if (zopt_verbose >= 6) + (void) printf("%s\n", header); + + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) + if (zopt_verbose >= 6) + (void) printf("\t%s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); +} + +static void +ztest_spa_import_export(char *oldname, char *newname) +{ + nvlist_t *config, *newconfig; + uint64_t pool_guid; + spa_t *spa; + + if (zopt_verbose >= 4) { + (void) printf("import/export: old = %s, new = %s\n", + oldname, newname); + } + + /* + * Clean up from previous runs. + */ + (void) spa_destroy(newname); + + /* + * Get the pool's configuration and guid. + */ + VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); + + /* + * Kick off a scrub to tickle scrub/export races. + */ + if (ztest_random(2) == 0) + (void) spa_scan(spa, POOL_SCAN_SCRUB); + + pool_guid = spa_guid(spa); + spa_close(spa, FTAG); + + ztest_walk_pool_directory("pools before export"); + + /* + * Export it. + */ + VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); + + ztest_walk_pool_directory("pools after export"); + + /* + * Try to import it. + */ + newconfig = spa_tryimport(config); + ASSERT(newconfig != NULL); + nvlist_free(newconfig); + + /* + * Import it under the new name. + */ + VERIFY3U(0, ==, spa_import(newname, config, NULL, 0)); + + ztest_walk_pool_directory("pools after import"); + + /* + * Try to import it again -- should fail with EEXIST. + */ + VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); + + /* + * Try to import it under a different name -- should fail with EEXIST. + */ + VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); + + /* + * Verify that the pool is no longer visible under the old name. + */ + VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); + + /* + * Verify that we can open and close the pool using the new name. + */ + VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); + ASSERT(pool_guid == spa_guid(spa)); + spa_close(spa, FTAG); + + nvlist_free(config); +} + +static void +ztest_resume(spa_t *spa) +{ + if (spa_suspended(spa) && zopt_verbose >= 6) + (void) printf("resuming from suspended state\n"); + spa_vdev_state_enter(spa, SCL_NONE); + vdev_clear(spa, NULL); + (void) spa_vdev_state_exit(spa, NULL, 0); + (void) zio_resume(spa); +} + +static void * +ztest_resume_thread(void *arg) +{ + spa_t *spa = arg; + + while (!ztest_exiting) { + if (spa_suspended(spa)) + ztest_resume(spa); + (void) poll(NULL, 0, 100); + } + return (NULL); +} + +static void * +ztest_deadman_thread(void *arg) +{ + ztest_shared_t *zs = arg; + int grace = 300; + hrtime_t delta; + + delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; + + (void) poll(NULL, 0, (int)(1000 * delta)); + + fatal(0, "failed to complete within %d seconds of deadline", grace); + + return (NULL); +} + +static void +ztest_execute(ztest_info_t *zi, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets]; + hrtime_t functime = gethrtime(); + + for (int i = 0; i < zi->zi_iters; i++) + zi->zi_func(zd, id); + + functime = gethrtime() - functime; + + atomic_add_64(&zi->zi_call_count, 1); + atomic_add_64(&zi->zi_call_time, functime); + + if (zopt_verbose >= 4) { + Dl_info dli; + (void) dladdr((void *)zi->zi_func, &dli); + (void) printf("%6.2f sec in %s\n", + (double)functime / NANOSEC, dli.dli_sname); + } +} + +static void * +ztest_thread(void *arg) +{ + uint64_t id = (uintptr_t)arg; + ztest_shared_t *zs = ztest_shared; + uint64_t call_next; + hrtime_t now; + ztest_info_t *zi; + + while ((now = gethrtime()) < zs->zs_thread_stop) { + /* + * See if it's time to force a crash. + */ + if (now > zs->zs_thread_kill) + ztest_kill(zs); + + /* + * If we're getting ENOSPC with some regularity, stop. + */ + if (zs->zs_enospc_count > 10) + break; + + /* + * Pick a random function to execute. + */ + zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)]; + call_next = zi->zi_call_next; + + if (now >= call_next && + atomic_cas_64(&zi->zi_call_next, call_next, call_next + + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) + ztest_execute(zi, id); + } + + return (NULL); +} + +static void +ztest_dataset_name(char *dsname, char *pool, int d) +{ + (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d); +} + +static void +ztest_dataset_destroy(ztest_shared_t *zs, int d) +{ + char name[MAXNAMELEN]; + + ztest_dataset_name(name, zs->zs_pool, d); + + if (zopt_verbose >= 3) + (void) printf("Destroying %s to free up space\n", name); + + /* + * Cleanup any non-standard clones and snapshots. In general, + * ztest thread t operates on dataset (t % zopt_datasets), + * so there may be more than one thing to clean up. + */ + for (int t = d; t < zopt_threads; t += zopt_datasets) + ztest_dsl_dataset_cleanup(name, t); + + (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, + DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); +} + +static void +ztest_dataset_dirobj_verify(ztest_ds_t *zd) +{ + uint64_t usedobjs, dirobjs, scratch; + + /* + * ZTEST_DIROBJ is the object directory for the entire dataset. + * Therefore, the number of objects in use should equal the + * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. + * If not, we have an object leak. + * + * Note that we can only check this in ztest_dataset_open(), + * when the open-context and syncing-context values agree. + * That's because zap_count() returns the open-context value, + * while dmu_objset_space() returns the rootbp fill count. + */ + VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); + dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); + ASSERT3U(dirobjs + 1, ==, usedobjs); +} + +static int +ztest_dataset_open(ztest_shared_t *zs, int d) +{ + ztest_ds_t *zd = &zs->zs_zd[d]; + uint64_t committed_seq = zd->zd_seq; + objset_t *os; + zilog_t *zilog; + char name[MAXNAMELEN]; + int error; + + ztest_dataset_name(name, zs->zs_pool, d); + + (void) rw_rdlock(&zs->zs_name_lock); + + error = ztest_dataset_create(name); + if (error == ENOSPC) { + (void) rw_unlock(&zs->zs_name_lock); + ztest_record_enospc(FTAG); + return (error); + } + ASSERT(error == 0 || error == EEXIST); + + VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0); + (void) rw_unlock(&zs->zs_name_lock); + + ztest_zd_init(zd, os); + + zilog = zd->zd_zilog; + + if (zilog->zl_header->zh_claim_lr_seq != 0 && + zilog->zl_header->zh_claim_lr_seq < committed_seq) + fatal(0, "missing log records: claimed %llu < committed %llu", + zilog->zl_header->zh_claim_lr_seq, committed_seq); + + ztest_dataset_dirobj_verify(zd); + + zil_replay(os, zd, ztest_replay_vector); + + ztest_dataset_dirobj_verify(zd); + + if (zopt_verbose >= 6) + (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", + zd->zd_name, + (u_longlong_t)zilog->zl_parse_blk_count, + (u_longlong_t)zilog->zl_parse_lr_count, + (u_longlong_t)zilog->zl_replaying_seq); + + zilog = zil_open(os, ztest_get_data); + + if (zilog->zl_replaying_seq != 0 && + zilog->zl_replaying_seq < committed_seq) + fatal(0, "missing log records: replayed %llu < committed %llu", + zilog->zl_replaying_seq, committed_seq); + + return (0); +} + +static void +ztest_dataset_close(ztest_shared_t *zs, int d) +{ + ztest_ds_t *zd = &zs->zs_zd[d]; + + zil_close(zd->zd_zilog); + dmu_objset_rele(zd->zd_os, zd); + + ztest_zd_fini(zd); +} + +/* + * Kick off threads to run tests on all datasets in parallel. + */ +static void +ztest_run(ztest_shared_t *zs) +{ + thread_t *tid; + spa_t *spa; + thread_t resume_tid; + int error; + + ztest_exiting = B_FALSE; + + /* + * Initialize parent/child shared state. + */ + VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); + VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); + + zs->zs_thread_start = gethrtime(); + zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC; + zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); + zs->zs_thread_kill = zs->zs_thread_stop; + if (ztest_random(100) < zopt_killrate) + zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC); + + (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL); + + list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), + offsetof(ztest_cb_data_t, zcd_node)); + + /* + * Open our pool. + */ + kernel_init(FREAD | FWRITE); + VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0); + zs->zs_spa = spa; + + spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; + + /* + * We don't expect the pool to suspend unless maxfaults == 0, + * in which case ztest_fault_inject() temporarily takes away + * the only valid replica. + */ + if (MAXFAULTS() == 0) + spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; + else + spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; + + /* + * Create a thread to periodically resume suspended I/O. + */ + VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, + &resume_tid) == 0); + + /* + * Create a deadman thread to abort() if we hang. + */ + VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND, + NULL) == 0); + + /* + * Verify that we can safely inquire about about any object, + * whether it's allocated or not. To make it interesting, + * we probe a 5-wide window around each power of two. + * This hits all edge cases, including zero and the max. + */ + for (int t = 0; t < 64; t++) { + for (int d = -5; d <= 5; d++) { + error = dmu_object_info(spa->spa_meta_objset, + (1ULL << t) + d, NULL); + ASSERT(error == 0 || error == ENOENT || + error == EINVAL); + } + } + + /* + * If we got any ENOSPC errors on the previous run, destroy something. + */ + if (zs->zs_enospc_count != 0) { + int d = ztest_random(zopt_datasets); + ztest_dataset_destroy(zs, d); + } + zs->zs_enospc_count = 0; + + tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL); + + if (zopt_verbose >= 4) + (void) printf("starting main threads...\n"); + + /* + * Kick off all the tests that run in parallel. + */ + for (int t = 0; t < zopt_threads; t++) { + if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0) + return; + VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, + THR_BOUND, &tid[t]) == 0); + } + + /* + * Wait for all of the tests to complete. We go in reverse order + * so we don't close datasets while threads are still using them. + */ + for (int t = zopt_threads - 1; t >= 0; t--) { + VERIFY(thr_join(tid[t], NULL, NULL) == 0); + if (t < zopt_datasets) + ztest_dataset_close(zs, t); + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(tid, zopt_threads * sizeof (thread_t)); + + /* Kill the resume thread */ + ztest_exiting = B_TRUE; + VERIFY(thr_join(resume_tid, NULL, NULL) == 0); + ztest_resume(spa); + + /* + * Right before closing the pool, kick off a bunch of async I/O; + * spa_close() should wait for it to complete. + */ + for (uint64_t object = 1; object < 50; object++) + dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); + + spa_close(spa, FTAG); + + /* + * Verify that we can loop over all pools. + */ + mutex_enter(&spa_namespace_lock); + for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) + if (zopt_verbose > 3) + (void) printf("spa_next: found %s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); + + /* + * Verify that we can export the pool and reimport it under a + * different name. + */ + if (ztest_random(2) == 0) { + char name[MAXNAMELEN]; + (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool); + ztest_spa_import_export(zs->zs_pool, name); + ztest_spa_import_export(name, zs->zs_pool); + } + + kernel_fini(); + + list_destroy(&zcl.zcl_callbacks); + + (void) _mutex_destroy(&zcl.zcl_callbacks_lock); + + (void) rwlock_destroy(&zs->zs_name_lock); + (void) _mutex_destroy(&zs->zs_vdev_lock); +} + +static void +ztest_freeze(ztest_shared_t *zs) +{ + ztest_ds_t *zd = &zs->zs_zd[0]; + spa_t *spa; + int numloops = 0; + + if (zopt_verbose >= 3) + (void) printf("testing spa_freeze()...\n"); + + kernel_init(FREAD | FWRITE); + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); + + /* + * Force the first log block to be transactionally allocated. + * We have to do this before we freeze the pool -- otherwise + * the log chain won't be anchored. + */ + while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { + ztest_dmu_object_alloc_free(zd, 0); + zil_commit(zd->zd_zilog, 0); + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * Freeze the pool. This stops spa_sync() from doing anything, + * so that the only way to record changes from now on is the ZIL. + */ + spa_freeze(spa); + + /* + * Run tests that generate log records but don't alter the pool config + * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). + * We do a txg_wait_synced() after each iteration to force the txg + * to increase well beyond the last synced value in the uberblock. + * The ZIL should be OK with that. + */ + while (ztest_random(10) != 0 && numloops++ < zopt_maxloops) { + ztest_dmu_write_parallel(zd, 0); + ztest_dmu_object_alloc_free(zd, 0); + txg_wait_synced(spa_get_dsl(spa), 0); + } + + /* + * Commit all of the changes we just generated. + */ + zil_commit(zd->zd_zilog, 0); + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * Close our dataset and close the pool. + */ + ztest_dataset_close(zs, 0); + spa_close(spa, FTAG); + kernel_fini(); + + /* + * Open and close the pool and dataset to induce log replay. + */ + kernel_init(FREAD | FWRITE); + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); + ztest_dataset_close(zs, 0); + spa_close(spa, FTAG); + kernel_fini(); +} + +void +print_time(hrtime_t t, char *timebuf) +{ + hrtime_t s = t / NANOSEC; + hrtime_t m = s / 60; + hrtime_t h = m / 60; + hrtime_t d = h / 24; + + s -= m * 60; + m -= h * 60; + h -= d * 24; + + timebuf[0] = '\0'; + + if (d) + (void) sprintf(timebuf, + "%llud%02lluh%02llum%02llus", d, h, m, s); + else if (h) + (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); + else if (m) + (void) sprintf(timebuf, "%llum%02llus", m, s); + else + (void) sprintf(timebuf, "%llus", s); +} + +static nvlist_t * +make_random_props() +{ + nvlist_t *props; + + if (ztest_random(2) == 0) + return (NULL); + + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); + + (void) printf("props:\n"); + dump_nvlist(props, 4); + + return (props); +} + +/* + * Create a storage pool with the given name and initial vdev size. + * Then test spa_freeze() functionality. + */ +static void +ztest_init(ztest_shared_t *zs) +{ + spa_t *spa; + nvlist_t *nvroot, *props; + + VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); + VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); + + kernel_init(FREAD | FWRITE); + + /* + * Create the storage pool. + */ + (void) spa_destroy(zs->zs_pool); + ztest_shared->zs_vdev_next_leaf = 0; + zs->zs_splits = 0; + zs->zs_mirrors = zopt_mirrors; + nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, + 0, zopt_raidz, zs->zs_mirrors, 1); + props = make_random_props(); + VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL)); + nvlist_free(nvroot); + + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + spa_close(spa, FTAG); + + kernel_fini(); + + ztest_run_zdb(zs->zs_pool); + + ztest_freeze(zs); + + ztest_run_zdb(zs->zs_pool); + + (void) rwlock_destroy(&zs->zs_name_lock); + (void) _mutex_destroy(&zs->zs_vdev_lock); +} + +int +main(int argc, char **argv) +{ + int kills = 0; + int iters = 0; + ztest_shared_t *zs; + size_t shared_size; + ztest_info_t *zi; + char timebuf[100]; + char numbuf[6]; + spa_t *spa; + + (void) setvbuf(stdout, NULL, _IOLBF, 0); + + ztest_random_fd = open("/dev/urandom", O_RDONLY); + + process_options(argc, argv); + + /* Override location of zpool.cache */ + (void) asprintf((char **)&spa_config_path, "%s/zpool.cache", zopt_dir); + + /* + * Blow away any existing copy of zpool.cache + */ + if (zopt_init != 0) + (void) remove(spa_config_path); + + shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t); + + zs = ztest_shared = (void *)mmap(0, + P2ROUNDUP(shared_size, getpagesize()), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + + if (zopt_verbose >= 1) { + (void) printf("%llu vdevs, %d datasets, %d threads," + " %llu seconds...\n", + (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads, + (u_longlong_t)zopt_time); + } + + /* + * Create and initialize our storage pool. + */ + for (int i = 1; i <= zopt_init; i++) { + bzero(zs, sizeof (ztest_shared_t)); + if (zopt_verbose >= 3 && zopt_init != 1) + (void) printf("ztest_init(), pass %d\n", i); + zs->zs_pool = zopt_pool; + ztest_init(zs); + } + + zs->zs_pool = zopt_pool; + zs->zs_proc_start = gethrtime(); + zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC; + + for (int f = 0; f < ZTEST_FUNCS; f++) { + zi = &zs->zs_info[f]; + *zi = ztest_info[f]; + if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) + zi->zi_call_next = UINT64_MAX; + else + zi->zi_call_next = zs->zs_proc_start + + ztest_random(2 * zi->zi_interval[0] + 1); + } + + /* + * Run the tests in a loop. These tests include fault injection + * to verify that self-healing data works, and forced crashes + * to verify that we never lose on-disk consistency. + */ + while (gethrtime() < zs->zs_proc_stop) { + int status; + pid_t pid; + + /* + * Initialize the workload counters for each function. + */ + for (int f = 0; f < ZTEST_FUNCS; f++) { + zi = &zs->zs_info[f]; + zi->zi_call_count = 0; + zi->zi_call_time = 0; + } + + /* Set the allocation switch size */ + metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1; + + pid = fork(); + + if (pid == -1) + fatal(1, "fork failed"); + + if (pid == 0) { /* child */ + struct rlimit rl = { 1024, 1024 }; + (void) setrlimit(RLIMIT_NOFILE, &rl); + (void) enable_extended_FILE_stdio(-1, -1); + ztest_run(zs); + exit(0); + } + + while (waitpid(pid, &status, 0) != pid) + continue; + + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != 0) { + (void) fprintf(stderr, + "child exited with code %d\n", + WEXITSTATUS(status)); + exit(2); + } + } else if (WIFSIGNALED(status)) { + if (WTERMSIG(status) != SIGKILL) { + (void) fprintf(stderr, + "child died with signal %d\n", + WTERMSIG(status)); + exit(3); + } + kills++; + } else { + (void) fprintf(stderr, "something strange happened " + "to child\n"); + exit(4); + } + + iters++; + + if (zopt_verbose >= 1) { + hrtime_t now = gethrtime(); + + now = MIN(now, zs->zs_proc_stop); + print_time(zs->zs_proc_stop - now, timebuf); + nicenum(zs->zs_space, numbuf); + + (void) printf("Pass %3d, %8s, %3llu ENOSPC, " + "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", + iters, + WIFEXITED(status) ? "Complete" : "SIGKILL", + (u_longlong_t)zs->zs_enospc_count, + 100.0 * zs->zs_alloc / zs->zs_space, + numbuf, + 100.0 * (now - zs->zs_proc_start) / + (zopt_time * NANOSEC), timebuf); + } + + if (zopt_verbose >= 2) { + (void) printf("\nWorkload summary:\n\n"); + (void) printf("%7s %9s %s\n", + "Calls", "Time", "Function"); + (void) printf("%7s %9s %s\n", + "-----", "----", "--------"); + for (int f = 0; f < ZTEST_FUNCS; f++) { + Dl_info dli; + + zi = &zs->zs_info[f]; + print_time(zi->zi_call_time, timebuf); + (void) dladdr((void *)zi->zi_func, &dli); + (void) printf("%7llu %9s %s\n", + (u_longlong_t)zi->zi_call_count, timebuf, + dli.dli_sname); + } + (void) printf("\n"); + } + + /* + * It's possible that we killed a child during a rename test, + * in which case we'll have a 'ztest_tmp' pool lying around + * instead of 'ztest'. Do a blind rename in case this happened. + */ + kernel_init(FREAD); + if (spa_open(zopt_pool, &spa, FTAG) == 0) { + spa_close(spa, FTAG); + } else { + char tmpname[MAXNAMELEN]; + kernel_fini(); + kernel_init(FREAD | FWRITE); + (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp", + zopt_pool); + (void) spa_rename(tmpname, zopt_pool); + } + kernel_fini(); + + ztest_run_zdb(zopt_pool); + } + + if (zopt_verbose >= 1) { + (void) printf("%d killed, %d completed, %.0f%% kill rate\n", + kills, iters - kills, (100.0 * kills) / MAX(1, iters)); + } + + return (0); +} diff --git a/head/atomic.h b/head/atomic.h new file mode 100644 index 0000000..00c9476 --- /dev/null +++ b/head/atomic.h @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ATOMIC_H +#define _ATOMIC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include + +#endif /* _ATOMIC_H */ diff --git a/head/libintl.h b/head/libintl.h new file mode 100644 index 0000000..e649668 --- /dev/null +++ b/head/libintl.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _LIBINTL_H +#define _LIBINTL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * wchar_t is a built-in type in standard C++ and as such is not + * defined here when using standard C++. However, the GNU compiler + * fixincludes utility nonetheless creates its own version of this + * header for use by gcc and g++. In that version it adds a redundant + * guard for __cplusplus. To avoid the creation of a gcc/g++ specific + * header we need to include the following magic comment: + * + * we must use the C++ compiler's type + * + * The above comment should not be removed or changed until GNU + * gcc/fixinc/inclhack.def is updated to bypass this header. + */ +#if !defined(__cplusplus) || (__cplusplus < 199711L && !defined(__GNUG__)) +#ifndef _WCHAR_T +#define _WCHAR_T +#if defined(_LP64) +typedef int wchar_t; +#else +typedef long wchar_t; +#endif +#endif /* !_WCHAR_T */ +#endif /* !defined(__cplusplus) ... */ + +#define TEXTDOMAINMAX 256 + +#define __GNU_GETTEXT_SUPPORTED_REVISION(m) \ + ((((m) == 0) || ((m) == 1)) ? 1 : -1) + +#ifdef __STDC__ +extern char *dcgettext(const char *, const char *, const int); +extern char *dgettext(const char *, const char *); +extern char *gettext(const char *); +extern char *textdomain(const char *); +extern char *bindtextdomain(const char *, const char *); + +/* + * LI18NUX 2000 Globalization Specification Version 1.0 + * with Amendment 2 + */ +extern char *dcngettext(const char *, const char *, + const char *, unsigned long int, int); +extern char *dngettext(const char *, const char *, + const char *, unsigned long int); +extern char *ngettext(const char *, const char *, unsigned long int); +extern char *bind_textdomain_codeset(const char *, const char *); + +/* Word handling functions --- requires dynamic linking */ +/* Warning: these are experimental and subject to change. */ +extern int wdinit(void); +extern int wdchkind(wchar_t); +extern int wdbindf(wchar_t, wchar_t, int); +extern wchar_t *wddelim(wchar_t, wchar_t, int); +extern wchar_t mcfiller(void); +extern int mcwrap(void); + +#else +extern char *dcgettext(); +extern char *dgettext(); +extern char *gettext(); +extern char *textdomain(); +extern char *bindtextdomain(); + +/* + * LI18NUX 2000 Globalization Specification Version 1.0 + * with Amendment 2 + */ +extern char *dcngettext(); +extern char *dngettext(); +extern char *ngettext(); +extern char *bind_textdomain_codeset(); + +/* Word handling functions --- requires dynamic linking */ +/* Warning: these are experimental and subject to change. */ +extern int wdinit(); +extern int wdchkind(); +extern int wdbindf(); +extern wchar_t *wddelim(); +extern wchar_t mcfiller(); +extern int mcwrap(); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBINTL_H */ diff --git a/head/stdio_ext.h b/head/stdio_ext.h new file mode 100644 index 0000000..77465c9 --- /dev/null +++ b/head/stdio_ext.h @@ -0,0 +1,88 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Extensions to the stdio package + */ + +#ifndef _STDIO_EXT_H +#define _STDIO_EXT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Even though the contents of the stdio FILE structure have always been + * private to the stdio implementation, over the years some programs have + * needed to get information about a stdio stream that was not accessible + * through a supported interface. These programs have resorted to accessing + * fields of the FILE structure directly, rendering them possibly non-portable + * to new implementations of stdio, or more likely, preventing enhancements + * to stdio because those programs will break. + * + * In the 64-bit world, the FILE structure is opaque. The routines here + * are provided as a way to get the information that used to be retrieved + * directly from the FILE structure. They are based on the needs of + * existing programs (such as 'mh' and 'emacs'), so they may be extended + * as other programs are ported. Though they may still be non-portable to + * other operating systems, they will work from each Solaris release to + * the next. More portable interfaces are being developed. + */ + +#define FSETLOCKING_QUERY 0 +#define FSETLOCKING_INTERNAL 1 +#define FSETLOCKING_BYCALLER 2 + +extern size_t __fbufsize(FILE *stream); +extern int __freading(FILE *stream); +extern int __fwriting(FILE *stream); +extern int __freadable(FILE *stream); +extern int __fwritable(FILE *stream); +extern int __flbf(FILE *stream); +extern void __fpurge(FILE *stream); +extern size_t __fpending(FILE *stream); +extern void _flushlbf(void); +extern int __fsetlocking(FILE *stream, int type); + +/* + * Extended FILE enabling function. + */ +#if defined(_LP64) && !defined(__lint) +#define enable_extended_FILE_stdio(fd, act) (0) +#else +extern int enable_extended_FILE_stdio(int, int); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _STDIO_EXT_H */ diff --git a/head/synch.h b/head/synch.h new file mode 100644 index 0000000..89efe9c --- /dev/null +++ b/head/synch.h @@ -0,0 +1,277 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYNCH_H +#define _SYNCH_H + +/* + * synch.h: + * definitions needed to use the thread synchronization interface + */ + +#ifndef _ASM +#include +#include +#include +#endif /* _ASM */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _ASM + +/* + * Semaphores + */ +typedef struct _sema { + /* this structure must be the same as sem_t in */ + uint32_t count; /* semaphore count */ + uint16_t type; + uint16_t magic; + upad64_t pad1[3]; /* reserved for a mutex_t */ + upad64_t pad2[2]; /* reserved for a cond_t */ +} sema_t; + +/* + * POSIX.1c Note: + * POSIX.1c requires that define the structures pthread_mutex_t + * and pthread_cond_t. These structures are identical to mutex_t (lwp_mutex_t) + * and cond_t (lwp_cond_t) which are defined in . A nested included + * of (to allow a "#typedef mutex_t pthread_mutex_t") would pull in + * non-posix symbols/constants violating the namespace restrictions. Hence, + * pthread_mutex_t/pthread_cond_t have been redefined in (actually + * in ). Any modifications done to mutex_t/lwp_mutex_t or + * cond_t/lwp_cond_t should also be done to pthread_mutex_t/pthread_cond_t. + */ +typedef lwp_mutex_t mutex_t; +typedef lwp_cond_t cond_t; + +/* + * Readers/writer locks + * + * NOTE: The layout of this structure should be kept in sync with the layout + * of the correponding structure of pthread_rwlock_t in sys/types.h. + * Also, there is an identical structure for lwp_rwlock_t in . + * Because we have to deal with C++, we cannot redefine this one as that one. + */ +typedef struct _rwlock { + int32_t readers; /* rwstate word */ + uint16_t type; + uint16_t magic; + mutex_t mutex; /* used with process-shared rwlocks */ + cond_t readercv; /* used only to indicate ownership */ + cond_t writercv; /* used only to indicate ownership */ +} rwlock_t; + +#ifdef __STDC__ +int _lwp_mutex_lock(lwp_mutex_t *); +int _lwp_mutex_unlock(lwp_mutex_t *); +int _lwp_mutex_trylock(lwp_mutex_t *); +int _lwp_cond_wait(lwp_cond_t *, lwp_mutex_t *); +int _lwp_cond_timedwait(lwp_cond_t *, lwp_mutex_t *, timespec_t *); +int _lwp_cond_reltimedwait(lwp_cond_t *, lwp_mutex_t *, timespec_t *); +int _lwp_cond_signal(lwp_cond_t *); +int _lwp_cond_broadcast(lwp_cond_t *); +int _lwp_sema_init(lwp_sema_t *, int); +int _lwp_sema_wait(lwp_sema_t *); +int _lwp_sema_trywait(lwp_sema_t *); +int _lwp_sema_post(lwp_sema_t *); +int cond_init(cond_t *, int, void *); +int cond_destroy(cond_t *); +int cond_wait(cond_t *, mutex_t *); +int cond_timedwait(cond_t *, mutex_t *, const timespec_t *); +int cond_reltimedwait(cond_t *, mutex_t *, const timespec_t *); +int cond_signal(cond_t *); +int cond_broadcast(cond_t *); +int mutex_init(mutex_t *, int, void *); +int mutex_destroy(mutex_t *); +int mutex_consistent(mutex_t *); +int mutex_lock(mutex_t *); +int mutex_trylock(mutex_t *); +int mutex_unlock(mutex_t *); +int rwlock_init(rwlock_t *, int, void *); +int rwlock_destroy(rwlock_t *); +int rw_rdlock(rwlock_t *); +int rw_wrlock(rwlock_t *); +int rw_unlock(rwlock_t *); +int rw_tryrdlock(rwlock_t *); +int rw_trywrlock(rwlock_t *); +int sema_init(sema_t *, unsigned int, int, void *); +int sema_destroy(sema_t *); +int sema_wait(sema_t *); +int sema_timedwait(sema_t *, const timespec_t *); +int sema_reltimedwait(sema_t *, const timespec_t *); +int sema_post(sema_t *); +int sema_trywait(sema_t *); + +#else /* __STDC__ */ + +int _lwp_mutex_lock(); +int _lwp_mutex_unlock(); +int _lwp_mutex_trylock(); +int _lwp_cond_wait(); +int _lwp_cond_timedwait(); +int _lwp_cond_reltimedwait(); +int _lwp_cond_signal(); +int _lwp_cond_broadcast(); +int _lwp_sema_init(); +int _lwp_sema_wait(); +int _lwp_sema_trywait(); +int _lwp_sema_post(); +int cond_init(); +int cond_destroy(); +int cond_wait(); +int cond_timedwait(); +int cond_reltimedwait(); +int cond_signal(); +int cond_broadcast(); +int mutex_init(); +int mutex_destroy(); +int mutex_consistent(); +int mutex_lock(); +int mutex_trylock(); +int mutex_unlock(); +int rwlock_init(); +int rwlock_destroy(); +int rw_rdlock(); +int rw_wrlock(); +int rw_unlock(); +int rw_tryrdlock(); +int rw_trywrlock(); +int sema_init(); +int sema_destroy(); +int sema_wait(); +int sema_timedwait(); +int sema_reltimedwait(); +int sema_post(); +int sema_trywait(); + +#endif /* __STDC__ */ + +#endif /* _ASM */ + +/* "Magic numbers" tagging synchronization object types */ +#define MUTEX_MAGIC _MUTEX_MAGIC +#define SEMA_MAGIC _SEMA_MAGIC +#define COND_MAGIC _COND_MAGIC +#define RWL_MAGIC _RWL_MAGIC + +/* + * POSIX.1c Note: + * DEFAULTMUTEX is defined same as PTHREAD_MUTEX_INITIALIZER in . + * DEFAULTCV is defined same as PTHREAD_COND_INITIALIZER in . + * DEFAULTRWLOCK is defined same as PTHREAD_RWLOCK_INITIALIZER in . + * Any changes to these macros should be reflected in + */ +#define DEFAULTMUTEX \ + {{0, 0, 0, {USYNC_THREAD}, MUTEX_MAGIC}, \ + {{{0, 0, 0, 0, 0, 0, 0, 0}}}, 0} +#define SHAREDMUTEX \ + {{0, 0, 0, {USYNC_PROCESS}, MUTEX_MAGIC}, \ + {{{0, 0, 0, 0, 0, 0, 0, 0}}}, 0} +#define RECURSIVEMUTEX \ + {{0, 0, 0, {USYNC_THREAD|LOCK_RECURSIVE}, MUTEX_MAGIC}, \ + {{{0, 0, 0, 0, 0, 0, 0, 0}}}, 0} +#define ERRORCHECKMUTEX \ + {{0, 0, 0, {USYNC_THREAD|LOCK_ERRORCHECK}, MUTEX_MAGIC}, \ + {{{0, 0, 0, 0, 0, 0, 0, 0}}}, 0} +#define RECURSIVE_ERRORCHECKMUTEX \ + {{0, 0, 0, {USYNC_THREAD|LOCK_RECURSIVE|LOCK_ERRORCHECK}, \ + MUTEX_MAGIC}, {{{0, 0, 0, 0, 0, 0, 0, 0}}}, 0} +#define DEFAULTCV \ + {{{0, 0, 0, 0}, USYNC_THREAD, COND_MAGIC}, 0} +#define SHAREDCV \ + {{{0, 0, 0, 0}, USYNC_PROCESS, COND_MAGIC}, 0} +#define DEFAULTSEMA \ + {0, USYNC_THREAD, SEMA_MAGIC, {0, 0, 0}, {0, 0}} +#define SHAREDSEMA \ + {0, USYNC_PROCESS, SEMA_MAGIC, {0, 0, 0}, {0, 0}} +#define DEFAULTRWLOCK \ + {0, USYNC_THREAD, RWL_MAGIC, DEFAULTMUTEX, DEFAULTCV, DEFAULTCV} +#define SHAREDRWLOCK \ + {0, USYNC_PROCESS, RWL_MAGIC, SHAREDMUTEX, SHAREDCV, SHAREDCV} + +/* + * Tests on lock states. + */ +#define SEMA_HELD(x) _sema_held(x) +#define RW_READ_HELD(x) _rw_read_held(x) +#define RW_WRITE_HELD(x) _rw_write_held(x) +#define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) +#define MUTEX_HELD(x) _mutex_held(x) + +/* + * The following definitions are for assertions which can be checked + * statically by tools like lock_lint. You can also define your own + * run-time test for each. If you don't, we define them to 1 so that + * such assertions simply pass. + */ +#ifndef NO_LOCKS_HELD +#define NO_LOCKS_HELD 1 +#endif +#ifndef NO_COMPETING_THREADS +#define NO_COMPETING_THREADS 1 +#endif + +#ifndef _ASM + +#ifdef __STDC__ + +/* + * The *_held() functions apply equally well to Solaris threads + * and to Posix threads synchronization objects, but the formal + * type declarations are different, so we just declare the argument + * to each *_held() function to be a void *, expecting that they will + * be called with the proper type of argument in each case. + */ +int _sema_held(void *); /* sema_t or sem_t */ +int _rw_read_held(void *); /* rwlock_t or pthread_rwlock_t */ +int _rw_write_held(void *); /* rwlock_t or pthread_rwlock_t */ +int _mutex_held(void *); /* mutex_t or pthread_mutex_t */ + +#else /* __STDC__ */ + +int _sema_held(); +int _rw_read_held(); +int _rw_write_held(); +int _mutex_held(); + +#endif /* __STDC__ */ + +/* Pause API */ +#ifdef __STDC__ +void smt_pause(void); +#else /* __STDC__ */ +void smt_pause(); +#endif /* __STDC__ */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYNCH_H */ diff --git a/head/thread.h b/head/thread.h new file mode 100644 index 0000000..63f0b71 --- /dev/null +++ b/head/thread.h @@ -0,0 +1,156 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _THREAD_H +#define _THREAD_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * thread.h: + * definitions needed to use the thread interface except synchronization. + * use for thread synchronization. + */ + +#ifndef _ASM +#include +#include +#include +#endif /* _ASM */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _ASM +typedef unsigned int thread_t; +typedef unsigned int thread_key_t; +#endif /* _ASM */ + +#ifndef _ASM +#ifdef __STDC__ + +extern int thr_create(void *, size_t, void *(*)(void *), void *, long, + thread_t *); +extern int thr_join(thread_t, thread_t *, void **); +extern int thr_setconcurrency(int); +extern int thr_getconcurrency(void); +extern void thr_exit(void *) __NORETURN; +extern thread_t thr_self(void); + +/* + * the definition of thr_sigsetmask() is not strict ansi-c since sigset_t is + * not in the strict ansi-c name space. Hence, include the prototype for + * thr_sigsetmask() only if strict ansi-c conformance is not turned on. + */ +#if !defined(_STRICT_STDC) || defined(__EXTENSIONS__) +extern int thr_sigsetmask(int, const sigset_t *, sigset_t *); +#endif + +/* + * the definition of thr_stksegment() is not strict ansi-c since stack_t is + * not in the strict ansi-c name space. Hence, include the prototype for + * thr_stksegment() only if strict ansi-c conformance is not turned on. + */ +#if !defined(_STRICT_STDC) || defined(__EXTENSIONS__) +extern int thr_stksegment(stack_t *); +#endif + +extern int thr_main(void); +extern int thr_kill(thread_t, int); +extern int thr_suspend(thread_t); +extern int thr_continue(thread_t); +extern void thr_yield(void); +extern int thr_setprio(thread_t, int); +extern int thr_getprio(thread_t, int *); +extern int thr_keycreate(thread_key_t *, void(*)(void *)); +extern int thr_keycreate_once(thread_key_t *, void(*)(void *)); +extern int thr_setspecific(thread_key_t, void *); +extern int thr_getspecific(thread_key_t, void **); +extern size_t thr_min_stack(void); + +#else /* __STDC */ + +extern int thr_create(); +extern int thr_join(); +extern int thr_setconcurrency(); +extern int thr_getconcurrency(); +extern void thr_exit(); +extern thread_t thr_self(); +extern int thr_sigsetmask(); +extern int thr_stksegment(); +extern int thr_main(); +extern int thr_kill(); +extern int thr_suspend(); +extern int thr_continue(); +extern void thr_yield(); +extern int thr_setprio(); +extern int thr_getprio(); +extern int thr_keycreate(); +extern int thr_keycreate_once(); +extern int thr_setspecific(); +extern int thr_getspecific(); +extern size_t thr_min_stack(); + +#endif /* __STDC */ +#endif /* _ASM */ + +#define THR_MIN_STACK thr_min_stack() +/* + * thread flags (one word bit mask) + */ +/* + * POSIX.1c Note: + * THR_BOUND is defined same as PTHREAD_SCOPE_SYSTEM in + * THR_DETACHED is defined same as PTHREAD_CREATE_DETACHED in + * Any changes in these definitions should be reflected in + */ +#define THR_BOUND 0x00000001 /* = PTHREAD_SCOPE_SYSTEM */ +#define THR_NEW_LWP 0x00000002 +#define THR_DETACHED 0x00000040 /* = PTHREAD_CREATE_DETACHED */ +#define THR_SUSPENDED 0x00000080 +#define THR_DAEMON 0x00000100 + +/* + * The key to be created by thr_keycreate_once() + * must be statically initialized with THR_ONCE_KEY. + * This must be the same as PTHREAD_ONCE_KEY_NP in + */ +#define THR_ONCE_KEY (thread_key_t)(-1) + +/* + * The available register states returned by thr_getstate(). + */ +#define TRS_VALID 0 +#define TRS_NONVOLATILE 1 +#define TRS_LWPID 2 +#define TRS_INVALID 3 + +#ifdef __cplusplus +} +#endif + +#endif /* _THREAD_H */ diff --git a/lib/libdtrace/common/dt_decl.c b/lib/libdtrace/common/dt_decl.c index bb77984..d2a0b29 100644 --- a/lib/libdtrace/common/dt_decl.c +++ b/lib/libdtrace/common/dt_decl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,13 +18,11 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -703,8 +700,7 @@ dt_decl_enumerator(char *s, dt_node_t *dnp) char *name; int value; - name = alloca(strlen(s) + 1); - (void) strcpy(name, s); + name = strdupa(s); free(s); if (dsp == NULL) diff --git a/lib/libdtrace/common/dt_ident.c b/lib/libdtrace/common/dt_ident.c index c437e0a..3dfa058 100644 --- a/lib/libdtrace/common/dt_ident.c +++ b/lib/libdtrace/common/dt_ident.c @@ -20,12 +20,9 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -181,8 +178,7 @@ dt_idcook_func(dt_node_t *dnp, dt_ident_t *idp, int argc, dt_node_t *args) int i = 0; assert(idp->di_iarg != NULL); - s = alloca(strlen(idp->di_iarg) + 1); - (void) strcpy(s, idp->di_iarg); + s = strdupa(idp->di_iarg); if ((p2 = strrchr(s, ')')) != NULL) *p2 = '\0'; /* mark end of parameter list string */ diff --git a/lib/libdtrace/common/dt_parser.c b/lib/libdtrace/common/dt_parser.c index 9aabc18..6ad30a9 100644 --- a/lib/libdtrace/common/dt_parser.c +++ b/lib/libdtrace/common/dt_parser.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,12 +20,9 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * DTrace D Language Parser * @@ -472,9 +468,9 @@ dt_node_name(const dt_node_t *dnp, char *buf, size_t len) case DT_NODE_XLATOR: (void) snprintf(buf, len, "translator <%s> (%s)", dt_type_name(dnp->dn_xlator->dx_dst_ctfp, - dnp->dn_xlator->dx_dst_type, n1, sizeof (n1)), + dnp->dn_xlator->dx_dst_type, n1, sizeof (n1)), dt_type_name(dnp->dn_xlator->dx_src_ctfp, - dnp->dn_xlator->dx_src_type, n2, sizeof (n2))); + dnp->dn_xlator->dx_src_type, n2, sizeof (n2))); break; case DT_NODE_PROG: (void) snprintf(buf, len, "%s", "program"); @@ -1440,9 +1436,9 @@ dt_node_decl(void) "\t current: %s\n\tprevious: %s\n", dmp->dm_name, dsp->ds_ident, dt_type_name(dtt.dtt_ctfp, dtt.dtt_type, - n1, sizeof (n1)), + n1, sizeof (n1)), dt_type_name(ott.dtt_ctfp, ott.dtt_type, - n2, sizeof (n2))); + n2, sizeof (n2))); } else if (!exists && dt_module_extern(dtp, dmp, dsp->ds_ident, &dtt) == NULL) { xyerror(D_UNKNOWN, @@ -1452,7 +1448,7 @@ dt_node_decl(void) dt_dprintf("extern %s`%s type=<%s>\n", dmp->dm_name, dsp->ds_ident, dt_type_name(dtt.dtt_ctfp, dtt.dtt_type, - n1, sizeof (n1))); + n1, sizeof (n1))); } break; } @@ -1756,8 +1752,7 @@ dt_node_offsetof(dt_decl_t *ddp, char *s) ctf_id_t type; uint_t kind; - name = alloca(strlen(s) + 1); - (void) strcpy(name, s); + name = strdupa(s); free(s); err = dt_decl_type(ddp, &dtt); diff --git a/lib/libdtrace/common/dt_pragma.c b/lib/libdtrace/common/dt_pragma.c index a8bab85..9cb3c3b 100644 --- a/lib/libdtrace/common/dt_pragma.c +++ b/lib/libdtrace/common/dt_pragma.c @@ -20,12 +20,9 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -337,8 +334,7 @@ dt_pragma_option(const char *prname, dt_node_t *dnp) "superfluous arguments specified for #pragma %s\n", prname); } - opt = alloca(strlen(dnp->dn_string) + 1); - (void) strcpy(opt, dnp->dn_string); + opt = strdupa(dnp->dn_string); if ((val = strchr(opt, '=')) != NULL) *val++ = '\0'; diff --git a/lib/libdtrace/common/dt_string.c b/lib/libdtrace/common/dt_string.c index 3a5315e..782d66c 100644 --- a/lib/libdtrace/common/dt_string.c +++ b/lib/libdtrace/common/dt_string.c @@ -29,23 +29,6 @@ #include #include -#include - -/* - * Create a copy of string s, but only duplicate the first n bytes. - */ -char * -strndup(const char *s, size_t n) -{ - char *s2 = malloc(n + 1); - - if (s2 == NULL) - longjmp(yypcb->pcb_jmpbuf, EDT_NOMEM); - - (void) strncpy(s2, s, n); - s2[n] = '\0'; - return (s2); -} /* * Transform string s inline, converting each embedded C escape sequence string diff --git a/lib/libdtrace/common/dt_string.h b/lib/libdtrace/common/dt_string.h index 1fd412b..a9bb7a1 100644 --- a/lib/libdtrace/common/dt_string.h +++ b/lib/libdtrace/common/dt_string.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,16 +18,14 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _DT_STRING_H #define _DT_STRING_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -36,7 +33,6 @@ extern "C" { #endif -extern char *strndup(const char *, size_t); extern size_t stresc2chr(char *); extern char *strchr2esc(const char *, size_t); extern const char *strbasename(const char *); diff --git a/lib/libdtrace/common/dt_subr.c b/lib/libdtrace/common/dt_subr.c index 97221c84..f586504 100644 --- a/lib/libdtrace/common/dt_subr.c +++ b/lib/libdtrace/common/dt_subr.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -222,8 +221,7 @@ dtrace_str2attr(const char *str, dtrace_attribute_t *attr) return (-1); /* invalid function arguments */ *attr = _dtrace_maxattr; - p = alloca(strlen(str) + 1); - (void) strcpy(p, str); + p = strdupa(str); if ((p = dt_getstrattr(p, &q)) == NULL) return (0); diff --git a/lib/libnvpair/libnvpair.c b/lib/libnvpair/libnvpair.c new file mode 100644 index 0000000..16bce48 --- /dev/null +++ b/lib/libnvpair/libnvpair.c @@ -0,0 +1,1269 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "libnvpair.h" + +/* + * libnvpair - A tools library for manipulating pairs. + * + * This library provides routines packing an unpacking nv pairs + * for transporting data across process boundaries, transporting + * between kernel and userland, and possibly saving onto disk files. + */ + +/* + * Print control structure. + */ + +#define DEFINEOP(opname, vtype) \ + struct { \ + int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ + const char *, vtype); \ + void *arg; \ + } opname + +#define DEFINEARROP(opname, vtype) \ + struct { \ + int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ + const char *, vtype, uint_t); \ + void *arg; \ + } opname + +struct nvlist_printops { + DEFINEOP(print_boolean, int); + DEFINEOP(print_boolean_value, boolean_t); + DEFINEOP(print_byte, uchar_t); + DEFINEOP(print_int8, int8_t); + DEFINEOP(print_uint8, uint8_t); + DEFINEOP(print_int16, int16_t); + DEFINEOP(print_uint16, uint16_t); + DEFINEOP(print_int32, int32_t); + DEFINEOP(print_uint32, uint32_t); + DEFINEOP(print_int64, int64_t); + DEFINEOP(print_uint64, uint64_t); + DEFINEOP(print_double, double); + DEFINEOP(print_string, char *); + DEFINEOP(print_hrtime, hrtime_t); + DEFINEOP(print_nvlist, nvlist_t *); + DEFINEARROP(print_boolean_array, boolean_t *); + DEFINEARROP(print_byte_array, uchar_t *); + DEFINEARROP(print_int8_array, int8_t *); + DEFINEARROP(print_uint8_array, uint8_t *); + DEFINEARROP(print_int16_array, int16_t *); + DEFINEARROP(print_uint16_array, uint16_t *); + DEFINEARROP(print_int32_array, int32_t *); + DEFINEARROP(print_uint32_array, uint32_t *); + DEFINEARROP(print_int64_array, int64_t *); + DEFINEARROP(print_uint64_array, uint64_t *); + DEFINEARROP(print_string_array, char **); + DEFINEARROP(print_nvlist_array, nvlist_t **); +}; + +struct nvlist_prtctl { + FILE *nvprt_fp; /* output destination */ + enum nvlist_indent_mode nvprt_indent_mode; /* see above */ + int nvprt_indent; /* absolute indent, or tab depth */ + int nvprt_indentinc; /* indent or tab increment */ + const char *nvprt_nmfmt; /* member name format, max one %s */ + const char *nvprt_eomfmt; /* after member format, e.g. "\n" */ + const char *nvprt_btwnarrfmt; /* between array members */ + int nvprt_btwnarrfmt_nl; /* nvprt_eoamfmt includes newline? */ + struct nvlist_printops *nvprt_dfltops; + struct nvlist_printops *nvprt_custops; +}; + +#define DFLTPRTOP(pctl, type) \ + ((pctl)->nvprt_dfltops->print_##type.op) + +#define DFLTPRTOPARG(pctl, type) \ + ((pctl)->nvprt_dfltops->print_##type.arg) + +#define CUSTPRTOP(pctl, type) \ + ((pctl)->nvprt_custops->print_##type.op) + +#define CUSTPRTOPARG(pctl, type) \ + ((pctl)->nvprt_custops->print_##type.arg) + +#define RENDER(pctl, type, nvl, name, val) \ + { \ + int done = 0; \ + if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ + done = CUSTPRTOP(pctl, type)(pctl, \ + CUSTPRTOPARG(pctl, type), nvl, name, val); \ + } \ + if (!done) { \ + (void) DFLTPRTOP(pctl, type)(pctl, \ + DFLTPRTOPARG(pctl, type), nvl, name, val); \ + } \ + (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \ + } + +#define ARENDER(pctl, type, nvl, name, arrp, count) \ + { \ + int done = 0; \ + if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ + done = CUSTPRTOP(pctl, type)(pctl, \ + CUSTPRTOPARG(pctl, type), nvl, name, arrp, count); \ + } \ + if (!done) { \ + (void) DFLTPRTOP(pctl, type)(pctl, \ + DFLTPRTOPARG(pctl, type), nvl, name, arrp, count); \ + } \ + (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \ + } + +static void nvlist_print_with_indent(nvlist_t *, nvlist_prtctl_t); + +/* + * ====================================================================== + * | | + * | Indentation | + * | | + * ====================================================================== + */ + +static void +indent(nvlist_prtctl_t pctl, int onemore) +{ + int depth; + + switch (pctl->nvprt_indent_mode) { + case NVLIST_INDENT_ABS: + (void) fprintf(pctl->nvprt_fp, "%*s", + pctl->nvprt_indent + onemore * pctl->nvprt_indentinc, ""); + break; + + case NVLIST_INDENT_TABBED: + depth = pctl->nvprt_indent + onemore; + while (depth-- > 0) + (void) fprintf(pctl->nvprt_fp, "\t"); + } +} + +/* + * ====================================================================== + * | | + * | Default nvlist member rendering functions. | + * | | + * ====================================================================== + */ + +/* + * Generate functions to print single-valued nvlist members. + * + * type_and_variant - suffix to form function name + * vtype - C type for the member value + * ptype - C type to cast value to for printing + * vfmt - format string for pair value, e.g "%d" or "0x%llx" + */ + +#define NVLIST_PRTFUNC(type_and_variant, vtype, ptype, vfmt) \ +static int \ +nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ + nvlist_t *nvl, const char *name, vtype value) \ +{ \ + FILE *fp = pctl->nvprt_fp; \ + NOTE(ARGUNUSED(private)) \ + NOTE(ARGUNUSED(nvl)) \ + indent(pctl, 1); \ + (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ + (void) fprintf(fp, vfmt, (ptype)value); \ + return (1); \ +} + +NVLIST_PRTFUNC(boolean, int, int, "%d") +NVLIST_PRTFUNC(boolean_value, boolean_t, int, "%d") +NVLIST_PRTFUNC(byte, uchar_t, uchar_t, "0x%2.2x") +NVLIST_PRTFUNC(int8, int8_t, int, "%d") +NVLIST_PRTFUNC(uint8, uint8_t, uint8_t, "0x%x") +NVLIST_PRTFUNC(int16, int16_t, int16_t, "%d") +NVLIST_PRTFUNC(uint16, uint16_t, uint16_t, "0x%x") +NVLIST_PRTFUNC(int32, int32_t, int32_t, "%d") +NVLIST_PRTFUNC(uint32, uint32_t, uint32_t, "0x%x") +NVLIST_PRTFUNC(int64, int64_t, longlong_t, "%lld") +NVLIST_PRTFUNC(uint64, uint64_t, u_longlong_t, "0x%llx") +NVLIST_PRTFUNC(double, double, double, "0x%llf") +NVLIST_PRTFUNC(string, char *, char *, "%s") +NVLIST_PRTFUNC(hrtime, hrtime_t, hrtime_t, "0x%llx") + +/* + * Generate functions to print array-valued nvlist members. + */ + +#define NVLIST_ARRPRTFUNC(type_and_variant, vtype, ptype, vfmt) \ +static int \ +nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ + nvlist_t *nvl, const char *name, vtype *valuep, uint_t count) \ +{ \ + FILE *fp = pctl->nvprt_fp; \ + uint_t i; \ + NOTE(ARGUNUSED(private)) \ + NOTE(ARGUNUSED(nvl)) \ + for (i = 0; i < count; i++) { \ + if (i == 0 || pctl->nvprt_btwnarrfmt_nl) { \ + indent(pctl, 1); \ + (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ + if (pctl->nvprt_btwnarrfmt_nl) \ + (void) fprintf(fp, "[%d]: ", i); \ + } \ + if (i != 0) \ + (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \ + (void) fprintf(fp, vfmt, (ptype)valuep[i]); \ + } \ + return (1); \ +} + +NVLIST_ARRPRTFUNC(boolean_array, boolean_t, boolean_t, "%d") +NVLIST_ARRPRTFUNC(byte_array, uchar_t, uchar_t, "0x%2.2x") +NVLIST_ARRPRTFUNC(int8_array, int8_t, int8_t, "%d") +NVLIST_ARRPRTFUNC(uint8_array, uint8_t, uint8_t, "0x%x") +NVLIST_ARRPRTFUNC(int16_array, int16_t, int16_t, "%d") +NVLIST_ARRPRTFUNC(uint16_array, uint16_t, uint16_t, "0x%x") +NVLIST_ARRPRTFUNC(int32_array, int32_t, int32_t, "%d") +NVLIST_ARRPRTFUNC(uint32_array, uint32_t, uint32_t, "0x%x") +NVLIST_ARRPRTFUNC(int64_array, int64_t, longlong_t, "%lld") +NVLIST_ARRPRTFUNC(uint64_array, uint64_t, u_longlong_t, "0x%llx") +NVLIST_ARRPRTFUNC(string_array, char *, char *, "%s") + +/*ARGSUSED*/ +static int +nvprint_nvlist(nvlist_prtctl_t pctl, void *private, + nvlist_t *nvl, const char *name, nvlist_t *value) +{ + FILE *fp = pctl->nvprt_fp; + + indent(pctl, 1); + (void) fprintf(fp, "%s = (embedded nvlist)\n", name); + + pctl->nvprt_indent += pctl->nvprt_indentinc; + nvlist_print_with_indent(value, pctl); + pctl->nvprt_indent -= pctl->nvprt_indentinc; + + indent(pctl, 1); + (void) fprintf(fp, "(end %s)\n", name); + + return (1); +} + +/*ARGSUSED*/ +static int +nvaprint_nvlist_array(nvlist_prtctl_t pctl, void *private, + nvlist_t *nvl, const char *name, nvlist_t **valuep, uint_t count) +{ + FILE *fp = pctl->nvprt_fp; + uint_t i; + + indent(pctl, 1); + (void) fprintf(fp, "%s = (array of embedded nvlists)\n", name); + + for (i = 0; i < count; i++) { + indent(pctl, 1); + (void) fprintf(fp, "(start %s[%d])\n", name, i); + + pctl->nvprt_indent += pctl->nvprt_indentinc; + nvlist_print_with_indent(valuep[i], pctl); + pctl->nvprt_indent -= pctl->nvprt_indentinc; + + indent(pctl, 1); + (void) fprintf(fp, "(end %s[%d])\n", name, i); + } + + return (1); +} + +/* + * ====================================================================== + * | | + * | Interfaces that allow control over formatting. | + * | | + * ====================================================================== + */ + +void +nvlist_prtctl_setdest(nvlist_prtctl_t pctl, FILE *fp) +{ + pctl->nvprt_fp = fp; +} + +FILE * +nvlist_prtctl_getdest(nvlist_prtctl_t pctl) +{ + return (pctl->nvprt_fp); +} + + +void +nvlist_prtctl_setindent(nvlist_prtctl_t pctl, enum nvlist_indent_mode mode, + int start, int inc) +{ + if (mode < NVLIST_INDENT_ABS || mode > NVLIST_INDENT_TABBED) + mode = NVLIST_INDENT_TABBED; + + if (start < 0) + start = 0; + + if (inc < 0) + inc = 1; + + pctl->nvprt_indent_mode = mode; + pctl->nvprt_indent = start; + pctl->nvprt_indentinc = inc; +} + +void +nvlist_prtctl_doindent(nvlist_prtctl_t pctl, int onemore) +{ + indent(pctl, onemore); +} + + +void +nvlist_prtctl_setfmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, + const char *fmt) +{ + switch (which) { + case NVLIST_FMT_MEMBER_NAME: + if (fmt == NULL) + fmt = "%s = "; + pctl->nvprt_nmfmt = fmt; + break; + + case NVLIST_FMT_MEMBER_POSTAMBLE: + if (fmt == NULL) + fmt = "\n"; + pctl->nvprt_eomfmt = fmt; + break; + + case NVLIST_FMT_BTWN_ARRAY: + if (fmt == NULL) { + pctl->nvprt_btwnarrfmt = " "; + pctl->nvprt_btwnarrfmt_nl = 0; + } else { + pctl->nvprt_btwnarrfmt = fmt; + pctl->nvprt_btwnarrfmt_nl = (strstr(fmt, "\n") != NULL); + } + break; + + default: + break; + } +} + + +void +nvlist_prtctl_dofmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, ...) +{ + FILE *fp = pctl->nvprt_fp; + va_list ap; + char *name; + + va_start(ap, which); + + switch (which) { + case NVLIST_FMT_MEMBER_NAME: + name = va_arg(ap, char *); + (void) fprintf(fp, pctl->nvprt_nmfmt, name); + break; + + case NVLIST_FMT_MEMBER_POSTAMBLE: + (void) fprintf(fp, pctl->nvprt_eomfmt); + break; + + case NVLIST_FMT_BTWN_ARRAY: + (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \ + break; + + default: + break; + } + + va_end(ap); +} + +/* + * ====================================================================== + * | | + * | Interfaces to allow appointment of replacement rendering functions.| + * | | + * ====================================================================== + */ + +#define NVLIST_PRINTCTL_REPLACE(type, vtype) \ +void \ +nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ + int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype), \ + void *private) \ +{ \ + CUSTPRTOP(pctl, type) = func; \ + CUSTPRTOPARG(pctl, type) = private; \ +} + +NVLIST_PRINTCTL_REPLACE(boolean, int) +NVLIST_PRINTCTL_REPLACE(boolean_value, boolean_t) +NVLIST_PRINTCTL_REPLACE(byte, uchar_t) +NVLIST_PRINTCTL_REPLACE(int8, int8_t) +NVLIST_PRINTCTL_REPLACE(uint8, uint8_t) +NVLIST_PRINTCTL_REPLACE(int16, int16_t) +NVLIST_PRINTCTL_REPLACE(uint16, uint16_t) +NVLIST_PRINTCTL_REPLACE(int32, int32_t) +NVLIST_PRINTCTL_REPLACE(uint32, uint32_t) +NVLIST_PRINTCTL_REPLACE(int64, int64_t) +NVLIST_PRINTCTL_REPLACE(uint64, uint64_t) +NVLIST_PRINTCTL_REPLACE(double, double) +NVLIST_PRINTCTL_REPLACE(string, char *) +NVLIST_PRINTCTL_REPLACE(hrtime, hrtime_t) +NVLIST_PRINTCTL_REPLACE(nvlist, nvlist_t *) + +#define NVLIST_PRINTCTL_AREPLACE(type, vtype) \ +void \ +nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ + int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, \ + uint_t), void *private) \ +{ \ + CUSTPRTOP(pctl, type) = func; \ + CUSTPRTOPARG(pctl, type) = private; \ +} + +NVLIST_PRINTCTL_AREPLACE(boolean_array, boolean_t *) +NVLIST_PRINTCTL_AREPLACE(byte_array, uchar_t *) +NVLIST_PRINTCTL_AREPLACE(int8_array, int8_t *) +NVLIST_PRINTCTL_AREPLACE(uint8_array, uint8_t *) +NVLIST_PRINTCTL_AREPLACE(int16_array, int16_t *) +NVLIST_PRINTCTL_AREPLACE(uint16_array, uint16_t *) +NVLIST_PRINTCTL_AREPLACE(int32_array, int32_t *) +NVLIST_PRINTCTL_AREPLACE(uint32_array, uint32_t *) +NVLIST_PRINTCTL_AREPLACE(int64_array, int64_t *) +NVLIST_PRINTCTL_AREPLACE(uint64_array, uint64_t *) +NVLIST_PRINTCTL_AREPLACE(string_array, char **) +NVLIST_PRINTCTL_AREPLACE(nvlist_array, nvlist_t **) + +/* + * ====================================================================== + * | | + * | Interfaces to manage nvlist_prtctl_t cookies. | + * | | + * ====================================================================== + */ + + +static const struct nvlist_printops defprtops = { + { nvprint_boolean, NULL }, + { nvprint_boolean_value, NULL }, + { nvprint_byte, NULL }, + { nvprint_int8, NULL }, + { nvprint_uint8, NULL }, + { nvprint_int16, NULL }, + { nvprint_uint16, NULL }, + { nvprint_int32, NULL }, + { nvprint_uint32, NULL }, + { nvprint_int64, NULL }, + { nvprint_uint64, NULL }, + { nvprint_double, NULL }, + { nvprint_string, NULL }, + { nvprint_hrtime, NULL }, + { nvprint_nvlist, NULL }, + { nvaprint_boolean_array, NULL }, + { nvaprint_byte_array, NULL }, + { nvaprint_int8_array, NULL }, + { nvaprint_uint8_array, NULL }, + { nvaprint_int16_array, NULL }, + { nvaprint_uint16_array, NULL }, + { nvaprint_int32_array, NULL }, + { nvaprint_uint32_array, NULL }, + { nvaprint_int64_array, NULL }, + { nvaprint_uint64_array, NULL }, + { nvaprint_string_array, NULL }, + { nvaprint_nvlist_array, NULL }, +}; + +static void +prtctl_defaults(FILE *fp, struct nvlist_prtctl *pctl, + struct nvlist_printops *ops) +{ + pctl->nvprt_fp = fp; + pctl->nvprt_indent_mode = NVLIST_INDENT_TABBED; + pctl->nvprt_indent = 0; + pctl->nvprt_indentinc = 1; + pctl->nvprt_nmfmt = "%s = "; + pctl->nvprt_eomfmt = "\n"; + pctl->nvprt_btwnarrfmt = " "; + pctl->nvprt_btwnarrfmt_nl = 0; + + pctl->nvprt_dfltops = (struct nvlist_printops *)&defprtops; + pctl->nvprt_custops = ops; +} + +nvlist_prtctl_t +nvlist_prtctl_alloc(void) +{ + struct nvlist_prtctl *pctl; + struct nvlist_printops *ops; + + if ((pctl = malloc(sizeof (*pctl))) == NULL) + return (NULL); + + if ((ops = calloc(1, sizeof (*ops))) == NULL) { + free(pctl); + return (NULL); + } + + prtctl_defaults(stdout, pctl, ops); + + return (pctl); +} + +void +nvlist_prtctl_free(nvlist_prtctl_t pctl) +{ + if (pctl != NULL) { + free(pctl->nvprt_custops); + free(pctl); + } +} + +/* + * ====================================================================== + * | | + * | Top-level print request interfaces. | + * | | + * ====================================================================== + */ + +/* + * nvlist_print - Prints elements in an event buffer + */ +static void +nvlist_print_with_indent(nvlist_t *nvl, nvlist_prtctl_t pctl) +{ + FILE *fp = pctl->nvprt_fp; + char *name; + uint_t nelem; + nvpair_t *nvp; + + if (nvl == NULL) + return; + + indent(pctl, 0); + (void) fprintf(fp, "nvlist version: %d\n", NVL_VERSION(nvl)); + + nvp = nvlist_next_nvpair(nvl, NULL); + + while (nvp) { + data_type_t type = nvpair_type(nvp); + + name = nvpair_name(nvp); + nelem = 0; + + switch (type) { + case DATA_TYPE_BOOLEAN: { + RENDER(pctl, boolean, nvl, name, 1); + break; + } + case DATA_TYPE_BOOLEAN_VALUE: { + boolean_t val; + (void) nvpair_value_boolean_value(nvp, &val); + RENDER(pctl, boolean_value, nvl, name, val); + break; + } + case DATA_TYPE_BYTE: { + uchar_t val; + (void) nvpair_value_byte(nvp, &val); + RENDER(pctl, byte, nvl, name, val); + break; + } + case DATA_TYPE_INT8: { + int8_t val; + (void) nvpair_value_int8(nvp, &val); + RENDER(pctl, int8, nvl, name, val); + break; + } + case DATA_TYPE_UINT8: { + uint8_t val; + (void) nvpair_value_uint8(nvp, &val); + RENDER(pctl, uint8, nvl, name, val); + break; + } + case DATA_TYPE_INT16: { + int16_t val; + (void) nvpair_value_int16(nvp, &val); + RENDER(pctl, int16, nvl, name, val); + break; + } + case DATA_TYPE_UINT16: { + uint16_t val; + (void) nvpair_value_uint16(nvp, &val); + RENDER(pctl, uint16, nvl, name, val); + break; + } + case DATA_TYPE_INT32: { + int32_t val; + (void) nvpair_value_int32(nvp, &val); + RENDER(pctl, int32, nvl, name, val); + break; + } + case DATA_TYPE_UINT32: { + uint32_t val; + (void) nvpair_value_uint32(nvp, &val); + RENDER(pctl, uint32, nvl, name, val); + break; + } + case DATA_TYPE_INT64: { + int64_t val; + (void) nvpair_value_int64(nvp, &val); + RENDER(pctl, int64, nvl, name, val); + break; + } + case DATA_TYPE_UINT64: { + uint64_t val; + (void) nvpair_value_uint64(nvp, &val); + RENDER(pctl, uint64, nvl, name, val); + break; + } + case DATA_TYPE_DOUBLE: { + double val; + (void) nvpair_value_double(nvp, &val); + RENDER(pctl, double, nvl, name, val); + break; + } + case DATA_TYPE_STRING: { + char *val; + (void) nvpair_value_string(nvp, &val); + RENDER(pctl, string, nvl, name, val); + break; + } + case DATA_TYPE_BOOLEAN_ARRAY: { + boolean_t *val; + (void) nvpair_value_boolean_array(nvp, &val, &nelem); + ARENDER(pctl, boolean_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_BYTE_ARRAY: { + uchar_t *val; + (void) nvpair_value_byte_array(nvp, &val, &nelem); + ARENDER(pctl, byte_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_INT8_ARRAY: { + int8_t *val; + (void) nvpair_value_int8_array(nvp, &val, &nelem); + ARENDER(pctl, int8_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val; + (void) nvpair_value_uint8_array(nvp, &val, &nelem); + ARENDER(pctl, uint8_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_INT16_ARRAY: { + int16_t *val; + (void) nvpair_value_int16_array(nvp, &val, &nelem); + ARENDER(pctl, int16_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val; + (void) nvpair_value_uint16_array(nvp, &val, &nelem); + ARENDER(pctl, uint16_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_INT32_ARRAY: { + int32_t *val; + (void) nvpair_value_int32_array(nvp, &val, &nelem); + ARENDER(pctl, int32_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val; + (void) nvpair_value_uint32_array(nvp, &val, &nelem); + ARENDER(pctl, uint32_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_INT64_ARRAY: { + int64_t *val; + (void) nvpair_value_int64_array(nvp, &val, &nelem); + ARENDER(pctl, int64_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val; + (void) nvpair_value_uint64_array(nvp, &val, &nelem); + ARENDER(pctl, uint64_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_STRING_ARRAY: { + char **val; + (void) nvpair_value_string_array(nvp, &val, &nelem); + ARENDER(pctl, string_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_HRTIME: { + hrtime_t val; + (void) nvpair_value_hrtime(nvp, &val); + RENDER(pctl, hrtime, nvl, name, val); + break; + } + case DATA_TYPE_NVLIST: { + nvlist_t *val; + (void) nvpair_value_nvlist(nvp, &val); + RENDER(pctl, nvlist, nvl, name, val); + break; + } + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **val; + (void) nvpair_value_nvlist_array(nvp, &val, &nelem); + ARENDER(pctl, nvlist_array, nvl, name, val, nelem); + break; + } + default: + (void) fprintf(fp, " unknown data type (%d)", type); + break; + } + nvp = nvlist_next_nvpair(nvl, nvp); + } +} + +void +nvlist_print(FILE *fp, nvlist_t *nvl) +{ + struct nvlist_prtctl pc; + + prtctl_defaults(fp, &pc, NULL); + nvlist_print_with_indent(nvl, &pc); +} + +void +nvlist_prt(nvlist_t *nvl, nvlist_prtctl_t pctl) +{ + nvlist_print_with_indent(nvl, pctl); +} + +#define NVP(elem, type, vtype, ptype, format) { \ + vtype value; \ +\ + (void) nvpair_value_##type(elem, &value); \ + (void) printf("%*s%s: " format "\n", indent, "", \ + nvpair_name(elem), (ptype)value); \ +} + +#define NVPA(elem, type, vtype, ptype, format) { \ + uint_t i, count; \ + vtype *value; \ +\ + (void) nvpair_value_##type(elem, &value, &count); \ + for (i = 0; i < count; i++) { \ + (void) printf("%*s%s[%d]: " format "\n", indent, "", \ + nvpair_name(elem), i, (ptype)value[i]); \ + } \ +} + +/* + * Similar to nvlist_print() but handles arrays slightly differently. + */ +void +dump_nvlist(nvlist_t *list, int indent) +{ + nvpair_t *elem = NULL; + boolean_t bool_value; + nvlist_t *nvlist_value; + nvlist_t **nvlist_array_value; + uint_t i, count; + + if (list == NULL) { + return; + } + + while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { + switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(elem, &bool_value); + (void) printf("%*s%s: %s\n", indent, "", + nvpair_name(elem), bool_value ? "true" : "false"); + break; + + case DATA_TYPE_BYTE: + NVP(elem, byte, uchar_t, int, "%u"); + break; + + case DATA_TYPE_INT8: + NVP(elem, int8, int8_t, int, "%d"); + break; + + case DATA_TYPE_UINT8: + NVP(elem, uint8, uint8_t, int, "%u"); + break; + + case DATA_TYPE_INT16: + NVP(elem, int16, int16_t, int, "%d"); + break; + + case DATA_TYPE_UINT16: + NVP(elem, uint16, uint16_t, int, "%u"); + break; + + case DATA_TYPE_INT32: + NVP(elem, int32, int32_t, long, "%ld"); + break; + + case DATA_TYPE_UINT32: + NVP(elem, uint32, uint32_t, ulong_t, "%lu"); + break; + + case DATA_TYPE_INT64: + NVP(elem, int64, int64_t, longlong_t, "%lld"); + break; + + case DATA_TYPE_UINT64: + NVP(elem, uint64, uint64_t, u_longlong_t, "%llu"); + break; + + case DATA_TYPE_STRING: + NVP(elem, string, char *, char *, "'%s'"); + break; + + case DATA_TYPE_BYTE_ARRAY: + NVPA(elem, byte_array, uchar_t, int, "%u"); + break; + + case DATA_TYPE_INT8_ARRAY: + NVPA(elem, int8_array, int8_t, int, "%d"); + break; + + case DATA_TYPE_UINT8_ARRAY: + NVPA(elem, uint8_array, uint8_t, int, "%u"); + break; + + case DATA_TYPE_INT16_ARRAY: + NVPA(elem, int16_array, int16_t, int, "%d"); + break; + + case DATA_TYPE_UINT16_ARRAY: + NVPA(elem, uint16_array, uint16_t, int, "%u"); + break; + + case DATA_TYPE_INT32_ARRAY: + NVPA(elem, int32_array, int32_t, long, "%ld"); + break; + + case DATA_TYPE_UINT32_ARRAY: + NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu"); + break; + + case DATA_TYPE_INT64_ARRAY: + NVPA(elem, int64_array, int64_t, longlong_t, "%lld"); + break; + + case DATA_TYPE_UINT64_ARRAY: + NVPA(elem, uint64_array, uint64_t, u_longlong_t, + "%llu"); + break; + + case DATA_TYPE_STRING_ARRAY: + NVPA(elem, string_array, char *, char *, "'%s'"); + break; + + case DATA_TYPE_NVLIST: + (void) nvpair_value_nvlist(elem, &nvlist_value); + (void) printf("%*s%s:\n", indent, "", + nvpair_name(elem)); + dump_nvlist(nvlist_value, indent + 4); + break; + + case DATA_TYPE_NVLIST_ARRAY: + (void) nvpair_value_nvlist_array(elem, + &nvlist_array_value, &count); + for (i = 0; i < count; i++) { + (void) printf("%*s%s[%u]:\n", indent, "", + nvpair_name(elem), i); + dump_nvlist(nvlist_array_value[i], indent + 4); + } + break; + + default: + (void) printf(dgettext(TEXT_DOMAIN, "bad config type " + "%d for %s\n"), nvpair_type(elem), + nvpair_name(elem)); + } + } +} + +/* + * ====================================================================== + * | | + * | Misc private interface. | + * | | + * ====================================================================== + */ + +/* + * Determine if string 'value' matches 'nvp' value. The 'value' string is + * converted, depending on the type of 'nvp', prior to match. For numeric + * types, a radix independent sscanf conversion of 'value' is used. If 'nvp' + * is an array type, 'ai' is the index into the array against which we are + * checking for match. If nvp is of DATA_TYPE_STRING*, the caller can pass + * in a regex_t compilation of value in 'value_regex' to trigger regular + * expression string match instead of simple strcmp(). + * + * Return 1 on match, 0 on no-match, and -1 on error. If the error is + * related to value syntax error and 'ep' is non-NULL, *ep will point into + * the 'value' string at the location where the error exists. + * + * NOTE: It may be possible to move the non-regex_t version of this into + * common code used by library/kernel/boot. + */ +int +nvpair_value_match_regex(nvpair_t *nvp, int ai, + char *value, regex_t *value_regex, char **ep) +{ + char *evalue; + uint_t a_len; + int sr; + + if (ep) + *ep = NULL; + + if ((nvp == NULL) || (value == NULL)) + return (-1); /* error fail match - invalid args */ + + /* make sure array and index combination make sense */ + if ((nvpair_type_is_array(nvp) && (ai < 0)) || + (!nvpair_type_is_array(nvp) && (ai >= 0))) + return (-1); /* error fail match - bad index */ + + /* non-string values should be single 'chunk' */ + if ((nvpair_type(nvp) != DATA_TYPE_STRING) && + (nvpair_type(nvp) != DATA_TYPE_STRING_ARRAY)) { + value += strspn(value, " \t"); + evalue = value + strcspn(value, " \t"); + if (*evalue) { + if (ep) + *ep = evalue; + return (-1); /* error fail match - syntax */ + } + } + + sr = EOF; + switch (nvpair_type(nvp)) { + case DATA_TYPE_STRING: { + char *val; + + /* check string value for match */ + if (nvpair_value_string(nvp, &val) == 0) { + if (value_regex) { + if (regexec(value_regex, val, + (size_t)0, NULL, 0) == 0) + return (1); /* match */ + } else { + if (strcmp(value, val) == 0) + return (1); /* match */ + } + } + break; + } + case DATA_TYPE_STRING_ARRAY: { + char **val_array; + + /* check indexed string value of array for match */ + if ((nvpair_value_string_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len)) { + if (value_regex) { + if (regexec(value_regex, val_array[ai], + (size_t)0, NULL, 0) == 0) + return (1); + } else { + if (strcmp(value, val_array[ai]) == 0) + return (1); + } + } + break; + } + case DATA_TYPE_BYTE: { + uchar_t val, val_arg; + + /* scanf uchar_t from value and check for match */ + sr = sscanf(value, "%c", &val_arg); + if ((sr == 1) && (nvpair_value_byte(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_BYTE_ARRAY: { + uchar_t *val_array, val_arg; + + + /* check indexed value of array for match */ + sr = sscanf(value, "%c", &val_arg); + if ((sr == 1) && + (nvpair_value_byte_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT8: { + int8_t val, val_arg; + + /* scanf int8_t from value and check for match */ + sr = sscanf(value, "%"SCNi8, &val_arg); + if ((sr == 1) && + (nvpair_value_int8(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT8_ARRAY: { + int8_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi8, &val_arg); + if ((sr == 1) && + (nvpair_value_int8_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT8: { + uint8_t val, val_arg; + + /* scanf uint8_t from value and check for match */ + sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint8(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint8_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT16: { + int16_t val, val_arg; + + /* scanf int16_t from value and check for match */ + sr = sscanf(value, "%"SCNi16, &val_arg); + if ((sr == 1) && + (nvpair_value_int16(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT16_ARRAY: { + int16_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi16, &val_arg); + if ((sr == 1) && + (nvpair_value_int16_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT16: { + uint16_t val, val_arg; + + /* scanf uint16_t from value and check for match */ + sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint16(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint16_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT32: { + int32_t val, val_arg; + + /* scanf int32_t from value and check for match */ + sr = sscanf(value, "%"SCNi32, &val_arg); + if ((sr == 1) && + (nvpair_value_int32(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT32_ARRAY: { + int32_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi32, &val_arg); + if ((sr == 1) && + (nvpair_value_int32_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT32: { + uint32_t val, val_arg; + + /* scanf uint32_t from value and check for match */ + sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint32(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint32_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT64: { + int64_t val, val_arg; + + /* scanf int64_t from value and check for match */ + sr = sscanf(value, "%"SCNi64, &val_arg); + if ((sr == 1) && + (nvpair_value_int64(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT64_ARRAY: { + int64_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi64, &val_arg); + if ((sr == 1) && + (nvpair_value_int64_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT64: { + uint64_t val_arg, val; + + /* scanf uint64_t from value and check for match */ + sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint64(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint64_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_BOOLEAN_VALUE: { + boolean_t val, val_arg; + + /* scanf boolean_t from value and check for match */ + sr = sscanf(value, "%"SCNi32, &val_arg); + if ((sr == 1) && + (nvpair_value_boolean_value(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_BOOLEAN_ARRAY: { + boolean_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi32, &val_arg); + if ((sr == 1) && + (nvpair_value_boolean_array(nvp, + &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_HRTIME: + case DATA_TYPE_NVLIST: + case DATA_TYPE_NVLIST_ARRAY: + case DATA_TYPE_BOOLEAN: + case DATA_TYPE_DOUBLE: + case DATA_TYPE_UNKNOWN: + default: + /* + * unknown/unsupported data type + */ + return (-1); /* error fail match */ + } + + /* + * check to see if sscanf failed conversion, return approximate + * pointer to problem + */ + if (sr != 1) { + if (ep) + *ep = value; + return (-1); /* error fail match - syntax */ + } + + return (0); /* fail match */ +} + +int +nvpair_value_match(nvpair_t *nvp, int ai, char *value, char **ep) +{ + return (nvpair_value_match_regex(nvp, ai, value, NULL, ep)); +} diff --git a/lib/libnvpair/libnvpair.h b/lib/libnvpair/libnvpair.h new file mode 100644 index 0000000..4c2615d --- /dev/null +++ b/lib/libnvpair/libnvpair.h @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBNVPAIR_H +#define _LIBNVPAIR_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * All interfaces described in this file are private to Solaris, and + * are subject to change at any time and without notice. The public + * nvlist/nvpair interfaces, as documented in manpage sections 3NVPAIR, + * are all imported from included above. + */ + +extern int nvpair_value_match(nvpair_t *, int, char *, char **); +extern int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, + char **); + +extern void nvlist_print(FILE *, nvlist_t *); +extern void dump_nvlist(nvlist_t *, int); + +/* + * Private nvlist printing interface that allows the caller some control + * over output rendering (as opposed to nvlist_print and dump_nvlist). + * + * Obtain an opaque nvlist_prtctl_t cookie using nvlist_prtctl_alloc + * (NULL on failure); on return the cookie is set up for default formatting + * and rendering. Quote the cookie in subsequent customisation functions and + * then pass the cookie to nvlist_prt to render the nvlist. Finally, + * use nvlist_prtctl_free to release the cookie. + * + * For all nvlist_lookup_xxx and nvlist_lookup_xxx_array functions + * we have a corresponding brace of functions that appoint replacement + * rendering functions: + * + * extern void nvlist_prtctl_xxx(nvlist_prtctl_t, + * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, + * xxxtype value)) + * + * and + * + * extern void nvlist_prtctl_xxx_array(nvlist_prtctl_t, + * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, + * xxxtype value, uint_t count)) + * + * where xxxtype is the C datatype corresponding to xxx, eg int8_t for "int8" + * and char * for "string". The function that is appointed to render the + * specified datatype receives as arguments the cookie, the nvlist + * member name, the value of that member (or a pointer for array function), + * and (for array rendering functions) a count of the number of elements. + */ + +typedef struct nvlist_prtctl *nvlist_prtctl_t; /* opaque */ + +enum nvlist_indent_mode { + NVLIST_INDENT_ABS, /* Absolute indentation */ + NVLIST_INDENT_TABBED /* Indent with tabstops */ +}; + +extern nvlist_prtctl_t nvlist_prtctl_alloc(void); +extern void nvlist_prtctl_free(nvlist_prtctl_t); +extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t); + +/* Output stream */ +extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *); +extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t); + +/* Indentation mode, start indent, indent increment; default tabbed/0/1 */ +extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode, + int, int); +extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int); + +enum nvlist_prtctl_fmt { + NVLIST_FMT_MEMBER_NAME, /* name fmt; default "%s = " */ + NVLIST_FMT_MEMBER_POSTAMBLE, /* after nvlist member; default "\n" */ + NVLIST_FMT_BTWN_ARRAY /* between array members; default " " */ +}; + +extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, + const char *); +extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...); + +/* + * Function prototypes for interfaces that appoint a new rendering function + * for single-valued nvlist members. + * + * A replacement function receives arguments as follows: + * + * nvlist_prtctl_t Print control structure; do not change preferences + * for this object from a print callback function. + * + * void * The function-private cookie argument registered + * when the replacement function was appointed. + * + * nvlist_t * The full nvlist that is being processed. The + * rendering function is called to render a single + * member (name and value passed as below) but it may + * want to reference or incorporate other aspects of + * the full nvlist. + * + * const char * Member name to render + * + * valtype Value of the member to render + * + * The function must return non-zero if it has rendered output for this + * member, or 0 if it wants to default to standard rendering for this + * one member. + */ + +#define NVLIST_PRINTCTL_SVDECL(funcname, valtype) \ + extern void funcname(nvlist_prtctl_t, \ + int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, valtype), \ + void *) + +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean, int); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean_value, boolean_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_byte, uchar_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int8, int8_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint8, uint8_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int16, int16_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint16, uint16_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int32, int32_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint32, uint32_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int64, int64_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint64, uint64_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_double, double); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_string, char *); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_hrtime, hrtime_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *); + +#undef NVLIST_PRINTCTL_SVDECL /* was just for "clarity" above */ + +/* + * Function prototypes for interfaces that appoint a new rendering function + * for array-valued nvlist members. + * + * One additional argument is taken: uint_t for the number of array elements + * + * Return values as above. + */ +#define NVLIST_PRINTCTL_AVDECL(funcname, vtype) \ + extern void funcname(nvlist_prtctl_t, \ + int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, uint_t), \ + void *) + +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_boolean_array, boolean_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_byte_array, uchar_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int8_array, int8_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint8_array, uint8_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int16_array, int16_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint16_array, uint16_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int32_array, int32_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint32_array, uint32_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int64_array, int64_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint64_array, uint64_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_string_array, char **); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_nvlist_array, nvlist_t **); + +#undef NVLIST_PRINTCTL_AVDECL /* was just for "clarity" above */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBNVPAIR_H */ diff --git a/lib/libnvpair/nvpair_alloc_system.c b/lib/libnvpair/nvpair_alloc_system.c new file mode 100644 index 0000000..1aefc10 --- /dev/null +++ b/lib/libnvpair/nvpair_alloc_system.c @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include +#include + +/*ARGSUSED*/ +static void * +nv_alloc_sys(nv_alloc_t *nva, size_t size) +{ + return (malloc(size)); +} + +/*ARGSUSED*/ +static void +nv_free_sys(nv_alloc_t *nva, void *buf, size_t size) +{ + free(buf); +} + +const nv_alloc_ops_t system_ops_def = { + NULL, /* nv_ao_init() */ + NULL, /* nv_ao_fini() */ + nv_alloc_sys, /* nv_ao_alloc() */ + nv_free_sys, /* nv_ao_free() */ + NULL /* nv_ao_reset() */ +}; + +nv_alloc_t nv_alloc_nosleep_def = { + &system_ops_def, + NULL +}; + +nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def; diff --git a/lib/libuutil/common/libuutil.h b/lib/libuutil/common/libuutil.h new file mode 100644 index 0000000..6675424 --- /dev/null +++ b/lib/libuutil/common/libuutil.h @@ -0,0 +1,390 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBUUTIL_H +#define _LIBUUTIL_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Standard flags codes. + */ +#define UU_DEFAULT 0 + +/* + * Standard error codes. + */ +#define UU_ERROR_NONE 0 /* no error */ +#define UU_ERROR_INVALID_ARGUMENT 1 /* invalid argument */ +#define UU_ERROR_UNKNOWN_FLAG 2 /* passed flag invalid */ +#define UU_ERROR_NO_MEMORY 3 /* out of memory */ +#define UU_ERROR_CALLBACK_FAILED 4 /* callback-initiated error */ +#define UU_ERROR_NOT_SUPPORTED 5 /* operation not supported */ +#define UU_ERROR_EMPTY 6 /* no value provided */ +#define UU_ERROR_UNDERFLOW 7 /* value is too small */ +#define UU_ERROR_OVERFLOW 8 /* value is too value */ +#define UU_ERROR_INVALID_CHAR 9 /* value contains unexpected char */ +#define UU_ERROR_INVALID_DIGIT 10 /* value contains digit not in base */ + +#define UU_ERROR_SYSTEM 99 /* underlying system error */ +#define UU_ERROR_UNKNOWN 100 /* error status not known */ + +/* + * Standard program exit codes. + */ +#define UU_EXIT_OK (*(uu_exit_ok())) +#define UU_EXIT_FATAL (*(uu_exit_fatal())) +#define UU_EXIT_USAGE (*(uu_exit_usage())) + +/* + * Exit status profiles. + */ +#define UU_PROFILE_DEFAULT 0 +#define UU_PROFILE_LAUNCHER 1 + +/* + * Error reporting functions. + */ +uint32_t uu_error(void); +const char *uu_strerror(uint32_t); + +/* + * Program notification functions. + */ +extern void uu_alt_exit(int); +extern const char *uu_setpname(char *); +extern const char *uu_getpname(void); +/*PRINTFLIKE1*/ +extern void uu_warn(const char *, ...); +extern void uu_vwarn(const char *, va_list); +/*PRINTFLIKE1*/ +extern void uu_die(const char *, ...) __NORETURN; +extern void uu_vdie(const char *, va_list) __NORETURN; +/*PRINTFLIKE2*/ +extern void uu_xdie(int, const char *, ...) __NORETURN; +extern void uu_vxdie(int, const char *, va_list) __NORETURN; + +/* + * Exit status functions (not to be used directly) + */ +extern int *uu_exit_ok(void); +extern int *uu_exit_fatal(void); +extern int *uu_exit_usage(void); + +/* + * string->number conversions + */ +extern int uu_strtoint(const char *, void *, size_t, int, int64_t, int64_t); +extern int uu_strtouint(const char *, void *, size_t, int, uint64_t, uint64_t); + +/* + * Debug print facility functions. + */ +typedef struct uu_dprintf uu_dprintf_t; + +typedef enum { + UU_DPRINTF_SILENT, + UU_DPRINTF_FATAL, + UU_DPRINTF_WARNING, + UU_DPRINTF_NOTICE, + UU_DPRINTF_INFO, + UU_DPRINTF_DEBUG +} uu_dprintf_severity_t; + +extern uu_dprintf_t *uu_dprintf_create(const char *, uu_dprintf_severity_t, + uint_t); +/*PRINTFLIKE3*/ +extern void uu_dprintf(uu_dprintf_t *, uu_dprintf_severity_t, + const char *, ...); +extern void uu_dprintf_destroy(uu_dprintf_t *); +extern const char *uu_dprintf_getname(uu_dprintf_t *); + +/* + * Identifier test flags and function. + */ +#define UU_NAME_DOMAIN 0x1 /* allow SUNW, or com.sun, prefix */ +#define UU_NAME_PATH 0x2 /* allow '/'-delimited paths */ + +int uu_check_name(const char *, uint_t); + +/* + * File creation functions. + */ +extern int uu_open_tmp(const char *dir, uint_t uflags); + +/* + * Convenience functions. + */ +#define UU_NELEM(a) (sizeof (a) / sizeof ((a)[0])) + +/*PRINTFLIKE1*/ +extern char *uu_msprintf(const char *format, ...); +extern void *uu_zalloc(size_t); +extern char *uu_strdup(const char *); +extern void uu_free(void *); + +extern boolean_t uu_strcaseeq(const char *a, const char *b); +extern boolean_t uu_streq(const char *a, const char *b); +extern char *uu_strndup(const char *s, size_t n); +extern boolean_t uu_strbw(const char *a, const char *b); +extern void *uu_memdup(const void *buf, size_t sz); +extern void uu_dump(FILE *out, const char *prefix, const void *buf, size_t len); + +/* + * Comparison function type definition. + * Developers should be careful in their use of the _private argument. If you + * break interface guarantees, you get undefined behavior. + */ +typedef int uu_compare_fn_t(const void *__left, const void *__right, + void *__private); + +/* + * Walk variant flags. + * A data structure need not provide support for all variants and + * combinations. Refer to the appropriate documentation. + */ +#define UU_WALK_ROBUST 0x00000001 /* walk can survive removes */ +#define UU_WALK_REVERSE 0x00000002 /* reverse walk order */ + +#define UU_WALK_PREORDER 0x00000010 /* walk tree in pre-order */ +#define UU_WALK_POSTORDER 0x00000020 /* walk tree in post-order */ + +/* + * Walk callback function return codes. + */ +#define UU_WALK_ERROR -1 +#define UU_WALK_NEXT 0 +#define UU_WALK_DONE 1 + +/* + * Walk callback function type definition. + */ +typedef int uu_walk_fn_t(void *_elem, void *_private); + +/* + * lists: opaque structures + */ +typedef struct uu_list_pool uu_list_pool_t; +typedef struct uu_list uu_list_t; + +typedef struct uu_list_node { + uintptr_t uln_opaque[2]; +} uu_list_node_t; + +typedef struct uu_list_walk uu_list_walk_t; + +typedef uintptr_t uu_list_index_t; + +/* + * lists: interface + * + * basic usage: + * typedef struct foo { + * ... + * uu_list_node_t foo_node; + * ... + * } foo_t; + * + * static int + * foo_compare(void *l_arg, void *r_arg, void *private) + * { + * foo_t *l = l_arg; + * foo_t *r = r_arg; + * + * if (... l greater than r ...) + * return (1); + * if (... l less than r ...) + * return (-1); + * return (0); + * } + * + * ... + * // at initialization time + * foo_pool = uu_list_pool_create("foo_pool", + * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare, + * debugging? 0 : UU_AVL_POOL_DEBUG); + * ... + */ +uu_list_pool_t *uu_list_pool_create(const char *, size_t, size_t, + uu_compare_fn_t *, uint32_t); +#define UU_LIST_POOL_DEBUG 0x00000001 + +void uu_list_pool_destroy(uu_list_pool_t *); + +/* + * usage: + * + * foo_t *a; + * a = malloc(sizeof(*a)); + * uu_list_node_init(a, &a->foo_list, pool); + * ... + * uu_list_node_fini(a, &a->foo_list, pool); + * free(a); + */ +void uu_list_node_init(void *, uu_list_node_t *, uu_list_pool_t *); +void uu_list_node_fini(void *, uu_list_node_t *, uu_list_pool_t *); + +uu_list_t *uu_list_create(uu_list_pool_t *, void *_parent, uint32_t); +#define UU_LIST_DEBUG 0x00000001 +#define UU_LIST_SORTED 0x00000002 /* list is sorted */ + +void uu_list_destroy(uu_list_t *); /* list must be empty */ + +size_t uu_list_numnodes(uu_list_t *); + +void *uu_list_first(uu_list_t *); +void *uu_list_last(uu_list_t *); + +void *uu_list_next(uu_list_t *, void *); +void *uu_list_prev(uu_list_t *, void *); + +int uu_list_walk(uu_list_t *, uu_walk_fn_t *, void *, uint32_t); + +uu_list_walk_t *uu_list_walk_start(uu_list_t *, uint32_t); +void *uu_list_walk_next(uu_list_walk_t *); +void uu_list_walk_end(uu_list_walk_t *); + +void *uu_list_find(uu_list_t *, void *, void *, uu_list_index_t *); +void uu_list_insert(uu_list_t *, void *, uu_list_index_t); + +void *uu_list_nearest_next(uu_list_t *, uu_list_index_t); +void *uu_list_nearest_prev(uu_list_t *, uu_list_index_t); + +void *uu_list_teardown(uu_list_t *, void **); + +void uu_list_remove(uu_list_t *, void *); + +/* + * lists: interfaces for non-sorted lists only + */ +int uu_list_insert_before(uu_list_t *, void *_target, void *_elem); +int uu_list_insert_after(uu_list_t *, void *_target, void *_elem); + +/* + * avl trees: opaque structures + */ +typedef struct uu_avl_pool uu_avl_pool_t; +typedef struct uu_avl uu_avl_t; + +typedef struct uu_avl_node { +#ifdef _LP64 + uintptr_t uan_opaque[3]; +#else + uintptr_t uan_opaque[4]; +#endif +} uu_avl_node_t; + +typedef struct uu_avl_walk uu_avl_walk_t; + +typedef uintptr_t uu_avl_index_t; + +/* + * avl trees: interface + * + * basic usage: + * typedef struct foo { + * ... + * uu_avl_node_t foo_node; + * ... + * } foo_t; + * + * static int + * foo_compare(void *l_arg, void *r_arg, void *private) + * { + * foo_t *l = l_arg; + * foo_t *r = r_arg; + * + * if (... l greater than r ...) + * return (1); + * if (... l less than r ...) + * return (-1); + * return (0); + * } + * + * ... + * // at initialization time + * foo_pool = uu_avl_pool_create("foo_pool", + * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare, + * debugging? 0 : UU_AVL_POOL_DEBUG); + * ... + */ +uu_avl_pool_t *uu_avl_pool_create(const char *, size_t, size_t, + uu_compare_fn_t *, uint32_t); +#define UU_AVL_POOL_DEBUG 0x00000001 + +void uu_avl_pool_destroy(uu_avl_pool_t *); + +/* + * usage: + * + * foo_t *a; + * a = malloc(sizeof(*a)); + * uu_avl_node_init(a, &a->foo_avl, pool); + * ... + * uu_avl_node_fini(a, &a->foo_avl, pool); + * free(a); + */ +void uu_avl_node_init(void *, uu_avl_node_t *, uu_avl_pool_t *); +void uu_avl_node_fini(void *, uu_avl_node_t *, uu_avl_pool_t *); + +uu_avl_t *uu_avl_create(uu_avl_pool_t *, void *_parent, uint32_t); +#define UU_AVL_DEBUG 0x00000001 + +void uu_avl_destroy(uu_avl_t *); /* list must be empty */ + +size_t uu_avl_numnodes(uu_avl_t *); + +void *uu_avl_first(uu_avl_t *); +void *uu_avl_last(uu_avl_t *); + +void *uu_avl_next(uu_avl_t *, void *); +void *uu_avl_prev(uu_avl_t *, void *); + +int uu_avl_walk(uu_avl_t *, uu_walk_fn_t *, void *, uint32_t); + +uu_avl_walk_t *uu_avl_walk_start(uu_avl_t *, uint32_t); +void *uu_avl_walk_next(uu_avl_walk_t *); +void uu_avl_walk_end(uu_avl_walk_t *); + +void *uu_avl_find(uu_avl_t *, void *, void *, uu_avl_index_t *); +void uu_avl_insert(uu_avl_t *, void *, uu_avl_index_t); + +void *uu_avl_nearest_next(uu_avl_t *, uu_avl_index_t); +void *uu_avl_nearest_prev(uu_avl_t *, uu_avl_index_t); + +void *uu_avl_teardown(uu_avl_t *, void **); + +void uu_avl_remove(uu_avl_t *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBUUTIL_H */ diff --git a/lib/libuutil/common/libuutil_common.h b/lib/libuutil/common/libuutil_common.h new file mode 100644 index 0000000..9ebaaed --- /dev/null +++ b/lib/libuutil/common/libuutil_common.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBUUTIL_COMMON_H +#define _LIBUUTIL_COMMON_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include +#include + +#endif /* _LIBUUTIL_COMMON_H */ diff --git a/lib/libuutil/common/libuutil_impl.h b/lib/libuutil/common/libuutil_impl.h new file mode 100644 index 0000000..9466e59 --- /dev/null +++ b/lib/libuutil/common/libuutil_impl.h @@ -0,0 +1,181 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBUUTIL_IMPL_H +#define _LIBUUTIL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include +#include + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void uu_set_error(uint_t); +#pragma rarely_called(uu_set_error) + +/*PRINTFLIKE1*/ +void uu_panic(const char *format, ...); +#pragma rarely_called(uu_panic) + +struct uu_dprintf { + char *uud_name; + uu_dprintf_severity_t uud_severity; + uint_t uud_flags; +}; + +/* + * For debugging purposes, libuutil keeps around linked lists of all uu_lists + * and uu_avls, along with pointers to their parents. These can cause false + * negatives when looking for memory leaks, so we encode the pointers by + * storing them with swapped endianness; this is not perfect, but it's about + * the best we can do without wasting a lot of space. + */ +#ifdef _LP64 +#define UU_PTR_ENCODE(ptr) BSWAP_64((uintptr_t)(void *)(ptr)) +#else +#define UU_PTR_ENCODE(ptr) BSWAP_32((uintptr_t)(void *)(ptr)) +#endif + +#define UU_PTR_DECODE(ptr) ((void *)UU_PTR_ENCODE(ptr)) + +/* + * uu_list structures + */ +typedef struct uu_list_node_impl { + struct uu_list_node_impl *uln_next; + struct uu_list_node_impl *uln_prev; +} uu_list_node_impl_t; + +struct uu_list_walk { + uu_list_walk_t *ulw_next; + uu_list_walk_t *ulw_prev; + + uu_list_t *ulw_list; + int8_t ulw_dir; + uint8_t ulw_robust; + uu_list_node_impl_t *ulw_next_result; +}; + +struct uu_list { + uintptr_t ul_next_enc; + uintptr_t ul_prev_enc; + + uu_list_pool_t *ul_pool; + uintptr_t ul_parent_enc; /* encoded parent pointer */ + size_t ul_offset; + size_t ul_numnodes; + uint8_t ul_debug; + uint8_t ul_sorted; + uint8_t ul_index; /* mark for uu_list_index_ts */ + + uu_list_node_impl_t ul_null_node; + uu_list_walk_t ul_null_walk; /* for robust walkers */ +}; + +#define UU_LIST_PTR(ptr) ((uu_list_t *)UU_PTR_DECODE(ptr)) + +#define UU_LIST_POOL_MAXNAME 64 + +struct uu_list_pool { + uu_list_pool_t *ulp_next; + uu_list_pool_t *ulp_prev; + + char ulp_name[UU_LIST_POOL_MAXNAME]; + size_t ulp_nodeoffset; + size_t ulp_objsize; + uu_compare_fn_t *ulp_cmp; + uint8_t ulp_debug; + uint8_t ulp_last_index; + pthread_mutex_t ulp_lock; /* protects null_list */ + uu_list_t ulp_null_list; +}; + +/* + * uu_avl structures + */ +typedef struct avl_node uu_avl_node_impl_t; + +struct uu_avl_walk { + uu_avl_walk_t *uaw_next; + uu_avl_walk_t *uaw_prev; + + uu_avl_t *uaw_avl; + void *uaw_next_result; + int8_t uaw_dir; + uint8_t uaw_robust; +}; + +struct uu_avl { + uintptr_t ua_next_enc; + uintptr_t ua_prev_enc; + + uu_avl_pool_t *ua_pool; + uintptr_t ua_parent_enc; + uint8_t ua_debug; + uint8_t ua_index; /* mark for uu_avl_index_ts */ + + struct avl_tree ua_tree; + uu_avl_walk_t ua_null_walk; +}; + +#define UU_AVL_PTR(x) ((uu_avl_t *)UU_PTR_DECODE(x)) + +#define UU_AVL_POOL_MAXNAME 64 + +struct uu_avl_pool { + uu_avl_pool_t *uap_next; + uu_avl_pool_t *uap_prev; + + char uap_name[UU_AVL_POOL_MAXNAME]; + size_t uap_nodeoffset; + size_t uap_objsize; + uu_compare_fn_t *uap_cmp; + uint8_t uap_debug; + uint8_t uap_last_index; + pthread_mutex_t uap_lock; /* protects null_avl */ + uu_avl_t uap_null_avl; +}; + +/* + * atfork() handlers + */ +void uu_avl_lockup(void); +void uu_avl_release(void); + +void uu_list_lockup(void); +void uu_list_release(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBUUTIL_IMPL_H */ diff --git a/lib/libuutil/common/uu_alloc.c b/lib/libuutil/common/uu_alloc.c new file mode 100644 index 0000000..2bef759 --- /dev/null +++ b/lib/libuutil/common/uu_alloc.c @@ -0,0 +1,135 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "libuutil_common.h" + +#include +#include +#include +#include + +void * +uu_zalloc(size_t n) +{ + void *p = malloc(n); + + if (p == NULL) { + uu_set_error(UU_ERROR_SYSTEM); + return (NULL); + } + + (void) memset(p, 0, n); + + return (p); +} + +void +uu_free(void *p) +{ + free(p); +} + +char * +uu_strdup(const char *str) +{ + char *buf = NULL; + + if (str != NULL) { + size_t sz; + + sz = strlen(str) + 1; + buf = uu_zalloc(sz); + if (buf != NULL) + (void) memcpy(buf, str, sz); + } + return (buf); +} + +/* + * Duplicate up to n bytes of a string. Kind of sort of like + * strdup(strlcpy(s, n)). + */ +char * +uu_strndup(const char *s, size_t n) +{ + size_t len; + char *p; + + len = strnlen(s, n); + p = uu_zalloc(len + 1); + if (p == NULL) + return (NULL); + + if (len > 0) + (void) memcpy(p, s, len); + p[len] = '\0'; + + return (p); +} + +/* + * Duplicate a block of memory. Combines malloc with memcpy, much as + * strdup combines malloc, strlen, and strcpy. + */ +void * +uu_memdup(const void *buf, size_t sz) +{ + void *p; + + p = uu_zalloc(sz); + if (p == NULL) + return (NULL); + (void) memcpy(p, buf, sz); + return (p); +} + +char * +uu_msprintf(const char *format, ...) +{ + va_list args; + char attic[1]; + uint_t M, m; + char *b; + + va_start(args, format); + M = vsnprintf(attic, 1, format, args); + va_end(args); + + for (;;) { + m = M; + if ((b = uu_zalloc(m + 1)) == NULL) + return (NULL); + + va_start(args, format); + M = vsnprintf(b, m + 1, format, args); + va_end(args); + + if (M == m) + break; /* sizes match */ + + uu_free(b); + } + + return (b); +} diff --git a/lib/libuutil/common/uu_avl.c b/lib/libuutil/common/uu_avl.c new file mode 100644 index 0000000..308e920 --- /dev/null +++ b/lib/libuutil/common/uu_avl.c @@ -0,0 +1,569 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "libuutil_common.h" + +#include +#include +#include +#include + +static uu_avl_pool_t uu_null_apool = { &uu_null_apool, &uu_null_apool }; +static pthread_mutex_t uu_apool_list_lock = PTHREAD_MUTEX_INITIALIZER; + +/* + * The index mark change on every insert and delete, to catch stale + * references. + * + * We leave the low bit alone, since the avl code uses it. + */ +#define INDEX_MAX (sizeof (uintptr_t) - 2) +#define INDEX_NEXT(m) (((m) == INDEX_MAX)? 2 : ((m) + 2) & INDEX_MAX) + +#define INDEX_DECODE(i) ((i) & ~INDEX_MAX) +#define INDEX_ENCODE(p, n) (((n) & ~INDEX_MAX) | (p)->ua_index) +#define INDEX_VALID(p, i) (((i) & INDEX_MAX) == (p)->ua_index) +#define INDEX_CHECK(i) (((i) & INDEX_MAX) != 0) + +/* + * When an element is inactive (not in a tree), we keep a marked pointer to + * its containing pool in its first word, and a NULL pointer in its second. + * + * On insert, we use these to verify that it comes from the correct pool. + */ +#define NODE_ARRAY(p, n) ((uintptr_t *)((uintptr_t)(n) + \ + (pp)->uap_nodeoffset)) + +#define POOL_TO_MARKER(pp) (((uintptr_t)(pp) | 1)) + +#define DEAD_MARKER 0xc4 + +uu_avl_pool_t * +uu_avl_pool_create(const char *name, size_t objsize, size_t nodeoffset, + uu_compare_fn_t *compare_func, uint32_t flags) +{ + uu_avl_pool_t *pp, *next, *prev; + + if (name == NULL || + uu_check_name(name, UU_NAME_DOMAIN) == -1 || + nodeoffset + sizeof (uu_avl_node_t) > objsize || + compare_func == NULL) { + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (NULL); + } + + if (flags & ~UU_AVL_POOL_DEBUG) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + pp = uu_zalloc(sizeof (uu_avl_pool_t)); + if (pp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + (void) strlcpy(pp->uap_name, name, sizeof (pp->uap_name)); + pp->uap_nodeoffset = nodeoffset; + pp->uap_objsize = objsize; + pp->uap_cmp = compare_func; + if (flags & UU_AVL_POOL_DEBUG) + pp->uap_debug = 1; + pp->uap_last_index = 0; + + (void) pthread_mutex_init(&pp->uap_lock, NULL); + + pp->uap_null_avl.ua_next_enc = UU_PTR_ENCODE(&pp->uap_null_avl); + pp->uap_null_avl.ua_prev_enc = UU_PTR_ENCODE(&pp->uap_null_avl); + + (void) pthread_mutex_lock(&uu_apool_list_lock); + pp->uap_next = next = &uu_null_apool; + pp->uap_prev = prev = next->uap_prev; + next->uap_prev = pp; + prev->uap_next = pp; + (void) pthread_mutex_unlock(&uu_apool_list_lock); + + return (pp); +} + +void +uu_avl_pool_destroy(uu_avl_pool_t *pp) +{ + if (pp->uap_debug) { + if (pp->uap_null_avl.ua_next_enc != + UU_PTR_ENCODE(&pp->uap_null_avl) || + pp->uap_null_avl.ua_prev_enc != + UU_PTR_ENCODE(&pp->uap_null_avl)) { + uu_panic("uu_avl_pool_destroy: Pool \"%.*s\" (%p) has " + "outstanding avls, or is corrupt.\n", + (int)sizeof (pp->uap_name), pp->uap_name, + (void *)pp); + } + } + (void) pthread_mutex_lock(&uu_apool_list_lock); + pp->uap_next->uap_prev = pp->uap_prev; + pp->uap_prev->uap_next = pp->uap_next; + (void) pthread_mutex_unlock(&uu_apool_list_lock); + pp->uap_prev = NULL; + pp->uap_next = NULL; + uu_free(pp); +} + +void +uu_avl_node_init(void *base, uu_avl_node_t *np, uu_avl_pool_t *pp) +{ + uintptr_t *na = (uintptr_t *)np; + + if (pp->uap_debug) { + uintptr_t offset = (uintptr_t)np - (uintptr_t)base; + if (offset + sizeof (*np) > pp->uap_objsize) { + uu_panic("uu_avl_node_init(%p, %p, %p (\"%s\")): " + "offset %ld doesn't fit in object (size %ld)\n", + base, (void *)np, (void *)pp, pp->uap_name, + (long)offset, (long)pp->uap_objsize); + } + if (offset != pp->uap_nodeoffset) { + uu_panic("uu_avl_node_init(%p, %p, %p (\"%s\")): " + "offset %ld doesn't match pool's offset (%ld)\n", + base, (void *)np, (void *)pp, pp->uap_name, + (long)offset, (long)pp->uap_objsize); + } + } + + na[0] = POOL_TO_MARKER(pp); + na[1] = 0; +} + +void +uu_avl_node_fini(void *base, uu_avl_node_t *np, uu_avl_pool_t *pp) +{ + uintptr_t *na = (uintptr_t *)np; + + if (pp->uap_debug) { + if (na[0] == DEAD_MARKER && na[1] == DEAD_MARKER) { + uu_panic("uu_avl_node_fini(%p, %p, %p (\"%s\")): " + "node already finied\n", + base, (void *)np, (void *)pp, pp->uap_name); + } + if (na[0] != POOL_TO_MARKER(pp) || na[1] != 0) { + uu_panic("uu_avl_node_fini(%p, %p, %p (\"%s\")): " + "node corrupt, in tree, or in different pool\n", + base, (void *)np, (void *)pp, pp->uap_name); + } + } + + na[0] = DEAD_MARKER; + na[1] = DEAD_MARKER; + na[2] = DEAD_MARKER; +} + +struct uu_avl_node_compare_info { + uu_compare_fn_t *ac_compare; + void *ac_private; + void *ac_right; + void *ac_found; +}; + +static int +uu_avl_node_compare(const void *l, const void *r) +{ + struct uu_avl_node_compare_info *info = + (struct uu_avl_node_compare_info *)l; + + int res = info->ac_compare(r, info->ac_right, info->ac_private); + + if (res == 0) { + if (info->ac_found == NULL) + info->ac_found = (void *)r; + return (-1); + } + if (res < 0) + return (1); + return (-1); +} + +uu_avl_t * +uu_avl_create(uu_avl_pool_t *pp, void *parent, uint32_t flags) +{ + uu_avl_t *ap, *next, *prev; + + if (flags & ~UU_AVL_DEBUG) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + ap = uu_zalloc(sizeof (*ap)); + if (ap == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + ap->ua_pool = pp; + ap->ua_parent_enc = UU_PTR_ENCODE(parent); + ap->ua_debug = pp->uap_debug || (flags & UU_AVL_DEBUG); + ap->ua_index = (pp->uap_last_index = INDEX_NEXT(pp->uap_last_index)); + + avl_create(&ap->ua_tree, &uu_avl_node_compare, pp->uap_objsize, + pp->uap_nodeoffset); + + ap->ua_null_walk.uaw_next = &ap->ua_null_walk; + ap->ua_null_walk.uaw_prev = &ap->ua_null_walk; + + (void) pthread_mutex_lock(&pp->uap_lock); + next = &pp->uap_null_avl; + prev = UU_PTR_DECODE(next->ua_prev_enc); + ap->ua_next_enc = UU_PTR_ENCODE(next); + ap->ua_prev_enc = UU_PTR_ENCODE(prev); + next->ua_prev_enc = UU_PTR_ENCODE(ap); + prev->ua_next_enc = UU_PTR_ENCODE(ap); + (void) pthread_mutex_unlock(&pp->uap_lock); + + return (ap); +} + +void +uu_avl_destroy(uu_avl_t *ap) +{ + uu_avl_pool_t *pp = ap->ua_pool; + + if (ap->ua_debug) { + if (avl_numnodes(&ap->ua_tree) != 0) { + uu_panic("uu_avl_destroy(%p): tree not empty\n", + (void *)ap); + } + if (ap->ua_null_walk.uaw_next != &ap->ua_null_walk || + ap->ua_null_walk.uaw_prev != &ap->ua_null_walk) { + uu_panic("uu_avl_destroy(%p): outstanding walkers\n", + (void *)ap); + } + } + (void) pthread_mutex_lock(&pp->uap_lock); + UU_AVL_PTR(ap->ua_next_enc)->ua_prev_enc = ap->ua_prev_enc; + UU_AVL_PTR(ap->ua_prev_enc)->ua_next_enc = ap->ua_next_enc; + (void) pthread_mutex_unlock(&pp->uap_lock); + ap->ua_prev_enc = UU_PTR_ENCODE(NULL); + ap->ua_next_enc = UU_PTR_ENCODE(NULL); + + ap->ua_pool = NULL; + avl_destroy(&ap->ua_tree); + + uu_free(ap); +} + +size_t +uu_avl_numnodes(uu_avl_t *ap) +{ + return (avl_numnodes(&ap->ua_tree)); +} + +void * +uu_avl_first(uu_avl_t *ap) +{ + return (avl_first(&ap->ua_tree)); +} + +void * +uu_avl_last(uu_avl_t *ap) +{ + return (avl_last(&ap->ua_tree)); +} + +void * +uu_avl_next(uu_avl_t *ap, void *node) +{ + return (AVL_NEXT(&ap->ua_tree, node)); +} + +void * +uu_avl_prev(uu_avl_t *ap, void *node) +{ + return (AVL_PREV(&ap->ua_tree, node)); +} + +static void +_avl_walk_init(uu_avl_walk_t *wp, uu_avl_t *ap, uint32_t flags) +{ + uu_avl_walk_t *next, *prev; + + int robust = (flags & UU_WALK_ROBUST); + int direction = (flags & UU_WALK_REVERSE)? -1 : 1; + + (void) memset(wp, 0, sizeof (*wp)); + wp->uaw_avl = ap; + wp->uaw_robust = robust; + wp->uaw_dir = direction; + + if (direction > 0) + wp->uaw_next_result = avl_first(&ap->ua_tree); + else + wp->uaw_next_result = avl_last(&ap->ua_tree); + + if (ap->ua_debug || robust) { + wp->uaw_next = next = &ap->ua_null_walk; + wp->uaw_prev = prev = next->uaw_prev; + next->uaw_prev = wp; + prev->uaw_next = wp; + } +} + +static void * +_avl_walk_advance(uu_avl_walk_t *wp, uu_avl_t *ap) +{ + void *np = wp->uaw_next_result; + + avl_tree_t *t = &ap->ua_tree; + + if (np == NULL) + return (NULL); + + wp->uaw_next_result = (wp->uaw_dir > 0)? AVL_NEXT(t, np) : + AVL_PREV(t, np); + + return (np); +} + +static void +_avl_walk_fini(uu_avl_walk_t *wp) +{ + if (wp->uaw_next != NULL) { + wp->uaw_next->uaw_prev = wp->uaw_prev; + wp->uaw_prev->uaw_next = wp->uaw_next; + wp->uaw_next = NULL; + wp->uaw_prev = NULL; + } + wp->uaw_avl = NULL; + wp->uaw_next_result = NULL; +} + +uu_avl_walk_t * +uu_avl_walk_start(uu_avl_t *ap, uint32_t flags) +{ + uu_avl_walk_t *wp; + + if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + wp = uu_zalloc(sizeof (*wp)); + if (wp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + _avl_walk_init(wp, ap, flags); + return (wp); +} + +void * +uu_avl_walk_next(uu_avl_walk_t *wp) +{ + return (_avl_walk_advance(wp, wp->uaw_avl)); +} + +void +uu_avl_walk_end(uu_avl_walk_t *wp) +{ + _avl_walk_fini(wp); + uu_free(wp); +} + +int +uu_avl_walk(uu_avl_t *ap, uu_walk_fn_t *func, void *private, uint32_t flags) +{ + void *e; + uu_avl_walk_t my_walk; + + int status = UU_WALK_NEXT; + + if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (-1); + } + + _avl_walk_init(&my_walk, ap, flags); + while (status == UU_WALK_NEXT && + (e = _avl_walk_advance(&my_walk, ap)) != NULL) + status = (*func)(e, private); + _avl_walk_fini(&my_walk); + + if (status >= 0) + return (0); + uu_set_error(UU_ERROR_CALLBACK_FAILED); + return (-1); +} + +void +uu_avl_remove(uu_avl_t *ap, void *elem) +{ + uu_avl_walk_t *wp; + uu_avl_pool_t *pp = ap->ua_pool; + uintptr_t *na = NODE_ARRAY(pp, elem); + + if (ap->ua_debug) { + /* + * invalidate outstanding uu_avl_index_ts. + */ + ap->ua_index = INDEX_NEXT(ap->ua_index); + } + + /* + * Robust walkers most be advanced, if we are removing the node + * they are currently using. In debug mode, non-robust walkers + * are also on the walker list. + */ + for (wp = ap->ua_null_walk.uaw_next; wp != &ap->ua_null_walk; + wp = wp->uaw_next) { + if (wp->uaw_robust) { + if (elem == wp->uaw_next_result) + (void) _avl_walk_advance(wp, ap); + } else if (wp->uaw_next_result != NULL) { + uu_panic("uu_avl_remove(%p, %p): active non-robust " + "walker\n", (void *)ap, elem); + } + } + + avl_remove(&ap->ua_tree, elem); + + na[0] = POOL_TO_MARKER(pp); + na[1] = 0; +} + +void * +uu_avl_teardown(uu_avl_t *ap, void **cookie) +{ + void *elem = avl_destroy_nodes(&ap->ua_tree, cookie); + + if (elem != NULL) { + uu_avl_pool_t *pp = ap->ua_pool; + uintptr_t *na = NODE_ARRAY(pp, elem); + + na[0] = POOL_TO_MARKER(pp); + na[1] = 0; + } + return (elem); +} + +void * +uu_avl_find(uu_avl_t *ap, void *elem, void *private, uu_avl_index_t *out) +{ + struct uu_avl_node_compare_info info; + void *result; + + info.ac_compare = ap->ua_pool->uap_cmp; + info.ac_private = private; + info.ac_right = elem; + info.ac_found = NULL; + + result = avl_find(&ap->ua_tree, &info, out); + if (out != NULL) + *out = INDEX_ENCODE(ap, *out); + + if (ap->ua_debug && result != NULL) + uu_panic("uu_avl_find: internal error: avl_find succeeded\n"); + + return (info.ac_found); +} + +void +uu_avl_insert(uu_avl_t *ap, void *elem, uu_avl_index_t idx) +{ + if (ap->ua_debug) { + uu_avl_pool_t *pp = ap->ua_pool; + uintptr_t *na = NODE_ARRAY(pp, elem); + + if (na[1] != 0) + uu_panic("uu_avl_insert(%p, %p, %p): node already " + "in tree, or corrupt\n", + (void *)ap, elem, (void *)idx); + if (na[0] == 0) + uu_panic("uu_avl_insert(%p, %p, %p): node not " + "initialized\n", + (void *)ap, elem, (void *)idx); + if (na[0] != POOL_TO_MARKER(pp)) + uu_panic("uu_avl_insert(%p, %p, %p): node from " + "other pool, or corrupt\n", + (void *)ap, elem, (void *)idx); + + if (!INDEX_VALID(ap, idx)) + uu_panic("uu_avl_insert(%p, %p, %p): %s\n", + (void *)ap, elem, (void *)idx, + INDEX_CHECK(idx)? "outdated index" : + "invalid index"); + + /* + * invalidate outstanding uu_avl_index_ts. + */ + ap->ua_index = INDEX_NEXT(ap->ua_index); + } + avl_insert(&ap->ua_tree, elem, INDEX_DECODE(idx)); +} + +void * +uu_avl_nearest_next(uu_avl_t *ap, uu_avl_index_t idx) +{ + if (ap->ua_debug && !INDEX_VALID(ap, idx)) + uu_panic("uu_avl_nearest_next(%p, %p): %s\n", + (void *)ap, (void *)idx, INDEX_CHECK(idx)? + "outdated index" : "invalid index"); + return (avl_nearest(&ap->ua_tree, INDEX_DECODE(idx), AVL_AFTER)); +} + +void * +uu_avl_nearest_prev(uu_avl_t *ap, uu_avl_index_t idx) +{ + if (ap->ua_debug && !INDEX_VALID(ap, idx)) + uu_panic("uu_avl_nearest_prev(%p, %p): %s\n", + (void *)ap, (void *)idx, INDEX_CHECK(idx)? + "outdated index" : "invalid index"); + return (avl_nearest(&ap->ua_tree, INDEX_DECODE(idx), AVL_BEFORE)); +} + +/* + * called from uu_lockup() and uu_release(), as part of our fork1()-safety. + */ +void +uu_avl_lockup(void) +{ + uu_avl_pool_t *pp; + + (void) pthread_mutex_lock(&uu_apool_list_lock); + for (pp = uu_null_apool.uap_next; pp != &uu_null_apool; + pp = pp->uap_next) + (void) pthread_mutex_lock(&pp->uap_lock); +} + +void +uu_avl_release(void) +{ + uu_avl_pool_t *pp; + + for (pp = uu_null_apool.uap_next; pp != &uu_null_apool; + pp = pp->uap_next) + (void) pthread_mutex_unlock(&pp->uap_lock); + (void) pthread_mutex_unlock(&uu_apool_list_lock); +} diff --git a/lib/libuutil/common/uu_dprintf.c b/lib/libuutil/common/uu_dprintf.c new file mode 100644 index 0000000..5b990a5 --- /dev/null +++ b/lib/libuutil/common/uu_dprintf.c @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "libuutil_common.h" + +#include +#include +#include +#include +#include +#include + +#define FACILITY_FMT "%s (%s): " + +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +static const char * +strseverity(uu_dprintf_severity_t severity) +{ + switch (severity) { + case UU_DPRINTF_SILENT: + return (dgettext(TEXT_DOMAIN, "silent")); + case UU_DPRINTF_FATAL: + return (dgettext(TEXT_DOMAIN, "FATAL")); + case UU_DPRINTF_WARNING: + return (dgettext(TEXT_DOMAIN, "WARNING")); + case UU_DPRINTF_NOTICE: + return (dgettext(TEXT_DOMAIN, "note")); + case UU_DPRINTF_INFO: + return (dgettext(TEXT_DOMAIN, "info")); + case UU_DPRINTF_DEBUG: + return (dgettext(TEXT_DOMAIN, "debug")); + default: + return (dgettext(TEXT_DOMAIN, "unspecified")); + } +} + +uu_dprintf_t * +uu_dprintf_create(const char *name, uu_dprintf_severity_t severity, + uint_t flags) +{ + uu_dprintf_t *D; + + if (uu_check_name(name, UU_NAME_DOMAIN) == -1) { + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (NULL); + } + + if ((D = uu_zalloc(sizeof (uu_dprintf_t))) == NULL) + return (NULL); + + if (name != NULL) { + D->uud_name = strdup(name); + if (D->uud_name == NULL) { + uu_free(D); + return (NULL); + } + } else { + D->uud_name = NULL; + } + + D->uud_severity = severity; + D->uud_flags = flags; + + return (D); +} + +/*PRINTFLIKE3*/ +void +uu_dprintf(uu_dprintf_t *D, uu_dprintf_severity_t severity, + const char *format, ...) +{ + va_list alist; + + /* XXX Assert that severity is not UU_DPRINTF_SILENT. */ + + if (severity > D->uud_severity) + return; + + (void) fprintf(stderr, FACILITY_FMT, D->uud_name, + strseverity(severity)); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); +} + +void +uu_dprintf_destroy(uu_dprintf_t *D) +{ + if (D->uud_name) + free(D->uud_name); + + uu_free(D); +} + +const char * +uu_dprintf_getname(uu_dprintf_t *D) +{ + return (D->uud_name); +} diff --git a/lib/libuutil/common/uu_ident.c b/lib/libuutil/common/uu_ident.c new file mode 100644 index 0000000..9a64384 --- /dev/null +++ b/lib/libuutil/common/uu_ident.c @@ -0,0 +1,122 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "libuutil_common.h" + +#include + +/* + * We require names of the form: + * [provider,]identifier[/[provider,]identifier]... + * + * Where provider is either a stock symbol (SUNW) or a java-style reversed + * domain name (com.sun). + * + * Both providers and identifiers must start with a letter, and may + * only contain alphanumerics, dashes, and underlines. Providers + * may also contain periods. + * + * Note that we do _not_ use the macros in , since they are affected + * by the current locale settings. + */ + +#define IS_ALPHA(c) \ + (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +#define IS_DIGIT(c) \ + ((c) >= '0' && (c) <= '9') + +static int +is_valid_ident(const char *s, const char *e, int allowdot) +{ + char c; + + if (s >= e) + return (0); /* name is empty */ + + c = *s++; + if (!IS_ALPHA(c)) + return (0); /* does not start with letter */ + + while (s < e && (c = *s++) != 0) { + if (IS_ALPHA(c) || IS_DIGIT(c) || c == '-' || c == '_' || + (allowdot && c == '.')) + continue; + return (0); /* invalid character */ + } + return (1); +} + +static int +is_valid_component(const char *b, const char *e, uint_t flags) +{ + char *sp; + + if (flags & UU_NAME_DOMAIN) { + sp = strchr(b, ','); + if (sp != NULL && sp < e) { + if (!is_valid_ident(b, sp, 1)) + return (0); + b = sp + 1; + } + } + + return (is_valid_ident(b, e, 0)); +} + +int +uu_check_name(const char *name, uint_t flags) +{ + const char *end = name + strlen(name); + const char *p; + + if (flags & ~(UU_NAME_DOMAIN | UU_NAME_PATH)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (-1); + } + + if (!(flags & UU_NAME_PATH)) { + if (!is_valid_component(name, end, flags)) + goto bad; + return (0); + } + + while ((p = strchr(name, '/')) != NULL) { + if (!is_valid_component(name, p - 1, flags)) + goto bad; + name = p + 1; + } + if (!is_valid_component(name, end, flags)) + goto bad; + + return (0); + +bad: + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (-1); +} diff --git a/lib/libuutil/common/uu_list.c b/lib/libuutil/common/uu_list.c new file mode 100644 index 0000000..35c7ba8 --- /dev/null +++ b/lib/libuutil/common/uu_list.c @@ -0,0 +1,718 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "libuutil_common.h" + +#include +#include +#include +#include + +#define ELEM_TO_NODE(lp, e) \ + ((uu_list_node_impl_t *)((uintptr_t)(e) + (lp)->ul_offset)) + +#define NODE_TO_ELEM(lp, n) \ + ((void *)((uintptr_t)(n) - (lp)->ul_offset)) + +/* + * uu_list_index_ts define a location for insertion. They are simply a + * pointer to the object after the insertion point. We store a mark + * in the low-bits of the index, to help prevent mistakes. + * + * When debugging, the index mark changes on every insert and delete, to + * catch stale references. + */ +#define INDEX_MAX (sizeof (uintptr_t) - 1) +#define INDEX_NEXT(m) (((m) == INDEX_MAX)? 1 : ((m) + 1) & INDEX_MAX) + +#define INDEX_TO_NODE(i) ((uu_list_node_impl_t *)((i) & ~INDEX_MAX)) +#define NODE_TO_INDEX(p, n) (((uintptr_t)(n) & ~INDEX_MAX) | (p)->ul_index) +#define INDEX_VALID(p, i) (((i) & INDEX_MAX) == (p)->ul_index) +#define INDEX_CHECK(i) (((i) & INDEX_MAX) != 0) + +#define POOL_TO_MARKER(pp) ((void *)((uintptr_t)(pp) | 1)) + +static uu_list_pool_t uu_null_lpool = { &uu_null_lpool, &uu_null_lpool }; +static pthread_mutex_t uu_lpool_list_lock = PTHREAD_MUTEX_INITIALIZER; + +uu_list_pool_t * +uu_list_pool_create(const char *name, size_t objsize, + size_t nodeoffset, uu_compare_fn_t *compare_func, uint32_t flags) +{ + uu_list_pool_t *pp, *next, *prev; + + if (name == NULL || + uu_check_name(name, UU_NAME_DOMAIN) == -1 || + nodeoffset + sizeof (uu_list_node_t) > objsize) { + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (NULL); + } + + if (flags & ~UU_LIST_POOL_DEBUG) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + pp = uu_zalloc(sizeof (uu_list_pool_t)); + if (pp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + (void) strlcpy(pp->ulp_name, name, sizeof (pp->ulp_name)); + pp->ulp_nodeoffset = nodeoffset; + pp->ulp_objsize = objsize; + pp->ulp_cmp = compare_func; + if (flags & UU_LIST_POOL_DEBUG) + pp->ulp_debug = 1; + pp->ulp_last_index = 0; + + (void) pthread_mutex_init(&pp->ulp_lock, NULL); + + pp->ulp_null_list.ul_next_enc = UU_PTR_ENCODE(&pp->ulp_null_list); + pp->ulp_null_list.ul_prev_enc = UU_PTR_ENCODE(&pp->ulp_null_list); + + (void) pthread_mutex_lock(&uu_lpool_list_lock); + pp->ulp_next = next = &uu_null_lpool; + pp->ulp_prev = prev = next->ulp_prev; + next->ulp_prev = pp; + prev->ulp_next = pp; + (void) pthread_mutex_unlock(&uu_lpool_list_lock); + + return (pp); +} + +void +uu_list_pool_destroy(uu_list_pool_t *pp) +{ + if (pp->ulp_debug) { + if (pp->ulp_null_list.ul_next_enc != + UU_PTR_ENCODE(&pp->ulp_null_list) || + pp->ulp_null_list.ul_prev_enc != + UU_PTR_ENCODE(&pp->ulp_null_list)) { + uu_panic("uu_list_pool_destroy: Pool \"%.*s\" (%p) has " + "outstanding lists, or is corrupt.\n", + (int)sizeof (pp->ulp_name), pp->ulp_name, + (void *)pp); + } + } + (void) pthread_mutex_lock(&uu_lpool_list_lock); + pp->ulp_next->ulp_prev = pp->ulp_prev; + pp->ulp_prev->ulp_next = pp->ulp_next; + (void) pthread_mutex_unlock(&uu_lpool_list_lock); + pp->ulp_prev = NULL; + pp->ulp_next = NULL; + uu_free(pp); +} + +void +uu_list_node_init(void *base, uu_list_node_t *np_arg, uu_list_pool_t *pp) +{ + uu_list_node_impl_t *np = (uu_list_node_impl_t *)np_arg; + + if (pp->ulp_debug) { + uintptr_t offset = (uintptr_t)np - (uintptr_t)base; + if (offset + sizeof (*np) > pp->ulp_objsize) { + uu_panic("uu_list_node_init(%p, %p, %p (\"%s\")): " + "offset %ld doesn't fit in object (size %ld)\n", + base, (void *)np, (void *)pp, pp->ulp_name, + (long)offset, (long)pp->ulp_objsize); + } + if (offset != pp->ulp_nodeoffset) { + uu_panic("uu_list_node_init(%p, %p, %p (\"%s\")): " + "offset %ld doesn't match pool's offset (%ld)\n", + base, (void *)np, (void *)pp, pp->ulp_name, + (long)offset, (long)pp->ulp_objsize); + } + } + np->uln_next = POOL_TO_MARKER(pp); + np->uln_prev = NULL; +} + +void +uu_list_node_fini(void *base, uu_list_node_t *np_arg, uu_list_pool_t *pp) +{ + uu_list_node_impl_t *np = (uu_list_node_impl_t *)np_arg; + + if (pp->ulp_debug) { + if (np->uln_next == NULL && + np->uln_prev == NULL) { + uu_panic("uu_list_node_fini(%p, %p, %p (\"%s\")): " + "node already finied\n", + base, (void *)np_arg, (void *)pp, pp->ulp_name); + } + if (np->uln_next != POOL_TO_MARKER(pp) || + np->uln_prev != NULL) { + uu_panic("uu_list_node_fini(%p, %p, %p (\"%s\")): " + "node corrupt or on list\n", + base, (void *)np_arg, (void *)pp, pp->ulp_name); + } + } + np->uln_next = NULL; + np->uln_prev = NULL; +} + +uu_list_t * +uu_list_create(uu_list_pool_t *pp, void *parent, uint32_t flags) +{ + uu_list_t *lp, *next, *prev; + + if (flags & ~(UU_LIST_DEBUG | UU_LIST_SORTED)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + if ((flags & UU_LIST_SORTED) && pp->ulp_cmp == NULL) { + if (pp->ulp_debug) + uu_panic("uu_list_create(%p, ...): requested " + "UU_LIST_SORTED, but pool has no comparison func\n", + (void *)pp); + uu_set_error(UU_ERROR_NOT_SUPPORTED); + return (NULL); + } + + lp = uu_zalloc(sizeof (*lp)); + if (lp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + lp->ul_pool = pp; + lp->ul_parent_enc = UU_PTR_ENCODE(parent); + lp->ul_offset = pp->ulp_nodeoffset; + lp->ul_debug = pp->ulp_debug || (flags & UU_LIST_DEBUG); + lp->ul_sorted = (flags & UU_LIST_SORTED); + lp->ul_numnodes = 0; + lp->ul_index = (pp->ulp_last_index = INDEX_NEXT(pp->ulp_last_index)); + + lp->ul_null_node.uln_next = &lp->ul_null_node; + lp->ul_null_node.uln_prev = &lp->ul_null_node; + + lp->ul_null_walk.ulw_next = &lp->ul_null_walk; + lp->ul_null_walk.ulw_prev = &lp->ul_null_walk; + + (void) pthread_mutex_lock(&pp->ulp_lock); + next = &pp->ulp_null_list; + prev = UU_PTR_DECODE(next->ul_prev_enc); + lp->ul_next_enc = UU_PTR_ENCODE(next); + lp->ul_prev_enc = UU_PTR_ENCODE(prev); + next->ul_prev_enc = UU_PTR_ENCODE(lp); + prev->ul_next_enc = UU_PTR_ENCODE(lp); + (void) pthread_mutex_unlock(&pp->ulp_lock); + + return (lp); +} + +void +uu_list_destroy(uu_list_t *lp) +{ + uu_list_pool_t *pp = lp->ul_pool; + + if (lp->ul_debug) { + if (lp->ul_null_node.uln_next != &lp->ul_null_node || + lp->ul_null_node.uln_prev != &lp->ul_null_node) { + uu_panic("uu_list_destroy(%p): list not empty\n", + (void *)lp); + } + if (lp->ul_numnodes != 0) { + uu_panic("uu_list_destroy(%p): numnodes is nonzero, " + "but list is empty\n", (void *)lp); + } + if (lp->ul_null_walk.ulw_next != &lp->ul_null_walk || + lp->ul_null_walk.ulw_prev != &lp->ul_null_walk) { + uu_panic("uu_list_destroy(%p): outstanding walkers\n", + (void *)lp); + } + } + + (void) pthread_mutex_lock(&pp->ulp_lock); + UU_LIST_PTR(lp->ul_next_enc)->ul_prev_enc = lp->ul_prev_enc; + UU_LIST_PTR(lp->ul_prev_enc)->ul_next_enc = lp->ul_next_enc; + (void) pthread_mutex_unlock(&pp->ulp_lock); + lp->ul_prev_enc = UU_PTR_ENCODE(NULL); + lp->ul_next_enc = UU_PTR_ENCODE(NULL); + lp->ul_pool = NULL; + uu_free(lp); +} + +static void +list_insert(uu_list_t *lp, uu_list_node_impl_t *np, uu_list_node_impl_t *prev, + uu_list_node_impl_t *next) +{ + if (lp->ul_debug) { + if (next->uln_prev != prev || prev->uln_next != next) + uu_panic("insert(%p): internal error: %p and %p not " + "neighbors\n", (void *)lp, (void *)next, + (void *)prev); + + if (np->uln_next != POOL_TO_MARKER(lp->ul_pool) || + np->uln_prev != NULL) { + uu_panic("insert(%p): elem %p node %p corrupt, " + "not initialized, or already in a list.\n", + (void *)lp, NODE_TO_ELEM(lp, np), (void *)np); + } + /* + * invalidate outstanding uu_list_index_ts. + */ + lp->ul_index = INDEX_NEXT(lp->ul_index); + } + np->uln_next = next; + np->uln_prev = prev; + next->uln_prev = np; + prev->uln_next = np; + + lp->ul_numnodes++; +} + +void +uu_list_insert(uu_list_t *lp, void *elem, uu_list_index_t idx) +{ + uu_list_node_impl_t *np; + + np = INDEX_TO_NODE(idx); + if (np == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (!INDEX_VALID(lp, idx)) + uu_panic("uu_list_insert(%p, %p, %p): %s\n", + (void *)lp, elem, (void *)idx, + INDEX_CHECK(idx)? "outdated index" : + "invalid index"); + if (np->uln_prev == NULL) + uu_panic("uu_list_insert(%p, %p, %p): out-of-date " + "index\n", (void *)lp, elem, (void *)idx); + } + + list_insert(lp, ELEM_TO_NODE(lp, elem), np->uln_prev, np); +} + +void * +uu_list_find(uu_list_t *lp, void *elem, void *private, uu_list_index_t *out) +{ + int sorted = lp->ul_sorted; + uu_compare_fn_t *func = lp->ul_pool->ulp_cmp; + uu_list_node_impl_t *np; + + if (func == NULL) { + if (out != NULL) + *out = 0; + uu_set_error(UU_ERROR_NOT_SUPPORTED); + return (NULL); + } + for (np = lp->ul_null_node.uln_next; np != &lp->ul_null_node; + np = np->uln_next) { + void *ep = NODE_TO_ELEM(lp, np); + int cmp = func(ep, elem, private); + if (cmp == 0) { + if (out != NULL) + *out = NODE_TO_INDEX(lp, np); + return (ep); + } + if (sorted && cmp > 0) { + if (out != NULL) + *out = NODE_TO_INDEX(lp, np); + return (NULL); + } + } + if (out != NULL) + *out = NODE_TO_INDEX(lp, 0); + return (NULL); +} + +void * +uu_list_nearest_next(uu_list_t *lp, uu_list_index_t idx) +{ + uu_list_node_impl_t *np = INDEX_TO_NODE(idx); + + if (np == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (!INDEX_VALID(lp, idx)) + uu_panic("uu_list_nearest_next(%p, %p): %s\n", + (void *)lp, (void *)idx, + INDEX_CHECK(idx)? "outdated index" : + "invalid index"); + if (np->uln_prev == NULL) + uu_panic("uu_list_nearest_next(%p, %p): out-of-date " + "index\n", (void *)lp, (void *)idx); + } + + if (np == &lp->ul_null_node) + return (NULL); + else + return (NODE_TO_ELEM(lp, np)); +} + +void * +uu_list_nearest_prev(uu_list_t *lp, uu_list_index_t idx) +{ + uu_list_node_impl_t *np = INDEX_TO_NODE(idx); + + if (np == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (!INDEX_VALID(lp, idx)) + uu_panic("uu_list_nearest_prev(%p, %p): %s\n", + (void *)lp, (void *)idx, INDEX_CHECK(idx)? + "outdated index" : "invalid index"); + if (np->uln_prev == NULL) + uu_panic("uu_list_nearest_prev(%p, %p): out-of-date " + "index\n", (void *)lp, (void *)idx); + } + + if ((np = np->uln_prev) == &lp->ul_null_node) + return (NULL); + else + return (NODE_TO_ELEM(lp, np)); +} + +static void +list_walk_init(uu_list_walk_t *wp, uu_list_t *lp, uint32_t flags) +{ + uu_list_walk_t *next, *prev; + + int robust = (flags & UU_WALK_ROBUST); + int direction = (flags & UU_WALK_REVERSE)? -1 : 1; + + (void) memset(wp, 0, sizeof (*wp)); + wp->ulw_list = lp; + wp->ulw_robust = robust; + wp->ulw_dir = direction; + if (direction > 0) + wp->ulw_next_result = lp->ul_null_node.uln_next; + else + wp->ulw_next_result = lp->ul_null_node.uln_prev; + + if (lp->ul_debug || robust) { + /* + * Add this walker to the list's list of walkers so + * uu_list_remove() can advance us if somebody tries to + * remove ulw_next_result. + */ + wp->ulw_next = next = &lp->ul_null_walk; + wp->ulw_prev = prev = next->ulw_prev; + next->ulw_prev = wp; + prev->ulw_next = wp; + } +} + +static uu_list_node_impl_t * +list_walk_advance(uu_list_walk_t *wp, uu_list_t *lp) +{ + uu_list_node_impl_t *np = wp->ulw_next_result; + uu_list_node_impl_t *next; + + if (np == &lp->ul_null_node) + return (NULL); + + next = (wp->ulw_dir > 0)? np->uln_next : np->uln_prev; + + wp->ulw_next_result = next; + return (np); +} + +static void +list_walk_fini(uu_list_walk_t *wp) +{ + /* GLXXX debugging? */ + if (wp->ulw_next != NULL) { + wp->ulw_next->ulw_prev = wp->ulw_prev; + wp->ulw_prev->ulw_next = wp->ulw_next; + wp->ulw_next = NULL; + wp->ulw_prev = NULL; + } + wp->ulw_list = NULL; + wp->ulw_next_result = NULL; +} + +uu_list_walk_t * +uu_list_walk_start(uu_list_t *lp, uint32_t flags) +{ + uu_list_walk_t *wp; + + if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + wp = uu_zalloc(sizeof (*wp)); + if (wp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + list_walk_init(wp, lp, flags); + return (wp); +} + +void * +uu_list_walk_next(uu_list_walk_t *wp) +{ + uu_list_t *lp = wp->ulw_list; + uu_list_node_impl_t *np = list_walk_advance(wp, lp); + + if (np == NULL) + return (NULL); + + return (NODE_TO_ELEM(lp, np)); +} + +void +uu_list_walk_end(uu_list_walk_t *wp) +{ + list_walk_fini(wp); + uu_free(wp); +} + +int +uu_list_walk(uu_list_t *lp, uu_walk_fn_t *func, void *private, uint32_t flags) +{ + uu_list_node_impl_t *np; + + int status = UU_WALK_NEXT; + + int robust = (flags & UU_WALK_ROBUST); + int reverse = (flags & UU_WALK_REVERSE); + + if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (-1); + } + + if (lp->ul_debug || robust) { + uu_list_walk_t my_walk; + void *e; + + list_walk_init(&my_walk, lp, flags); + while (status == UU_WALK_NEXT && + (e = uu_list_walk_next(&my_walk)) != NULL) + status = (*func)(e, private); + list_walk_fini(&my_walk); + } else { + if (!reverse) { + for (np = lp->ul_null_node.uln_next; + status == UU_WALK_NEXT && np != &lp->ul_null_node; + np = np->uln_next) { + status = (*func)(NODE_TO_ELEM(lp, np), private); + } + } else { + for (np = lp->ul_null_node.uln_prev; + status == UU_WALK_NEXT && np != &lp->ul_null_node; + np = np->uln_prev) { + status = (*func)(NODE_TO_ELEM(lp, np), private); + } + } + } + if (status >= 0) + return (0); + uu_set_error(UU_ERROR_CALLBACK_FAILED); + return (-1); +} + +void +uu_list_remove(uu_list_t *lp, void *elem) +{ + uu_list_node_impl_t *np = ELEM_TO_NODE(lp, elem); + uu_list_walk_t *wp; + + if (lp->ul_debug) { + if (np->uln_prev == NULL) + uu_panic("uu_list_remove(%p, %p): elem not on list\n", + (void *)lp, elem); + /* + * invalidate outstanding uu_list_index_ts. + */ + lp->ul_index = INDEX_NEXT(lp->ul_index); + } + + /* + * robust walkers must be advanced. In debug mode, non-robust + * walkers are also on the list. If there are any, it's an error. + */ + for (wp = lp->ul_null_walk.ulw_next; wp != &lp->ul_null_walk; + wp = wp->ulw_next) { + if (wp->ulw_robust) { + if (np == wp->ulw_next_result) + (void) list_walk_advance(wp, lp); + } else if (wp->ulw_next_result != NULL) { + uu_panic("uu_list_remove(%p, %p): active non-robust " + "walker\n", (void *)lp, elem); + } + } + + np->uln_next->uln_prev = np->uln_prev; + np->uln_prev->uln_next = np->uln_next; + + lp->ul_numnodes--; + + np->uln_next = POOL_TO_MARKER(lp->ul_pool); + np->uln_prev = NULL; +} + +void * +uu_list_teardown(uu_list_t *lp, void **cookie) +{ + void *ep; + + /* + * XXX: disable list modification until list is empty + */ + if (lp->ul_debug && *cookie != NULL) + uu_panic("uu_list_teardown(%p, %p): unexpected cookie\n", + (void *)lp, (void *)cookie); + + ep = uu_list_first(lp); + if (ep) + uu_list_remove(lp, ep); + return (ep); +} + +int +uu_list_insert_before(uu_list_t *lp, void *target, void *elem) +{ + uu_list_node_impl_t *np = ELEM_TO_NODE(lp, target); + + if (target == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (np->uln_prev == NULL) + uu_panic("uu_list_insert_before(%p, %p, %p): %p is " + "not currently on a list\n", + (void *)lp, target, elem, target); + } + if (lp->ul_sorted) { + if (lp->ul_debug) + uu_panic("uu_list_insert_before(%p, ...): list is " + "UU_LIST_SORTED\n", (void *)lp); + uu_set_error(UU_ERROR_NOT_SUPPORTED); + return (-1); + } + + list_insert(lp, ELEM_TO_NODE(lp, elem), np->uln_prev, np); + return (0); +} + +int +uu_list_insert_after(uu_list_t *lp, void *target, void *elem) +{ + uu_list_node_impl_t *np = ELEM_TO_NODE(lp, target); + + if (target == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (np->uln_prev == NULL) + uu_panic("uu_list_insert_after(%p, %p, %p): %p is " + "not currently on a list\n", + (void *)lp, target, elem, target); + } + if (lp->ul_sorted) { + if (lp->ul_debug) + uu_panic("uu_list_insert_after(%p, ...): list is " + "UU_LIST_SORTED\n", (void *)lp); + uu_set_error(UU_ERROR_NOT_SUPPORTED); + return (-1); + } + + list_insert(lp, ELEM_TO_NODE(lp, elem), np, np->uln_next); + return (0); +} + +size_t +uu_list_numnodes(uu_list_t *lp) +{ + return (lp->ul_numnodes); +} + +void * +uu_list_first(uu_list_t *lp) +{ + uu_list_node_impl_t *n = lp->ul_null_node.uln_next; + if (n == &lp->ul_null_node) + return (NULL); + return (NODE_TO_ELEM(lp, n)); +} + +void * +uu_list_last(uu_list_t *lp) +{ + uu_list_node_impl_t *n = lp->ul_null_node.uln_prev; + if (n == &lp->ul_null_node) + return (NULL); + return (NODE_TO_ELEM(lp, n)); +} + +void * +uu_list_next(uu_list_t *lp, void *elem) +{ + uu_list_node_impl_t *n = ELEM_TO_NODE(lp, elem); + + n = n->uln_next; + if (n == &lp->ul_null_node) + return (NULL); + return (NODE_TO_ELEM(lp, n)); +} + +void * +uu_list_prev(uu_list_t *lp, void *elem) +{ + uu_list_node_impl_t *n = ELEM_TO_NODE(lp, elem); + + n = n->uln_prev; + if (n == &lp->ul_null_node) + return (NULL); + return (NODE_TO_ELEM(lp, n)); +} + +/* + * called from uu_lockup() and uu_release(), as part of our fork1()-safety. + */ +void +uu_list_lockup(void) +{ + uu_list_pool_t *pp; + + (void) pthread_mutex_lock(&uu_lpool_list_lock); + for (pp = uu_null_lpool.ulp_next; pp != &uu_null_lpool; + pp = pp->ulp_next) + (void) pthread_mutex_lock(&pp->ulp_lock); +} + +void +uu_list_release(void) +{ + uu_list_pool_t *pp; + + for (pp = uu_null_lpool.ulp_next; pp != &uu_null_lpool; + pp = pp->ulp_next) + (void) pthread_mutex_unlock(&pp->ulp_lock); + (void) pthread_mutex_unlock(&uu_lpool_list_lock); +} diff --git a/lib/libuutil/common/uu_misc.c b/lib/libuutil/common/uu_misc.c new file mode 100644 index 0000000..3d5b40c --- /dev/null +++ b/lib/libuutil/common/uu_misc.c @@ -0,0 +1,280 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "libuutil_common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * All of the old code under !defined(PTHREAD_ONCE_KEY_NP) + * is here to enable the building of a native version of + * libuutil.so when the build machine has not yet been upgraded + * to a version of libc that provides pthread_key_create_once_np(). + * It should all be deleted when solaris_nevada ships. + * The code is not MT-safe in a relaxed memory model. + */ + +#if defined(PTHREAD_ONCE_KEY_NP) +static pthread_key_t uu_error_key = PTHREAD_ONCE_KEY_NP; +#else /* PTHREAD_ONCE_KEY_NP */ +static pthread_key_t uu_error_key = 0; +static pthread_mutex_t uu_key_lock = PTHREAD_MUTEX_INITIALIZER; +#endif /* PTHREAD_ONCE_KEY_NP */ + +static int uu_error_key_setup = 0; + +static pthread_mutex_t uu_panic_lock = PTHREAD_MUTEX_INITIALIZER; +/* LINTED static unused */ +static const char *uu_panic_format; +/* LINTED static unused */ +static va_list uu_panic_args; +static pthread_t uu_panic_thread; + +static uint32_t _uu_main_error; + +void +uu_set_error(uint_t code) +{ + if (thr_main() != 0) { + _uu_main_error = code; + return; + } +#if defined(PTHREAD_ONCE_KEY_NP) + if (pthread_key_create_once_np(&uu_error_key, NULL) != 0) + uu_error_key_setup = -1; + else + uu_error_key_setup = 1; +#else /* PTHREAD_ONCE_KEY_NP */ + if (uu_error_key_setup == 0) { + (void) pthread_mutex_lock(&uu_key_lock); + if (uu_error_key_setup == 0) { + if (pthread_key_create(&uu_error_key, NULL) != 0) + uu_error_key_setup = -1; + else + uu_error_key_setup = 1; + } + (void) pthread_mutex_unlock(&uu_key_lock); + } +#endif /* PTHREAD_ONCE_KEY_NP */ + if (uu_error_key_setup > 0) + (void) pthread_setspecific(uu_error_key, + (void *)(uintptr_t)code); +} + +uint32_t +uu_error(void) +{ + if (thr_main() != 0) + return (_uu_main_error); + + if (uu_error_key_setup < 0) /* can't happen? */ + return (UU_ERROR_UNKNOWN); + + /* + * Because UU_ERROR_NONE == 0, if uu_set_error() was + * never called, then this will return UU_ERROR_NONE: + */ + return ((uint32_t)(uintptr_t)pthread_getspecific(uu_error_key)); +} + +const char * +uu_strerror(uint32_t code) +{ + const char *str; + + switch (code) { + case UU_ERROR_NONE: + str = dgettext(TEXT_DOMAIN, "No error"); + break; + + case UU_ERROR_INVALID_ARGUMENT: + str = dgettext(TEXT_DOMAIN, "Invalid argument"); + break; + + case UU_ERROR_UNKNOWN_FLAG: + str = dgettext(TEXT_DOMAIN, "Unknown flag passed"); + break; + + case UU_ERROR_NO_MEMORY: + str = dgettext(TEXT_DOMAIN, "Out of memory"); + break; + + case UU_ERROR_CALLBACK_FAILED: + str = dgettext(TEXT_DOMAIN, "Callback-initiated failure"); + break; + + case UU_ERROR_NOT_SUPPORTED: + str = dgettext(TEXT_DOMAIN, "Operation not supported"); + break; + + case UU_ERROR_EMPTY: + str = dgettext(TEXT_DOMAIN, "No value provided"); + break; + + case UU_ERROR_UNDERFLOW: + str = dgettext(TEXT_DOMAIN, "Value too small"); + break; + + case UU_ERROR_OVERFLOW: + str = dgettext(TEXT_DOMAIN, "Value too large"); + break; + + case UU_ERROR_INVALID_CHAR: + str = dgettext(TEXT_DOMAIN, + "Value contains unexpected character"); + break; + + case UU_ERROR_INVALID_DIGIT: + str = dgettext(TEXT_DOMAIN, + "Value contains digit not in base"); + break; + + case UU_ERROR_SYSTEM: + str = dgettext(TEXT_DOMAIN, "Underlying system error"); + break; + + case UU_ERROR_UNKNOWN: + str = dgettext(TEXT_DOMAIN, "Error status not known"); + break; + + default: + errno = ESRCH; + str = NULL; + break; + } + return (str); +} + +void +uu_panic(const char *format, ...) +{ + va_list args; + + va_start(args, format); + + (void) pthread_mutex_lock(&uu_panic_lock); + if (uu_panic_thread == 0) { + uu_panic_thread = pthread_self(); + uu_panic_format = format; + va_copy(uu_panic_args, args); + } + (void) pthread_mutex_unlock(&uu_panic_lock); + + (void) vfprintf(stderr, format, args); + + if (uu_panic_thread == pthread_self()) + abort(); + else + for (;;) + (void) pause(); +} + +int +assfail(const char *astring, const char *file, int line) +{ + __assert(astring, file, line); + /*NOTREACHED*/ + return (0); +} + +static void +uu_lockup(void) +{ + (void) pthread_mutex_lock(&uu_panic_lock); +#if !defined(PTHREAD_ONCE_KEY_NP) + (void) pthread_mutex_lock(&uu_key_lock); +#endif + uu_avl_lockup(); + uu_list_lockup(); +} + +static void +uu_release(void) +{ + (void) pthread_mutex_unlock(&uu_panic_lock); +#if !defined(PTHREAD_ONCE_KEY_NP) + (void) pthread_mutex_unlock(&uu_key_lock); +#endif + uu_avl_release(); + uu_list_release(); +} + +static void +uu_release_child(void) +{ + uu_panic_format = NULL; + uu_panic_thread = 0; + + uu_release(); +} + +#pragma init(uu_init) +static void +uu_init(void) +{ + (void) pthread_atfork(uu_lockup, uu_release, uu_release_child); +} + +/* + * Dump a block of memory in hex+ascii, for debugging + */ +void +uu_dump(FILE *out, const char *prefix, const void *buf, size_t len) +{ + const unsigned char *p = buf; + int i; + + for (i = 0; i < len; i += 16) { + int j; + + (void) fprintf(out, "%s", prefix); + for (j = 0; j < 16 && i + j < len; j++) { + (void) fprintf(out, "%2.2x ", p[i + j]); + } + for (; j < 16; j++) { + (void) fprintf(out, " "); + } + for (j = 0; j < 16 && i + j < len; j++) { + (void) fprintf(out, "%c", + isprint(p[i + j]) ? p[i + j] : '.'); + } + (void) fprintf(out, "\n"); + } +} diff --git a/lib/libuutil/common/uu_open.c b/lib/libuutil/common/uu_open.c new file mode 100644 index 0000000..7256662 --- /dev/null +++ b/lib/libuutil/common/uu_open.c @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "libuutil_common.h" + +#include + +#include +#include +#include +#include +#include + +#ifdef _LP64 +#define TMPPATHFMT "%s/uu%ld" +#else /* _LP64 */ +#define TMPPATHFMT "%s/uu%lld" +#endif /* _LP64 */ + +/*ARGSUSED*/ +int +uu_open_tmp(const char *dir, uint_t uflags) +{ + int f; + char *fname = uu_zalloc(PATH_MAX); + + if (fname == NULL) + return (-1); + + for (;;) { + (void) snprintf(fname, PATH_MAX, "%s/uu%lld", dir, gethrtime()); + + f = open(fname, O_CREAT | O_EXCL | O_RDWR, 0600); + + if (f >= 0 || errno != EEXIST) + break; + } + + if (f >= 0) + (void) unlink(fname); + + uu_free(fname); + + return (f); +} diff --git a/lib/libuutil/common/uu_pname.c b/lib/libuutil/common/uu_pname.c new file mode 100644 index 0000000..3307a26 --- /dev/null +++ b/lib/libuutil/common/uu_pname.c @@ -0,0 +1,207 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "libuutil_common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char PNAME_FMT[] = "%s: "; +static const char ERRNO_FMT[] = ": %s\n"; + +static const char *pname; + +static void +uu_die_internal(int status, const char *format, va_list alist) __NORETURN; + +int uu_exit_ok_value = EXIT_SUCCESS; +int uu_exit_fatal_value = EXIT_FAILURE; +int uu_exit_usage_value = 2; + +int * +uu_exit_ok(void) +{ + return (&uu_exit_ok_value); +} + +int * +uu_exit_fatal(void) +{ + return (&uu_exit_fatal_value); +} + +int * +uu_exit_usage(void) +{ + return (&uu_exit_usage_value); +} + +void +uu_alt_exit(int profile) +{ + switch (profile) { + case UU_PROFILE_DEFAULT: + uu_exit_ok_value = EXIT_SUCCESS; + uu_exit_fatal_value = EXIT_FAILURE; + uu_exit_usage_value = 2; + break; + case UU_PROFILE_LAUNCHER: + uu_exit_ok_value = EXIT_SUCCESS; + uu_exit_fatal_value = 124; + uu_exit_usage_value = 125; + break; + } +} + +static void +uu_warn_internal(int err, const char *format, va_list alist) +{ + if (pname != NULL) + (void) fprintf(stderr, PNAME_FMT, pname); + + (void) vfprintf(stderr, format, alist); + + if (strrchr(format, '\n') == NULL) + (void) fprintf(stderr, ERRNO_FMT, strerror(err)); +} + +void +uu_vwarn(const char *format, va_list alist) +{ + uu_warn_internal(errno, format, alist); +} + +/*PRINTFLIKE1*/ +void +uu_warn(const char *format, ...) +{ + va_list alist; + va_start(alist, format); + uu_warn_internal(errno, format, alist); + va_end(alist); +} + +static void +uu_die_internal(int status, const char *format, va_list alist) +{ + uu_warn_internal(errno, format, alist); +#ifdef DEBUG + { + char *cp; + + if (!issetugid()) { + cp = getenv("UU_DIE_ABORTS"); + if (cp != NULL && *cp != '\0') + abort(); + } + } +#endif + exit(status); +} + +void +uu_vdie(const char *format, va_list alist) +{ + uu_die_internal(UU_EXIT_FATAL, format, alist); +} + +/*PRINTFLIKE1*/ +void +uu_die(const char *format, ...) +{ + va_list alist; + va_start(alist, format); + uu_die_internal(UU_EXIT_FATAL, format, alist); + va_end(alist); +} + +void +uu_vxdie(int status, const char *format, va_list alist) +{ + uu_die_internal(status, format, alist); +} + +/*PRINTFLIKE2*/ +void +uu_xdie(int status, const char *format, ...) +{ + va_list alist; + va_start(alist, format); + uu_die_internal(status, format, alist); + va_end(alist); +} + +const char * +uu_setpname(char *arg0) +{ + /* + * Having a NULL argv[0], while uncommon, is possible. It + * makes more sense to handle this event in uu_setpname rather + * than in each of its consumers. + */ + if (arg0 == NULL) { + pname = getexecname(); + if (pname == NULL) + pname = "unknown_command"; + return (pname); + } + + /* + * Guard against '/' at end of command invocation. + */ + for (;;) { + char *p = strrchr(arg0, '/'); + if (p == NULL) { + pname = arg0; + break; + } else { + if (*(p + 1) == '\0') { + *p = '\0'; + continue; + } + + pname = p + 1; + break; + } + } + + return (pname); +} + +const char * +uu_getpname(void) +{ + return (pname); +} diff --git a/lib/libuutil/common/uu_string.c b/lib/libuutil/common/uu_string.c new file mode 100644 index 0000000..66afba0 --- /dev/null +++ b/lib/libuutil/common/uu_string.c @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * String helper functions + */ + +#include +#include +#include +#include +#include +#include "libuutil.h" + +/* Return true if strings are equal */ +boolean_t +uu_streq(const char *a, const char *b) +{ + return (strcmp(a, b) == 0); +} + +/* Return true if strings are equal, case-insensitively */ +boolean_t +uu_strcaseeq(const char *a, const char *b) +{ + return (strcasecmp(a, b) == 0); +} + +/* Return true if string a Begins With string b */ +boolean_t +uu_strbw(const char *a, const char *b) +{ + return (strncmp(a, b, strlen(b)) == 0); +} diff --git a/lib/libuutil/common/uu_strtoint.c b/lib/libuutil/common/uu_strtoint.c new file mode 100644 index 0000000..8fd1148 --- /dev/null +++ b/lib/libuutil/common/uu_strtoint.c @@ -0,0 +1,300 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "libuutil_common.h" + +#include +#include + +#define MAX_BASE 36 + +#define IS_DIGIT(x) ((x) >= '0' && (x) <= '9') + +#define CTOI(x) (((x) >= '0' && (x) <= '9') ? (x) - '0' : \ + ((x) >= 'a' && (x) <= 'z') ? (x) + 10 - 'a' : (x) + 10 - 'A') + +static int +strtoint(const char *s_arg, uint64_t *out, uint32_t base, int sign) +{ + const unsigned char *s = (const unsigned char *)s_arg; + + uint64_t val = 0; + uint64_t multmax; + + unsigned c, i; + + int neg = 0; + + int bad_digit = 0; + int bad_char = 0; + int overflow = 0; + + if (s == NULL || base == 1 || base > MAX_BASE) { + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (-1); + } + + while ((c = *s) != 0 && isspace(c)) + s++; + + switch (c) { + case '-': + if (!sign) + overflow = 1; /* becomes underflow below */ + neg = 1; + /*FALLTHRU*/ + case '+': + c = *++s; + break; + default: + break; + } + + if (c == '\0') { + uu_set_error(UU_ERROR_EMPTY); + return (-1); + } + + if (base == 0) { + if (c != '0') + base = 10; + else if (s[1] == 'x' || s[1] == 'X') + base = 16; + else + base = 8; + } + + if (base == 16 && c == '0' && (s[1] == 'x' || s[1] == 'X')) + c = *(s += 2); + + if ((val = CTOI(c)) >= base) { + if (IS_DIGIT(c)) + bad_digit = 1; + else + bad_char = 1; + val = 0; + } + + multmax = (uint64_t)UINT64_MAX / (uint64_t)base; + + for (c = *++s; c != '\0'; c = *++s) { + if ((i = CTOI(c)) >= base) { + if (isspace(c)) + break; + if (IS_DIGIT(c)) + bad_digit = 1; + else + bad_char = 1; + i = 0; + } + + if (val > multmax) + overflow = 1; + + val *= base; + if ((uint64_t)UINT64_MAX - val < (uint64_t)i) + overflow = 1; + + val += i; + } + + while ((c = *s) != 0) { + if (!isspace(c)) + bad_char = 1; + s++; + } + + if (sign) { + if (neg) { + if (val > -(uint64_t)INT64_MIN) + overflow = 1; + } else { + if (val > INT64_MAX) + overflow = 1; + } + } + + if (neg) + val = -val; + + if (bad_char | bad_digit | overflow) { + if (bad_char) + uu_set_error(UU_ERROR_INVALID_CHAR); + else if (bad_digit) + uu_set_error(UU_ERROR_INVALID_DIGIT); + else if (overflow) { + if (neg) + uu_set_error(UU_ERROR_UNDERFLOW); + else + uu_set_error(UU_ERROR_OVERFLOW); + } + return (-1); + } + + *out = val; + return (0); +} + +int +uu_strtoint(const char *s, void *v, size_t sz, int base, + int64_t min, int64_t max) +{ + uint64_t val_u; + int64_t val; + + if (min > max) + goto bad_argument; + + switch (sz) { + case 1: + if (max > INT8_MAX || min < INT8_MIN) + goto bad_argument; + break; + case 2: + if (max > INT16_MAX || min < INT16_MIN) + goto bad_argument; + break; + case 4: + if (max > INT32_MAX || min < INT32_MIN) + goto bad_argument; + break; + case 8: + if (max > INT64_MAX || min < INT64_MIN) + goto bad_argument; + break; + default: + goto bad_argument; + } + + if (min == 0 && max == 0) { + min = -(1ULL << (8 * sz - 1)); + max = (1ULL << (8 * sz - 1)) - 1; + } + + if (strtoint(s, &val_u, base, 1) == -1) + return (-1); + + val = (int64_t)val_u; + + if (val < min) { + uu_set_error(UU_ERROR_UNDERFLOW); + return (-1); + } else if (val > max) { + uu_set_error(UU_ERROR_OVERFLOW); + return (-1); + } + + switch (sz) { + case 1: + *(int8_t *)v = val; + return (0); + case 2: + *(int16_t *)v = val; + return (0); + case 4: + *(int32_t *)v = val; + return (0); + case 8: + *(int64_t *)v = val; + return (0); + default: + break; /* fall through to bad_argument */ + } + +bad_argument: + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (-1); +} + +int +uu_strtouint(const char *s, void *v, size_t sz, int base, + uint64_t min, uint64_t max) +{ + uint64_t val; + + if (min > max) + goto bad_argument; + + switch (sz) { + case 1: + if (max > UINT8_MAX) + goto bad_argument; + break; + case 2: + if (max > UINT16_MAX) + goto bad_argument; + break; + case 4: + if (max > UINT32_MAX) + goto bad_argument; + break; + case 8: + if (max > UINT64_MAX) + goto bad_argument; + break; + default: + goto bad_argument; + } + + if (min == 0 && max == 0) { + /* we have to be careful, since << can overflow */ + max = (1ULL << (8 * sz - 1)) * 2 - 1; + } + + if (strtoint(s, &val, base, 0) == -1) + return (-1); + + if (val < min) { + uu_set_error(UU_ERROR_UNDERFLOW); + return (-1); + } else if (val > max) { + uu_set_error(UU_ERROR_OVERFLOW); + return (-1); + } + + switch (sz) { + case 1: + *(uint8_t *)v = val; + return (0); + case 2: + *(uint16_t *)v = val; + return (0); + case 4: + *(uint32_t *)v = val; + return (0); + case 8: + *(uint64_t *)v = val; + return (0); + default: + break; /* shouldn't happen, fall through */ + } + +bad_argument: + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (-1); +} diff --git a/lib/libzfs/common/libzfs.h b/lib/libzfs/common/libzfs.h new file mode 100644 index 0000000..ea34cc9 --- /dev/null +++ b/lib/libzfs/common/libzfs.h @@ -0,0 +1,705 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBZFS_H +#define _LIBZFS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Miscellaneous ZFS constants + */ +#define ZFS_MAXNAMELEN MAXNAMELEN +#define ZPOOL_MAXNAMELEN MAXNAMELEN +#define ZFS_MAXPROPLEN MAXPATHLEN +#define ZPOOL_MAXPROPLEN MAXPATHLEN + +/* + * libzfs errors + */ +enum { + EZFS_NOMEM = 2000, /* out of memory */ + EZFS_BADPROP, /* invalid property value */ + EZFS_PROPREADONLY, /* cannot set readonly property */ + EZFS_PROPTYPE, /* property does not apply to dataset type */ + EZFS_PROPNONINHERIT, /* property is not inheritable */ + EZFS_PROPSPACE, /* bad quota or reservation */ + EZFS_BADTYPE, /* dataset is not of appropriate type */ + EZFS_BUSY, /* pool or dataset is busy */ + EZFS_EXISTS, /* pool or dataset already exists */ + EZFS_NOENT, /* no such pool or dataset */ + EZFS_BADSTREAM, /* bad backup stream */ + EZFS_DSREADONLY, /* dataset is readonly */ + EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ + EZFS_INVALIDNAME, /* invalid dataset name */ + EZFS_BADRESTORE, /* unable to restore to destination */ + EZFS_BADBACKUP, /* backup failed */ + EZFS_BADTARGET, /* bad attach/detach/replace target */ + EZFS_NODEVICE, /* no such device in pool */ + EZFS_BADDEV, /* invalid device to add */ + EZFS_NOREPLICAS, /* no valid replicas */ + EZFS_RESILVERING, /* currently resilvering */ + EZFS_BADVERSION, /* unsupported version */ + EZFS_POOLUNAVAIL, /* pool is currently unavailable */ + EZFS_DEVOVERFLOW, /* too many devices in one vdev */ + EZFS_BADPATH, /* must be an absolute path */ + EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ + EZFS_ZONED, /* used improperly in local zone */ + EZFS_MOUNTFAILED, /* failed to mount dataset */ + EZFS_UMOUNTFAILED, /* failed to unmount dataset */ + EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ + EZFS_SHARENFSFAILED, /* share(1M) failed */ + EZFS_PERM, /* permission denied */ + EZFS_NOSPC, /* out of space */ + EZFS_FAULT, /* bad address */ + EZFS_IO, /* I/O error */ + EZFS_INTR, /* signal received */ + EZFS_ISSPARE, /* device is a hot spare */ + EZFS_INVALCONFIG, /* invalid vdev configuration */ + EZFS_RECURSIVE, /* recursive dependency */ + EZFS_NOHISTORY, /* no history object */ + EZFS_POOLPROPS, /* couldn't retrieve pool props */ + EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ + EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ + EZFS_NAMETOOLONG, /* dataset name is too long */ + EZFS_OPENFAILED, /* open of device failed */ + EZFS_NOCAP, /* couldn't get capacity */ + EZFS_LABELFAILED, /* write of label failed */ + EZFS_BADWHO, /* invalid permission who */ + EZFS_BADPERM, /* invalid permission */ + EZFS_BADPERMSET, /* invalid permission set name */ + EZFS_NODELEGATION, /* delegated administration is disabled */ + EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ + EZFS_SHARESMBFAILED, /* failed to share over smb */ + EZFS_BADCACHE, /* bad cache file */ + EZFS_ISL2CACHE, /* device is for the level 2 ARC */ + EZFS_VDEVNOTSUP, /* unsupported vdev type */ + EZFS_NOTSUP, /* ops not supported on this dataset */ + EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ + EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ + EZFS_REFTAG_RELE, /* snapshot release: tag not found */ + EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ + EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ + EZFS_PIPEFAILED, /* pipe create failed */ + EZFS_THREADCREATEFAILED, /* thread create failed */ + EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ + EZFS_SCRUBBING, /* currently scrubbing */ + EZFS_NO_SCRUB, /* no active scrub */ + EZFS_DIFF, /* general failure of zfs diff */ + EZFS_DIFFDATA, /* bad zfs diff data */ + EZFS_POOLREADONLY, /* pool is in read-only mode */ + EZFS_UNKNOWN +}; + +/* + * The following data structures are all part + * of the zfs_allow_t data structure which is + * used for printing 'allow' permissions. + * It is a linked list of zfs_allow_t's which + * then contain avl tree's for user/group/sets/... + * and each one of the entries in those trees have + * avl tree's for the permissions they belong to and + * whether they are local,descendent or local+descendent + * permissions. The AVL trees are used primarily for + * sorting purposes, but also so that we can quickly find + * a given user and or permission. + */ +typedef struct zfs_perm_node { + avl_node_t z_node; + char z_pname[MAXPATHLEN]; +} zfs_perm_node_t; + +typedef struct zfs_allow_node { + avl_node_t z_node; + char z_key[MAXPATHLEN]; /* name, such as joe */ + avl_tree_t z_localdescend; /* local+descendent perms */ + avl_tree_t z_local; /* local permissions */ + avl_tree_t z_descend; /* descendent permissions */ +} zfs_allow_node_t; + +typedef struct zfs_allow { + struct zfs_allow *z_next; + char z_setpoint[MAXPATHLEN]; + avl_tree_t z_sets; + avl_tree_t z_crperms; + avl_tree_t z_user; + avl_tree_t z_group; + avl_tree_t z_everyone; +} zfs_allow_t; + +/* + * Basic handle types + */ +typedef struct zfs_handle zfs_handle_t; +typedef struct zpool_handle zpool_handle_t; +typedef struct libzfs_handle libzfs_handle_t; + +/* + * Library initialization + */ +extern libzfs_handle_t *libzfs_init(void); +extern void libzfs_fini(libzfs_handle_t *); + +extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *); +extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *); + +extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); + +extern int libzfs_errno(libzfs_handle_t *); +extern const char *libzfs_error_action(libzfs_handle_t *); +extern const char *libzfs_error_description(libzfs_handle_t *); +extern void libzfs_mnttab_init(libzfs_handle_t *); +extern void libzfs_mnttab_fini(libzfs_handle_t *); +extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); +extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, + struct mnttab *); +extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, + const char *, const char *); +extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *); + +/* + * Basic handle functions + */ +extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); +extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); +extern void zpool_close(zpool_handle_t *); +extern const char *zpool_get_name(zpool_handle_t *); +extern int zpool_get_state(zpool_handle_t *); +extern char *zpool_state_to_name(vdev_state_t, vdev_aux_t); +extern void zpool_free_handles(libzfs_handle_t *); + +/* + * Iterate over all active pools in the system. + */ +typedef int (*zpool_iter_f)(zpool_handle_t *, void *); +extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); + +/* + * Functions to create and destroy pools + */ +extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, + nvlist_t *, nvlist_t *); +extern int zpool_destroy(zpool_handle_t *); +extern int zpool_add(zpool_handle_t *, nvlist_t *); + +typedef struct splitflags { + /* do not split, but return the config that would be split off */ + int dryrun : 1; + + /* after splitting, import the pool */ + int import : 1; +} splitflags_t; + +/* + * Functions to manipulate pool and vdev state + */ +extern int zpool_scan(zpool_handle_t *, pool_scan_func_t); +extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); + +extern int zpool_vdev_online(zpool_handle_t *, const char *, int, + vdev_state_t *); +extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); +extern int zpool_vdev_attach(zpool_handle_t *, const char *, + const char *, nvlist_t *, int); +extern int zpool_vdev_detach(zpool_handle_t *, const char *); +extern int zpool_vdev_remove(zpool_handle_t *, const char *); +extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, + splitflags_t); + +extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); +extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); +extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); + +extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, + boolean_t *, boolean_t *); +extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, + boolean_t *, boolean_t *, boolean_t *); +extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); + +/* + * Functions to manage pool properties + */ +extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); +extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, + size_t proplen, zprop_source_t *); +extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, + zprop_source_t *); + +extern const char *zpool_prop_to_name(zpool_prop_t); +extern const char *zpool_prop_values(zpool_prop_t); + +/* + * Pool health statistics. + */ +typedef enum { + /* + * The following correspond to faults as defined in the (fault.fs.zfs.*) + * event namespace. Each is associated with a corresponding message ID. + */ + ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ + ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ + ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ + ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ + ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ + ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ + ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ + ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ + ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ + ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ + ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ + ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ + ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ + ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ + + /* + * These faults have no corresponding message ID. At the time we are + * checking the status, the original reason for the FMA fault (I/O or + * checksum errors) has been lost. + */ + ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ + ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ + + /* + * The following are not faults per se, but still an error possibly + * requiring administrative attention. There is no corresponding + * message ID. + */ + ZPOOL_STATUS_VERSION_OLDER, /* older on-disk version */ + ZPOOL_STATUS_RESILVERING, /* device being resilvered */ + ZPOOL_STATUS_OFFLINE_DEV, /* device online */ + ZPOOL_STATUS_REMOVED_DEV, /* removed device */ + + /* + * Finally, the following indicates a healthy pool. + */ + ZPOOL_STATUS_OK +} zpool_status_t; + +extern zpool_status_t zpool_get_status(zpool_handle_t *, char **); +extern zpool_status_t zpool_import_status(nvlist_t *, char **); +extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); + +/* + * Statistics and configuration functions. + */ +extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); +extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); +extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); + +/* + * Import and export functions + */ +extern int zpool_export(zpool_handle_t *, boolean_t); +extern int zpool_export_force(zpool_handle_t *); +extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, + char *altroot); +extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, + nvlist_t *, int); + +/* + * Search for pools to import + */ + +typedef struct importargs { + char **path; /* a list of paths to search */ + int paths; /* number of paths to search */ + char *poolname; /* name of a pool to find */ + uint64_t guid; /* guid of a pool to find */ + char *cachefile; /* cachefile to use for import */ + int can_be_active : 1; /* can the pool be active? */ + int unique : 1; /* does 'poolname' already exist? */ + int exists : 1; /* set on return if pool already exists */ +} importargs_t; + +extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *); + +/* legacy pool search routines */ +extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); +extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, + char *, uint64_t); + +/* + * Miscellaneous pool functions + */ +struct zfs_cmd; + +extern const char *zfs_history_event_names[LOG_END]; + +extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, + boolean_t verbose); +extern int zpool_upgrade(zpool_handle_t *, uint64_t); +extern int zpool_get_history(zpool_handle_t *, nvlist_t **); +extern int zpool_history_unpack(char *, uint64_t, uint64_t *, + nvlist_t ***, uint_t *); +extern void zpool_set_history_str(const char *subcommand, int argc, + char **argv, char *history_str); +extern int zpool_stage_history(libzfs_handle_t *, const char *); +extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, + size_t len); +extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); +extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); +extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, + nvlist_t *); + +/* + * Basic handle manipulations. These functions do not create or destroy the + * underlying datasets, only the references to them. + */ +extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); +extern void zfs_close(zfs_handle_t *); +extern zfs_type_t zfs_get_type(const zfs_handle_t *); +extern const char *zfs_get_name(const zfs_handle_t *); +extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); + +/* + * Property management functions. Some functions are shared with the kernel, + * and are found in sys/fs/zfs.h. + */ + +/* + * zfs dataset property management + */ +extern const char *zfs_prop_default_string(zfs_prop_t); +extern uint64_t zfs_prop_default_numeric(zfs_prop_t); +extern const char *zfs_prop_column_name(zfs_prop_t); +extern boolean_t zfs_prop_align_right(zfs_prop_t); + +extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, + nvlist_t *, uint64_t, zfs_handle_t *, const char *); + +extern const char *zfs_prop_to_name(zfs_prop_t); +extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); +extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, + zprop_source_t *, char *, size_t, boolean_t); +extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, + boolean_t); +extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, + zprop_source_t *, char *, size_t); +extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue); +extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal); +extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); +extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); +extern const char *zfs_prop_values(zfs_prop_t); +extern int zfs_prop_is_string(zfs_prop_t prop); +extern nvlist_t *zfs_get_user_props(zfs_handle_t *); +extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); + +typedef struct zprop_list { + int pl_prop; + char *pl_user_prop; + struct zprop_list *pl_next; + boolean_t pl_all; + size_t pl_width; + size_t pl_recvd_width; + boolean_t pl_fixed; +} zprop_list_t; + +extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t); +extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); + +#define ZFS_MOUNTPOINT_NONE "none" +#define ZFS_MOUNTPOINT_LEGACY "legacy" + +/* + * zpool property management + */ +extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); +extern const char *zpool_prop_default_string(zpool_prop_t); +extern uint64_t zpool_prop_default_numeric(zpool_prop_t); +extern const char *zpool_prop_column_name(zpool_prop_t); +extern boolean_t zpool_prop_align_right(zpool_prop_t); + +/* + * Functions shared by zfs and zpool property management. + */ +extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all, + boolean_t ordered, zfs_type_t type); +extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, + zfs_type_t); +extern void zprop_free_list(zprop_list_t *); + +#define ZFS_GET_NCOLS 5 + +typedef enum { + GET_COL_NONE, + GET_COL_NAME, + GET_COL_PROPERTY, + GET_COL_VALUE, + GET_COL_RECVD, + GET_COL_SOURCE +} zfs_get_column_t; + +/* + * Functions for printing zfs or zpool properties + */ +typedef struct zprop_get_cbdata { + int cb_sources; + zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; + int cb_colwidths[ZFS_GET_NCOLS + 1]; + boolean_t cb_scripted; + boolean_t cb_literal; + boolean_t cb_first; + zprop_list_t *cb_proplist; + zfs_type_t cb_type; +} zprop_get_cbdata_t; + +void zprop_print_one_property(const char *, zprop_get_cbdata_t *, + const char *, const char *, zprop_source_t, const char *, + const char *); + +/* + * Iterator functions. + */ +typedef int (*zfs_iter_f)(zfs_handle_t *, void *); +extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); +extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *); + +typedef struct get_all_cb { + zfs_handle_t **cb_handles; + size_t cb_alloc; + size_t cb_used; + boolean_t cb_verbose; + int (*cb_getone)(zfs_handle_t *, void *); +} get_all_cb_t; + +void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); +int libzfs_dataset_cmp(const void *, const void *); + +/* + * Functions to create and destroy datasets. + */ +extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, + nvlist_t *); +extern int zfs_create_ancestors(libzfs_handle_t *, const char *); +extern int zfs_destroy(zfs_handle_t *, boolean_t); +extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); +extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); +extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); +extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); +extern int zfs_rename(zfs_handle_t *, const char *, boolean_t); + +typedef struct sendflags { + /* print informational messages (ie, -v was specified) */ + int verbose : 1; + + /* recursive send (ie, -R) */ + int replicate : 1; + + /* for incrementals, do all intermediate snapshots */ + int doall : 1; /* (ie, -I) */ + + /* if dataset is a clone, do incremental from its origin */ + int fromorigin : 1; + + /* do deduplication */ + int dedup : 1; + + /* send properties (ie, -p) */ + int props : 1; +} sendflags_t; + +typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); + +extern int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, + sendflags_t flags, int outfd, snapfilter_cb_t filter_func, + void *cb_arg, nvlist_t **debugnvp); + +extern int zfs_promote(zfs_handle_t *); +extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, + boolean_t, boolean_t, int, uint64_t, uint64_t); +extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); +extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); + +typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, + uid_t rid, uint64_t space); + +extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, + zfs_userspace_cb_t func, void *arg); + +typedef struct recvflags { + /* print informational messages (ie, -v was specified) */ + int verbose : 1; + + /* the destination is a prefix, not the exact fs (ie, -d) */ + int isprefix : 1; + + /* + * Only the tail of the sent snapshot path is appended to the + * destination to determine the received snapshot name (ie, -e). + */ + int istail : 1; + + /* do not actually do the recv, just check if it would work (ie, -n) */ + int dryrun : 1; + + /* rollback/destroy filesystems as necessary (eg, -F) */ + int force : 1; + + /* set "canmount=off" on all modified filesystems */ + int canmountoff : 1; + + /* byteswap flag is used internally; callers need not specify */ + int byteswap : 1; + + /* do not mount file systems as they are extracted (private) */ + int nomount : 1; +} recvflags_t; + +extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t, + int, avl_tree_t *); + +typedef enum diff_flags { + ZFS_DIFF_PARSEABLE = 0x1, + ZFS_DIFF_TIMESTAMP = 0x2, + ZFS_DIFF_CLASSIFY = 0x4 +} diff_flags_t; + +extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, + int); + +/* + * Miscellaneous functions. + */ +extern const char *zfs_type_to_name(zfs_type_t); +extern void zfs_refresh_properties(zfs_handle_t *); +extern int zfs_name_valid(const char *, zfs_type_t); +extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t); +extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, + zfs_type_t); +extern int zfs_spa_version(zfs_handle_t *, int *); + +/* + * Mount support functions. + */ +extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); +extern boolean_t zfs_is_mounted(zfs_handle_t *, char **); +extern int zfs_mount(zfs_handle_t *, const char *, int); +extern int zfs_unmount(zfs_handle_t *, const char *, int); +extern int zfs_unmountall(zfs_handle_t *, int); + +/* + * Share support functions. + */ +extern boolean_t zfs_is_shared(zfs_handle_t *); +extern int zfs_share(zfs_handle_t *); +extern int zfs_unshare(zfs_handle_t *); + +/* + * Protocol-specific share support functions. + */ +extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); +extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); +extern int zfs_share_nfs(zfs_handle_t *); +extern int zfs_share_smb(zfs_handle_t *); +extern int zfs_shareall(zfs_handle_t *); +extern int zfs_unshare_nfs(zfs_handle_t *, const char *); +extern int zfs_unshare_smb(zfs_handle_t *, const char *); +extern int zfs_unshareall_nfs(zfs_handle_t *); +extern int zfs_unshareall_smb(zfs_handle_t *); +extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); +extern int zfs_unshareall(zfs_handle_t *); +extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, + void *, void *, int, zfs_share_op_t); + +/* + * When dealing with nvlists, verify() is extremely useful + */ +#ifdef NDEBUG +#define verify(EX) ((void)(EX)) +#else +#define verify(EX) assert(EX) +#endif + +/* + * Utility function to convert a number to a human-readable form. + */ +extern void zfs_nicenum(uint64_t, char *, size_t); +extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); + +/* + * Given a device or file, determine if it is part of a pool. + */ +extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, + boolean_t *); + +/* + * Label manipulation. + */ +extern int zpool_read_label(int, nvlist_t **); +extern int zpool_clear_label(int); + +/* is this zvol valid for use as a dump device? */ +extern int zvol_check_dump_config(char *); + +/* + * Management interfaces for SMB ACL files + */ + +int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); +int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); +int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); +int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); + +/* + * Enable and disable datasets within a pool by mounting/unmounting and + * sharing/unsharing them. + */ +extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); +extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); + +/* + * Mappings between vdev and FRU. + */ +extern void libzfs_fru_refresh(libzfs_handle_t *); +extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *); +extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *); +extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *, + const char *); +extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *); +extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_H */ diff --git a/lib/libzfs/common/libzfs_changelist.c b/lib/libzfs/common/libzfs_changelist.c new file mode 100644 index 0000000..4328d38 --- /dev/null +++ b/lib/libzfs/common/libzfs_changelist.c @@ -0,0 +1,693 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Portions Copyright 2007 Ramprakash Jelari + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "libzfs_impl.h" + +/* + * Structure to keep track of dataset state. Before changing the 'sharenfs' or + * 'mountpoint' property, we record whether the filesystem was previously + * mounted/shared. This prior state dictates whether we remount/reshare the + * dataset after the property has been changed. + * + * The interface consists of the following sequence of functions: + * + * changelist_gather() + * changelist_prefix() + * < change property > + * changelist_postfix() + * changelist_free() + * + * Other interfaces: + * + * changelist_remove() - remove a node from a gathered list + * changelist_rename() - renames all datasets appropriately when doing a rename + * changelist_unshare() - unshares all the nodes in a given changelist + * changelist_haszonedchild() - check if there is any child exported to + * a local zone + */ +typedef struct prop_changenode { + zfs_handle_t *cn_handle; + int cn_shared; + int cn_mounted; + int cn_zoned; + boolean_t cn_needpost; /* is postfix() needed? */ + uu_list_node_t cn_listnode; +} prop_changenode_t; + +struct prop_changelist { + zfs_prop_t cl_prop; + zfs_prop_t cl_realprop; + zfs_prop_t cl_shareprop; /* used with sharenfs/sharesmb */ + uu_list_pool_t *cl_pool; + uu_list_t *cl_list; + boolean_t cl_waslegacy; + boolean_t cl_allchildren; + boolean_t cl_alldependents; + int cl_mflags; /* Mount flags */ + int cl_gflags; /* Gather request flags */ + boolean_t cl_haszonedchild; + boolean_t cl_sorted; +}; + +/* + * If the property is 'mountpoint', go through and unmount filesystems as + * necessary. We don't do the same for 'sharenfs', because we can just re-share + * with different options without interrupting service. We do handle 'sharesmb' + * since there may be old resource names that need to be removed. + */ +int +changelist_prefix(prop_changelist_t *clp) +{ + prop_changenode_t *cn; + int ret = 0; + + if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && + clp->cl_prop != ZFS_PROP_SHARESMB) + return (0); + + for (cn = uu_list_first(clp->cl_list); cn != NULL; + cn = uu_list_next(clp->cl_list, cn)) { + + /* if a previous loop failed, set the remaining to false */ + if (ret == -1) { + cn->cn_needpost = B_FALSE; + continue; + } + + /* + * If we are in the global zone, but this dataset is exported + * to a local zone, do nothing. + */ + if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) + continue; + + if (!ZFS_IS_VOLUME(cn->cn_handle)) { + /* + * Do the property specific processing. + */ + switch (clp->cl_prop) { + case ZFS_PROP_MOUNTPOINT: + if (zfs_unmount(cn->cn_handle, NULL, + clp->cl_mflags) != 0) { + ret = -1; + cn->cn_needpost = B_FALSE; + } + break; + case ZFS_PROP_SHARESMB: + (void) zfs_unshare_smb(cn->cn_handle, NULL); + break; + } + } + } + + if (ret == -1) + (void) changelist_postfix(clp); + + return (ret); +} + +/* + * If the property is 'mountpoint' or 'sharenfs', go through and remount and/or + * reshare the filesystems as necessary. In changelist_gather() we recorded + * whether the filesystem was previously shared or mounted. The action we take + * depends on the previous state, and whether the value was previously 'legacy'. + * For non-legacy properties, we only remount/reshare the filesystem if it was + * previously mounted/shared. Otherwise, we always remount/reshare the + * filesystem. + */ +int +changelist_postfix(prop_changelist_t *clp) +{ + prop_changenode_t *cn; + char shareopts[ZFS_MAXPROPLEN]; + int errors = 0; + libzfs_handle_t *hdl; + + /* + * If we're changing the mountpoint, attempt to destroy the underlying + * mountpoint. All other datasets will have inherited from this dataset + * (in which case their mountpoints exist in the filesystem in the new + * location), or have explicit mountpoints set (in which case they won't + * be in the changelist). + */ + if ((cn = uu_list_last(clp->cl_list)) == NULL) + return (0); + + if (clp->cl_prop == ZFS_PROP_MOUNTPOINT) + remove_mountpoint(cn->cn_handle); + + /* + * It is possible that the changelist_prefix() used libshare + * to unshare some entries. Since libshare caches data, an + * attempt to reshare during postfix can fail unless libshare + * is uninitialized here so that it will reinitialize later. + */ + if (cn->cn_handle != NULL) { + hdl = cn->cn_handle->zfs_hdl; + assert(hdl != NULL); + zfs_uninit_libshare(hdl); + } + + /* + * We walk the datasets in reverse, because we want to mount any parent + * datasets before mounting the children. We walk all datasets even if + * there are errors. + */ + for (cn = uu_list_last(clp->cl_list); cn != NULL; + cn = uu_list_prev(clp->cl_list, cn)) { + + boolean_t sharenfs; + boolean_t sharesmb; + boolean_t mounted; + + /* + * If we are in the global zone, but this dataset is exported + * to a local zone, do nothing. + */ + if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) + continue; + + /* Only do post-processing if it's required */ + if (!cn->cn_needpost) + continue; + cn->cn_needpost = B_FALSE; + + zfs_refresh_properties(cn->cn_handle); + + if (ZFS_IS_VOLUME(cn->cn_handle)) + continue; + + /* + * Remount if previously mounted or mountpoint was legacy, + * or sharenfs or sharesmb property is set. + */ + sharenfs = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARENFS, + shareopts, sizeof (shareopts), NULL, NULL, 0, + B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); + + sharesmb = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARESMB, + shareopts, sizeof (shareopts), NULL, NULL, 0, + B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); + + mounted = zfs_is_mounted(cn->cn_handle, NULL); + + if (!mounted && (cn->cn_mounted || + ((sharenfs || sharesmb || clp->cl_waslegacy) && + (zfs_prop_get_int(cn->cn_handle, + ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) { + + if (zfs_mount(cn->cn_handle, NULL, 0) != 0) + errors++; + else + mounted = TRUE; + } + + /* + * If the file system is mounted we always re-share even + * if the filesystem is currently shared, so that we can + * adopt any new options. + */ + if (sharenfs && mounted) + errors += zfs_share_nfs(cn->cn_handle); + else if (cn->cn_shared || clp->cl_waslegacy) + errors += zfs_unshare_nfs(cn->cn_handle, NULL); + if (sharesmb && mounted) + errors += zfs_share_smb(cn->cn_handle); + else if (cn->cn_shared || clp->cl_waslegacy) + errors += zfs_unshare_smb(cn->cn_handle, NULL); + } + + return (errors ? -1 : 0); +} + +/* + * Is this "dataset" a child of "parent"? + */ +boolean_t +isa_child_of(const char *dataset, const char *parent) +{ + int len; + + len = strlen(parent); + + if (strncmp(dataset, parent, len) == 0 && + (dataset[len] == '@' || dataset[len] == '/' || + dataset[len] == '\0')) + return (B_TRUE); + else + return (B_FALSE); + +} + +/* + * If we rename a filesystem, child filesystem handles are no longer valid + * since we identify each dataset by its name in the ZFS namespace. As a + * result, we have to go through and fix up all the names appropriately. We + * could do this automatically if libzfs kept track of all open handles, but + * this is a lot less work. + */ +void +changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) +{ + prop_changenode_t *cn; + char newname[ZFS_MAXNAMELEN]; + + for (cn = uu_list_first(clp->cl_list); cn != NULL; + cn = uu_list_next(clp->cl_list, cn)) { + /* + * Do not rename a clone that's not in the source hierarchy. + */ + if (!isa_child_of(cn->cn_handle->zfs_name, src)) + continue; + + /* + * Destroy the previous mountpoint if needed. + */ + remove_mountpoint(cn->cn_handle); + + (void) strlcpy(newname, dst, sizeof (newname)); + (void) strcat(newname, cn->cn_handle->zfs_name + strlen(src)); + + (void) strlcpy(cn->cn_handle->zfs_name, newname, + sizeof (cn->cn_handle->zfs_name)); + } +} + +/* + * Given a gathered changelist for the 'sharenfs' or 'sharesmb' property, + * unshare all the datasets in the list. + */ +int +changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto) +{ + prop_changenode_t *cn; + int ret = 0; + + if (clp->cl_prop != ZFS_PROP_SHARENFS && + clp->cl_prop != ZFS_PROP_SHARESMB) + return (0); + + for (cn = uu_list_first(clp->cl_list); cn != NULL; + cn = uu_list_next(clp->cl_list, cn)) { + if (zfs_unshare_proto(cn->cn_handle, NULL, proto) != 0) + ret = -1; + } + + return (ret); +} + +/* + * Check if there is any child exported to a local zone in a given changelist. + * This information has already been recorded while gathering the changelist + * via changelist_gather(). + */ +int +changelist_haszonedchild(prop_changelist_t *clp) +{ + return (clp->cl_haszonedchild); +} + +/* + * Remove a node from a gathered list. + */ +void +changelist_remove(prop_changelist_t *clp, const char *name) +{ + prop_changenode_t *cn; + + for (cn = uu_list_first(clp->cl_list); cn != NULL; + cn = uu_list_next(clp->cl_list, cn)) { + + if (strcmp(cn->cn_handle->zfs_name, name) == 0) { + uu_list_remove(clp->cl_list, cn); + zfs_close(cn->cn_handle); + free(cn); + return; + } + } +} + +/* + * Release any memory associated with a changelist. + */ +void +changelist_free(prop_changelist_t *clp) +{ + prop_changenode_t *cn; + void *cookie; + + if (clp->cl_list) { + cookie = NULL; + while ((cn = uu_list_teardown(clp->cl_list, &cookie)) != NULL) { + zfs_close(cn->cn_handle); + free(cn); + } + + uu_list_destroy(clp->cl_list); + } + if (clp->cl_pool) + uu_list_pool_destroy(clp->cl_pool); + + free(clp); +} + +static int +change_one(zfs_handle_t *zhp, void *data) +{ + prop_changelist_t *clp = data; + char property[ZFS_MAXPROPLEN]; + char where[64]; + prop_changenode_t *cn; + zprop_source_t sourcetype; + zprop_source_t share_sourcetype; + + /* + * We only want to unmount/unshare those filesystems that may inherit + * from the target filesystem. If we find any filesystem with a + * locally set mountpoint, we ignore any children since changing the + * property will not affect them. If this is a rename, we iterate + * over all children regardless, since we need them unmounted in + * order to do the rename. Also, if this is a volume and we're doing + * a rename, then always add it to the changelist. + */ + + if (!(ZFS_IS_VOLUME(zhp) && clp->cl_realprop == ZFS_PROP_NAME) && + zfs_prop_get(zhp, clp->cl_prop, property, + sizeof (property), &sourcetype, where, sizeof (where), + B_FALSE) != 0) { + zfs_close(zhp); + return (0); + } + + /* + * If we are "watching" sharenfs or sharesmb + * then check out the companion property which is tracked + * in cl_shareprop + */ + if (clp->cl_shareprop != ZPROP_INVAL && + zfs_prop_get(zhp, clp->cl_shareprop, property, + sizeof (property), &share_sourcetype, where, sizeof (where), + B_FALSE) != 0) { + zfs_close(zhp); + return (0); + } + + if (clp->cl_alldependents || clp->cl_allchildren || + sourcetype == ZPROP_SRC_DEFAULT || + sourcetype == ZPROP_SRC_INHERITED || + (clp->cl_shareprop != ZPROP_INVAL && + (share_sourcetype == ZPROP_SRC_DEFAULT || + share_sourcetype == ZPROP_SRC_INHERITED))) { + if ((cn = zfs_alloc(zfs_get_handle(zhp), + sizeof (prop_changenode_t))) == NULL) { + zfs_close(zhp); + return (-1); + } + + cn->cn_handle = zhp; + cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) || + zfs_is_mounted(zhp, NULL); + cn->cn_shared = zfs_is_shared(zhp); + cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + cn->cn_needpost = B_TRUE; + + /* Indicate if any child is exported to a local zone. */ + if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) + clp->cl_haszonedchild = B_TRUE; + + uu_list_node_init(cn, &cn->cn_listnode, clp->cl_pool); + + if (clp->cl_sorted) { + uu_list_index_t idx; + + (void) uu_list_find(clp->cl_list, cn, NULL, + &idx); + uu_list_insert(clp->cl_list, cn, idx); + } else { + /* + * Add this child to beginning of the list. Children + * below this one in the hierarchy will get added above + * this one in the list. This produces a list in + * reverse dataset name order. + * This is necessary when the original mountpoint + * is legacy or none. + */ + ASSERT(!clp->cl_alldependents); + verify(uu_list_insert_before(clp->cl_list, + uu_list_first(clp->cl_list), cn) == 0); + } + + if (!clp->cl_alldependents) + return (zfs_iter_children(zhp, change_one, data)); + } else { + zfs_close(zhp); + } + + return (0); +} + +/*ARGSUSED*/ +static int +compare_mountpoints(const void *a, const void *b, void *unused) +{ + const prop_changenode_t *ca = a; + const prop_changenode_t *cb = b; + + char mounta[MAXPATHLEN]; + char mountb[MAXPATHLEN]; + + boolean_t hasmounta, hasmountb; + + /* + * When unsharing or unmounting filesystems, we need to do it in + * mountpoint order. This allows the user to have a mountpoint + * hierarchy that is different from the dataset hierarchy, and still + * allow it to be changed. However, if either dataset doesn't have a + * mountpoint (because it is a volume or a snapshot), we place it at the + * end of the list, because it doesn't affect our change at all. + */ + hasmounta = (zfs_prop_get(ca->cn_handle, ZFS_PROP_MOUNTPOINT, mounta, + sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); + hasmountb = (zfs_prop_get(cb->cn_handle, ZFS_PROP_MOUNTPOINT, mountb, + sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); + + if (!hasmounta && hasmountb) + return (-1); + else if (hasmounta && !hasmountb) + return (1); + else if (!hasmounta && !hasmountb) + return (0); + else + return (strcmp(mountb, mounta)); +} + +/* + * Given a ZFS handle and a property, construct a complete list of datasets + * that need to be modified as part of this process. For anything but the + * 'mountpoint' and 'sharenfs' properties, this just returns an empty list. + * Otherwise, we iterate over all children and look for any datasets that + * inherit the property. For each such dataset, we add it to the list and + * mark whether it was shared beforehand. + */ +prop_changelist_t * +changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, + int mnt_flags) +{ + prop_changelist_t *clp; + prop_changenode_t *cn; + zfs_handle_t *temp; + char property[ZFS_MAXPROPLEN]; + uu_compare_fn_t *compare = NULL; + boolean_t legacy = B_FALSE; + + if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL) + return (NULL); + + /* + * For mountpoint-related tasks, we want to sort everything by + * mountpoint, so that we mount and unmount them in the appropriate + * order, regardless of their position in the hierarchy. + */ + if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED || + prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS || + prop == ZFS_PROP_SHARESMB) { + + if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, + property, sizeof (property), + NULL, NULL, 0, B_FALSE) == 0 && + (strcmp(property, "legacy") == 0 || + strcmp(property, "none") == 0)) { + + legacy = B_TRUE; + } + if (!legacy) { + compare = compare_mountpoints; + clp->cl_sorted = B_TRUE; + } + } + + clp->cl_pool = uu_list_pool_create("changelist_pool", + sizeof (prop_changenode_t), + offsetof(prop_changenode_t, cn_listnode), + compare, 0); + if (clp->cl_pool == NULL) { + assert(uu_error() == UU_ERROR_NO_MEMORY); + (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error"); + changelist_free(clp); + return (NULL); + } + + clp->cl_list = uu_list_create(clp->cl_pool, NULL, + clp->cl_sorted ? UU_LIST_SORTED : 0); + clp->cl_gflags = gather_flags; + clp->cl_mflags = mnt_flags; + + if (clp->cl_list == NULL) { + assert(uu_error() == UU_ERROR_NO_MEMORY); + (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error"); + changelist_free(clp); + return (NULL); + } + + /* + * If this is a rename or the 'zoned' property, we pretend we're + * changing the mountpoint and flag it so we can catch all children in + * change_one(). + * + * Flag cl_alldependents to catch all children plus the dependents + * (clones) that are not in the hierarchy. + */ + if (prop == ZFS_PROP_NAME) { + clp->cl_prop = ZFS_PROP_MOUNTPOINT; + clp->cl_alldependents = B_TRUE; + } else if (prop == ZFS_PROP_ZONED) { + clp->cl_prop = ZFS_PROP_MOUNTPOINT; + clp->cl_allchildren = B_TRUE; + } else if (prop == ZFS_PROP_CANMOUNT) { + clp->cl_prop = ZFS_PROP_MOUNTPOINT; + } else if (prop == ZFS_PROP_VOLSIZE) { + clp->cl_prop = ZFS_PROP_MOUNTPOINT; + } else { + clp->cl_prop = prop; + } + clp->cl_realprop = prop; + + if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && + clp->cl_prop != ZFS_PROP_SHARENFS && + clp->cl_prop != ZFS_PROP_SHARESMB) + return (clp); + + /* + * If watching SHARENFS or SHARESMB then + * also watch its companion property. + */ + if (clp->cl_prop == ZFS_PROP_SHARENFS) + clp->cl_shareprop = ZFS_PROP_SHARESMB; + else if (clp->cl_prop == ZFS_PROP_SHARESMB) + clp->cl_shareprop = ZFS_PROP_SHARENFS; + + if (clp->cl_alldependents) { + if (zfs_iter_dependents(zhp, B_TRUE, change_one, clp) != 0) { + changelist_free(clp); + return (NULL); + } + } else if (zfs_iter_children(zhp, change_one, clp) != 0) { + changelist_free(clp); + return (NULL); + } + + /* + * We have to re-open ourselves because we auto-close all the handles + * and can't tell the difference. + */ + if ((temp = zfs_open(zhp->zfs_hdl, zfs_get_name(zhp), + ZFS_TYPE_DATASET)) == NULL) { + changelist_free(clp); + return (NULL); + } + + /* + * Always add ourself to the list. We add ourselves to the end so that + * we're the last to be unmounted. + */ + if ((cn = zfs_alloc(zhp->zfs_hdl, + sizeof (prop_changenode_t))) == NULL) { + zfs_close(temp); + changelist_free(clp); + return (NULL); + } + + cn->cn_handle = temp; + cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) || + zfs_is_mounted(temp, NULL); + cn->cn_shared = zfs_is_shared(temp); + cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + cn->cn_needpost = B_TRUE; + + uu_list_node_init(cn, &cn->cn_listnode, clp->cl_pool); + if (clp->cl_sorted) { + uu_list_index_t idx; + (void) uu_list_find(clp->cl_list, cn, NULL, &idx); + uu_list_insert(clp->cl_list, cn, idx); + } else { + /* + * Add the target dataset to the end of the list. + * The list is not really unsorted. The list will be + * in reverse dataset name order. This is necessary + * when the original mountpoint is legacy or none. + */ + verify(uu_list_insert_after(clp->cl_list, + uu_list_last(clp->cl_list), cn) == 0); + } + + /* + * If the mountpoint property was previously 'legacy', or 'none', + * record it as the behavior of changelist_postfix() will be different. + */ + if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) { + /* + * do not automatically mount ex-legacy datasets if + * we specifically set canmount to noauto + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) != + ZFS_CANMOUNT_NOAUTO) + clp->cl_waslegacy = B_TRUE; + } + + return (clp); +} diff --git a/lib/libzfs/common/libzfs_config.c b/lib/libzfs/common/libzfs_config.c new file mode 100644 index 0000000..dc27238 --- /dev/null +++ b/lib/libzfs/common/libzfs_config.c @@ -0,0 +1,370 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * The pool configuration repository is stored in /etc/zfs/zpool.cache as a + * single packed nvlist. While it would be nice to just read in this + * file from userland, this wouldn't work from a local zone. So we have to have + * a zpool ioctl to return the complete configuration for all pools. In the + * global zone, this will be identical to reading the file and unpacking it in + * userland. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" + +typedef struct config_node { + char *cn_name; + nvlist_t *cn_config; + uu_avl_node_t cn_avl; +} config_node_t; + +/* ARGSUSED */ +static int +config_node_compare(const void *a, const void *b, void *unused) +{ + int ret; + + const config_node_t *ca = (config_node_t *)a; + const config_node_t *cb = (config_node_t *)b; + + ret = strcmp(ca->cn_name, cb->cn_name); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +void +namespace_clear(libzfs_handle_t *hdl) +{ + if (hdl->libzfs_ns_avl) { + config_node_t *cn; + void *cookie = NULL; + + while ((cn = uu_avl_teardown(hdl->libzfs_ns_avl, + &cookie)) != NULL) { + nvlist_free(cn->cn_config); + free(cn->cn_name); + free(cn); + } + + uu_avl_destroy(hdl->libzfs_ns_avl); + hdl->libzfs_ns_avl = NULL; + } + + if (hdl->libzfs_ns_avlpool) { + uu_avl_pool_destroy(hdl->libzfs_ns_avlpool); + hdl->libzfs_ns_avlpool = NULL; + } +} + +/* + * Loads the pool namespace, or re-loads it if the cache has changed. + */ +static int +namespace_reload(libzfs_handle_t *hdl) +{ + nvlist_t *config; + config_node_t *cn; + nvpair_t *elem; + zfs_cmd_t zc = { 0 }; + void *cookie; + + if (hdl->libzfs_ns_gen == 0) { + /* + * This is the first time we've accessed the configuration + * cache. Initialize the AVL tree and then fall through to the + * common code. + */ + if ((hdl->libzfs_ns_avlpool = uu_avl_pool_create("config_pool", + sizeof (config_node_t), + offsetof(config_node_t, cn_avl), + config_node_compare, UU_DEFAULT)) == NULL) + return (no_memory(hdl)); + + if ((hdl->libzfs_ns_avl = uu_avl_create(hdl->libzfs_ns_avlpool, + NULL, UU_DEFAULT)) == NULL) + return (no_memory(hdl)); + } + + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) + return (-1); + + for (;;) { + zc.zc_cookie = hdl->libzfs_ns_gen; + if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_CONFIGS, &zc) != 0) { + switch (errno) { + case EEXIST: + /* + * The namespace hasn't changed. + */ + zcmd_free_nvlists(&zc); + return (0); + + case ENOMEM: + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + break; + + default: + zcmd_free_nvlists(&zc); + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, "failed to read " + "pool configuration"))); + } + } else { + hdl->libzfs_ns_gen = zc.zc_cookie; + break; + } + } + + if (zcmd_read_dst_nvlist(hdl, &zc, &config) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + + zcmd_free_nvlists(&zc); + + /* + * Clear out any existing configuration information. + */ + cookie = NULL; + while ((cn = uu_avl_teardown(hdl->libzfs_ns_avl, &cookie)) != NULL) { + nvlist_free(cn->cn_config); + free(cn->cn_name); + free(cn); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(config, elem)) != NULL) { + nvlist_t *child; + uu_avl_index_t where; + + if ((cn = zfs_alloc(hdl, sizeof (config_node_t))) == NULL) { + nvlist_free(config); + return (-1); + } + + if ((cn->cn_name = zfs_strdup(hdl, + nvpair_name(elem))) == NULL) { + free(cn); + nvlist_free(config); + return (-1); + } + + verify(nvpair_value_nvlist(elem, &child) == 0); + if (nvlist_dup(child, &cn->cn_config, 0) != 0) { + free(cn->cn_name); + free(cn); + nvlist_free(config); + return (no_memory(hdl)); + } + verify(uu_avl_find(hdl->libzfs_ns_avl, cn, NULL, &where) + == NULL); + + uu_avl_insert(hdl->libzfs_ns_avl, cn, where); + } + + nvlist_free(config); + return (0); +} + +/* + * Retrieve the configuration for the given pool. The configuration is a nvlist + * describing the vdevs, as well as the statistics associated with each one. + */ +nvlist_t * +zpool_get_config(zpool_handle_t *zhp, nvlist_t **oldconfig) +{ + if (oldconfig) + *oldconfig = zhp->zpool_old_config; + return (zhp->zpool_config); +} + +/* + * Refresh the vdev statistics associated with the given pool. This is used in + * iostat to show configuration changes and determine the delta from the last + * time the function was called. This function can fail, in case the pool has + * been destroyed. + */ +int +zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) +{ + zfs_cmd_t zc = { 0 }; + int error; + nvlist_t *config; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + *missing = B_FALSE; + (void) strcpy(zc.zc_name, zhp->zpool_name); + + if (zhp->zpool_config_size == 0) + zhp->zpool_config_size = 1 << 16; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size) != 0) + return (-1); + + for (;;) { + if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_POOL_STATS, + &zc) == 0) { + /* + * The real error is returned in the zc_cookie field. + */ + error = zc.zc_cookie; + break; + } + + if (errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } else { + zcmd_free_nvlists(&zc); + if (errno == ENOENT || errno == EINVAL) + *missing = B_TRUE; + zhp->zpool_state = POOL_STATE_UNAVAIL; + return (0); + } + } + + if (zcmd_read_dst_nvlist(hdl, &zc, &config) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + + zcmd_free_nvlists(&zc); + + zhp->zpool_config_size = zc.zc_nvlist_dst_size; + + if (zhp->zpool_config != NULL) { + uint64_t oldtxg, newtxg; + + verify(nvlist_lookup_uint64(zhp->zpool_config, + ZPOOL_CONFIG_POOL_TXG, &oldtxg) == 0); + verify(nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_TXG, &newtxg) == 0); + + if (zhp->zpool_old_config != NULL) + nvlist_free(zhp->zpool_old_config); + + if (oldtxg != newtxg) { + nvlist_free(zhp->zpool_config); + zhp->zpool_old_config = NULL; + } else { + zhp->zpool_old_config = zhp->zpool_config; + } + } + + zhp->zpool_config = config; + if (error) + zhp->zpool_state = POOL_STATE_UNAVAIL; + else + zhp->zpool_state = POOL_STATE_ACTIVE; + + return (0); +} + +/* + * Iterate over all pools in the system. + */ +int +zpool_iter(libzfs_handle_t *hdl, zpool_iter_f func, void *data) +{ + config_node_t *cn; + zpool_handle_t *zhp; + int ret; + + /* + * If someone makes a recursive call to zpool_iter(), we want to avoid + * refreshing the namespace because that will invalidate the parent + * context. We allow recursive calls, but simply re-use the same + * namespace AVL tree. + */ + if (!hdl->libzfs_pool_iter && namespace_reload(hdl) != 0) + return (-1); + + hdl->libzfs_pool_iter++; + for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; + cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { + + if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) { + hdl->libzfs_pool_iter--; + return (-1); + } + + if (zhp == NULL) + continue; + + if ((ret = func(zhp, data)) != 0) { + hdl->libzfs_pool_iter--; + return (ret); + } + } + hdl->libzfs_pool_iter--; + + return (0); +} + +/* + * Iterate over root datasets, calling the given function for each. The zfs + * handle passed each time must be explicitly closed by the callback. + */ +int +zfs_iter_root(libzfs_handle_t *hdl, zfs_iter_f func, void *data) +{ + config_node_t *cn; + zfs_handle_t *zhp; + int ret; + + if (namespace_reload(hdl) != 0) + return (-1); + + for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; + cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { + + if ((zhp = make_dataset_handle(hdl, cn->cn_name)) == NULL) + continue; + + if ((ret = func(zhp, data)) != 0) + return (ret); + } + + return (0); +} diff --git a/lib/libzfs/common/libzfs_dataset.c b/lib/libzfs/common/libzfs_dataset.c new file mode 100644 index 0000000..b7c1360 --- /dev/null +++ b/lib/libzfs/common/libzfs_dataset.c @@ -0,0 +1,4058 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "libzfs_impl.h" +#include "zfs_deleg.h" + +static int userquota_propname_decode(const char *propname, boolean_t zoned, + zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); + +/* + * Given a single type (not a mask of types), return the type in a human + * readable form. + */ +const char * +zfs_type_to_name(zfs_type_t type) +{ + switch (type) { + case ZFS_TYPE_FILESYSTEM: + return (dgettext(TEXT_DOMAIN, "filesystem")); + case ZFS_TYPE_SNAPSHOT: + return (dgettext(TEXT_DOMAIN, "snapshot")); + case ZFS_TYPE_VOLUME: + return (dgettext(TEXT_DOMAIN, "volume")); + } + + return (NULL); +} + +/* + * Given a path and mask of ZFS types, return a string describing this dataset. + * This is used when we fail to open a dataset and we cannot get an exact type. + * We guess what the type would have been based on the path and the mask of + * acceptable types. + */ +static const char * +path_to_str(const char *path, int types) +{ + /* + * When given a single type, always report the exact type. + */ + if (types == ZFS_TYPE_SNAPSHOT) + return (dgettext(TEXT_DOMAIN, "snapshot")); + if (types == ZFS_TYPE_FILESYSTEM) + return (dgettext(TEXT_DOMAIN, "filesystem")); + if (types == ZFS_TYPE_VOLUME) + return (dgettext(TEXT_DOMAIN, "volume")); + + /* + * The user is requesting more than one type of dataset. If this is the + * case, consult the path itself. If we're looking for a snapshot, and + * a '@' is found, then report it as "snapshot". Otherwise, remove the + * snapshot attribute and try again. + */ + if (types & ZFS_TYPE_SNAPSHOT) { + if (strchr(path, '@') != NULL) + return (dgettext(TEXT_DOMAIN, "snapshot")); + return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT)); + } + + /* + * The user has requested either filesystems or volumes. + * We have no way of knowing a priori what type this would be, so always + * report it as "filesystem" or "volume", our two primitive types. + */ + if (types & ZFS_TYPE_FILESYSTEM) + return (dgettext(TEXT_DOMAIN, "filesystem")); + + assert(types & ZFS_TYPE_VOLUME); + return (dgettext(TEXT_DOMAIN, "volume")); +} + +/* + * Validate a ZFS path. This is used even before trying to open the dataset, to + * provide a more meaningful error message. We call zfs_error_aux() to + * explain exactly why the name was not valid. + */ +int +zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, + boolean_t modifying) +{ + namecheck_err_t why; + char what; + + if (dataset_namecheck(path, &why, &what) != 0) { + if (hdl != NULL) { + switch (why) { + case NAME_ERR_TOOLONG: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "name is too long")); + break; + + case NAME_ERR_LEADING_SLASH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "leading slash in name")); + break; + + case NAME_ERR_EMPTY_COMPONENT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "empty component in name")); + break; + + case NAME_ERR_TRAILING_SLASH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "trailing slash in name")); + break; + + case NAME_ERR_INVALCHAR: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "invalid character " + "'%c' in name"), what); + break; + + case NAME_ERR_MULTIPLE_AT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "multiple '@' delimiters in name")); + break; + + case NAME_ERR_NOLETTER: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool doesn't begin with a letter")); + break; + + case NAME_ERR_RESERVED: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "name is reserved")); + break; + + case NAME_ERR_DISKLIKE: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "reserved disk name")); + break; + } + } + + return (0); + } + + if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) { + if (hdl != NULL) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshot delimiter '@' in filesystem name")); + return (0); + } + + if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) { + if (hdl != NULL) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "missing '@' delimiter in snapshot name")); + return (0); + } + + if (modifying && strchr(path, '%') != NULL) { + if (hdl != NULL) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid character %c in name"), '%'); + return (0); + } + + return (-1); +} + +int +zfs_name_valid(const char *name, zfs_type_t type) +{ + if (type == ZFS_TYPE_POOL) + return (zpool_name_valid(NULL, B_FALSE, name)); + return (zfs_validate_name(NULL, name, type, B_FALSE)); +} + +/* + * This function takes the raw DSL properties, and filters out the user-defined + * properties into a separate nvlist. + */ +static nvlist_t * +process_user_props(zfs_handle_t *zhp, nvlist_t *props) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvpair_t *elem; + nvlist_t *propval; + nvlist_t *nvl; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (NULL); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + if (!zfs_prop_user(nvpair_name(elem))) + continue; + + verify(nvpair_value_nvlist(elem, &propval) == 0); + if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) { + nvlist_free(nvl); + (void) no_memory(hdl); + return (NULL); + } + } + + return (nvl); +} + +static zpool_handle_t * +zpool_add_handle(zfs_handle_t *zhp, const char *pool_name) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zpool_handle_t *zph; + + if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) { + if (hdl->libzfs_pool_handles != NULL) + zph->zpool_next = hdl->libzfs_pool_handles; + hdl->libzfs_pool_handles = zph; + } + return (zph); +} + +static zpool_handle_t * +zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zpool_handle_t *zph = hdl->libzfs_pool_handles; + + while ((zph != NULL) && + (strncmp(pool_name, zpool_get_name(zph), len) != 0)) + zph = zph->zpool_next; + return (zph); +} + +/* + * Returns a handle to the pool that contains the provided dataset. + * If a handle to that pool already exists then that handle is returned. + * Otherwise, a new handle is created and added to the list of handles. + */ +static zpool_handle_t * +zpool_handle(zfs_handle_t *zhp) +{ + char *pool_name; + int len; + zpool_handle_t *zph; + + len = strcspn(zhp->zfs_name, "/@") + 1; + pool_name = zfs_alloc(zhp->zfs_hdl, len); + (void) strlcpy(pool_name, zhp->zfs_name, len); + + zph = zpool_find_handle(zhp, pool_name, len); + if (zph == NULL) + zph = zpool_add_handle(zhp, pool_name); + + free(pool_name); + return (zph); +} + +void +zpool_free_handles(libzfs_handle_t *hdl) +{ + zpool_handle_t *next, *zph = hdl->libzfs_pool_handles; + + while (zph != NULL) { + next = zph->zpool_next; + zpool_close(zph); + zph = next; + } + hdl->libzfs_pool_handles = NULL; +} + +/* + * Utility function to gather stats (objset and zpl) for the given object. + */ +static int +get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + + (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); + + while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) { + if (errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, zc) != 0) { + return (-1); + } + } else { + return (-1); + } + } + return (0); +} + +/* + * Utility function to get the received properties of the given object. + */ +static int +get_recvd_props_ioctl(zfs_handle_t *zhp) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *recvdprops; + zfs_cmd_t zc = { 0 }; + int err; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) + return (-1); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { + if (errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + return (-1); + } + } else { + zcmd_free_nvlists(&zc); + return (-1); + } + } + + err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops); + zcmd_free_nvlists(&zc); + if (err != 0) + return (-1); + + nvlist_free(zhp->zfs_recvd_props); + zhp->zfs_recvd_props = recvdprops; + + return (0); +} + +static int +put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) +{ + nvlist_t *allprops, *userprops; + + zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */ + + if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) { + return (-1); + } + + /* + * XXX Why do we store the user props separately, in addition to + * storing them in zfs_props? + */ + if ((userprops = process_user_props(zhp, allprops)) == NULL) { + nvlist_free(allprops); + return (-1); + } + + nvlist_free(zhp->zfs_props); + nvlist_free(zhp->zfs_user_props); + + zhp->zfs_props = allprops; + zhp->zfs_user_props = userprops; + + return (0); +} + +static int +get_stats(zfs_handle_t *zhp) +{ + int rc = 0; + zfs_cmd_t zc = { 0 }; + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + if (get_stats_ioctl(zhp, &zc) != 0) + rc = -1; + else if (put_stats_zhdl(zhp, &zc) != 0) + rc = -1; + zcmd_free_nvlists(&zc); + return (rc); +} + +/* + * Refresh the properties currently stored in the handle. + */ +void +zfs_refresh_properties(zfs_handle_t *zhp) +{ + (void) get_stats(zhp); +} + +/* + * Makes a handle from the given dataset name. Used by zfs_open() and + * zfs_iter_* to create child handles on the fly. + */ +static int +make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) +{ + if (put_stats_zhdl(zhp, zc) != 0) + return (-1); + + /* + * We've managed to open the dataset and gather statistics. Determine + * the high-level type. + */ + if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) + zhp->zfs_head_type = ZFS_TYPE_VOLUME; + else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) + zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM; + else + abort(); + + if (zhp->zfs_dmustats.dds_is_snapshot) + zhp->zfs_type = ZFS_TYPE_SNAPSHOT; + else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) + zhp->zfs_type = ZFS_TYPE_VOLUME; + else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) + zhp->zfs_type = ZFS_TYPE_FILESYSTEM; + else + abort(); /* we should never see any other types */ + + if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) + return (-1); + + return (0); +} + +zfs_handle_t * +make_dataset_handle(libzfs_handle_t *hdl, const char *path) +{ + zfs_cmd_t zc = { 0 }; + + zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = hdl; + (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) { + free(zhp); + return (NULL); + } + if (get_stats_ioctl(zhp, &zc) == -1) { + zcmd_free_nvlists(&zc); + free(zhp); + return (NULL); + } + if (make_dataset_handle_common(zhp, &zc) == -1) { + free(zhp); + zhp = NULL; + } + zcmd_free_nvlists(&zc); + return (zhp); +} + +static zfs_handle_t * +make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) +{ + zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = hdl; + (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); + if (make_dataset_handle_common(zhp, zc) == -1) { + free(zhp); + return (NULL); + } + return (zhp); +} + +/* + * Opens the given snapshot, filesystem, or volume. The 'types' + * argument is a mask of acceptable types. The function will print an + * appropriate error message and return NULL if it can't be opened. + */ +zfs_handle_t * +zfs_open(libzfs_handle_t *hdl, const char *path, int types) +{ + zfs_handle_t *zhp; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); + + /* + * Validate the name before we even try to open it. + */ + if (!zfs_validate_name(hdl, path, ZFS_TYPE_DATASET, B_FALSE)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid dataset name")); + (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + return (NULL); + } + + /* + * Try to get stats for the dataset, which will tell us if it exists. + */ + errno = 0; + if ((zhp = make_dataset_handle(hdl, path)) == NULL) { + (void) zfs_standard_error(hdl, errno, errbuf); + return (NULL); + } + + if (!(types & zhp->zfs_type)) { + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + zfs_close(zhp); + return (NULL); + } + + return (zhp); +} + +/* + * Release a ZFS handle. Nothing to do but free the associated memory. + */ +void +zfs_close(zfs_handle_t *zhp) +{ + if (zhp->zfs_mntopts) + free(zhp->zfs_mntopts); + nvlist_free(zhp->zfs_props); + nvlist_free(zhp->zfs_user_props); + nvlist_free(zhp->zfs_recvd_props); + free(zhp); +} + +typedef struct mnttab_node { + struct mnttab mtn_mt; + avl_node_t mtn_node; +} mnttab_node_t; + +static int +libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) +{ + const mnttab_node_t *mtn1 = arg1; + const mnttab_node_t *mtn2 = arg2; + int rv; + + rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); + + if (rv == 0) + return (0); + return (rv > 0 ? 1 : -1); +} + +void +libzfs_mnttab_init(libzfs_handle_t *hdl) +{ + assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); + avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, + sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); +} + +void +libzfs_mnttab_update(libzfs_handle_t *hdl) +{ + struct mnttab entry; + + rewind(hdl->libzfs_mnttab); + while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { + mnttab_node_t *mtn; + + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) + continue; + mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); + mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special); + mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp); + mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype); + mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); + avl_add(&hdl->libzfs_mnttab_cache, mtn); + } +} + +void +libzfs_mnttab_fini(libzfs_handle_t *hdl) +{ + void *cookie = NULL; + mnttab_node_t *mtn; + + while (mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) { + free(mtn->mtn_mt.mnt_special); + free(mtn->mtn_mt.mnt_mountp); + free(mtn->mtn_mt.mnt_fstype); + free(mtn->mtn_mt.mnt_mntopts); + free(mtn); + } + avl_destroy(&hdl->libzfs_mnttab_cache); +} + +void +libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) +{ + hdl->libzfs_mnttab_enable = enable; +} + +int +libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, + struct mnttab *entry) +{ + mnttab_node_t find; + mnttab_node_t *mtn; + + if (!hdl->libzfs_mnttab_enable) { + struct mnttab srch = { 0 }; + + if (avl_numnodes(&hdl->libzfs_mnttab_cache)) + libzfs_mnttab_fini(hdl); + rewind(hdl->libzfs_mnttab); + srch.mnt_special = (char *)fsname; + srch.mnt_fstype = MNTTYPE_ZFS; + if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0) + return (0); + else + return (ENOENT); + } + + if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) + libzfs_mnttab_update(hdl); + + find.mtn_mt.mnt_special = (char *)fsname; + mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); + if (mtn) { + *entry = mtn->mtn_mt; + return (0); + } + return (ENOENT); +} + +void +libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, + const char *mountp, const char *mntopts) +{ + mnttab_node_t *mtn; + + if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) + return; + mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); + mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); + mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); + mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); + mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); + avl_add(&hdl->libzfs_mnttab_cache, mtn); +} + +void +libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) +{ + mnttab_node_t find; + mnttab_node_t *ret; + + find.mtn_mt.mnt_special = (char *)fsname; + if (ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) { + avl_remove(&hdl->libzfs_mnttab_cache, ret); + free(ret->mtn_mt.mnt_special); + free(ret->mtn_mt.mnt_mountp); + free(ret->mtn_mt.mnt_fstype); + free(ret->mtn_mt.mnt_mntopts); + free(ret); + } +} + +int +zfs_spa_version(zfs_handle_t *zhp, int *spa_version) +{ + zpool_handle_t *zpool_handle = zhp->zpool_hdl; + + if (zpool_handle == NULL) + return (-1); + + *spa_version = zpool_get_prop_int(zpool_handle, + ZPOOL_PROP_VERSION, NULL); + return (0); +} + +/* + * The choice of reservation property depends on the SPA version. + */ +static int +zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop) +{ + int spa_version; + + if (zfs_spa_version(zhp, &spa_version) < 0) + return (-1); + + if (spa_version >= SPA_VERSION_REFRESERVATION) + *resv_prop = ZFS_PROP_REFRESERVATION; + else + *resv_prop = ZFS_PROP_RESERVATION; + + return (0); +} + +/* + * Given an nvlist of properties to set, validates that they are correct, and + * parses any numeric properties (index, boolean, etc) if they are specified as + * strings. + */ +nvlist_t * +zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, + uint64_t zoned, zfs_handle_t *zhp, const char *errbuf) +{ + nvpair_t *elem; + uint64_t intval; + char *strval; + zfs_prop_t prop; + nvlist_t *ret; + int chosen_normal = -1; + int chosen_utf = -1; + + if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (NULL); + } + + /* + * Make sure this property is valid and applies to this type. + */ + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *propname = nvpair_name(elem); + + prop = zfs_name_to_prop(propname); + if (prop == ZPROP_INVAL && zfs_prop_user(propname)) { + /* + * This is a user property: make sure it's a + * string, and that it's less than ZAP_MAXNAMELEN. + */ + if (nvpair_type(elem) != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property name '%s' is too long"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + (void) nvpair_value_string(elem, &strval); + if (nvlist_add_string(ret, propname, strval) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; + } + + /* + * Currently, only user properties can be modified on + * snapshots. + */ + if (type == ZFS_TYPE_SNAPSHOT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "this property can not be modified for snapshots")); + (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); + goto error; + } + + if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) { + zfs_userquota_prop_t uqtype; + char newpropname[128]; + char domain[128]; + uint64_t rid; + uint64_t valary[3]; + + if (userquota_propname_decode(propname, zoned, + &uqtype, domain, sizeof (domain), &rid) != 0) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "'%s' has an invalid user/group name"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (uqtype != ZFS_PROP_USERQUOTA && + uqtype != ZFS_PROP_GROUPQUOTA) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "'%s' is readonly"), + propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, + errbuf); + goto error; + } + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + (void) nvpair_value_string(elem, &strval); + if (strcmp(strval, "none") == 0) { + intval = 0; + } else if (zfs_nicestrtonum(hdl, + strval, &intval) != 0) { + (void) zfs_error(hdl, + EZFS_BADPROP, errbuf); + goto error; + } + } else if (nvpair_type(elem) == + DATA_TYPE_UINT64) { + (void) nvpair_value_uint64(elem, &intval); + if (intval == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "use 'none' to disable " + "userquota/groupquota")); + goto error; + } + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a number"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + /* + * Encode the prop name as + * userquota@-domain, to make it easy + * for the kernel to decode. + */ + (void) snprintf(newpropname, sizeof (newpropname), + "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype], + (longlong_t)rid, domain); + valary[0] = uqtype; + valary[1] = rid; + valary[2] = intval; + if (nvlist_add_uint64_array(ret, newpropname, + valary, 3) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; + } + + if (prop == ZPROP_INVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (!zfs_prop_valid_for_type(prop, type)) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "'%s' does not " + "apply to datasets of this type"), propname); + (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); + goto error; + } + + if (zfs_prop_readonly(prop) && + (!zfs_prop_setonce(prop) || zhp != NULL)) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "'%s' is readonly"), + propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); + goto error; + } + + if (zprop_parse_value(hdl, elem, prop, type, ret, + &strval, &intval, errbuf) != 0) + goto error; + + /* + * Perform some additional checks for specific properties. + */ + switch (prop) { + case ZFS_PROP_VERSION: + { + int version; + + if (zhp == NULL) + break; + version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + if (intval < version) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Can not downgrade; already at version %u"), + version); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + } + + case ZFS_PROP_RECORDSIZE: + case ZFS_PROP_VOLBLOCKSIZE: + /* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */ + if (intval < SPA_MINBLOCKSIZE || + intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be power of 2 from %u " + "to %uk"), propname, + (uint_t)SPA_MINBLOCKSIZE, + (uint_t)SPA_MAXBLOCKSIZE >> 10); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + + case ZFS_PROP_MLSLABEL: + { + /* + * Verify the mlslabel string and convert to + * internal hex label string. + */ + + m_label_t *new_sl; + char *hex = NULL; /* internal label string */ + + /* Default value is already OK. */ + if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) + break; + + /* Verify the label can be converted to binary form */ + if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) || + (str_to_label(strval, &new_sl, MAC_LABEL, + L_NO_CORRECTION, NULL) == -1)) { + goto badlabel; + } + + /* Now translate to hex internal label string */ + if (label_to_str(new_sl, &hex, M_INTERNAL, + DEF_NAMES) != 0) { + if (hex) + free(hex); + goto badlabel; + } + m_label_free(new_sl); + + /* If string is already in internal form, we're done. */ + if (strcmp(strval, hex) == 0) { + free(hex); + break; + } + + /* Replace the label string with the internal form. */ + (void) nvlist_remove(ret, zfs_prop_to_name(prop), + DATA_TYPE_STRING); + verify(nvlist_add_string(ret, zfs_prop_to_name(prop), + hex) == 0); + free(hex); + + break; + +badlabel: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid mlslabel '%s'"), strval); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + m_label_free(new_sl); /* OK if null */ + goto error; + + } + + case ZFS_PROP_MOUNTPOINT: + { + namecheck_err_t why; + + if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 || + strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0) + break; + + if (mountpoint_namecheck(strval, &why)) { + switch (why) { + case NAME_ERR_LEADING_SLASH: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "'%s' must be an absolute path, " + "'none', or 'legacy'"), propname); + break; + case NAME_ERR_TOOLONG: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "component of '%s' is too long"), + propname); + break; + } + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + } + + /*FALLTHRU*/ + + case ZFS_PROP_SHARESMB: + case ZFS_PROP_SHARENFS: + /* + * For the mountpoint and sharenfs or sharesmb + * properties, check if it can be set in a + * global/non-global zone based on + * the zoned property value: + * + * global zone non-global zone + * -------------------------------------------------- + * zoned=on mountpoint (no) mountpoint (yes) + * sharenfs (no) sharenfs (no) + * sharesmb (no) sharesmb (no) + * + * zoned=off mountpoint (yes) N/A + * sharenfs (yes) + * sharesmb (yes) + */ + if (zoned) { + if (getzoneid() == GLOBAL_ZONEID) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be set on " + "dataset in a non-global zone"), + propname); + (void) zfs_error(hdl, EZFS_ZONED, + errbuf); + goto error; + } else if (prop == ZFS_PROP_SHARENFS || + prop == ZFS_PROP_SHARESMB) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be set in " + "a non-global zone"), propname); + (void) zfs_error(hdl, EZFS_ZONED, + errbuf); + goto error; + } + } else if (getzoneid() != GLOBAL_ZONEID) { + /* + * If zoned property is 'off', this must be in + * a global zone. If not, something is wrong. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be set while dataset " + "'zoned' property is set"), propname); + (void) zfs_error(hdl, EZFS_ZONED, errbuf); + goto error; + } + + /* + * At this point, it is legitimate to set the + * property. Now we want to make sure that the + * property value is valid if it is sharenfs. + */ + if ((prop == ZFS_PROP_SHARENFS || + prop == ZFS_PROP_SHARESMB) && + strcmp(strval, "on") != 0 && + strcmp(strval, "off") != 0) { + zfs_share_proto_t proto; + + if (prop == ZFS_PROP_SHARESMB) + proto = PROTO_SMB; + else + proto = PROTO_NFS; + + /* + * Must be an valid sharing protocol + * option string so init the libshare + * in order to enable the parser and + * then parse the options. We use the + * control API since we don't care about + * the current configuration and don't + * want the overhead of loading it + * until we actually do something. + */ + + if (zfs_init_libshare(hdl, + SA_INIT_CONTROL_API) != SA_OK) { + /* + * An error occurred so we can't do + * anything + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be set: problem " + "in share initialization"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + + if (zfs_parse_options(strval, proto) != SA_OK) { + /* + * There was an error in parsing so + * deal with it by issuing an error + * message and leaving after + * uninitializing the the libshare + * interface. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be set to invalid " + "options"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + zfs_uninit_libshare(hdl); + goto error; + } + zfs_uninit_libshare(hdl); + } + + break; + case ZFS_PROP_UTF8ONLY: + chosen_utf = (int)intval; + break; + case ZFS_PROP_NORMALIZE: + chosen_normal = (int)intval; + break; + } + + /* + * For changes to existing volumes, we have some additional + * checks to enforce. + */ + if (type == ZFS_TYPE_VOLUME && zhp != NULL) { + uint64_t volsize = zfs_prop_get_int(zhp, + ZFS_PROP_VOLSIZE); + uint64_t blocksize = zfs_prop_get_int(zhp, + ZFS_PROP_VOLBLOCKSIZE); + char buf[64]; + + switch (prop) { + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + if (intval > volsize) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is greater than current " + "volume size"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + break; + + case ZFS_PROP_VOLSIZE: + if (intval % blocksize != 0) { + zfs_nicenum(blocksize, buf, + sizeof (buf)); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a multiple of " + "volume block size (%s)"), + propname, buf); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + + if (intval == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be zero"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + break; + } + } + } + + /* + * If normalization was chosen, but no UTF8 choice was made, + * enforce rejection of non-UTF8 names. + * + * If normalization was chosen, but rejecting non-UTF8 names + * was explicitly not chosen, it is an error. + */ + if (chosen_normal > 0 && chosen_utf < 0) { + if (nvlist_add_uint64(ret, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) { + (void) no_memory(hdl); + goto error; + } + } else if (chosen_normal > 0 && chosen_utf == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be set 'on' if normalization chosen"), + zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + return (ret); + +error: + nvlist_free(ret); + return (NULL); +} + +int +zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) +{ + uint64_t old_volsize; + uint64_t new_volsize; + uint64_t old_reservation; + uint64_t new_reservation; + zfs_prop_t resv_prop; + + /* + * If this is an existing volume, and someone is setting the volsize, + * make sure that it matches the reservation, or add it if necessary. + */ + old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); + if (zfs_which_resv_prop(zhp, &resv_prop) < 0) + return (-1); + old_reservation = zfs_prop_get_int(zhp, resv_prop); + if ((zvol_volsize_to_reservation(old_volsize, zhp->zfs_props) != + old_reservation) || nvlist_lookup_uint64(nvl, + zfs_prop_to_name(resv_prop), &new_reservation) != ENOENT) { + return (0); + } + if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), + &new_volsize) != 0) + return (-1); + new_reservation = zvol_volsize_to_reservation(new_volsize, + zhp->zfs_props); + if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), + new_reservation) != 0) { + (void) no_memory(zhp->zfs_hdl); + return (-1); + } + return (1); +} + +void +zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err, + char *errbuf) +{ + switch (err) { + + case ENOSPC: + /* + * For quotas and reservations, ENOSPC indicates + * something different; setting a quota or reservation + * doesn't use any disk space. + */ + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "size is less than current used or " + "reserved space")); + (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); + break; + + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "size is greater than available space")); + (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); + break; + + default: + (void) zfs_standard_error(hdl, err, errbuf); + break; + } + break; + + case EBUSY: + (void) zfs_standard_error(hdl, EBUSY, errbuf); + break; + + case EROFS: + (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); + break; + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool and or dataset must be upgraded to set this " + "property or value")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + + case ERANGE: + if (prop == ZFS_PROP_COMPRESSION) { + (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property setting is not allowed on " + "bootable datasets")); + (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); + } else { + (void) zfs_standard_error(hdl, err, errbuf); + } + break; + + case EINVAL: + if (prop == ZPROP_INVAL) { + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + } else { + (void) zfs_standard_error(hdl, err, errbuf); + } + break; + + case EOVERFLOW: + /* + * This platform can't address a volume this big. + */ +#ifdef _ILP32 + if (prop == ZFS_PROP_VOLSIZE) { + (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); + break; + } +#endif + /* FALLTHROUGH */ + default: + (void) zfs_standard_error(hdl, err, errbuf); + } +} + +/* + * Given a property name and value, set the property for the given dataset. + */ +int +zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) +{ + zfs_cmd_t zc = { 0 }; + int ret = -1; + prop_changelist_t *cl = NULL; + char errbuf[1024]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *nvl = NULL, *realprops; + zfs_prop_t prop; + boolean_t do_prefix; + uint64_t idx; + int added_resv; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), + zhp->zfs_name); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_string(nvl, propname, propval) != 0) { + (void) no_memory(hdl); + goto error; + } + + if ((realprops = zfs_valid_proplist(hdl, zhp->zfs_type, nvl, + zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, errbuf)) == NULL) + goto error; + + nvlist_free(nvl); + nvl = realprops; + + prop = zfs_name_to_prop(propname); + + if (prop == ZFS_PROP_VOLSIZE) { + if ((added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) + goto error; + } + + if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) + goto error; + + if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "child dataset with inherited mountpoint is used " + "in a non-global zone")); + ret = zfs_error(hdl, EZFS_ZONED, errbuf); + goto error; + } + + /* + * If the dataset's canmount property is being set to noauto, + * then we want to prevent unmounting & remounting it. + */ + do_prefix = !((prop == ZFS_PROP_CANMOUNT) && + (zprop_string_to_index(prop, propval, &idx, + ZFS_TYPE_DATASET) == 0) && (idx == ZFS_CANMOUNT_NOAUTO)); + + if (do_prefix && (ret = changelist_prefix(cl)) != 0) + goto error; + + /* + * Execute the corresponding ioctl() to set this property. + */ + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0) + goto error; + + ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); + + if (ret != 0) { + zfs_setprop_error(hdl, prop, errno, errbuf); + if (added_resv && errno == ENOSPC) { + /* clean up the volsize property we tried to set */ + uint64_t old_volsize = zfs_prop_get_int(zhp, + ZFS_PROP_VOLSIZE); + nvlist_free(nvl); + zcmd_free_nvlists(&zc); + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + goto error; + if (nvlist_add_uint64(nvl, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), + old_volsize) != 0) + goto error; + if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0) + goto error; + (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); + } + } else { + if (do_prefix) + ret = changelist_postfix(cl); + + /* + * Refresh the statistics so the new property value + * is reflected. + */ + if (ret == 0) + (void) get_stats(zhp); + } + +error: + nvlist_free(nvl); + zcmd_free_nvlists(&zc); + if (cl) + changelist_free(cl); + return (ret); +} + +/* + * Given a property, inherit the value from the parent dataset, or if received + * is TRUE, revert to the received value, if any. + */ +int +zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) +{ + zfs_cmd_t zc = { 0 }; + int ret; + prop_changelist_t *cl; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; + zfs_prop_t prop; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot inherit %s for '%s'"), propname, zhp->zfs_name); + + zc.zc_cookie = received; + if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { + /* + * For user properties, the amount of work we have to do is very + * small, so just do it here. + */ + if (!zfs_prop_user(propname)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); + + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) + return (zfs_standard_error(hdl, errno, errbuf)); + + return (0); + } + + /* + * Verify that this property is inheritable. + */ + if (zfs_prop_readonly(prop)) + return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf)); + + if (!zfs_prop_inheritable(prop) && !received) + return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf)); + + /* + * Check to see if the value applies to this type + */ + if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) + return (zfs_error(hdl, EZFS_PROPTYPE, errbuf)); + + /* + * Normalize the name, to get rid of shorthand abbreviations. + */ + propname = zfs_prop_to_name(prop); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); + + if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID && + zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset is used in a non-global zone")); + return (zfs_error(hdl, EZFS_ZONED, errbuf)); + } + + /* + * Determine datasets which will be affected by this change, if any. + */ + if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) + return (-1); + + if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "child dataset with inherited mountpoint is used " + "in a non-global zone")); + ret = zfs_error(hdl, EZFS_ZONED, errbuf); + goto error; + } + + if ((ret = changelist_prefix(cl)) != 0) + goto error; + + if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) { + return (zfs_standard_error(hdl, errno, errbuf)); + } else { + + if ((ret = changelist_postfix(cl)) != 0) + goto error; + + /* + * Refresh the statistics so the new property is reflected. + */ + (void) get_stats(zhp); + } + +error: + changelist_free(cl); + return (ret); +} + +/* + * True DSL properties are stored in an nvlist. The following two functions + * extract them appropriately. + */ +static uint64_t +getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) +{ + nvlist_t *nv; + uint64_t value; + + *source = NULL; + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(prop), &nv) == 0) { + verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); + (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); + } else { + verify(!zhp->zfs_props_table || + zhp->zfs_props_table[prop] == B_TRUE); + value = zfs_prop_default_numeric(prop); + *source = ""; + } + + return (value); +} + +static char * +getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) +{ + nvlist_t *nv; + char *value; + + *source = NULL; + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(prop), &nv) == 0) { + verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); + (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); + } else { + verify(!zhp->zfs_props_table || + zhp->zfs_props_table[prop] == B_TRUE); + if ((value = (char *)zfs_prop_default_string(prop)) == NULL) + value = ""; + *source = ""; + } + + return (value); +} + +static boolean_t +zfs_is_recvd_props_mode(zfs_handle_t *zhp) +{ + return (zhp->zfs_props == zhp->zfs_recvd_props); +} + +static void +zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) +{ + *cookie = (uint64_t)(uintptr_t)zhp->zfs_props; + zhp->zfs_props = zhp->zfs_recvd_props; +} + +static void +zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) +{ + zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie; + *cookie = 0; +} + +/* + * Internal function for getting a numeric property. Both zfs_prop_get() and + * zfs_prop_get_int() are built using this interface. + * + * Certain properties can be overridden using 'mount -o'. In this case, scan + * the contents of the /etc/mnttab entry, searching for the appropriate options. + * If they differ from the on-disk values, report the current values and mark + * the source "temporary". + */ +static int +get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, + char **source, uint64_t *val) +{ + zfs_cmd_t zc = { 0 }; + nvlist_t *zplprops = NULL; + struct mnttab mnt; + char *mntopt_on = NULL; + char *mntopt_off = NULL; + boolean_t received = zfs_is_recvd_props_mode(zhp); + + *source = NULL; + + switch (prop) { + case ZFS_PROP_ATIME: + mntopt_on = MNTOPT_ATIME; + mntopt_off = MNTOPT_NOATIME; + break; + + case ZFS_PROP_DEVICES: + mntopt_on = MNTOPT_DEVICES; + mntopt_off = MNTOPT_NODEVICES; + break; + + case ZFS_PROP_EXEC: + mntopt_on = MNTOPT_EXEC; + mntopt_off = MNTOPT_NOEXEC; + break; + + case ZFS_PROP_READONLY: + mntopt_on = MNTOPT_RO; + mntopt_off = MNTOPT_RW; + break; + + case ZFS_PROP_SETUID: + mntopt_on = MNTOPT_SETUID; + mntopt_off = MNTOPT_NOSETUID; + break; + + case ZFS_PROP_XATTR: + mntopt_on = MNTOPT_XATTR; + mntopt_off = MNTOPT_NOXATTR; + break; + + case ZFS_PROP_NBMAND: + mntopt_on = MNTOPT_NBMAND; + mntopt_off = MNTOPT_NONBMAND; + break; + } + + /* + * Because looking up the mount options is potentially expensive + * (iterating over all of /etc/mnttab), we defer its calculation until + * we're looking up a property which requires its presence. + */ + if (!zhp->zfs_mntcheck && + (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { + libzfs_handle_t *hdl = zhp->zfs_hdl; + struct mnttab entry; + + if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) { + zhp->zfs_mntopts = zfs_strdup(hdl, + entry.mnt_mntopts); + if (zhp->zfs_mntopts == NULL) + return (-1); + } + + zhp->zfs_mntcheck = B_TRUE; + } + + if (zhp->zfs_mntopts == NULL) + mnt.mnt_mntopts = ""; + else + mnt.mnt_mntopts = zhp->zfs_mntopts; + + switch (prop) { + case ZFS_PROP_ATIME: + case ZFS_PROP_DEVICES: + case ZFS_PROP_EXEC: + case ZFS_PROP_READONLY: + case ZFS_PROP_SETUID: + case ZFS_PROP_XATTR: + case ZFS_PROP_NBMAND: + *val = getprop_uint64(zhp, prop, source); + + if (received) + break; + + if (hasmntopt(&mnt, mntopt_on) && !*val) { + *val = B_TRUE; + if (src) + *src = ZPROP_SRC_TEMPORARY; + } else if (hasmntopt(&mnt, mntopt_off) && *val) { + *val = B_FALSE; + if (src) + *src = ZPROP_SRC_TEMPORARY; + } + break; + + case ZFS_PROP_CANMOUNT: + case ZFS_PROP_VOLSIZE: + case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + *val = getprop_uint64(zhp, prop, source); + + if (*source == NULL) { + /* not default, must be local */ + *source = zhp->zfs_name; + } + break; + + case ZFS_PROP_MOUNTED: + *val = (zhp->zfs_mntopts != NULL); + break; + + case ZFS_PROP_NUMCLONES: + *val = zhp->zfs_dmustats.dds_num_clones; + break; + + case ZFS_PROP_VERSION: + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + case ZFS_PROP_CASE: + if (!zfs_prop_valid_for_type(prop, zhp->zfs_head_type) || + zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { + zcmd_free_nvlists(&zc); + return (-1); + } + if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || + nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), + val) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + if (zplprops) + nvlist_free(zplprops); + zcmd_free_nvlists(&zc); + break; + + default: + switch (zfs_prop_get_type(prop)) { + case PROP_TYPE_NUMBER: + case PROP_TYPE_INDEX: + *val = getprop_uint64(zhp, prop, source); + /* + * If we tried to use a default value for a + * readonly property, it means that it was not + * present. + */ + if (zfs_prop_readonly(prop) && + *source != NULL && (*source)[0] == '\0') { + *source = NULL; + } + break; + + case PROP_TYPE_STRING: + default: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "cannot get non-numeric property")); + return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP, + dgettext(TEXT_DOMAIN, "internal error"))); + } + } + + return (0); +} + +/* + * Calculate the source type, given the raw source string. + */ +static void +get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source, + char *statbuf, size_t statlen) +{ + if (statbuf == NULL || *srctype == ZPROP_SRC_TEMPORARY) + return; + + if (source == NULL) { + *srctype = ZPROP_SRC_NONE; + } else if (source[0] == '\0') { + *srctype = ZPROP_SRC_DEFAULT; + } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) { + *srctype = ZPROP_SRC_RECEIVED; + } else { + if (strcmp(source, zhp->zfs_name) == 0) { + *srctype = ZPROP_SRC_LOCAL; + } else { + (void) strlcpy(statbuf, source, statlen); + *srctype = ZPROP_SRC_INHERITED; + } + } + +} + +int +zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, + size_t proplen, boolean_t literal) +{ + zfs_prop_t prop; + int err = 0; + + if (zhp->zfs_recvd_props == NULL) + if (get_recvd_props_ioctl(zhp) != 0) + return (-1); + + prop = zfs_name_to_prop(propname); + + if (prop != ZPROP_INVAL) { + uint64_t cookie; + if (!nvlist_exists(zhp->zfs_recvd_props, propname)) + return (-1); + zfs_set_recvd_props_mode(zhp, &cookie); + err = zfs_prop_get(zhp, prop, propbuf, proplen, + NULL, NULL, 0, literal); + zfs_unset_recvd_props_mode(zhp, &cookie); + } else if (zfs_prop_userquota(propname)) { + return (-1); + } else { + nvlist_t *propval; + char *recvdval; + if (nvlist_lookup_nvlist(zhp->zfs_recvd_props, + propname, &propval) != 0) + return (-1); + verify(nvlist_lookup_string(propval, ZPROP_VALUE, + &recvdval) == 0); + (void) strlcpy(propbuf, recvdval, proplen); + } + + return (err == 0 ? 0 : -1); +} + +/* + * Retrieve a property from the given object. If 'literal' is specified, then + * numbers are left as exact values. Otherwise, numbers are converted to a + * human-readable form. + * + * Returns 0 on success, or -1 on error. + */ +int +zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, + zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal) +{ + char *source = NULL; + uint64_t val; + char *str; + const char *strval; + boolean_t received = zfs_is_recvd_props_mode(zhp); + + /* + * Check to see if this property applies to our object + */ + if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) + return (-1); + + if (received && zfs_prop_readonly(prop)) + return (-1); + + if (src) + *src = ZPROP_SRC_NONE; + + switch (prop) { + case ZFS_PROP_CREATION: + /* + * 'creation' is a time_t stored in the statistics. We convert + * this into a string unless 'literal' is specified. + */ + { + val = getprop_uint64(zhp, prop, &source); + time_t time = (time_t)val; + struct tm t; + + if (literal || + localtime_r(&time, &t) == NULL || + strftime(propbuf, proplen, "%a %b %e %k:%M %Y", + &t) == 0) + (void) snprintf(propbuf, proplen, "%llu", val); + } + break; + + case ZFS_PROP_MOUNTPOINT: + /* + * Getting the precise mountpoint can be tricky. + * + * - for 'none' or 'legacy', return those values. + * - for inherited mountpoints, we want to take everything + * after our ancestor and append it to the inherited value. + * + * If the pool has an alternate root, we want to prepend that + * root to any values we return. + */ + + str = getprop_string(zhp, prop, &source); + + if (str[0] == '/') { + char buf[MAXPATHLEN]; + char *root = buf; + const char *relpath; + + /* + * If we inherit the mountpoint, even from a dataset + * with a received value, the source will be the path of + * the dataset we inherit from. If source is + * ZPROP_SOURCE_VAL_RECVD, the received value is not + * inherited. + */ + if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { + relpath = ""; + } else { + relpath = zhp->zfs_name + strlen(source); + if (relpath[0] == '/') + relpath++; + } + + if ((zpool_get_prop(zhp->zpool_hdl, + ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL)) || + (strcmp(root, "-") == 0)) + root[0] = '\0'; + /* + * Special case an alternate root of '/'. This will + * avoid having multiple leading slashes in the + * mountpoint path. + */ + if (strcmp(root, "/") == 0) + root++; + + /* + * If the mountpoint is '/' then skip over this + * if we are obtaining either an alternate root or + * an inherited mountpoint. + */ + if (str[1] == '\0' && (root[0] != '\0' || + relpath[0] != '\0')) + str++; + + if (relpath[0] == '\0') + (void) snprintf(propbuf, proplen, "%s%s", + root, str); + else + (void) snprintf(propbuf, proplen, "%s%s%s%s", + root, str, relpath[0] == '@' ? "" : "/", + relpath); + } else { + /* 'legacy' or 'none' */ + (void) strlcpy(propbuf, str, proplen); + } + + break; + + case ZFS_PROP_ORIGIN: + (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), + proplen); + /* + * If there is no parent at all, return failure to indicate that + * it doesn't apply to this dataset. + */ + if (propbuf[0] == '\0') + return (-1); + break; + + case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + + /* + * If quota or reservation is 0, we translate this into 'none' + * (unless literal is set), and indicate that it's the default + * value. Otherwise, we print the number nicely and indicate + * that its set locally. + */ + if (val == 0) { + if (literal) + (void) strlcpy(propbuf, "0", proplen); + else + (void) strlcpy(propbuf, "none", proplen); + } else { + if (literal) + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + else + zfs_nicenum(val, propbuf, proplen); + } + break; + + case ZFS_PROP_COMPRESSRATIO: + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + (void) snprintf(propbuf, proplen, "%llu.%02llux", + (u_longlong_t)(val / 100), + (u_longlong_t)(val % 100)); + break; + + case ZFS_PROP_TYPE: + switch (zhp->zfs_type) { + case ZFS_TYPE_FILESYSTEM: + str = "filesystem"; + break; + case ZFS_TYPE_VOLUME: + str = "volume"; + break; + case ZFS_TYPE_SNAPSHOT: + str = "snapshot"; + break; + default: + abort(); + } + (void) snprintf(propbuf, proplen, "%s", str); + break; + + case ZFS_PROP_MOUNTED: + /* + * The 'mounted' property is a pseudo-property that described + * whether the filesystem is currently mounted. Even though + * it's a boolean value, the typical values of "on" and "off" + * don't make sense, so we translate to "yes" and "no". + */ + if (get_numeric_property(zhp, ZFS_PROP_MOUNTED, + src, &source, &val) != 0) + return (-1); + if (val) + (void) strlcpy(propbuf, "yes", proplen); + else + (void) strlcpy(propbuf, "no", proplen); + break; + + case ZFS_PROP_NAME: + /* + * The 'name' property is a pseudo-property derived from the + * dataset name. It is presented as a real property to simplify + * consumers. + */ + (void) strlcpy(propbuf, zhp->zfs_name, proplen); + break; + + case ZFS_PROP_MLSLABEL: + { + m_label_t *new_sl = NULL; + char *ascii = NULL; /* human readable label */ + + (void) strlcpy(propbuf, + getprop_string(zhp, prop, &source), proplen); + + if (literal || (strcasecmp(propbuf, + ZFS_MLSLABEL_DEFAULT) == 0)) + break; + + /* + * Try to translate the internal hex string to + * human-readable output. If there are any + * problems just use the hex string. + */ + + if (str_to_label(propbuf, &new_sl, MAC_LABEL, + L_NO_CORRECTION, NULL) == -1) { + m_label_free(new_sl); + break; + } + + if (label_to_str(new_sl, &ascii, M_LABEL, + DEF_NAMES) != 0) { + if (ascii) + free(ascii); + m_label_free(new_sl); + break; + } + m_label_free(new_sl); + + (void) strlcpy(propbuf, ascii, proplen); + free(ascii); + } + break; + + default: + switch (zfs_prop_get_type(prop)) { + case PROP_TYPE_NUMBER: + if (get_numeric_property(zhp, prop, src, + &source, &val) != 0) + return (-1); + if (literal) + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + else + zfs_nicenum(val, propbuf, proplen); + break; + + case PROP_TYPE_STRING: + (void) strlcpy(propbuf, + getprop_string(zhp, prop, &source), proplen); + break; + + case PROP_TYPE_INDEX: + if (get_numeric_property(zhp, prop, src, + &source, &val) != 0) + return (-1); + if (zfs_prop_index_to_string(prop, val, &strval) != 0) + return (-1); + (void) strlcpy(propbuf, strval, proplen); + break; + + default: + abort(); + } + } + + get_source(zhp, src, source, statbuf, statlen); + + return (0); +} + +/* + * Utility function to get the given numeric property. Does no validation that + * the given property is the appropriate type; should only be used with + * hard-coded property types. + */ +uint64_t +zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) +{ + char *source; + uint64_t val; + + (void) get_numeric_property(zhp, prop, NULL, &source, &val); + + return (val); +} + +int +zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) +{ + char buf[64]; + + (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); + return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); +} + +/* + * Similar to zfs_prop_get(), but returns the value as an integer. + */ +int +zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, + zprop_source_t *src, char *statbuf, size_t statlen) +{ + char *source; + + /* + * Check to see if this property applies to our object + */ + if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) { + return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, + dgettext(TEXT_DOMAIN, "cannot get property '%s'"), + zfs_prop_to_name(prop))); + } + + if (src) + *src = ZPROP_SRC_NONE; + + if (get_numeric_property(zhp, prop, src, &source, value) != 0) + return (-1); + + get_source(zhp, src, source, statbuf, statlen); + + return (0); +} + +static int +idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, + char **domainp, idmap_rid_t *ridp) +{ + idmap_get_handle_t *get_hdl = NULL; + idmap_stat status; + int err = EINVAL; + + if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS) + goto out; + + if (isuser) { + err = idmap_get_sidbyuid(get_hdl, id, + IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); + } else { + err = idmap_get_sidbygid(get_hdl, id, + IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); + } + if (err == IDMAP_SUCCESS && + idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && + status == IDMAP_SUCCESS) + err = 0; + else + err = EINVAL; +out: + if (get_hdl) + idmap_get_destroy(get_hdl); + return (err); +} + +/* + * convert the propname into parameters needed by kernel + * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 + * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 + */ +static int +userquota_propname_decode(const char *propname, boolean_t zoned, + zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) +{ + zfs_userquota_prop_t type; + char *cp, *end; + char *numericsid = NULL; + boolean_t isuser; + + domain[0] = '\0'; + + /* Figure out the property type ({user|group}{quota|space}) */ + for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { + if (strncmp(propname, zfs_userquota_prop_prefixes[type], + strlen(zfs_userquota_prop_prefixes[type])) == 0) + break; + } + if (type == ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + *typep = type; + + isuser = (type == ZFS_PROP_USERQUOTA || + type == ZFS_PROP_USERUSED); + + cp = strchr(propname, '@') + 1; + + if (strchr(cp, '@')) { + /* + * It's a SID name (eg "user@domain") that needs to be + * turned into S-1-domainID-RID. + */ + directory_error_t e; + if (zoned && getzoneid() == GLOBAL_ZONEID) + return (ENOENT); + if (isuser) { + e = directory_sid_from_user_name(NULL, + cp, &numericsid); + } else { + e = directory_sid_from_group_name(NULL, + cp, &numericsid); + } + if (e != NULL) { + directory_error_free(e); + return (ENOENT); + } + if (numericsid == NULL) + return (ENOENT); + cp = numericsid; + /* will be further decoded below */ + } + + if (strncmp(cp, "S-1-", 4) == 0) { + /* It's a numeric SID (eg "S-1-234-567-89") */ + (void) strlcpy(domain, cp, domainlen); + cp = strrchr(domain, '-'); + *cp = '\0'; + cp++; + + errno = 0; + *ridp = strtoull(cp, &end, 10); + if (numericsid) { + free(numericsid); + numericsid = NULL; + } + if (errno != 0 || *end != '\0') + return (EINVAL); + } else if (!isdigit(*cp)) { + /* + * It's a user/group name (eg "user") that needs to be + * turned into a uid/gid + */ + if (zoned && getzoneid() == GLOBAL_ZONEID) + return (ENOENT); + if (isuser) { + struct passwd *pw; + pw = getpwnam(cp); + if (pw == NULL) + return (ENOENT); + *ridp = pw->pw_uid; + } else { + struct group *gr; + gr = getgrnam(cp); + if (gr == NULL) + return (ENOENT); + *ridp = gr->gr_gid; + } + } else { + /* It's a user/group ID (eg "12345"). */ + uid_t id = strtoul(cp, &end, 10); + idmap_rid_t rid; + char *mapdomain; + + if (*end != '\0') + return (EINVAL); + if (id > MAXUID) { + /* It's an ephemeral ID. */ + if (idmap_id_to_numeric_domain_rid(id, isuser, + &mapdomain, &rid) != 0) + return (ENOENT); + (void) strlcpy(domain, mapdomain, domainlen); + *ridp = rid; + } else { + *ridp = id; + } + } + + ASSERT3P(numericsid, ==, NULL); + return (0); +} + +static int +zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue, zfs_userquota_prop_t *typep) +{ + int err; + zfs_cmd_t zc = { 0 }; + + (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + err = userquota_propname_decode(propname, + zfs_prop_get_int(zhp, ZFS_PROP_ZONED), + typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); + zc.zc_objset_type = *typep; + if (err) + return (err); + + err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc); + if (err) + return (err); + + *propvalue = zc.zc_cookie; + return (0); +} + +int +zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue) +{ + zfs_userquota_prop_t type; + + return (zfs_prop_get_userquota_common(zhp, propname, propvalue, + &type)); +} + +int +zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal) +{ + int err; + uint64_t propvalue; + zfs_userquota_prop_t type; + + err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, + &type); + + if (err) + return (err); + + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", propvalue); + } else if (propvalue == 0 && + (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA)) { + (void) strlcpy(propbuf, "none", proplen); + } else { + zfs_nicenum(propvalue, propbuf, proplen); + } + return (0); +} + +/* + * Returns the name of the given zfs handle. + */ +const char * +zfs_get_name(const zfs_handle_t *zhp) +{ + return (zhp->zfs_name); +} + +/* + * Returns the type of the given zfs handle. + */ +zfs_type_t +zfs_get_type(const zfs_handle_t *zhp) +{ + return (zhp->zfs_type); +} + +static int +zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc) +{ + int rc; + uint64_t orig_cookie; + + orig_cookie = zc->zc_cookie; +top: + (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); + rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc); + + if (rc == -1) { + switch (errno) { + case ENOMEM: + /* expand nvlist memory and try again */ + if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) { + zcmd_free_nvlists(zc); + return (-1); + } + zc->zc_cookie = orig_cookie; + goto top; + /* + * An errno value of ESRCH indicates normal completion. + * If ENOENT is returned, then the underlying dataset + * has been removed since we obtained the handle. + */ + case ESRCH: + case ENOENT: + rc = 1; + break; + default: + rc = zfs_standard_error(zhp->zfs_hdl, errno, + dgettext(TEXT_DOMAIN, + "cannot iterate filesystems")); + break; + } + } + return (rc); +} + +/* + * Iterate over all child filesystems + */ +int +zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + zfs_cmd_t zc = { 0 }; + zfs_handle_t *nzhp; + int ret; + + if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) + return (0); + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, + &zc)) == 0) { + /* + * Silently ignore errors, as the only plausible explanation is + * that the pool has since been removed. + */ + if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, + &zc)) == NULL) { + continue; + } + + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); + return (ret); + } + } + zcmd_free_nvlists(&zc); + return ((ret < 0) ? ret : 0); +} + +/* + * Iterate over all snapshots + */ +int +zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + zfs_cmd_t zc = { 0 }; + zfs_handle_t *nzhp; + int ret; + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) + return (0); + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT, + &zc)) == 0) { + + if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, + &zc)) == NULL) { + continue; + } + + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); + return (ret); + } + } + zcmd_free_nvlists(&zc); + return ((ret < 0) ? ret : 0); +} + +/* + * Iterate over all children, snapshots and filesystems + */ +int +zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + int ret; + + if ((ret = zfs_iter_filesystems(zhp, func, data)) != 0) + return (ret); + + return (zfs_iter_snapshots(zhp, func, data)); +} + +/* + * Is one dataset name a child dataset of another? + * + * Needs to handle these cases: + * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo" + * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar" + * Descendant? No. No. No. Yes. + */ +static boolean_t +is_descendant(const char *ds1, const char *ds2) +{ + size_t d1len = strlen(ds1); + + /* ds2 can't be a descendant if it's smaller */ + if (strlen(ds2) < d1len) + return (B_FALSE); + + /* otherwise, compare strings and verify that there's a '/' char */ + return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0)); +} + +/* + * Given a complete name, return just the portion that refers to the parent. + * Can return NULL if this is a pool. + */ +static int +parent_name(const char *path, char *buf, size_t buflen) +{ + char *loc; + + if ((loc = strrchr(path, '/')) == NULL) + return (-1); + + (void) strncpy(buf, path, MIN(buflen, loc - path)); + buf[loc - path] = '\0'; + + return (0); +} + +/* + * If accept_ancestor is false, then check to make sure that the given path has + * a parent, and that it exists. If accept_ancestor is true, then find the + * closest existing ancestor for the given path. In prefixlen return the + * length of already existing prefix of the given path. We also fetch the + * 'zoned' property, which is used to validate property settings when creating + * new datasets. + */ +static int +check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, + boolean_t accept_ancestor, int *prefixlen) +{ + zfs_cmd_t zc = { 0 }; + char parent[ZFS_MAXNAMELEN]; + char *slash; + zfs_handle_t *zhp; + char errbuf[1024]; + uint64_t is_zoned; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); + + /* get parent, and check to see if this is just a pool */ + if (parent_name(path, parent, sizeof (parent)) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "missing dataset name")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + + /* check to see if the pool exists */ + if ((slash = strchr(parent, '/')) == NULL) + slash = parent + strlen(parent); + (void) strncpy(zc.zc_name, parent, slash - parent); + zc.zc_name[slash - parent] = '\0'; + if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 && + errno == ENOENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no such pool '%s'"), zc.zc_name); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } + + /* check to see if the parent dataset exists */ + while ((zhp = make_dataset_handle(hdl, parent)) == NULL) { + if (errno == ENOENT && accept_ancestor) { + /* + * Go deeper to find an ancestor, give up on top level. + */ + if (parent_name(parent, parent, sizeof (parent)) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no such pool '%s'"), zc.zc_name); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } + } else if (errno == ENOENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent does not exist")); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } else + return (zfs_standard_error(hdl, errno, errbuf)); + } + + is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + if (zoned != NULL) + *zoned = is_zoned; + + /* we are in a non-global zone, but parent is in the global zone */ + if (getzoneid() != GLOBAL_ZONEID && !is_zoned) { + (void) zfs_standard_error(hdl, EPERM, errbuf); + zfs_close(zhp); + return (-1); + } + + /* make sure parent is a filesystem */ + if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent is not a filesystem")); + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + zfs_close(zhp); + return (-1); + } + + zfs_close(zhp); + if (prefixlen != NULL) + *prefixlen = strlen(parent); + return (0); +} + +/* + * Finds whether the dataset of the given type(s) exists. + */ +boolean_t +zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types) +{ + zfs_handle_t *zhp; + + if (!zfs_validate_name(hdl, path, types, B_FALSE)) + return (B_FALSE); + + /* + * Try to get stats for the dataset, which will tell us if it exists. + */ + if ((zhp = make_dataset_handle(hdl, path)) != NULL) { + int ds_type = zhp->zfs_type; + + zfs_close(zhp); + if (types & ds_type) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Given a path to 'target', create all the ancestors between + * the prefixlen portion of the path, and the target itself. + * Fail if the initial prefixlen-ancestor does not already exist. + */ +int +create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) +{ + zfs_handle_t *h; + char *cp; + const char *opname; + + /* make sure prefix exists */ + cp = target + prefixlen; + if (*cp != '/') { + assert(strchr(cp, '/') == NULL); + h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); + } else { + *cp = '\0'; + h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); + *cp = '/'; + } + if (h == NULL) + return (-1); + zfs_close(h); + + /* + * Attempt to create, mount, and share any ancestor filesystems, + * up to the prefixlen-long one. + */ + for (cp = target + prefixlen + 1; + cp = strchr(cp, '/'); *cp = '/', cp++) { + char *logstr; + + *cp = '\0'; + + h = make_dataset_handle(hdl, target); + if (h) { + /* it already exists, nothing to do here */ + zfs_close(h); + continue; + } + + logstr = hdl->libzfs_log_str; + hdl->libzfs_log_str = NULL; + if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, + NULL) != 0) { + hdl->libzfs_log_str = logstr; + opname = dgettext(TEXT_DOMAIN, "create"); + goto ancestorerr; + } + + hdl->libzfs_log_str = logstr; + h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); + if (h == NULL) { + opname = dgettext(TEXT_DOMAIN, "open"); + goto ancestorerr; + } + + if (zfs_mount(h, NULL, 0) != 0) { + opname = dgettext(TEXT_DOMAIN, "mount"); + goto ancestorerr; + } + + if (zfs_share(h) != 0) { + opname = dgettext(TEXT_DOMAIN, "share"); + goto ancestorerr; + } + + zfs_close(h); + } + + return (0); + +ancestorerr: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "failed to %s ancestor '%s'"), opname, target); + return (-1); +} + +/* + * Creates non-existing ancestors of the given path. + */ +int +zfs_create_ancestors(libzfs_handle_t *hdl, const char *path) +{ + int prefix; + char *path_copy; + int rc; + + if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) + return (-1); + + if ((path_copy = strdup(path)) != NULL) { + rc = create_parents(hdl, path_copy, prefix); + free(path_copy); + } + if (path_copy == NULL || rc != 0) + return (-1); + + return (0); +} + +/* + * Create a new filesystem or volume. + */ +int +zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, + nvlist_t *props) +{ + zfs_cmd_t zc = { 0 }; + int ret; + uint64_t size = 0; + uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); + char errbuf[1024]; + uint64_t zoned; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot create '%s'"), path); + + /* validate the path, taking care to note the extended error message */ + if (!zfs_validate_name(hdl, path, type, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + /* validate parents exist */ + if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0) + return (-1); + + /* + * The failure modes when creating a dataset of a different type over + * one that already exists is a little strange. In particular, if you + * try to create a dataset on top of an existing dataset, the ioctl() + * will return ENOENT, not EEXIST. To prevent this from happening, we + * first try to see if the dataset exists. + */ + (void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name)); + if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset already exists")); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + } + + if (type == ZFS_TYPE_VOLUME) + zc.zc_objset_type = DMU_OST_ZVOL; + else + zc.zc_objset_type = DMU_OST_ZFS; + + if (props && (props = zfs_valid_proplist(hdl, type, props, + zoned, NULL, errbuf)) == 0) + return (-1); + + if (type == ZFS_TYPE_VOLUME) { + /* + * If we are creating a volume, the size and block size must + * satisfy a few restraints. First, the blocksize must be a + * valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the + * volsize must be a multiple of the block size, and cannot be + * zero. + */ + if (props == NULL || nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) { + nvlist_free(props); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "missing volume size")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + + if ((ret = nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &blocksize)) != 0) { + if (ret == ENOENT) { + blocksize = zfs_prop_default_numeric( + ZFS_PROP_VOLBLOCKSIZE); + } else { + nvlist_free(props); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "missing volume block size")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + } + + if (size == 0) { + nvlist_free(props); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volume size cannot be zero")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + + if (size % blocksize != 0) { + nvlist_free(props); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volume size must be a multiple of volume block " + "size")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + } + + if (props && zcmd_write_src_nvlist(hdl, &zc, props) != 0) + return (-1); + nvlist_free(props); + + /* create the dataset */ + ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc); + + zcmd_free_nvlists(&zc); + + /* check for failure */ + if (ret != 0) { + char parent[ZFS_MAXNAMELEN]; + (void) parent_name(path, parent, sizeof (parent)); + + switch (errno) { + case ENOENT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no such parent '%s'"), parent); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case EINVAL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent '%s' is not a filesystem"), parent); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + + case EDOM: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volume block size must be power of 2 from " + "%u to %uk"), + (uint_t)SPA_MINBLOCKSIZE, + (uint_t)SPA_MAXBLOCKSIZE >> 10); + + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to set this " + "property or value")); + return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); +#ifdef _ILP32 + case EOVERFLOW: + /* + * This platform can't address a volume this big. + */ + if (type == ZFS_TYPE_VOLUME) + return (zfs_error(hdl, EZFS_VOLTOOBIG, + errbuf)); +#endif + /* FALLTHROUGH */ + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + + return (0); +} + +/* + * Destroys the given dataset. The caller must make sure that the filesystem + * isn't mounted, and that there are no active dependents. + */ +int +zfs_destroy(zfs_handle_t *zhp, boolean_t defer) +{ + zfs_cmd_t zc = { 0 }; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (ZFS_IS_VOLUME(zhp)) { + zc.zc_objset_type = DMU_OST_ZVOL; + } else { + zc.zc_objset_type = DMU_OST_ZFS; + } + + zc.zc_defer_destroy = defer; + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) { + return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, + dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), + zhp->zfs_name)); + } + + remove_mountpoint(zhp); + + return (0); +} + +struct destroydata { + char *snapname; + boolean_t gotone; + boolean_t closezhp; +}; + +static int +zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) +{ + struct destroydata *dd = arg; + zfs_handle_t *szhp; + char name[ZFS_MAXNAMELEN]; + boolean_t closezhp = dd->closezhp; + int rv = 0; + + (void) strlcpy(name, zhp->zfs_name, sizeof (name)); + (void) strlcat(name, "@", sizeof (name)); + (void) strlcat(name, dd->snapname, sizeof (name)); + + szhp = make_dataset_handle(zhp->zfs_hdl, name); + if (szhp) { + dd->gotone = B_TRUE; + zfs_close(szhp); + } + + dd->closezhp = B_TRUE; + if (!dd->gotone) + rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg); + if (closezhp) + zfs_close(zhp); + return (rv); +} + +/* + * Destroys all snapshots with the given name in zhp & descendants. + */ +int +zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) +{ + zfs_cmd_t zc = { 0 }; + int ret; + struct destroydata dd = { 0 }; + + dd.snapname = snapname; + (void) zfs_check_snap_cb(zhp, &dd); + + if (!dd.gotone) { + return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, + dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), + zhp->zfs_name, snapname)); + } + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + zc.zc_defer_destroy = defer; + + ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc); + if (ret != 0) { + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot destroy '%s@%s'"), zc.zc_name, snapname); + + switch (errno) { + case EEXIST: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "snapshot is cloned")); + return (zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf)); + + default: + return (zfs_standard_error(zhp->zfs_hdl, errno, + errbuf)); + } + } + + return (0); +} + +/* + * Clones the given dataset. The target must be of the same type as the source. + */ +int +zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) +{ + zfs_cmd_t zc = { 0 }; + char parent[ZFS_MAXNAMELEN]; + int ret; + char errbuf[1024]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_type_t type; + uint64_t zoned; + + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot create '%s'"), target); + + /* validate the target name */ + if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + /* validate parents exist */ + if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0) + return (-1); + + (void) parent_name(target, parent, sizeof (parent)); + + /* do the clone */ + if (ZFS_IS_VOLUME(zhp)) { + zc.zc_objset_type = DMU_OST_ZVOL; + type = ZFS_TYPE_VOLUME; + } else { + zc.zc_objset_type = DMU_OST_ZFS; + type = ZFS_TYPE_FILESYSTEM; + } + + if (props) { + if ((props = zfs_valid_proplist(hdl, type, props, zoned, + zhp, errbuf)) == NULL) + return (-1); + + if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { + nvlist_free(props); + return (-1); + } + + nvlist_free(props); + } + + (void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); + ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_CREATE, &zc); + + zcmd_free_nvlists(&zc); + + if (ret != 0) { + switch (errno) { + + case ENOENT: + /* + * The parent doesn't exist. We should have caught this + * above, but there may a race condition that has since + * destroyed the parent. + * + * At this point, we don't know whether it's the source + * that doesn't exist anymore, or whether the target + * dataset doesn't exist. + */ + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "no such parent '%s'"), parent); + return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); + + case EXDEV: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "source and target pools differ")); + return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET, + errbuf)); + + default: + return (zfs_standard_error(zhp->zfs_hdl, errno, + errbuf)); + } + } + + return (ret); +} + +/* + * Promotes the given clone fs to be the clone parent. + */ +int +zfs_promote(zfs_handle_t *zhp) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_cmd_t zc = { 0 }; + char parent[MAXPATHLEN]; + int ret; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot promote '%s'"), zhp->zfs_name); + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshots can not be promoted")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + + (void) strlcpy(parent, zhp->zfs_dmustats.dds_origin, sizeof (parent)); + if (parent[0] == '\0') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not a cloned filesystem")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + + (void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin, + sizeof (zc.zc_value)); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + ret = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); + + if (ret != 0) { + int save_errno = errno; + + switch (save_errno) { + case EEXIST: + /* There is a conflicting snapshot name. */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "conflicting snapshot '%s' from parent '%s'"), + zc.zc_string, parent); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + + default: + return (zfs_standard_error(hdl, save_errno, errbuf)); + } + } + return (ret); +} + +/* + * Takes a snapshot of the given dataset. + */ +int +zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, + nvlist_t *props) +{ + const char *delim; + char parent[ZFS_MAXNAMELEN]; + zfs_handle_t *zhp; + zfs_cmd_t zc = { 0 }; + int ret; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot snapshot '%s'"), path); + + /* validate the target name */ + if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + if (props) { + if ((props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, + props, B_FALSE, NULL, errbuf)) == NULL) + return (-1); + + if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { + nvlist_free(props); + return (-1); + } + + nvlist_free(props); + } + + /* make sure the parent exists and is of the appropriate type */ + delim = strchr(path, '@'); + (void) strncpy(parent, path, delim - path); + parent[delim - path] = '\0'; + + if ((zhp = zfs_open(hdl, parent, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) == NULL) { + zcmd_free_nvlists(&zc); + return (-1); + } + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, delim+1, sizeof (zc.zc_value)); + if (ZFS_IS_VOLUME(zhp)) + zc.zc_objset_type = DMU_OST_ZVOL; + else + zc.zc_objset_type = DMU_OST_ZFS; + zc.zc_cookie = recursive; + ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SNAPSHOT, &zc); + + zcmd_free_nvlists(&zc); + + /* + * if it was recursive, the one that actually failed will be in + * zc.zc_name. + */ + if (ret != 0) { + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value); + (void) zfs_standard_error(hdl, errno, errbuf); + } + + zfs_close(zhp); + + return (ret); +} + +/* + * Destroy any more recent snapshots. We invoke this callback on any dependents + * of the snapshot first. If the 'cb_dependent' member is non-zero, then this + * is a dependent and we should just destroy it without checking the transaction + * group. + */ +typedef struct rollback_data { + const char *cb_target; /* the snapshot */ + uint64_t cb_create; /* creation time reference */ + boolean_t cb_error; + boolean_t cb_dependent; + boolean_t cb_force; +} rollback_data_t; + +static int +rollback_destroy(zfs_handle_t *zhp, void *data) +{ + rollback_data_t *cbp = data; + + if (!cbp->cb_dependent) { + if (strcmp(zhp->zfs_name, cbp->cb_target) != 0 && + zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && + zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > + cbp->cb_create) { + char *logstr; + + cbp->cb_dependent = B_TRUE; + cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, + rollback_destroy, cbp); + cbp->cb_dependent = B_FALSE; + + logstr = zhp->zfs_hdl->libzfs_log_str; + zhp->zfs_hdl->libzfs_log_str = NULL; + cbp->cb_error |= zfs_destroy(zhp, B_FALSE); + zhp->zfs_hdl->libzfs_log_str = logstr; + } + } else { + /* We must destroy this clone; first unmount it */ + prop_changelist_t *clp; + + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, + cbp->cb_force ? MS_FORCE: 0); + if (clp == NULL || changelist_prefix(clp) != 0) { + cbp->cb_error = B_TRUE; + zfs_close(zhp); + return (0); + } + if (zfs_destroy(zhp, B_FALSE) != 0) + cbp->cb_error = B_TRUE; + else + changelist_remove(clp, zhp->zfs_name); + (void) changelist_postfix(clp); + changelist_free(clp); + } + + zfs_close(zhp); + return (0); +} + +/* + * Given a dataset, rollback to a specific snapshot, discarding any + * data changes since then and making it the active dataset. + * + * Any snapshots more recent than the target are destroyed, along with + * their dependents. + */ +int +zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) +{ + rollback_data_t cb = { 0 }; + int err; + zfs_cmd_t zc = { 0 }; + boolean_t restore_resv = 0; + uint64_t old_volsize, new_volsize; + zfs_prop_t resv_prop; + + assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM || + zhp->zfs_type == ZFS_TYPE_VOLUME); + + /* + * Destroy all recent snapshots and its dependends. + */ + cb.cb_force = force; + cb.cb_target = snap->zfs_name; + cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); + (void) zfs_iter_children(zhp, rollback_destroy, &cb); + + if (cb.cb_error) + return (-1); + + /* + * Now that we have verified that the snapshot is the latest, + * rollback to the given snapshot. + */ + + if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + if (zfs_which_resv_prop(zhp, &resv_prop) < 0) + return (-1); + old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); + restore_resv = + (old_volsize == zfs_prop_get_int(zhp, resv_prop)); + } + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (ZFS_IS_VOLUME(zhp)) + zc.zc_objset_type = DMU_OST_ZVOL; + else + zc.zc_objset_type = DMU_OST_ZFS; + + /* + * We rely on zfs_iter_children() to verify that there are no + * newer snapshots for the given dataset. Therefore, we can + * simply pass the name on to the ioctl() call. There is still + * an unlikely race condition where the user has taken a + * snapshot since we verified that this was the most recent. + * + */ + if ((err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_ROLLBACK, &zc)) != 0) { + (void) zfs_standard_error_fmt(zhp->zfs_hdl, errno, + dgettext(TEXT_DOMAIN, "cannot rollback '%s'"), + zhp->zfs_name); + return (err); + } + + /* + * For volumes, if the pre-rollback volsize matched the pre- + * rollback reservation and the volsize has changed then set + * the reservation property to the post-rollback volsize. + * Make a new handle since the rollback closed the dataset. + */ + if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && + (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { + if (restore_resv) { + new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); + if (old_volsize != new_volsize) + err = zfs_prop_set_int(zhp, resv_prop, + new_volsize); + } + zfs_close(zhp); + } + return (err); +} + +/* + * Iterate over all dependents for a given dataset. This includes both + * hierarchical dependents (children) and data dependents (snapshots and + * clones). The bulk of the processing occurs in get_dependents() in + * libzfs_graph.c. + */ +int +zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion, + zfs_iter_f func, void *data) +{ + char **dependents; + size_t count; + int i; + zfs_handle_t *child; + int ret = 0; + + if (get_dependents(zhp->zfs_hdl, allowrecursion, zhp->zfs_name, + &dependents, &count) != 0) + return (-1); + + for (i = 0; i < count; i++) { + if ((child = make_dataset_handle(zhp->zfs_hdl, + dependents[i])) == NULL) + continue; + + if ((ret = func(child, data)) != 0) + break; + } + + for (i = 0; i < count; i++) + free(dependents[i]); + free(dependents); + + return (ret); +} + +/* + * Renames the given dataset. + */ +int +zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) +{ + int ret; + zfs_cmd_t zc = { 0 }; + char *delim; + prop_changelist_t *cl = NULL; + zfs_handle_t *zhrp = NULL; + char *parentname = NULL; + char parent[ZFS_MAXNAMELEN]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; + + /* if we have the same exact name, just return success */ + if (strcmp(zhp->zfs_name, target) == 0) + return (0); + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot rename to '%s'"), target); + + /* + * Make sure the target name is valid + */ + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { + if ((strchr(target, '@') == NULL) || + *target == '@') { + /* + * Snapshot target name is abbreviated, + * reconstruct full dataset name + */ + (void) strlcpy(parent, zhp->zfs_name, + sizeof (parent)); + delim = strchr(parent, '@'); + if (strchr(target, '@') == NULL) + *(++delim) = '\0'; + else + *delim = '\0'; + (void) strlcat(parent, target, sizeof (parent)); + target = parent; + } else { + /* + * Make sure we're renaming within the same dataset. + */ + delim = strchr(target, '@'); + if (strncmp(zhp->zfs_name, target, delim - target) + != 0 || zhp->zfs_name[delim - target] != '@') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshots must be part of same " + "dataset")); + return (zfs_error(hdl, EZFS_CROSSTARGET, + errbuf)); + } + } + if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } else { + if (recursive) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "recursive rename must be a snapshot")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + + if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + /* validate parents */ + if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0) + return (-1); + + /* make sure we're in the same pool */ + verify((delim = strchr(target, '/')) != NULL); + if (strncmp(zhp->zfs_name, target, delim - target) != 0 || + zhp->zfs_name[delim - target] != '/') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "datasets must be within same pool")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + } + + /* new name cannot be a child of the current dataset name */ + if (is_descendant(zhp->zfs_name, target)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "New dataset name cannot be a descendant of " + "current dataset name")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + } + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name); + + if (getzoneid() == GLOBAL_ZONEID && + zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset is used in a non-global zone")); + return (zfs_error(hdl, EZFS_ZONED, errbuf)); + } + + if (recursive) { + + parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); + if (parentname == NULL) { + ret = -1; + goto error; + } + delim = strchr(parentname, '@'); + *delim = '\0'; + zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); + if (zhrp == NULL) { + ret = -1; + goto error; + } + + } else { + if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL) + return (-1); + + if (changelist_haszonedchild(cl)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "child dataset with inherited mountpoint is used " + "in a non-global zone")); + (void) zfs_error(hdl, EZFS_ZONED, errbuf); + goto error; + } + + if ((ret = changelist_prefix(cl)) != 0) + goto error; + } + + if (ZFS_IS_VOLUME(zhp)) + zc.zc_objset_type = DMU_OST_ZVOL; + else + zc.zc_objset_type = DMU_OST_ZFS; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); + + zc.zc_cookie = recursive; + + if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { + /* + * if it was recursive, the one that actually failed will + * be in zc.zc_name + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot rename '%s'"), zc.zc_name); + + if (recursive && errno == EEXIST) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "a child dataset already has a snapshot " + "with the new name")); + (void) zfs_error(hdl, EZFS_EXISTS, errbuf); + } else { + (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); + } + + /* + * On failure, we still want to remount any filesystems that + * were previously mounted, so we don't alter the system state. + */ + if (!recursive) + (void) changelist_postfix(cl); + } else { + if (!recursive) { + changelist_rename(cl, zfs_get_name(zhp), target); + ret = changelist_postfix(cl); + } + } + +error: + if (parentname) { + free(parentname); + } + if (zhrp) { + zfs_close(zhrp); + } + if (cl) { + changelist_free(cl); + } + return (ret); +} + +nvlist_t * +zfs_get_user_props(zfs_handle_t *zhp) +{ + return (zhp->zfs_user_props); +} + +nvlist_t * +zfs_get_recvd_props(zfs_handle_t *zhp) +{ + if (zhp->zfs_recvd_props == NULL) + if (get_recvd_props_ioctl(zhp) != 0) + return (NULL); + return (zhp->zfs_recvd_props); +} + +/* + * This function is used by 'zfs list' to determine the exact set of columns to + * display, and their maximum widths. This does two main things: + * + * - If this is a list of all properties, then expand the list to include + * all native properties, and set a flag so that for each dataset we look + * for new unique user properties and add them to the list. + * + * - For non fixed-width properties, keep track of the maximum width seen + * so that we can size the column appropriately. If the user has + * requested received property values, we also need to compute the width + * of the RECEIVED column. + */ +int +zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zprop_list_t *entry; + zprop_list_t **last, **start; + nvlist_t *userprops, *propval; + nvpair_t *elem; + char *strval; + char buf[ZFS_MAXPROPLEN]; + + if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0) + return (-1); + + userprops = zfs_get_user_props(zhp); + + entry = *plp; + if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) { + /* + * Go through and add any user properties as necessary. We + * start by incrementing our list pointer to the first + * non-native property. + */ + start = plp; + while (*start != NULL) { + if ((*start)->pl_prop == ZPROP_INVAL) + break; + start = &(*start)->pl_next; + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) { + /* + * See if we've already found this property in our list. + */ + for (last = start; *last != NULL; + last = &(*last)->pl_next) { + if (strcmp((*last)->pl_user_prop, + nvpair_name(elem)) == 0) + break; + } + + if (*last == NULL) { + if ((entry = zfs_alloc(hdl, + sizeof (zprop_list_t))) == NULL || + ((entry->pl_user_prop = zfs_strdup(hdl, + nvpair_name(elem)))) == NULL) { + free(entry); + return (-1); + } + + entry->pl_prop = ZPROP_INVAL; + entry->pl_width = strlen(nvpair_name(elem)); + entry->pl_all = B_TRUE; + *last = entry; + } + } + } + + /* + * Now go through and check the width of any non-fixed columns + */ + for (entry = *plp; entry != NULL; entry = entry->pl_next) { + if (entry->pl_fixed) + continue; + + if (entry->pl_prop != ZPROP_INVAL) { + if (zfs_prop_get(zhp, entry->pl_prop, + buf, sizeof (buf), NULL, NULL, 0, B_FALSE) == 0) { + if (strlen(buf) > entry->pl_width) + entry->pl_width = strlen(buf); + } + if (received && zfs_prop_get_recvd(zhp, + zfs_prop_to_name(entry->pl_prop), + buf, sizeof (buf), B_FALSE) == 0) + if (strlen(buf) > entry->pl_recvd_width) + entry->pl_recvd_width = strlen(buf); + } else { + if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop, + &propval) == 0) { + verify(nvlist_lookup_string(propval, + ZPROP_VALUE, &strval) == 0); + if (strlen(strval) > entry->pl_width) + entry->pl_width = strlen(strval); + } + if (received && zfs_prop_get_recvd(zhp, + entry->pl_user_prop, + buf, sizeof (buf), B_FALSE) == 0) + if (strlen(buf) > entry->pl_recvd_width) + entry->pl_recvd_width = strlen(buf); + } + } + + return (0); +} + +int +zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path, + char *resource, void *export, void *sharetab, + int sharemax, zfs_share_op_t operation) +{ + zfs_cmd_t zc = { 0 }; + int error; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); + if (resource) + (void) strlcpy(zc.zc_string, resource, sizeof (zc.zc_string)); + zc.zc_share.z_sharedata = (uint64_t)(uintptr_t)sharetab; + zc.zc_share.z_exportdata = (uint64_t)(uintptr_t)export; + zc.zc_share.z_sharetype = operation; + zc.zc_share.z_sharemax = sharemax; + error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc); + return (error); +} + +void +zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) +{ + nvpair_t *curr; + + /* + * Keep a reference to the props-table against which we prune the + * properties. + */ + zhp->zfs_props_table = props; + + curr = nvlist_next_nvpair(zhp->zfs_props, NULL); + + while (curr) { + zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); + nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr); + + /* + * User properties will result in ZPROP_INVAL, and since we + * only know how to prune standard ZFS properties, we always + * leave these in the list. This can also happen if we + * encounter an unknown DSL property (when running older + * software, for example). + */ + if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE) + (void) nvlist_remove(zhp->zfs_props, + nvpair_name(curr), nvpair_type(curr)); + curr = next; + } +} + +static int +zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, + zfs_smb_acl_op_t cmd, char *resource1, char *resource2) +{ + zfs_cmd_t zc = { 0 }; + nvlist_t *nvlist = NULL; + int error; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); + zc.zc_cookie = (uint64_t)cmd; + + if (cmd == ZFS_SMB_ACL_RENAME) { + if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (NULL); + } + } + + switch (cmd) { + case ZFS_SMB_ACL_ADD: + case ZFS_SMB_ACL_REMOVE: + (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); + break; + case ZFS_SMB_ACL_RENAME: + if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, + resource1) != 0) { + (void) no_memory(hdl); + return (-1); + } + if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, + resource2) != 0) { + (void) no_memory(hdl); + return (-1); + } + if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) { + nvlist_free(nvlist); + return (-1); + } + break; + case ZFS_SMB_ACL_PURGE: + break; + default: + return (-1); + } + error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); + if (nvlist) + nvlist_free(nvlist); + return (error); +} + +int +zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, + char *path, char *resource) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, + resource, NULL)); +} + +int +zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, + char *path, char *resource) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, + resource, NULL)); +} + +int +zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, + NULL, NULL)); +} + +int +zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, + char *oldname, char *newname) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, + oldname, newname)); +} + +int +zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, + zfs_userspace_cb_t func, void *arg) +{ + zfs_cmd_t zc = { 0 }; + int error; + zfs_useracct_t buf[100]; + + (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + zc.zc_objset_type = type; + zc.zc_nvlist_dst = (uintptr_t)buf; + + /* CONSTCOND */ + while (1) { + zfs_useracct_t *zua = buf; + + zc.zc_nvlist_dst_size = sizeof (buf); + error = ioctl(zhp->zfs_hdl->libzfs_fd, + ZFS_IOC_USERSPACE_MANY, &zc); + if (error || zc.zc_nvlist_dst_size == 0) + break; + + while (zc.zc_nvlist_dst_size > 0) { + error = func(arg, zua->zu_domain, zua->zu_rid, + zua->zu_space); + if (error != 0) + return (error); + zua++; + zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); + } + } + + return (error); +} + +int +zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, + boolean_t recursive, boolean_t temphold, boolean_t enoent_ok, + int cleanup_fd, uint64_t dsobj, uint64_t createtxg) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + ASSERT(!recursive || dsobj == 0); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) + >= sizeof (zc.zc_string)) + return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); + zc.zc_cookie = recursive; + zc.zc_temphold = temphold; + zc.zc_cleanup_fd = cleanup_fd; + zc.zc_sendobj = dsobj; + zc.zc_createtxg = createtxg; + + if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) { + char errbuf[ZFS_MAXNAMELEN+32]; + + /* + * if it was recursive, the one that actually failed will be in + * zc.zc_name. + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot hold '%s@%s'"), zc.zc_name, snapname); + switch (errno) { + case E2BIG: + /* + * Temporary tags wind up having the ds object id + * prepended. So even if we passed the length check + * above, it's still possible for the tag to wind + * up being slightly too long. + */ + return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf)); + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); + case EINVAL: + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case EEXIST: + return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf)); + case ENOENT: + if (enoent_ok) + return (ENOENT); + /* FALLTHROUGH */ + default: + return (zfs_standard_error_fmt(hdl, errno, errbuf)); + } + } + + return (0); +} + +int +zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, + boolean_t recursive) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) + >= sizeof (zc.zc_string)) + return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); + zc.zc_cookie = recursive; + + if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) { + char errbuf[ZFS_MAXNAMELEN+32]; + + /* + * if it was recursive, the one that actually failed will be in + * zc.zc_name. + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot release '%s' from '%s@%s'"), tag, zc.zc_name, + snapname); + switch (errno) { + case ESRCH: + return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf)); + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); + case EINVAL: + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + default: + return (zfs_standard_error_fmt(hdl, errno, errbuf)); + } + } + + return (0); +} + +uint64_t +zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) +{ + uint64_t numdb; + uint64_t nblocks, volblocksize; + int ncopies; + char *strval; + + if (nvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0) + ncopies = atoi(strval); + else + ncopies = 1; + if (nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize) != 0) + volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + nblocks = volsize/volblocksize; + /* start with metadnode L0-L6 */ + numdb = 7; + /* calculate number of indirects */ + while (nblocks > 1) { + nblocks += DNODES_PER_LEVEL - 1; + nblocks /= DNODES_PER_LEVEL; + numdb += nblocks; + } + numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1); + volsize *= ncopies; + /* + * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't + * compressed, but in practice they compress down to about + * 1100 bytes + */ + numdb *= 1ULL << DN_MAX_INDBLKSHIFT; + volsize += numdb; + return (volsize); +} diff --git a/lib/libzfs/common/libzfs_diff.c b/lib/libzfs/common/libzfs_diff.c new file mode 100644 index 0000000..888224f --- /dev/null +++ b/lib/libzfs/common/libzfs_diff.c @@ -0,0 +1,826 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * zfs diff support + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libzfs_impl.h" + +#define ZDIFF_SNAPDIR "/.zfs/snapshot/" +#define ZDIFF_SHARESDIR "/.zfs/shares/" +#define ZDIFF_PREFIX "zfs-diff-%d" + +#define ZDIFF_ADDED '+' +#define ZDIFF_MODIFIED 'M' +#define ZDIFF_REMOVED '-' +#define ZDIFF_RENAMED 'R' + +static boolean_t +do_name_cmp(const char *fpath, const char *tpath) +{ + char *fname, *tname; + fname = strrchr(fpath, '/') + 1; + tname = strrchr(tpath, '/') + 1; + return (strcmp(fname, tname) == 0); +} + +typedef struct differ_info { + zfs_handle_t *zhp; + char *fromsnap; + char *frommnt; + char *tosnap; + char *tomnt; + char *ds; + char *dsmnt; + char *tmpsnap; + char errbuf[1024]; + boolean_t isclone; + boolean_t scripted; + boolean_t classify; + boolean_t timestamped; + uint64_t shares; + int zerr; + int cleanupfd; + int outputfd; + int datafd; +} differ_info_t; + +/* + * Given a {dsname, object id}, get the object path + */ +static int +get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, + char *pn, int maxlen, zfs_stat_t *sb) +{ + zfs_cmd_t zc = { 0 }; + int error; + + (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); + zc.zc_obj = obj; + + errno = 0; + error = ioctl(di->zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_STATS, &zc); + di->zerr = errno; + + /* we can get stats even if we failed to get a path */ + (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t)); + if (error == 0) { + ASSERT(di->zerr == 0); + (void) strlcpy(pn, zc.zc_value, maxlen); + return (0); + } + + if (di->zerr == EPERM) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "The sys_config privilege or diff delegated permission " + "is needed\nto discover path names")); + return (-1); + } else { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Unable to determine path or stats for " + "object %lld in %s"), obj, dsname); + return (-1); + } +} + +/* + * stream_bytes + * + * Prints a file name out a character at a time. If the character is + * not in the range of what we consider "printable" ASCII, display it + * as an escaped 3-digit octal value. ASCII values less than a space + * are all control characters and we declare the upper end as the + * DELete character. This also is the last 7-bit ASCII character. + * We choose to treat all 8-bit ASCII as not printable for this + * application. + */ +static void +stream_bytes(FILE *fp, const char *string) +{ + while (*string) { + if (*string > ' ' && *string != '\\' && *string < '\177') + (void) fprintf(fp, "%c", *string++); + else + (void) fprintf(fp, "\\%03o", *string++); + } +} + +static void +print_what(FILE *fp, mode_t what) +{ + char symbol; + + switch (what & S_IFMT) { + case S_IFBLK: + symbol = 'B'; + break; + case S_IFCHR: + symbol = 'C'; + break; + case S_IFDIR: + symbol = '/'; + break; + case S_IFDOOR: + symbol = '>'; + break; + case S_IFIFO: + symbol = '|'; + break; + case S_IFLNK: + symbol = '@'; + break; + case S_IFPORT: + symbol = 'P'; + break; + case S_IFSOCK: + symbol = '='; + break; + case S_IFREG: + symbol = 'F'; + break; + default: + symbol = '?'; + break; + } + (void) fprintf(fp, "%c", symbol); +} + +static void +print_cmn(FILE *fp, differ_info_t *di, const char *file) +{ + stream_bytes(fp, di->dsmnt); + stream_bytes(fp, file); +} + +static void +print_rename(FILE *fp, differ_info_t *di, const char *old, const char *new, + zfs_stat_t *isb) +{ + if (di->timestamped) + (void) fprintf(fp, "%10lld.%09lld\t", + (longlong_t)isb->zs_ctime[0], + (longlong_t)isb->zs_ctime[1]); + (void) fprintf(fp, "%c\t", ZDIFF_RENAMED); + if (di->classify) { + print_what(fp, isb->zs_mode); + (void) fprintf(fp, "\t"); + } + print_cmn(fp, di, old); + if (di->scripted) + (void) fprintf(fp, "\t"); + else + (void) fprintf(fp, " -> "); + print_cmn(fp, di, new); + (void) fprintf(fp, "\n"); +} + +static void +print_link_change(FILE *fp, differ_info_t *di, int delta, const char *file, + zfs_stat_t *isb) +{ + if (di->timestamped) + (void) fprintf(fp, "%10lld.%09lld\t", + (longlong_t)isb->zs_ctime[0], + (longlong_t)isb->zs_ctime[1]); + (void) fprintf(fp, "%c\t", ZDIFF_MODIFIED); + if (di->classify) { + print_what(fp, isb->zs_mode); + (void) fprintf(fp, "\t"); + } + print_cmn(fp, di, file); + (void) fprintf(fp, "\t(%+d)", delta); + (void) fprintf(fp, "\n"); +} + +static void +print_file(FILE *fp, differ_info_t *di, char type, const char *file, + zfs_stat_t *isb) +{ + if (di->timestamped) + (void) fprintf(fp, "%10lld.%09lld\t", + (longlong_t)isb->zs_ctime[0], + (longlong_t)isb->zs_ctime[1]); + (void) fprintf(fp, "%c\t", type); + if (di->classify) { + print_what(fp, isb->zs_mode); + (void) fprintf(fp, "\t"); + } + print_cmn(fp, di, file); + (void) fprintf(fp, "\n"); +} + +static int +write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj) +{ + struct zfs_stat fsb, tsb; + boolean_t same_name; + mode_t fmode, tmode; + char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN]; + int fobjerr, tobjerr; + int change; + + if (dobj == di->shares) + return (0); + + /* + * Check the from and to snapshots for info on the object. If + * we get ENOENT, then the object just didn't exist in that + * snapshot. If we get ENOTSUP, then we tried to get + * info on a non-ZPL object, which we don't care about anyway. + */ + fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname, + MAXPATHLEN, &fsb); + if (fobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) + return (-1); + + tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname, + MAXPATHLEN, &tsb); + if (tobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) + return (-1); + + /* + * Unallocated object sharing the same meta dnode block + */ + if (fobjerr && tobjerr) { + ASSERT(di->zerr == ENOENT || di->zerr == ENOTSUP); + di->zerr = 0; + return (0); + } + + di->zerr = 0; /* negate get_stats_for_obj() from side that failed */ + fmode = fsb.zs_mode & S_IFMT; + tmode = tsb.zs_mode & S_IFMT; + if (fmode == S_IFDIR || tmode == S_IFDIR || fsb.zs_links == 0 || + tsb.zs_links == 0) + change = 0; + else + change = tsb.zs_links - fsb.zs_links; + + if (fobjerr) { + if (change) { + print_link_change(fp, di, change, tobjname, &tsb); + return (0); + } + print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); + return (0); + } else if (tobjerr) { + if (change) { + print_link_change(fp, di, change, fobjname, &fsb); + return (0); + } + print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); + return (0); + } + + if (fmode != tmode && fsb.zs_gen == tsb.zs_gen) + tsb.zs_gen++; /* Force a generational difference */ + same_name = do_name_cmp(fobjname, tobjname); + + /* Simple modification or no change */ + if (fsb.zs_gen == tsb.zs_gen) { + /* No apparent changes. Could we assert !this? */ + if (fsb.zs_ctime[0] == tsb.zs_ctime[0] && + fsb.zs_ctime[1] == tsb.zs_ctime[1]) + return (0); + if (change) { + print_link_change(fp, di, change, + change > 0 ? fobjname : tobjname, &tsb); + } else if (same_name) { + print_file(fp, di, ZDIFF_MODIFIED, fobjname, &tsb); + } else { + print_rename(fp, di, fobjname, tobjname, &tsb); + } + return (0); + } else { + /* file re-created or object re-used */ + print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); + print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); + return (0); + } +} + +static int +write_inuse_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) +{ + uint64_t o; + int err; + + for (o = dr->ddr_first; o <= dr->ddr_last; o++) { + if (err = write_inuse_diffs_one(fp, di, o)) + return (err); + } + return (0); +} + +static int +describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf, + int maxlen) +{ + struct zfs_stat sb; + + if (get_stats_for_obj(di, di->fromsnap, object, namebuf, + maxlen, &sb) != 0) { + /* Let it slide, if in the delete queue on from side */ + if (di->zerr == ENOENT && sb.zs_links == 0) { + di->zerr = 0; + return (0); + } + return (-1); + } + + print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb); + return (0); +} + +static int +write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *lhdl = di->zhp->zfs_hdl; + char fobjname[MAXPATHLEN]; + + (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name)); + zc.zc_obj = dr->ddr_first - 1; + + ASSERT(di->zerr == 0); + + while (zc.zc_obj < dr->ddr_last) { + int err; + + err = ioctl(lhdl->libzfs_fd, ZFS_IOC_NEXT_OBJ, &zc); + if (err == 0) { + if (zc.zc_obj == di->shares) { + zc.zc_obj++; + continue; + } + if (zc.zc_obj > dr->ddr_last) { + break; + } + err = describe_free(fp, di, zc.zc_obj, fobjname, + MAXPATHLEN); + if (err) + break; + } else if (errno == ESRCH) { + break; + } else { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "next allocated object (> %lld) find failure"), + zc.zc_obj); + di->zerr = errno; + break; + } + } + if (di->zerr) + return (-1); + return (0); +} + +static void * +differ(void *arg) +{ + differ_info_t *di = arg; + dmu_diff_record_t dr; + FILE *ofp; + int err = 0; + + if ((ofp = fdopen(di->outputfd, "w")) == NULL) { + di->zerr = errno; + (void) strerror_r(errno, di->errbuf, sizeof (di->errbuf)); + (void) close(di->datafd); + return ((void *)-1); + } + + for (;;) { + char *cp = (char *)&dr; + int len = sizeof (dr); + int rv; + + do { + rv = read(di->datafd, cp, len); + cp += rv; + len -= rv; + } while (len > 0 && rv > 0); + + if (rv < 0 || (rv == 0 && len != sizeof (dr))) { + di->zerr = EPIPE; + break; + } else if (rv == 0) { + /* end of file at a natural breaking point */ + break; + } + + switch (dr.ddr_type) { + case DDR_FREE: + err = write_free_diffs(ofp, di, &dr); + break; + case DDR_INUSE: + err = write_inuse_diffs(ofp, di, &dr); + break; + default: + di->zerr = EPIPE; + break; + } + + if (err || di->zerr) + break; + } + + (void) fclose(ofp); + (void) close(di->datafd); + if (err) + return ((void *)-1); + if (di->zerr) { + ASSERT(di->zerr == EINVAL); + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Internal error: bad data from diff IOCTL")); + return ((void *)-1); + } + return ((void *)0); +} + +static int +find_shares_object(differ_info_t *di) +{ + char fullpath[MAXPATHLEN]; + struct stat64 sb = { 0 }; + + (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN); + (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN); + + if (stat64(fullpath, &sb) != 0) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath); + return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf)); + } + + di->shares = (uint64_t)sb.st_ino; + return (0); +} + +static int +make_temp_snapshot(differ_info_t *di) +{ + libzfs_handle_t *hdl = di->zhp->zfs_hdl; + zfs_cmd_t zc = { 0 }; + + (void) snprintf(zc.zc_value, sizeof (zc.zc_value), + ZDIFF_PREFIX, getpid()); + (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name)); + zc.zc_cleanup_fd = di->cleanupfd; + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) { + int err = errno; + if (err == EPERM) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "The diff delegated " + "permission is needed in order\nto create a " + "just-in-time snapshot for diffing\n")); + return (zfs_error(hdl, EZFS_DIFF, di->errbuf)); + } else { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "Cannot create just-in-time " + "snapshot of '%s'"), zc.zc_name); + return (zfs_standard_error(hdl, err, di->errbuf)); + } + } + + di->tmpsnap = zfs_strdup(hdl, zc.zc_value); + di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap); + return (0); +} + +static void +teardown_differ_info(differ_info_t *di) +{ + free(di->ds); + free(di->dsmnt); + free(di->fromsnap); + free(di->frommnt); + free(di->tosnap); + free(di->tmpsnap); + free(di->tomnt); + (void) close(di->cleanupfd); +} + +static int +get_snapshot_names(differ_info_t *di, const char *fromsnap, + const char *tosnap) +{ + libzfs_handle_t *hdl = di->zhp->zfs_hdl; + char *atptrf = NULL; + char *atptrt = NULL; + int fdslen, fsnlen; + int tdslen, tsnlen; + + /* + * Can accept + * dataset@snap1 + * dataset@snap1 dataset@snap2 + * dataset@snap1 @snap2 + * dataset@snap1 dataset + * @snap1 dataset@snap2 + */ + if (tosnap == NULL) { + /* only a from snapshot given, must be valid */ + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Badly formed snapshot name %s"), fromsnap); + + if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT, + B_FALSE)) { + return (zfs_error(hdl, EZFS_INVALIDNAME, + di->errbuf)); + } + + atptrf = strchr(fromsnap, '@'); + ASSERT(atptrf != NULL); + fdslen = atptrf - fromsnap; + + di->fromsnap = zfs_strdup(hdl, fromsnap); + di->ds = zfs_strdup(hdl, fromsnap); + di->ds[fdslen] = '\0'; + + /* the to snap will be a just-in-time snap of the head */ + return (make_temp_snapshot(di)); + } + + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Unable to determine which snapshots to compare")); + + atptrf = strchr(fromsnap, '@'); + atptrt = strchr(tosnap, '@'); + fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap); + tdslen = atptrt ? atptrt - tosnap : strlen(tosnap); + fsnlen = strlen(fromsnap) - fdslen; /* includes @ sign */ + tsnlen = strlen(tosnap) - tdslen; /* includes @ sign */ + + if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0) || + (fsnlen == 0 && tsnlen == 0)) { + return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); + } else if ((fdslen > 0 && tdslen > 0) && + ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) { + /* + * not the same dataset name, might be okay if + * tosnap is a clone of a fromsnap descendant. + */ + char origin[ZFS_MAXNAMELEN]; + zprop_source_t src; + zfs_handle_t *zhp; + + di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1); + (void) strncpy(di->ds, tosnap, tdslen); + di->ds[tdslen] = '\0'; + + zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM); + while (zhp != NULL) { + (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, + origin, sizeof (origin), &src, NULL, 0, B_FALSE); + + if (strncmp(origin, fromsnap, fsnlen) == 0) + break; + + (void) zfs_close(zhp); + zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM); + } + + if (zhp == NULL) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); + } else { + (void) zfs_close(zhp); + } + + di->isclone = B_TRUE; + di->fromsnap = zfs_strdup(hdl, fromsnap); + if (tsnlen) { + di->tosnap = zfs_strdup(hdl, tosnap); + } else { + return (make_temp_snapshot(di)); + } + } else { + int dslen = fdslen ? fdslen : tdslen; + + di->ds = zfs_alloc(hdl, dslen + 1); + (void) strncpy(di->ds, fdslen ? fromsnap : tosnap, dslen); + di->ds[dslen] = '\0'; + + di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf); + if (tsnlen) { + di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt); + } else { + return (make_temp_snapshot(di)); + } + } + return (0); +} + +static int +get_mountpoint(differ_info_t *di, char *dsnm, char **mntpt) +{ + boolean_t mounted; + + mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt); + if (mounted == B_FALSE) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Cannot diff an unmounted snapshot")); + return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf)); + } + + /* Avoid a double slash at the beginning of root-mounted datasets */ + if (**mntpt == '/' && *(*mntpt + 1) == '\0') + **mntpt = '\0'; + return (0); +} + +static int +get_mountpoints(differ_info_t *di) +{ + char *strptr; + char *frommntpt; + + /* + * first get the mountpoint for the parent dataset + */ + if (get_mountpoint(di, di->ds, &di->dsmnt) != 0) + return (-1); + + strptr = strchr(di->tosnap, '@'); + ASSERT3P(strptr, !=, NULL); + di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt, + ZDIFF_SNAPDIR, ++strptr); + + strptr = strchr(di->fromsnap, '@'); + ASSERT3P(strptr, !=, NULL); + + frommntpt = di->dsmnt; + if (di->isclone) { + char *mntpt; + int err; + + *strptr = '\0'; + err = get_mountpoint(di, di->fromsnap, &mntpt); + *strptr = '@'; + if (err != 0) + return (-1); + frommntpt = mntpt; + } + + di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt, + ZDIFF_SNAPDIR, ++strptr); + + if (di->isclone) + free(frommntpt); + + return (0); +} + +static int +setup_differ_info(zfs_handle_t *zhp, const char *fromsnap, + const char *tosnap, differ_info_t *di) +{ + di->zhp = zhp; + + di->cleanupfd = open(ZFS_DEV, O_RDWR|O_EXCL); + VERIFY(di->cleanupfd >= 0); + + if (get_snapshot_names(di, fromsnap, tosnap) != 0) + return (-1); + + if (get_mountpoints(di) != 0) + return (-1); + + if (find_shares_object(di) != 0) + return (-1); + + return (0); +} + +int +zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, + const char *tosnap, int flags) +{ + zfs_cmd_t zc = { 0 }; + char errbuf[1024]; + differ_info_t di = { 0 }; + pthread_t tid; + int pipefd[2]; + int iocerr; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "zfs diff failed")); + + if (setup_differ_info(zhp, fromsnap, tosnap, &di)) { + teardown_differ_info(&di); + return (-1); + } + + if (pipe(pipefd)) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + teardown_differ_info(&di); + return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); + } + + di.scripted = (flags & ZFS_DIFF_PARSEABLE); + di.classify = (flags & ZFS_DIFF_CLASSIFY); + di.timestamped = (flags & ZFS_DIFF_TIMESTAMP); + + di.outputfd = outfd; + di.datafd = pipefd[0]; + + if (pthread_create(&tid, NULL, differ, &di)) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + (void) close(pipefd[0]); + (void) close(pipefd[1]); + teardown_differ_info(&di); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); + } + + /* do the ioctl() */ + (void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1); + (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1); + zc.zc_cookie = pipefd[1]; + + iocerr = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DIFF, &zc); + if (iocerr != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "Unable to obtain diffs")); + if (errno == EPERM) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "\n The sys_mount privilege or diff delegated " + "permission is needed\n to execute the " + "diff ioctl")); + } else if (errno == EXDEV) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "\n Not an earlier snapshot from the same fs")); + } else if (errno != EPIPE || di.zerr == 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + } + (void) close(pipefd[1]); + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + teardown_differ_info(&di); + if (di.zerr != 0 && di.zerr != EPIPE) { + zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); + return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); + } else { + return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf)); + } + } + + (void) close(pipefd[1]); + (void) pthread_join(tid, NULL); + + if (di.zerr != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); + return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); + } + teardown_differ_info(&di); + return (0); +} diff --git a/lib/libzfs/common/libzfs_fru.c b/lib/libzfs/common/libzfs_fru.c new file mode 100644 index 0000000..788fa2c --- /dev/null +++ b/lib/libzfs/common/libzfs_fru.c @@ -0,0 +1,452 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "libzfs_impl.h" + +/* + * This file is responsible for determining the relationship between I/O + * devices paths and physical locations. In the world of MPxIO and external + * enclosures, the device path is not synonymous with the physical location. + * If you remove a drive and insert it into a different slot, it will end up + * with the same path under MPxIO. If you recable storage enclosures, the + * device paths may change. All of this makes it difficult to implement the + * 'autoreplace' property, which is supposed to automatically manage disk + * replacement based on physical slot. + * + * In order to work around these limitations, we have a per-vdev FRU property + * that is the libtopo path (minus disk-specific authority information) to the + * physical location of the device on the system. This is an optional + * property, and is only needed when using the 'autoreplace' property or when + * generating FMA faults against vdevs. + */ + +/* + * Because the FMA packages depend on ZFS, we have to dlopen() libtopo in case + * it is not present. We only need this once per library instance, so it is + * not part of the libzfs handle. + */ +static void *_topo_dlhandle; +static topo_hdl_t *(*_topo_open)(int, const char *, int *); +static void (*_topo_close)(topo_hdl_t *); +static char *(*_topo_snap_hold)(topo_hdl_t *, const char *, int *); +static void (*_topo_snap_release)(topo_hdl_t *); +static topo_walk_t *(*_topo_walk_init)(topo_hdl_t *, const char *, + topo_walk_cb_t, void *, int *); +static int (*_topo_walk_step)(topo_walk_t *, int); +static void (*_topo_walk_fini)(topo_walk_t *); +static void (*_topo_hdl_strfree)(topo_hdl_t *, char *); +static char *(*_topo_node_name)(tnode_t *); +static int (*_topo_prop_get_string)(tnode_t *, const char *, const char *, + char **, int *); +static int (*_topo_node_fru)(tnode_t *, nvlist_t **, nvlist_t *, int *); +static int (*_topo_fmri_nvl2str)(topo_hdl_t *, nvlist_t *, char **, int *); +static int (*_topo_fmri_strcmp_noauth)(topo_hdl_t *, const char *, + const char *); + +#define ZFS_FRU_HASH_SIZE 257 + +static size_t +fru_strhash(const char *key) +{ + ulong_t g, h = 0; + const char *p; + + for (p = key; *p != '\0'; p++) { + h = (h << 4) + *p; + + if ((g = (h & 0xf0000000)) != 0) { + h ^= (g >> 24); + h ^= g; + } + } + + return (h % ZFS_FRU_HASH_SIZE); +} + +static int +libzfs_fru_gather(topo_hdl_t *thp, tnode_t *tn, void *arg) +{ + libzfs_handle_t *hdl = arg; + nvlist_t *fru; + char *devpath, *frustr; + int err; + libzfs_fru_t *frup; + size_t idx; + + /* + * If this is the chassis node, and we don't yet have the system + * chassis ID, then fill in this value now. + */ + if (hdl->libzfs_chassis_id[0] == '\0' && + strcmp(_topo_node_name(tn), "chassis") == 0) { + if (_topo_prop_get_string(tn, FM_FMRI_AUTHORITY, + FM_FMRI_AUTH_CHASSIS, &devpath, &err) == 0) + (void) strlcpy(hdl->libzfs_chassis_id, devpath, + sizeof (hdl->libzfs_chassis_id)); + } + + /* + * Skip non-disk nodes. + */ + if (strcmp(_topo_node_name(tn), "disk") != 0) + return (TOPO_WALK_NEXT); + + /* + * Get the devfs path and FRU. + */ + if (_topo_prop_get_string(tn, "io", "devfs-path", &devpath, &err) != 0) + return (TOPO_WALK_NEXT); + + if (libzfs_fru_lookup(hdl, devpath) != NULL) { + _topo_hdl_strfree(thp, devpath); + return (TOPO_WALK_NEXT); + } + + if (_topo_node_fru(tn, &fru, NULL, &err) != 0) { + _topo_hdl_strfree(thp, devpath); + return (TOPO_WALK_NEXT); + } + + /* + * Convert the FRU into a string. + */ + if (_topo_fmri_nvl2str(thp, fru, &frustr, &err) != 0) { + nvlist_free(fru); + _topo_hdl_strfree(thp, devpath); + return (TOPO_WALK_NEXT); + } + + nvlist_free(fru); + + /* + * Finally, we have a FRU string and device path. Add it to the hash. + */ + if ((frup = calloc(sizeof (libzfs_fru_t), 1)) == NULL) { + _topo_hdl_strfree(thp, devpath); + _topo_hdl_strfree(thp, frustr); + return (TOPO_WALK_NEXT); + } + + if ((frup->zf_device = strdup(devpath)) == NULL || + (frup->zf_fru = strdup(frustr)) == NULL) { + free(frup->zf_device); + free(frup); + _topo_hdl_strfree(thp, devpath); + _topo_hdl_strfree(thp, frustr); + return (TOPO_WALK_NEXT); + } + + _topo_hdl_strfree(thp, devpath); + _topo_hdl_strfree(thp, frustr); + + idx = fru_strhash(frup->zf_device); + frup->zf_chain = hdl->libzfs_fru_hash[idx]; + hdl->libzfs_fru_hash[idx] = frup; + frup->zf_next = hdl->libzfs_fru_list; + hdl->libzfs_fru_list = frup; + + return (TOPO_WALK_NEXT); +} + +/* + * Called during initialization to setup the dynamic libtopo connection. + */ +#pragma init(libzfs_init_fru) +static void +libzfs_init_fru(void) +{ + char path[MAXPATHLEN]; + char isa[257]; + +#if defined(_LP64) + if (sysinfo(SI_ARCHITECTURE_64, isa, sizeof (isa)) < 0) + isa[0] = '\0'; +#else + isa[0] = '\0'; +#endif + (void) snprintf(path, sizeof (path), + "/usr/lib/fm/%s/libtopo.so", isa); + + if ((_topo_dlhandle = dlopen(path, RTLD_LAZY)) == NULL) + return; + + _topo_open = (topo_hdl_t *(*)()) + dlsym(_topo_dlhandle, "topo_open"); + _topo_close = (void (*)()) + dlsym(_topo_dlhandle, "topo_close"); + _topo_snap_hold = (char *(*)()) + dlsym(_topo_dlhandle, "topo_snap_hold"); + _topo_snap_release = (void (*)()) + dlsym(_topo_dlhandle, "topo_snap_release"); + _topo_walk_init = (topo_walk_t *(*)()) + dlsym(_topo_dlhandle, "topo_walk_init"); + _topo_walk_step = (int (*)()) + dlsym(_topo_dlhandle, "topo_walk_step"); + _topo_walk_fini = (void (*)()) + dlsym(_topo_dlhandle, "topo_walk_fini"); + _topo_hdl_strfree = (void (*)()) + dlsym(_topo_dlhandle, "topo_hdl_strfree"); + _topo_node_name = (char *(*)()) + dlsym(_topo_dlhandle, "topo_node_name"); + _topo_prop_get_string = (int (*)()) + dlsym(_topo_dlhandle, "topo_prop_get_string"); + _topo_node_fru = (int (*)()) + dlsym(_topo_dlhandle, "topo_node_fru"); + _topo_fmri_nvl2str = (int (*)()) + dlsym(_topo_dlhandle, "topo_fmri_nvl2str"); + _topo_fmri_strcmp_noauth = (int (*)()) + dlsym(_topo_dlhandle, "topo_fmri_strcmp_noauth"); + + if (_topo_open == NULL || _topo_close == NULL || + _topo_snap_hold == NULL || _topo_snap_release == NULL || + _topo_walk_init == NULL || _topo_walk_step == NULL || + _topo_walk_fini == NULL || _topo_hdl_strfree == NULL || + _topo_node_name == NULL || _topo_prop_get_string == NULL || + _topo_node_fru == NULL || _topo_fmri_nvl2str == NULL || + _topo_fmri_strcmp_noauth == NULL) { + (void) dlclose(_topo_dlhandle); + _topo_dlhandle = NULL; + } +} + +/* + * Refresh the mappings from device path -> FMRI. We do this by walking the + * hc topology looking for disk nodes, and recording the io/devfs-path and FRU. + * Note that we strip out the disk-specific authority information (serial, + * part, revision, etc) so that we are left with only the identifying + * characteristics of the slot (hc path and chassis-id). + */ +void +libzfs_fru_refresh(libzfs_handle_t *hdl) +{ + int err; + char *uuid; + topo_hdl_t *thp; + topo_walk_t *twp; + + if (_topo_dlhandle == NULL) + return; + + /* + * Clear the FRU hash and initialize our basic structures. + */ + libzfs_fru_clear(hdl, B_FALSE); + + if ((hdl->libzfs_topo_hdl = _topo_open(TOPO_VERSION, + NULL, &err)) == NULL) + return; + + thp = hdl->libzfs_topo_hdl; + + if ((uuid = _topo_snap_hold(thp, NULL, &err)) == NULL) + return; + + _topo_hdl_strfree(thp, uuid); + + if (hdl->libzfs_fru_hash == NULL && + (hdl->libzfs_fru_hash = + calloc(ZFS_FRU_HASH_SIZE * sizeof (void *), 1)) == NULL) + return; + + /* + * We now have a topo snapshot, so iterate over the hc topology looking + * for disks to add to the hash. + */ + twp = _topo_walk_init(thp, FM_FMRI_SCHEME_HC, + libzfs_fru_gather, hdl, &err); + if (twp != NULL) { + (void) _topo_walk_step(twp, TOPO_WALK_CHILD); + _topo_walk_fini(twp); + } +} + +/* + * Given a devfs path, return the FRU for the device, if known. This will + * automatically call libzfs_fru_refresh() if it hasn't already been called by + * the consumer. The string returned is valid until the next call to + * libzfs_fru_refresh(). + */ +const char * +libzfs_fru_lookup(libzfs_handle_t *hdl, const char *devpath) +{ + size_t idx = fru_strhash(devpath); + libzfs_fru_t *frup; + + if (hdl->libzfs_fru_hash == NULL) + libzfs_fru_refresh(hdl); + + if (hdl->libzfs_fru_hash == NULL) + return (NULL); + + for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL; + frup = frup->zf_chain) { + if (strcmp(devpath, frup->zf_device) == 0) + return (frup->zf_fru); + } + + return (NULL); +} + +/* + * Given a fru path, return the device path. This will automatically call + * libzfs_fru_refresh() if it hasn't already been called by the consumer. The + * string returned is valid until the next call to libzfs_fru_refresh(). + */ +const char * +libzfs_fru_devpath(libzfs_handle_t *hdl, const char *fru) +{ + libzfs_fru_t *frup; + size_t idx; + + if (hdl->libzfs_fru_hash == NULL) + libzfs_fru_refresh(hdl); + + if (hdl->libzfs_fru_hash == NULL) + return (NULL); + + for (idx = 0; idx < ZFS_FRU_HASH_SIZE; idx++) { + for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL; + frup = frup->zf_next) { + if (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, + fru, frup->zf_fru)) + return (frup->zf_device); + } + } + + return (NULL); +} + +/* + * Change the stored FRU for the given vdev. + */ +int +zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru) +{ + zfs_cmd_t zc = { 0 }; + + (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + (void) strncpy(zc.zc_value, fru, sizeof (zc.zc_value)); + zc.zc_guid = vdev_guid; + + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SETFRU, &zc) != 0) + return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, + dgettext(TEXT_DOMAIN, "cannot set FRU"))); + + return (0); +} + +/* + * Compare to two FRUs, ignoring any authority information. + */ +boolean_t +libzfs_fru_compare(libzfs_handle_t *hdl, const char *a, const char *b) +{ + if (hdl->libzfs_fru_hash == NULL) + libzfs_fru_refresh(hdl); + + if (hdl->libzfs_fru_hash == NULL) + return (strcmp(a, b) == 0); + + return (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, a, b)); +} + +/* + * This special function checks to see whether the FRU indicates it's supposed + * to be in the system chassis, but the chassis-id doesn't match. This can + * happen in a clustered case, where both head nodes have the same logical + * disk, but opening the device on the other head node is meaningless. + */ +boolean_t +libzfs_fru_notself(libzfs_handle_t *hdl, const char *fru) +{ + const char *chassisid; + size_t len; + + if (hdl->libzfs_fru_hash == NULL) + libzfs_fru_refresh(hdl); + + if (hdl->libzfs_chassis_id[0] == '\0') + return (B_FALSE); + + if (strstr(fru, "/chassis=0/") == NULL) + return (B_FALSE); + + if ((chassisid = strstr(fru, ":chassis-id=")) == NULL) + return (B_FALSE); + + chassisid += 12; + len = strlen(hdl->libzfs_chassis_id); + if (strncmp(chassisid, hdl->libzfs_chassis_id, len) == 0 && + (chassisid[len] == '/' || chassisid[len] == ':')) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Clear memory associated with the FRU hash. + */ +void +libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final) +{ + libzfs_fru_t *frup; + + while ((frup = hdl->libzfs_fru_list) != NULL) { + hdl->libzfs_fru_list = frup->zf_next; + free(frup->zf_device); + free(frup->zf_fru); + free(frup); + } + + hdl->libzfs_fru_list = NULL; + + if (hdl->libzfs_topo_hdl != NULL) { + _topo_snap_release(hdl->libzfs_topo_hdl); + _topo_close(hdl->libzfs_topo_hdl); + hdl->libzfs_topo_hdl = NULL; + } + + if (final) { + free(hdl->libzfs_fru_hash); + } else if (hdl->libzfs_fru_hash != NULL) { + bzero(hdl->libzfs_fru_hash, + ZFS_FRU_HASH_SIZE * sizeof (void *)); + } +} diff --git a/lib/libzfs/common/libzfs_graph.c b/lib/libzfs/common/libzfs_graph.c new file mode 100644 index 0000000..bc21c51 --- /dev/null +++ b/lib/libzfs/common/libzfs_graph.c @@ -0,0 +1,653 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Iterate over all children of the current object. This includes the normal + * dataset hierarchy, but also arbitrary hierarchies due to clones. We want to + * walk all datasets in the pool, and construct a directed graph of the form: + * + * home + * | + * +----+----+ + * | | + * v v ws + * bar baz | + * | | + * v v + * @yesterday ----> foo + * + * In order to construct this graph, we have to walk every dataset in the pool, + * because the clone parent is stored as a property of the child, not the + * parent. The parent only keeps track of the number of clones. + * + * In the normal case (without clones) this would be rather expensive. To avoid + * unnecessary computation, we first try a walk of the subtree hierarchy + * starting from the initial node. At each dataset, we construct a node in the + * graph and an edge leading from its parent. If we don't see any snapshots + * with a non-zero clone count, then we are finished. + * + * If we do find a cloned snapshot, then we finish the walk of the current + * subtree, but indicate that we need to do a complete walk. We then perform a + * global walk of all datasets, avoiding the subtree we already processed. + * + * At the end of this, we'll end up with a directed graph of all relevant (and + * possible some irrelevant) datasets in the system. We need to both find our + * limiting subgraph and determine a safe ordering in which to destroy the + * datasets. We do a topological ordering of our graph starting at our target + * dataset, and then walk the results in reverse. + * + * It's possible for the graph to have cycles if, for example, the user renames + * a clone to be the parent of its origin snapshot. The user can request to + * generate an error in this case, or ignore the cycle and continue. + * + * When removing datasets, we want to destroy the snapshots in chronological + * order (because this is the most efficient method). In order to accomplish + * this, we store the creation transaction group with each vertex and keep each + * vertex's edges sorted according to this value. The topological sort will + * automatically walk the snapshots in the correct order. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "libzfs_impl.h" +#include "zfs_namecheck.h" + +#define MIN_EDGECOUNT 4 + +/* + * Vertex structure. Indexed by dataset name, this structure maintains a list + * of edges to other vertices. + */ +struct zfs_edge; +typedef struct zfs_vertex { + char zv_dataset[ZFS_MAXNAMELEN]; + struct zfs_vertex *zv_next; + int zv_visited; + uint64_t zv_txg; + struct zfs_edge **zv_edges; + int zv_edgecount; + int zv_edgealloc; +} zfs_vertex_t; + +enum { + VISIT_SEEN = 1, + VISIT_SORT_PRE, + VISIT_SORT_POST +}; + +/* + * Edge structure. Simply maintains a pointer to the destination vertex. There + * is no need to store the source vertex, since we only use edges in the context + * of the source vertex. + */ +typedef struct zfs_edge { + zfs_vertex_t *ze_dest; + struct zfs_edge *ze_next; +} zfs_edge_t; + +#define ZFS_GRAPH_SIZE 1027 /* this could be dynamic some day */ + +/* + * Graph structure. Vertices are maintained in a hash indexed by dataset name. + */ +typedef struct zfs_graph { + zfs_vertex_t **zg_hash; + size_t zg_size; + size_t zg_nvertex; + const char *zg_root; + int zg_clone_count; +} zfs_graph_t; + +/* + * Allocate a new edge pointing to the target vertex. + */ +static zfs_edge_t * +zfs_edge_create(libzfs_handle_t *hdl, zfs_vertex_t *dest) +{ + zfs_edge_t *zep = zfs_alloc(hdl, sizeof (zfs_edge_t)); + + if (zep == NULL) + return (NULL); + + zep->ze_dest = dest; + + return (zep); +} + +/* + * Destroy an edge. + */ +static void +zfs_edge_destroy(zfs_edge_t *zep) +{ + free(zep); +} + +/* + * Allocate a new vertex with the given name. + */ +static zfs_vertex_t * +zfs_vertex_create(libzfs_handle_t *hdl, const char *dataset) +{ + zfs_vertex_t *zvp = zfs_alloc(hdl, sizeof (zfs_vertex_t)); + + if (zvp == NULL) + return (NULL); + + assert(strlen(dataset) < ZFS_MAXNAMELEN); + + (void) strlcpy(zvp->zv_dataset, dataset, sizeof (zvp->zv_dataset)); + + if ((zvp->zv_edges = zfs_alloc(hdl, + MIN_EDGECOUNT * sizeof (void *))) == NULL) { + free(zvp); + return (NULL); + } + + zvp->zv_edgealloc = MIN_EDGECOUNT; + + return (zvp); +} + +/* + * Destroy a vertex. Frees up any associated edges. + */ +static void +zfs_vertex_destroy(zfs_vertex_t *zvp) +{ + int i; + + for (i = 0; i < zvp->zv_edgecount; i++) + zfs_edge_destroy(zvp->zv_edges[i]); + + free(zvp->zv_edges); + free(zvp); +} + +/* + * Given a vertex, add an edge to the destination vertex. + */ +static int +zfs_vertex_add_edge(libzfs_handle_t *hdl, zfs_vertex_t *zvp, + zfs_vertex_t *dest) +{ + zfs_edge_t *zep = zfs_edge_create(hdl, dest); + + if (zep == NULL) + return (-1); + + if (zvp->zv_edgecount == zvp->zv_edgealloc) { + void *ptr; + + if ((ptr = zfs_realloc(hdl, zvp->zv_edges, + zvp->zv_edgealloc * sizeof (void *), + zvp->zv_edgealloc * 2 * sizeof (void *))) == NULL) + return (-1); + + zvp->zv_edges = ptr; + zvp->zv_edgealloc *= 2; + } + + zvp->zv_edges[zvp->zv_edgecount++] = zep; + + return (0); +} + +static int +zfs_edge_compare(const void *a, const void *b) +{ + const zfs_edge_t *ea = *((zfs_edge_t **)a); + const zfs_edge_t *eb = *((zfs_edge_t **)b); + + if (ea->ze_dest->zv_txg < eb->ze_dest->zv_txg) + return (-1); + if (ea->ze_dest->zv_txg > eb->ze_dest->zv_txg) + return (1); + return (0); +} + +/* + * Sort the given vertex edges according to the creation txg of each vertex. + */ +static void +zfs_vertex_sort_edges(zfs_vertex_t *zvp) +{ + if (zvp->zv_edgecount == 0) + return; + + qsort(zvp->zv_edges, zvp->zv_edgecount, sizeof (void *), + zfs_edge_compare); +} + +/* + * Construct a new graph object. We allow the size to be specified as a + * parameter so in the future we can size the hash according to the number of + * datasets in the pool. + */ +static zfs_graph_t * +zfs_graph_create(libzfs_handle_t *hdl, const char *dataset, size_t size) +{ + zfs_graph_t *zgp = zfs_alloc(hdl, sizeof (zfs_graph_t)); + + if (zgp == NULL) + return (NULL); + + zgp->zg_size = size; + if ((zgp->zg_hash = zfs_alloc(hdl, + size * sizeof (zfs_vertex_t *))) == NULL) { + free(zgp); + return (NULL); + } + + zgp->zg_root = dataset; + zgp->zg_clone_count = 0; + + return (zgp); +} + +/* + * Destroy a graph object. We have to iterate over all the hash chains, + * destroying each vertex in the process. + */ +static void +zfs_graph_destroy(zfs_graph_t *zgp) +{ + int i; + zfs_vertex_t *current, *next; + + for (i = 0; i < zgp->zg_size; i++) { + current = zgp->zg_hash[i]; + while (current != NULL) { + next = current->zv_next; + zfs_vertex_destroy(current); + current = next; + } + } + + free(zgp->zg_hash); + free(zgp); +} + +/* + * Graph hash function. Classic bernstein k=33 hash function, taken from + * usr/src/cmd/sgs/tools/common/strhash.c + */ +static size_t +zfs_graph_hash(zfs_graph_t *zgp, const char *str) +{ + size_t hash = 5381; + int c; + + while ((c = *str++) != 0) + hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + + return (hash % zgp->zg_size); +} + +/* + * Given a dataset name, finds the associated vertex, creating it if necessary. + */ +static zfs_vertex_t * +zfs_graph_lookup(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset, + uint64_t txg) +{ + size_t idx = zfs_graph_hash(zgp, dataset); + zfs_vertex_t *zvp; + + for (zvp = zgp->zg_hash[idx]; zvp != NULL; zvp = zvp->zv_next) { + if (strcmp(zvp->zv_dataset, dataset) == 0) { + if (zvp->zv_txg == 0) + zvp->zv_txg = txg; + return (zvp); + } + } + + if ((zvp = zfs_vertex_create(hdl, dataset)) == NULL) + return (NULL); + + zvp->zv_next = zgp->zg_hash[idx]; + zvp->zv_txg = txg; + zgp->zg_hash[idx] = zvp; + zgp->zg_nvertex++; + + return (zvp); +} + +/* + * Given two dataset names, create an edge between them. For the source vertex, + * mark 'zv_visited' to indicate that we have seen this vertex, and not simply + * created it as a destination of another edge. If 'dest' is NULL, then this + * is an individual vertex (i.e. the starting vertex), so don't add an edge. + */ +static int +zfs_graph_add(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *source, + const char *dest, uint64_t txg) +{ + zfs_vertex_t *svp, *dvp; + + if ((svp = zfs_graph_lookup(hdl, zgp, source, 0)) == NULL) + return (-1); + svp->zv_visited = VISIT_SEEN; + if (dest != NULL) { + dvp = zfs_graph_lookup(hdl, zgp, dest, txg); + if (dvp == NULL) + return (-1); + if (zfs_vertex_add_edge(hdl, svp, dvp) != 0) + return (-1); + } + + return (0); +} + +/* + * Iterate over all children of the given dataset, adding any vertices + * as necessary. Returns -1 if there was an error, or 0 otherwise. + * This is a simple recursive algorithm - the ZFS namespace typically + * is very flat. We manually invoke the necessary ioctl() calls to + * avoid the overhead and additional semantics of zfs_open(). + */ +static int +iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) +{ + zfs_cmd_t zc = { 0 }; + zfs_vertex_t *zvp; + + /* + * Look up the source vertex, and avoid it if we've seen it before. + */ + zvp = zfs_graph_lookup(hdl, zgp, dataset, 0); + if (zvp == NULL) + return (-1); + if (zvp->zv_visited == VISIT_SEEN) + return (0); + + /* + * Iterate over all children + */ + for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0; + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) { + /* + * Get statistics for this dataset, to determine the type of the + * dataset and clone statistics. If this fails, the dataset has + * since been removed, and we're pretty much screwed anyway. + */ + zc.zc_objset_stats.dds_origin[0] = '\0'; + if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) + continue; + + if (zc.zc_objset_stats.dds_origin[0] != '\0') { + if (zfs_graph_add(hdl, zgp, + zc.zc_objset_stats.dds_origin, zc.zc_name, + zc.zc_objset_stats.dds_creation_txg) != 0) + return (-1); + /* + * Count origins only if they are contained in the graph + */ + if (isa_child_of(zc.zc_objset_stats.dds_origin, + zgp->zg_root)) + zgp->zg_clone_count--; + } + + /* + * Add an edge between the parent and the child. + */ + if (zfs_graph_add(hdl, zgp, dataset, zc.zc_name, + zc.zc_objset_stats.dds_creation_txg) != 0) + return (-1); + + /* + * Recursively visit child + */ + if (iterate_children(hdl, zgp, zc.zc_name)) + return (-1); + } + + /* + * Now iterate over all snapshots. + */ + bzero(&zc, sizeof (zc)); + + for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0; + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) { + + /* + * Get statistics for this dataset, to determine the type of the + * dataset and clone statistics. If this fails, the dataset has + * since been removed, and we're pretty much screwed anyway. + */ + if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) + continue; + + /* + * Add an edge between the parent and the child. + */ + if (zfs_graph_add(hdl, zgp, dataset, zc.zc_name, + zc.zc_objset_stats.dds_creation_txg) != 0) + return (-1); + + zgp->zg_clone_count += zc.zc_objset_stats.dds_num_clones; + } + + zvp->zv_visited = VISIT_SEEN; + + return (0); +} + +/* + * Returns false if there are no snapshots with dependent clones in this + * subtree or if all of those clones are also in this subtree. Returns + * true if there is an error or there are external dependents. + */ +static boolean_t +external_dependents(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) +{ + zfs_cmd_t zc = { 0 }; + + /* + * Check whether this dataset is a clone or has clones since + * iterate_children() only checks the children. + */ + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) + return (B_TRUE); + + if (zc.zc_objset_stats.dds_origin[0] != '\0') { + if (zfs_graph_add(hdl, zgp, + zc.zc_objset_stats.dds_origin, zc.zc_name, + zc.zc_objset_stats.dds_creation_txg) != 0) + return (B_TRUE); + if (isa_child_of(zc.zc_objset_stats.dds_origin, dataset)) + zgp->zg_clone_count--; + } + + if ((zc.zc_objset_stats.dds_num_clones) || + iterate_children(hdl, zgp, dataset)) + return (B_TRUE); + + return (zgp->zg_clone_count != 0); +} + +/* + * Construct a complete graph of all necessary vertices. First, iterate over + * only our object's children. If no cloned snapshots are found, or all of + * the cloned snapshots are in this subtree then return a graph of the subtree. + * Otherwise, start at the root of the pool and iterate over all datasets. + */ +static zfs_graph_t * +construct_graph(libzfs_handle_t *hdl, const char *dataset) +{ + zfs_graph_t *zgp = zfs_graph_create(hdl, dataset, ZFS_GRAPH_SIZE); + int ret = 0; + + if (zgp == NULL) + return (zgp); + + if ((strchr(dataset, '/') == NULL) || + (external_dependents(hdl, zgp, dataset))) { + /* + * Determine pool name and try again. + */ + int len = strcspn(dataset, "/@") + 1; + char *pool = zfs_alloc(hdl, len); + + if (pool == NULL) { + zfs_graph_destroy(zgp); + return (NULL); + } + (void) strlcpy(pool, dataset, len); + + if (iterate_children(hdl, zgp, pool) == -1 || + zfs_graph_add(hdl, zgp, pool, NULL, 0) != 0) { + free(pool); + zfs_graph_destroy(zgp); + return (NULL); + } + free(pool); + } + + if (ret == -1 || zfs_graph_add(hdl, zgp, dataset, NULL, 0) != 0) { + zfs_graph_destroy(zgp); + return (NULL); + } + + return (zgp); +} + +/* + * Given a graph, do a recursive topological sort into the given array. This is + * really just a depth first search, so that the deepest nodes appear first. + * hijack the 'zv_visited' marker to avoid visiting the same vertex twice. + */ +static int +topo_sort(libzfs_handle_t *hdl, boolean_t allowrecursion, char **result, + size_t *idx, zfs_vertex_t *zgv) +{ + int i; + + if (zgv->zv_visited == VISIT_SORT_PRE && !allowrecursion) { + /* + * If we've already seen this vertex as part of our depth-first + * search, then we have a cyclic dependency, and we must return + * an error. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "recursive dependency at '%s'"), + zgv->zv_dataset); + return (zfs_error(hdl, EZFS_RECURSIVE, + dgettext(TEXT_DOMAIN, + "cannot determine dependent datasets"))); + } else if (zgv->zv_visited >= VISIT_SORT_PRE) { + /* + * If we've already processed this as part of the topological + * sort, then don't bother doing so again. + */ + return (0); + } + + zgv->zv_visited = VISIT_SORT_PRE; + + /* avoid doing a search if we don't have to */ + zfs_vertex_sort_edges(zgv); + for (i = 0; i < zgv->zv_edgecount; i++) { + if (topo_sort(hdl, allowrecursion, result, idx, + zgv->zv_edges[i]->ze_dest) != 0) + return (-1); + } + + /* we may have visited this in the course of the above */ + if (zgv->zv_visited == VISIT_SORT_POST) + return (0); + + if ((result[*idx] = zfs_alloc(hdl, + strlen(zgv->zv_dataset) + 1)) == NULL) + return (-1); + + (void) strcpy(result[*idx], zgv->zv_dataset); + *idx += 1; + zgv->zv_visited = VISIT_SORT_POST; + return (0); +} + +/* + * The only public interface for this file. Do the dirty work of constructing a + * child list for the given object. Construct the graph, do the toplogical + * sort, and then return the array of strings to the caller. + * + * The 'allowrecursion' parameter controls behavior when cycles are found. If + * it is set, the the cycle is ignored and the results returned as if the cycle + * did not exist. If it is not set, then the routine will generate an error if + * a cycle is found. + */ +int +get_dependents(libzfs_handle_t *hdl, boolean_t allowrecursion, + const char *dataset, char ***result, size_t *count) +{ + zfs_graph_t *zgp; + zfs_vertex_t *zvp; + + if ((zgp = construct_graph(hdl, dataset)) == NULL) + return (-1); + + if ((*result = zfs_alloc(hdl, + zgp->zg_nvertex * sizeof (char *))) == NULL) { + zfs_graph_destroy(zgp); + return (-1); + } + + if ((zvp = zfs_graph_lookup(hdl, zgp, dataset, 0)) == NULL) { + free(*result); + zfs_graph_destroy(zgp); + return (-1); + } + + *count = 0; + if (topo_sort(hdl, allowrecursion, *result, count, zvp) != 0) { + free(*result); + zfs_graph_destroy(zgp); + return (-1); + } + + /* + * Get rid of the last entry, which is our starting vertex and not + * strictly a dependent. + */ + assert(*count > 0); + free((*result)[*count - 1]); + (*count)--; + + zfs_graph_destroy(zgp); + + return (0); +} diff --git a/lib/libzfs/common/libzfs_impl.h b/lib/libzfs/common/libzfs_impl.h new file mode 100644 index 0000000..c9b09a2 --- /dev/null +++ b/lib/libzfs/common/libzfs_impl.h @@ -0,0 +1,214 @@ +/* + * CDDL HEADER SART + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBFS_IMPL_H +#define _LIBFS_IMPL_H + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef VERIFY +#undef VERIFY +#endif +#define VERIFY verify + +typedef struct libzfs_fru { + char *zf_device; + char *zf_fru; + struct libzfs_fru *zf_chain; + struct libzfs_fru *zf_next; +} libzfs_fru_t; + +struct libzfs_handle { + int libzfs_error; + int libzfs_fd; + FILE *libzfs_mnttab; + FILE *libzfs_sharetab; + zpool_handle_t *libzfs_pool_handles; + uu_avl_pool_t *libzfs_ns_avlpool; + uu_avl_t *libzfs_ns_avl; + uint64_t libzfs_ns_gen; + int libzfs_desc_active; + char libzfs_action[1024]; + char libzfs_desc[1024]; + char *libzfs_log_str; + int libzfs_printerr; + int libzfs_storeerr; /* stuff error messages into buffer */ + void *libzfs_sharehdl; /* libshare handle */ + uint_t libzfs_shareflags; + boolean_t libzfs_mnttab_enable; + avl_tree_t libzfs_mnttab_cache; + int libzfs_pool_iter; + topo_hdl_t *libzfs_topo_hdl; + libzfs_fru_t **libzfs_fru_hash; + libzfs_fru_t *libzfs_fru_list; + char libzfs_chassis_id[256]; +}; + +#define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */ + +struct zfs_handle { + libzfs_handle_t *zfs_hdl; + zpool_handle_t *zpool_hdl; + char zfs_name[ZFS_MAXNAMELEN]; + zfs_type_t zfs_type; /* type including snapshot */ + zfs_type_t zfs_head_type; /* type excluding snapshot */ + dmu_objset_stats_t zfs_dmustats; + nvlist_t *zfs_props; + nvlist_t *zfs_user_props; + nvlist_t *zfs_recvd_props; + boolean_t zfs_mntcheck; + char *zfs_mntopts; + uint8_t *zfs_props_table; +}; + +/* + * This is different from checking zfs_type, because it will also catch + * snapshots of volumes. + */ +#define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME) + +struct zpool_handle { + libzfs_handle_t *zpool_hdl; + zpool_handle_t *zpool_next; + char zpool_name[ZPOOL_MAXNAMELEN]; + int zpool_state; + size_t zpool_config_size; + nvlist_t *zpool_config; + nvlist_t *zpool_old_config; + nvlist_t *zpool_props; + diskaddr_t zpool_start_block; +}; + +typedef enum { + PROTO_NFS = 0, + PROTO_SMB = 1, + PROTO_END = 2 +} zfs_share_proto_t; + +/* + * The following can be used as a bitmask and any new values + * added must preserve that capability. + */ +typedef enum { + SHARED_NOT_SHARED = 0x0, + SHARED_NFS = 0x2, + SHARED_SMB = 0x4 +} zfs_share_type_t; + +int zfs_error(libzfs_handle_t *, int, const char *); +int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...); +void zfs_error_aux(libzfs_handle_t *, const char *, ...); +void *zfs_alloc(libzfs_handle_t *, size_t); +void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t); +char *zfs_asprintf(libzfs_handle_t *, const char *, ...); +char *zfs_strdup(libzfs_handle_t *, const char *); +int no_memory(libzfs_handle_t *); + +int zfs_standard_error(libzfs_handle_t *, int, const char *); +int zfs_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); +int zpool_standard_error(libzfs_handle_t *, int, const char *); +int zpool_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); + +int get_dependents(libzfs_handle_t *, boolean_t, const char *, char ***, + size_t *); + + +int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t, + nvlist_t *, char **, uint64_t *, const char *); +int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, + zfs_type_t type); + +/* + * Use this changelist_gather() flag to force attempting mounts + * on each change node regardless of whether or not it is currently + * mounted. + */ +#define CL_GATHER_MOUNT_ALWAYS 1 + +typedef struct prop_changelist prop_changelist_t; + +int zcmd_alloc_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, size_t); +int zcmd_write_src_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); +int zcmd_write_conf_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); +int zcmd_expand_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *); +int zcmd_read_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t **); +void zcmd_free_nvlists(zfs_cmd_t *); + +int changelist_prefix(prop_changelist_t *); +int changelist_postfix(prop_changelist_t *); +void changelist_rename(prop_changelist_t *, const char *, const char *); +void changelist_remove(prop_changelist_t *, const char *); +void changelist_free(prop_changelist_t *); +prop_changelist_t *changelist_gather(zfs_handle_t *, zfs_prop_t, int, int); +int changelist_unshare(prop_changelist_t *, zfs_share_proto_t *); +int changelist_haszonedchild(prop_changelist_t *); + +void remove_mountpoint(zfs_handle_t *); +int create_parents(libzfs_handle_t *, char *, int); +boolean_t isa_child_of(const char *dataset, const char *parent); + +zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); + +int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); + +boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); + +int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, + boolean_t modifying); + +void namespace_clear(libzfs_handle_t *); + +/* + * libshare (sharemgr) interfaces used internally. + */ + +extern int zfs_init_libshare(libzfs_handle_t *, int); +extern void zfs_uninit_libshare(libzfs_handle_t *); +extern int zfs_parse_options(char *, zfs_share_proto_t); + +extern int zfs_unshare_proto(zfs_handle_t *, + const char *, zfs_share_proto_t *); + +extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBFS_IMPL_H */ diff --git a/lib/libzfs/common/libzfs_import.c b/lib/libzfs/common/libzfs_import.c new file mode 100644 index 0000000..e137035 --- /dev/null +++ b/lib/libzfs/common/libzfs_import.c @@ -0,0 +1,1688 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Pool import support functions. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "libzfs.h" +#include "libzfs_impl.h" + +/* + * Intermediate structures used to gather configuration information. + */ +typedef struct config_entry { + uint64_t ce_txg; + nvlist_t *ce_config; + struct config_entry *ce_next; +} config_entry_t; + +typedef struct vdev_entry { + uint64_t ve_guid; + config_entry_t *ve_configs; + struct vdev_entry *ve_next; +} vdev_entry_t; + +typedef struct pool_entry { + uint64_t pe_guid; + vdev_entry_t *pe_vdevs; + struct pool_entry *pe_next; +} pool_entry_t; + +typedef struct name_entry { + char *ne_name; + uint64_t ne_guid; + struct name_entry *ne_next; +} name_entry_t; + +typedef struct pool_list { + pool_entry_t *pools; + name_entry_t *names; +} pool_list_t; + +static char * +get_devid(const char *path) +{ + int fd; + ddi_devid_t devid; + char *minor, *ret; + + if ((fd = open(path, O_RDONLY)) < 0) + return (NULL); + + minor = NULL; + ret = NULL; + if (devid_get(fd, &devid) == 0) { + if (devid_get_minor_name(fd, &minor) == 0) + ret = devid_str_encode(devid, minor); + if (minor != NULL) + devid_str_free(minor); + devid_free(devid); + } + (void) close(fd); + + return (ret); +} + + +/* + * Go through and fix up any path and/or devid information for the given vdev + * configuration. + */ +static int +fix_paths(nvlist_t *nv, name_entry_t *names) +{ + nvlist_t **child; + uint_t c, children; + uint64_t guid; + name_entry_t *ne, *best; + char *path, *devid; + int matched; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (fix_paths(child[c], names) != 0) + return (-1); + return (0); + } + + /* + * This is a leaf (file or disk) vdev. In either case, go through + * the name list and see if we find a matching guid. If so, replace + * the path and see if we can calculate a new devid. + * + * There may be multiple names associated with a particular guid, in + * which case we have overlapping slices or multiple paths to the same + * disk. If this is the case, then we want to pick the path that is + * the most similar to the original, where "most similar" is the number + * of matching characters starting from the end of the path. This will + * preserve slice numbers even if the disks have been reorganized, and + * will also catch preferred disk names if multiple paths exist. + */ + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + path = NULL; + + matched = 0; + best = NULL; + for (ne = names; ne != NULL; ne = ne->ne_next) { + if (ne->ne_guid == guid) { + const char *src, *dst; + int count; + + if (path == NULL) { + best = ne; + break; + } + + src = ne->ne_name + strlen(ne->ne_name) - 1; + dst = path + strlen(path) - 1; + for (count = 0; src >= ne->ne_name && dst >= path; + src--, dst--, count++) + if (*src != *dst) + break; + + /* + * At this point, 'count' is the number of characters + * matched from the end. + */ + if (count > matched || best == NULL) { + best = ne; + matched = count; + } + } + } + + if (best == NULL) + return (0); + + if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) + return (-1); + + if ((devid = get_devid(best->ne_name)) == NULL) { + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + } else { + if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) + return (-1); + devid_str_free(devid); + } + + return (0); +} + +/* + * Add the given configuration to the list of known devices. + */ +static int +add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path, + nvlist_t *config) +{ + uint64_t pool_guid, vdev_guid, top_guid, txg, state; + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + name_entry_t *ne; + + /* + * If this is a hot spare not currently in use or level 2 cache + * device, add it to the list of names to translate, but don't do + * anything else. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) == 0 && + (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { + if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) + return (-1); + + if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { + free(ne); + return (-1); + } + ne->ne_guid = vdev_guid; + ne->ne_next = pl->names; + pl->names = ne; + return (0); + } + + /* + * If we have a valid config but cannot read any of these fields, then + * it means we have a half-initialized label. In vdev_label_init() + * we write a label with txg == 0 so that we can identify the device + * in case the user refers to the same disk later on. If we fail to + * create the pool, we'll be left with a label in this state + * which should not be considered part of a valid pool. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, + &vdev_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(config); + return (0); + } + + /* + * First, see if we know about this pool. If not, then add it to the + * list of known pools. + */ + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + if (pe->pe_guid == pool_guid) + break; + } + + if (pe == NULL) { + if ((pe = zfs_alloc(hdl, sizeof (pool_entry_t))) == NULL) { + nvlist_free(config); + return (-1); + } + pe->pe_guid = pool_guid; + pe->pe_next = pl->pools; + pl->pools = pe; + } + + /* + * Second, see if we know about this toplevel vdev. Add it if its + * missing. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + if (ve->ve_guid == top_guid) + break; + } + + if (ve == NULL) { + if ((ve = zfs_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { + nvlist_free(config); + return (-1); + } + ve->ve_guid = top_guid; + ve->ve_next = pe->pe_vdevs; + pe->pe_vdevs = ve; + } + + /* + * Third, see if we have a config with a matching transaction group. If + * so, then we do nothing. Otherwise, add it to the list of known + * configs. + */ + for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { + if (ce->ce_txg == txg) + break; + } + + if (ce == NULL) { + if ((ce = zfs_alloc(hdl, sizeof (config_entry_t))) == NULL) { + nvlist_free(config); + return (-1); + } + ce->ce_txg = txg; + ce->ce_config = config; + ce->ce_next = ve->ve_configs; + ve->ve_configs = ce; + } else { + nvlist_free(config); + } + + /* + * At this point we've successfully added our config to the list of + * known configs. The last thing to do is add the vdev guid -> path + * mappings so that we can fix up the configuration as necessary before + * doing the import. + */ + if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) + return (-1); + + if ((ne->ne_name = zfs_strdup(hdl, path)) == NULL) { + free(ne); + return (-1); + } + + ne->ne_guid = vdev_guid; + ne->ne_next = pl->names; + pl->names = ne; + + return (0); +} + +/* + * Returns true if the named pool matches the given GUID. + */ +static int +pool_active(libzfs_handle_t *hdl, const char *name, uint64_t guid, + boolean_t *isactive) +{ + zpool_handle_t *zhp; + uint64_t theguid; + + if (zpool_open_silent(hdl, name, &zhp) != 0) + return (-1); + + if (zhp == NULL) { + *isactive = B_FALSE; + return (0); + } + + verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID, + &theguid) == 0); + + zpool_close(zhp); + + *isactive = (theguid == guid); + return (0); +} + +static nvlist_t * +refresh_config(libzfs_handle_t *hdl, nvlist_t *config) +{ + nvlist_t *nvl; + zfs_cmd_t zc = { 0 }; + int err; + + if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) + return (NULL); + + if (zcmd_alloc_dst_nvlist(hdl, &zc, + zc.zc_nvlist_conf_size * 2) != 0) { + zcmd_free_nvlists(&zc); + return (NULL); + } + + while ((err = ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_TRYIMPORT, + &zc)) != 0 && errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (NULL); + } + } + + if (err) { + zcmd_free_nvlists(&zc); + return (NULL); + } + + if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) { + zcmd_free_nvlists(&zc); + return (NULL); + } + + zcmd_free_nvlists(&zc); + return (nvl); +} + +/* + * Determine if the vdev id is a hole in the namespace. + */ +boolean_t +vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) +{ + for (int c = 0; c < holes; c++) { + + /* Top-level is a hole */ + if (hole_array[c] == id) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Convert our list of pools into the definitive set of configurations. We + * start by picking the best config for each toplevel vdev. Once that's done, + * we assemble the toplevel vdevs into a full config for the pool. We make a + * pass to fix up any incorrect paths, and then add it to the main list to + * return to the user. + */ +static nvlist_t * +get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) +{ + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + nvlist_t *ret = NULL, *config = NULL, *tmp, *nvtop, *nvroot; + nvlist_t **spares, **l2cache; + uint_t i, nspares, nl2cache; + boolean_t config_seen; + uint64_t best_txg; + char *name, *hostname; + uint64_t version, guid; + uint_t children = 0; + nvlist_t **child = NULL; + uint_t holes; + uint64_t *hole_array, max_id; + uint_t c; + boolean_t isactive; + uint64_t hostid; + nvlist_t *nvl; + boolean_t found_one = B_FALSE; + boolean_t valid_top_config = B_FALSE; + + if (nvlist_alloc(&ret, 0, 0) != 0) + goto nomem; + + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + uint64_t id, max_txg = 0; + + if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + config_seen = B_FALSE; + + /* + * Iterate over all toplevel vdevs. Grab the pool configuration + * from the first one we find, and then go through the rest and + * add them as necessary to the 'vdevs' member of the config. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + + /* + * Determine the best configuration for this vdev by + * selecting the config with the latest transaction + * group. + */ + best_txg = 0; + for (ce = ve->ve_configs; ce != NULL; + ce = ce->ce_next) { + + if (ce->ce_txg > best_txg) { + tmp = ce->ce_config; + best_txg = ce->ce_txg; + } + } + + /* + * We rely on the fact that the max txg for the + * pool will contain the most up-to-date information + * about the valid top-levels in the vdev namespace. + */ + if (best_txg > max_txg) { + (void) nvlist_remove(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, + ZPOOL_CONFIG_HOLE_ARRAY, + DATA_TYPE_UINT64_ARRAY); + + max_txg = best_txg; + hole_array = NULL; + holes = 0; + max_id = 0; + valid_top_config = B_FALSE; + + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { + verify(nvlist_add_uint64(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + max_id) == 0); + valid_top_config = B_TRUE; + } + + if (nvlist_lookup_uint64_array(tmp, + ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, + &holes) == 0) { + verify(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_HOLE_ARRAY, + hole_array, holes) == 0); + } + } + + if (!config_seen) { + /* + * Copy the relevant pieces of data to the pool + * configuration: + * + * version + * pool guid + * name + * pool state + * hostid (if available) + * hostname (if available) + */ + uint64_t state; + + verify(nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VERSION, &version) == 0); + if (nvlist_add_uint64(config, + ZPOOL_CONFIG_VERSION, version) != 0) + goto nomem; + verify(nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_GUID, &guid) == 0); + if (nvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_GUID, guid) != 0) + goto nomem; + verify(nvlist_lookup_string(tmp, + ZPOOL_CONFIG_POOL_NAME, &name) == 0); + if (nvlist_add_string(config, + ZPOOL_CONFIG_POOL_NAME, name) != 0) + goto nomem; + verify(nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_STATE, &state) == 0); + if (nvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_STATE, state) != 0) + goto nomem; + hostid = 0; + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + if (nvlist_add_uint64(config, + ZPOOL_CONFIG_HOSTID, hostid) != 0) + goto nomem; + verify(nvlist_lookup_string(tmp, + ZPOOL_CONFIG_HOSTNAME, + &hostname) == 0); + if (nvlist_add_string(config, + ZPOOL_CONFIG_HOSTNAME, + hostname) != 0) + goto nomem; + } + + config_seen = B_TRUE; + } + + /* + * Add this top-level vdev to the child array. + */ + verify(nvlist_lookup_nvlist(tmp, + ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); + verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, + &id) == 0); + + if (id >= children) { + nvlist_t **newchild; + + newchild = zfs_alloc(hdl, (id + 1) * + sizeof (nvlist_t *)); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + free(child); + child = newchild; + children = id + 1; + } + if (nvlist_dup(nvtop, &child[id], 0) != 0) + goto nomem; + + } + + /* + * If we have information about all the top-levels then + * clean up the nvlist which we've constructed. This + * means removing any extraneous devices that are + * beyond the valid range or adding devices to the end + * of our array which appear to be missing. + */ + if (valid_top_config) { + if (max_id < children) { + for (c = max_id; c < children; c++) + nvlist_free(child[c]); + children = max_id; + } else if (max_id > children) { + nvlist_t **newchild; + + newchild = zfs_alloc(hdl, (max_id) * + sizeof (nvlist_t *)); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + free(child); + child = newchild; + children = max_id; + } + } + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + /* + * The vdev namespace may contain holes as a result of + * device removal. We must add them back into the vdev + * tree before we process any missing devices. + */ + if (holes > 0) { + ASSERT(valid_top_config); + + for (c = 0; c < children; c++) { + nvlist_t *holey; + + if (child[c] != NULL || + !vdev_is_hole(hole_array, holes, c)) + continue; + + if (nvlist_alloc(&holey, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + + /* + * Holes in the namespace are treated as + * "hole" top-level vdevs and have a + * special flag set on them. + */ + if (nvlist_add_string(holey, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_GUID, 0ULL) != 0) + goto nomem; + child[c] = holey; + } + } + + /* + * Look for any missing top-level vdevs. If this is the case, + * create a faked up 'missing' vdev as a placeholder. We cannot + * simply compress the child array, because the kernel performs + * certain checks to make sure the vdev IDs match their location + * in the configuration. + */ + for (c = 0; c < children; c++) { + if (child[c] == NULL) { + nvlist_t *missing; + if (nvlist_alloc(&missing, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + if (nvlist_add_string(missing, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(missing); + goto nomem; + } + child[c] = missing; + } + } + + /* + * Put all of this pool's top-level vdevs into a root vdev. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + child, children) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + children = 0; + child = NULL; + + /* + * Go through and fix up any paths and/or devids based on our + * known list of vdev GUID -> path mappings. + */ + if (fix_paths(nvroot, pl->names) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + /* + * Add the root vdev to this pool's configuration. + */ + if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + nvroot) != 0) { + nvlist_free(nvroot); + goto nomem; + } + nvlist_free(nvroot); + + /* + * zdb uses this path to report on active pools that were + * imported or created using -R. + */ + if (active_ok) + goto add_pool; + + /* + * Determine if this pool is currently active, in which case we + * can't actually import it. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + if (pool_active(hdl, name, guid, &isactive) != 0) + goto error; + + if (isactive) { + nvlist_free(config); + config = NULL; + continue; + } + + if ((nvl = refresh_config(hdl, config)) == NULL) { + nvlist_free(config); + config = NULL; + continue; + } + + nvlist_free(config); + config = nvl; + + /* + * Go through and update the paths for spares, now that we have + * them. + */ + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + if (fix_paths(spares[i], pl->names) != 0) + goto nomem; + } + } + + /* + * Update the paths for l2cache devices. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + for (i = 0; i < nl2cache; i++) { + if (fix_paths(l2cache[i], pl->names) != 0) + goto nomem; + } + } + + /* + * Restore the original information read from the actual label. + */ + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, + DATA_TYPE_STRING); + if (hostid != 0) { + verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, + hostid) == 0); + verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, + hostname) == 0); + } + +add_pool: + /* + * Add this pool to the list of configs. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + if (nvlist_add_nvlist(ret, name, config) != 0) + goto nomem; + + found_one = B_TRUE; + nvlist_free(config); + config = NULL; + } + + if (!found_one) { + nvlist_free(ret); + ret = NULL; + } + + return (ret); + +nomem: + (void) no_memory(hdl); +error: + nvlist_free(config); + nvlist_free(ret); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + + return (NULL); +} + +/* + * Return the offset of the given label. + */ +static uint64_t +label_offset(uint64_t size, int l) +{ + ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); + return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); +} + +/* + * Given a file descriptor, read the label information and return an nvlist + * describing the configuration, if there is one. + */ +int +zpool_read_label(int fd, nvlist_t **config) +{ + struct stat64 statbuf; + int l; + vdev_label_t *label; + uint64_t state, txg, size; + + *config = NULL; + + if (fstat64(fd, &statbuf) == -1) + return (0); + size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); + + if ((label = malloc(sizeof (vdev_label_t))) == NULL) + return (-1); + + for (l = 0; l < VDEV_LABELS; l++) { + if (pread64(fd, label, sizeof (vdev_label_t), + label_offset(size, l)) != sizeof (vdev_label_t)) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(*config); + continue; + } + + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(*config); + continue; + } + + free(label); + return (0); + } + + free(label); + *config = NULL; + return (0); +} + +typedef struct rdsk_node { + char *rn_name; + int rn_dfd; + libzfs_handle_t *rn_hdl; + nvlist_t *rn_config; + avl_tree_t *rn_avl; + avl_node_t rn_node; + boolean_t rn_nozpool; +} rdsk_node_t; + +static int +slice_cache_compare(const void *arg1, const void *arg2) +{ + const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; + const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; + char *nm1slice, *nm2slice; + int rv; + + /* + * slices zero and two are the most likely to provide results, + * so put those first + */ + nm1slice = strstr(nm1, "s0"); + nm2slice = strstr(nm2, "s0"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + nm1slice = strstr(nm1, "s2"); + nm2slice = strstr(nm2, "s2"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + + rv = strcmp(nm1, nm2); + if (rv == 0) + return (0); + return (rv > 0 ? 1 : -1); +} + +static void +check_one_slice(avl_tree_t *r, char *diskname, uint_t partno, + diskaddr_t size, uint_t blksz) +{ + rdsk_node_t tmpnode; + rdsk_node_t *node; + char sname[MAXNAMELEN]; + + tmpnode.rn_name = &sname[0]; + (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u", + diskname, partno); + /* + * protect against division by zero for disk labels that + * contain a bogus sector size + */ + if (blksz == 0) + blksz = DEV_BSIZE; + /* too small to contain a zpool? */ + if ((size < (SPA_MINDEVSIZE / blksz)) && + (node = avl_find(r, &tmpnode, NULL))) + node->rn_nozpool = B_TRUE; +} + +static void +nozpool_all_slices(avl_tree_t *r, const char *sname) +{ + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if (((ptr = strrchr(diskname, 's')) == NULL) && + ((ptr = strrchr(diskname, 'p')) == NULL)) + return; + ptr[0] = 's'; + ptr[1] = '\0'; + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, 0, 1); + ptr[0] = 'p'; + for (i = 0; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); +} + +static void +check_slices(avl_tree_t *r, int fd, const char *sname) +{ + struct extvtoc vtoc; + struct dk_gpt *gpt; + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1])) + return; + ptr[1] = '\0'; + + if (read_extvtoc(fd, &vtoc) >= 0) { + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + vtoc.v_part[i].p_size, vtoc.v_sectorsz); + } else if (efi_alloc_and_read(fd, &gpt) >= 0) { + /* + * on x86 we'll still have leftover links that point + * to slices s[9-15], so use NDKMAP instead + */ + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + gpt->efi_parts[i].p_size, gpt->efi_lbasize); + /* nodes p[1-4] are never used with EFI labels */ + ptr[0] = 'p'; + for (i = 1; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); + efi_free(gpt); + } +} + +static void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + struct stat64 statbuf; + nvlist_t *config; + int fd; + + if (rn->rn_nozpool) + return; + if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { + /* symlink to a device that's no longer there */ + if (errno == ENOENT) + nozpool_all_slices(rn->rn_avl, rn->rn_name); + return; + } + /* + * Ignore failed stats. We only want regular + * files, character devs and block devs. + */ + if (fstat64(fd, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && + !S_ISCHR(statbuf.st_mode) && + !S_ISBLK(statbuf.st_mode))) { + (void) close(fd); + return; + } + /* this file is too small to hold a zpool */ + if (S_ISREG(statbuf.st_mode) && + statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } else if (!S_ISREG(statbuf.st_mode)) { + /* + * Try to read the disk label first so we don't have to + * open a bunch of minor nodes that can't have a zpool. + */ + check_slices(rn->rn_avl, fd, rn->rn_name); + } + + if ((zpool_read_label(fd, &config)) != 0) { + (void) close(fd); + (void) no_memory(rn->rn_hdl); + return; + } + (void) close(fd); + + + rn->rn_config = config; + if (config != NULL) { + assert(rn->rn_nozpool == B_FALSE); + } +} + +/* + * Given a file descriptor, clear (zero) the label information. This function + * is currently only used in the appliance stack as part of the ZFS sysevent + * module. + */ +int +zpool_clear_label(int fd) +{ + struct stat64 statbuf; + int l; + vdev_label_t *label; + uint64_t size; + + if (fstat64(fd, &statbuf) == -1) + return (0); + size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); + + if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL) + return (-1); + + for (l = 0; l < VDEV_LABELS; l++) { + if (pwrite64(fd, label, sizeof (vdev_label_t), + label_offset(size, l)) != sizeof (vdev_label_t)) + return (-1); + } + + free(label); + return (0); +} + +/* + * Given a list of directories to search, find all pools stored on disk. This + * includes partial pools which are not available to import. If no args are + * given (argc is 0), then the default directory (/dev/dsk) is searched. + * poolname or guid (but not both) are provided by the caller when trying + * to import a specific pool. + */ +static nvlist_t * +zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) +{ + int i, dirs = iarg->paths; + DIR *dirp = NULL; + struct dirent64 *dp; + char path[MAXPATHLEN]; + char *end, **dir = iarg->path; + size_t pathleft; + nvlist_t *ret = NULL; + static char *default_dir = "/dev/dsk"; + pool_list_t pools = { 0 }; + pool_entry_t *pe, *penext; + vdev_entry_t *ve, *venext; + config_entry_t *ce, *cenext; + name_entry_t *ne, *nenext; + avl_tree_t slice_cache; + rdsk_node_t *slice; + void *cookie; + + if (dirs == 0) { + dirs = 1; + dir = &default_dir; + } + + /* + * Go through and read the label configuration information from every + * possible device, organizing the information according to pool GUID + * and toplevel GUID. + */ + for (i = 0; i < dirs; i++) { + tpool_t *t; + char *rdsk; + int dfd; + + /* use realpath to normalize the path */ + if (realpath(dir[i], path) == 0) { + (void) zfs_error_fmt(hdl, EZFS_BADPATH, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]); + goto error; + } + end = &path[strlen(path)]; + *end++ = '/'; + *end = 0; + pathleft = &path[sizeof (path)] - end; + + /* + * Using raw devices instead of block devices when we're + * reading the labels skips a bunch of slow operations during + * close(2) processing, so we replace /dev/dsk with /dev/rdsk. + */ + if (strcmp(path, "/dev/dsk/") == 0) + rdsk = "/dev/rdsk/"; + else + rdsk = path; + + if ((dfd = open64(rdsk, O_RDONLY)) < 0 || + (dirp = fdopendir(dfd)) == NULL) { + zfs_error_aux(hdl, strerror(errno)); + (void) zfs_error_fmt(hdl, EZFS_BADPATH, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), + rdsk); + goto error; + } + + avl_create(&slice_cache, slice_cache_compare, + sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); + /* + * This is not MT-safe, but we have no MT consumers of libzfs + */ + while ((dp = readdir64(dirp)) != NULL) { + const char *name = dp->d_name; + if (name[0] == '.' && + (name[1] == 0 || (name[1] == '.' && name[2] == 0))) + continue; + + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zfs_strdup(hdl, name); + slice->rn_avl = &slice_cache; + slice->rn_dfd = dfd; + slice->rn_hdl = hdl; + slice->rn_nozpool = B_FALSE; + avl_add(&slice_cache, slice); + } + /* + * create a thread pool to do all of this in parallel; + * rn_nozpool is not protected, so this is racy in that + * multiple tasks could decide that the same slice can + * not hold a zpool, which is benign. Also choose + * double the number of processors; we hold a lot of + * locks in the kernel, so going beyond this doesn't + * buy us much. + */ + t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + for (slice = avl_first(&slice_cache); slice; + (slice = avl_walk(&slice_cache, slice, + AVL_AFTER))) + (void) tpool_dispatch(t, zpool_open_func, slice); + tpool_wait(t); + tpool_destroy(t); + + cookie = NULL; + while ((slice = avl_destroy_nodes(&slice_cache, + &cookie)) != NULL) { + if (slice->rn_config != NULL) { + nvlist_t *config = slice->rn_config; + boolean_t matched = B_TRUE; + + if (iarg->poolname != NULL) { + char *pname; + + matched = nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && + strcmp(iarg->poolname, pname) == 0; + } else if (iarg->guid != 0) { + uint64_t this_guid; + + matched = nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, + &this_guid) == 0 && + iarg->guid == this_guid; + } + if (!matched) { + nvlist_free(config); + config = NULL; + continue; + } + /* use the non-raw path for the config */ + (void) strlcpy(end, slice->rn_name, pathleft); + if (add_config(hdl, &pools, path, config) != 0) + goto error; + } + free(slice->rn_name); + free(slice); + } + avl_destroy(&slice_cache); + + (void) closedir(dirp); + dirp = NULL; + } + + ret = get_configs(hdl, &pools, iarg->can_be_active); + +error: + for (pe = pools.pools; pe != NULL; pe = penext) { + penext = pe->pe_next; + for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { + venext = ve->ve_next; + for (ce = ve->ve_configs; ce != NULL; ce = cenext) { + cenext = ce->ce_next; + if (ce->ce_config) + nvlist_free(ce->ce_config); + free(ce); + } + free(ve); + } + free(pe); + } + + for (ne = pools.names; ne != NULL; ne = nenext) { + nenext = ne->ne_next; + if (ne->ne_name) + free(ne->ne_name); + free(ne); + } + + if (dirp) + (void) closedir(dirp); + + return (ret); +} + +nvlist_t * +zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv) +{ + importargs_t iarg = { 0 }; + + iarg.paths = argc; + iarg.path = argv; + + return (zpool_find_import_impl(hdl, &iarg)); +} + +/* + * Given a cache file, return the contents as a list of importable pools. + * poolname or guid (but not both) are provided by the caller when trying + * to import a specific pool. + */ +nvlist_t * +zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile, + char *poolname, uint64_t guid) +{ + char *buf; + int fd; + struct stat64 statbuf; + nvlist_t *raw, *src, *dst; + nvlist_t *pools; + nvpair_t *elem; + char *name; + uint64_t this_guid; + boolean_t active; + + verify(poolname == NULL || guid == 0); + + if ((fd = open(cachefile, O_RDONLY)) < 0) { + zfs_error_aux(hdl, "%s", strerror(errno)); + (void) zfs_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "failed to open cache file")); + return (NULL); + } + + if (fstat64(fd, &statbuf) != 0) { + zfs_error_aux(hdl, "%s", strerror(errno)); + (void) close(fd); + (void) zfs_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "failed to get size of cache file")); + return (NULL); + } + + if ((buf = zfs_alloc(hdl, statbuf.st_size)) == NULL) { + (void) close(fd); + return (NULL); + } + + if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { + (void) close(fd); + free(buf); + (void) zfs_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, + "failed to read cache file contents")); + return (NULL); + } + + (void) close(fd); + + if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { + free(buf); + (void) zfs_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, + "invalid or corrupt cache file contents")); + return (NULL); + } + + free(buf); + + /* + * Go through and get the current state of the pools and refresh their + * state. + */ + if (nvlist_alloc(&pools, 0, 0) != 0) { + (void) no_memory(hdl); + nvlist_free(raw); + return (NULL); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { + verify(nvpair_value_nvlist(elem, &src) == 0); + + verify(nvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + if (poolname != NULL && strcmp(poolname, name) != 0) + continue; + + verify(nvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID, + &this_guid) == 0); + if (guid != 0) { + verify(nvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID, + &this_guid) == 0); + if (guid != this_guid) + continue; + } + + if (pool_active(hdl, name, this_guid, &active) != 0) { + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + + if (active) + continue; + + if ((dst = refresh_config(hdl, src)) == NULL) { + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + + if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { + (void) no_memory(hdl); + nvlist_free(dst); + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + nvlist_free(dst); + } + + nvlist_free(raw); + return (pools); +} + +static int +name_or_guid_exists(zpool_handle_t *zhp, void *data) +{ + importargs_t *import = data; + int found = 0; + + if (import->poolname != NULL) { + char *pool_name; + + verify(nvlist_lookup_string(zhp->zpool_config, + ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); + if (strcmp(pool_name, import->poolname) == 0) + found = 1; + } else { + uint64_t pool_guid; + + verify(nvlist_lookup_uint64(zhp->zpool_config, + ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); + if (pool_guid == import->guid) + found = 1; + } + + zpool_close(zhp); + return (found); +} + +nvlist_t * +zpool_search_import(libzfs_handle_t *hdl, importargs_t *import) +{ + verify(import->poolname == NULL || import->guid == 0); + + if (import->unique) + import->exists = zpool_iter(hdl, name_or_guid_exists, import); + + if (import->cachefile != NULL) + return (zpool_find_import_cached(hdl, import->cachefile, + import->poolname, import->guid)); + + return (zpool_find_import_impl(hdl, import)); +} + +boolean_t +find_guid(nvlist_t *nv, uint64_t guid) +{ + uint64_t tmp; + nvlist_t **child; + uint_t c, children; + + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &tmp) == 0); + if (tmp == guid) + return (B_TRUE); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (find_guid(child[c], guid)) + return (B_TRUE); + } + + return (B_FALSE); +} + +typedef struct aux_cbdata { + const char *cb_type; + uint64_t cb_guid; + zpool_handle_t *cb_zhp; +} aux_cbdata_t; + +static int +find_aux(zpool_handle_t *zhp, void *data) +{ + aux_cbdata_t *cbp = data; + nvlist_t **list; + uint_t i, count; + uint64_t guid; + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type, + &list, &count) == 0) { + for (i = 0; i < count; i++) { + verify(nvlist_lookup_uint64(list[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + if (guid == cbp->cb_guid) { + cbp->cb_zhp = zhp; + return (1); + } + } + } + + zpool_close(zhp); + return (0); +} + +/* + * Determines if the pool is in use. If so, it returns true and the state of + * the pool as well as the name of the pool. Both strings are allocated and + * must be freed by the caller. + */ +int +zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, + boolean_t *inuse) +{ + nvlist_t *config; + char *name; + boolean_t ret; + uint64_t guid, vdev_guid; + zpool_handle_t *zhp; + nvlist_t *pool_config; + uint64_t stateval, isspare; + aux_cbdata_t cb = { 0 }; + boolean_t isactive; + + *inuse = B_FALSE; + + if (zpool_read_label(fd, &config) != 0) { + (void) no_memory(hdl); + return (-1); + } + + if (config == NULL) + return (0); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &stateval) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, + &vdev_guid) == 0); + + if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) { + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + } + + switch (stateval) { + case POOL_STATE_EXPORTED: + /* + * A pool with an exported state may in fact be imported + * read-only, so check the in-core state to see if it's + * active and imported read-only. If it is, set + * its state to active. + */ + if (pool_active(hdl, name, guid, &isactive) == 0 && isactive && + (zhp = zpool_open_canfail(hdl, name)) != NULL && + zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL)) + stateval = POOL_STATE_ACTIVE; + + ret = B_TRUE; + break; + + case POOL_STATE_ACTIVE: + /* + * For an active pool, we have to determine if it's really part + * of a currently active pool (in which case the pool will exist + * and the guid will be the same), or whether it's part of an + * active pool that was disconnected without being explicitly + * exported. + */ + if (pool_active(hdl, name, guid, &isactive) != 0) { + nvlist_free(config); + return (-1); + } + + if (isactive) { + /* + * Because the device may have been removed while + * offlined, we only report it as active if the vdev is + * still present in the config. Otherwise, pretend like + * it's not in use. + */ + if ((zhp = zpool_open_canfail(hdl, name)) != NULL && + (pool_config = zpool_get_config(zhp, NULL)) + != NULL) { + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(pool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + ret = find_guid(nvroot, vdev_guid); + } else { + ret = B_FALSE; + } + + /* + * If this is an active spare within another pool, we + * treat it like an unused hot spare. This allows the + * user to create a pool with a hot spare that currently + * in use within another pool. Since we return B_TRUE, + * libdiskmgt will continue to prevent generic consumers + * from using the device. + */ + if (ret && nvlist_lookup_uint64(config, + ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare) + stateval = POOL_STATE_SPARE; + + if (zhp != NULL) + zpool_close(zhp); + } else { + stateval = POOL_STATE_POTENTIALLY_ACTIVE; + ret = B_TRUE; + } + break; + + case POOL_STATE_SPARE: + /* + * For a hot spare, it can be either definitively in use, or + * potentially active. To determine if it's in use, we iterate + * over all pools in the system and search for one with a spare + * with a matching guid. + * + * Due to the shared nature of spares, we don't actually report + * the potentially active case as in use. This means the user + * can freely create pools on the hot spares of exported pools, + * but to do otherwise makes the resulting code complicated, and + * we end up having to deal with this case anyway. + */ + cb.cb_zhp = NULL; + cb.cb_guid = vdev_guid; + cb.cb_type = ZPOOL_CONFIG_SPARES; + if (zpool_iter(hdl, find_aux, &cb) == 1) { + name = (char *)zpool_get_name(cb.cb_zhp); + ret = TRUE; + } else { + ret = FALSE; + } + break; + + case POOL_STATE_L2CACHE: + + /* + * Check if any pool is currently using this l2cache device. + */ + cb.cb_zhp = NULL; + cb.cb_guid = vdev_guid; + cb.cb_type = ZPOOL_CONFIG_L2CACHE; + if (zpool_iter(hdl, find_aux, &cb) == 1) { + name = (char *)zpool_get_name(cb.cb_zhp); + ret = TRUE; + } else { + ret = FALSE; + } + break; + + default: + ret = B_FALSE; + } + + + if (ret) { + if ((*namestr = zfs_strdup(hdl, name)) == NULL) { + if (cb.cb_zhp) + zpool_close(cb.cb_zhp); + nvlist_free(config); + return (-1); + } + *state = (pool_state_t)stateval; + } + + if (cb.cb_zhp) + zpool_close(cb.cb_zhp); + + nvlist_free(config); + *inuse = ret; + return (0); +} diff --git a/lib/libzfs/common/libzfs_mount.c b/lib/libzfs/common/libzfs_mount.c new file mode 100644 index 0000000..9222206 --- /dev/null +++ b/lib/libzfs/common/libzfs_mount.c @@ -0,0 +1,1266 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Routines to manage ZFS mounts. We separate all the nasty routines that have + * to deal with the OS. The following functions are the main entry points -- + * they are used by mount and unmount and when changing a filesystem's + * mountpoint. + * + * zfs_is_mounted() + * zfs_mount() + * zfs_unmount() + * zfs_unmountall() + * + * This file also contains the functions used to manage sharing filesystems via + * NFS and iSCSI: + * + * zfs_is_shared() + * zfs_share() + * zfs_unshare() + * + * zfs_is_shared_nfs() + * zfs_is_shared_smb() + * zfs_share_proto() + * zfs_shareall(); + * zfs_unshare_nfs() + * zfs_unshare_smb() + * zfs_unshareall_nfs() + * zfs_unshareall_smb() + * zfs_unshareall() + * zfs_unshareall_bypath() + * + * The following functions are available for pool consumers, and will + * mount/unmount and share/unshare all datasets within pool: + * + * zpool_enable_datasets() + * zpool_disable_datasets() + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "libzfs_impl.h" + +#include +#include +#define MAXISALEN 257 /* based on sysinfo(2) man page */ + +static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); +zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, + zfs_share_proto_t); + +/* + * The share protocols table must be in the same order as the zfs_share_prot_t + * enum in libzfs_impl.h + */ +typedef struct { + zfs_prop_t p_prop; + char *p_name; + int p_share_err; + int p_unshare_err; +} proto_table_t; + +proto_table_t proto_table[PROTO_END] = { + {ZFS_PROP_SHARENFS, "nfs", EZFS_SHARENFSFAILED, EZFS_UNSHARENFSFAILED}, + {ZFS_PROP_SHARESMB, "smb", EZFS_SHARESMBFAILED, EZFS_UNSHARESMBFAILED}, +}; + +zfs_share_proto_t nfs_only[] = { + PROTO_NFS, + PROTO_END +}; + +zfs_share_proto_t smb_only[] = { + PROTO_SMB, + PROTO_END +}; +zfs_share_proto_t share_all_proto[] = { + PROTO_NFS, + PROTO_SMB, + PROTO_END +}; + +/* + * Search the sharetab for the given mountpoint and protocol, returning + * a zfs_share_type_t value. + */ +static zfs_share_type_t +is_shared(libzfs_handle_t *hdl, const char *mountpoint, zfs_share_proto_t proto) +{ + char buf[MAXPATHLEN], *tab; + char *ptr; + + if (hdl->libzfs_sharetab == NULL) + return (SHARED_NOT_SHARED); + + (void) fseek(hdl->libzfs_sharetab, 0, SEEK_SET); + + while (fgets(buf, sizeof (buf), hdl->libzfs_sharetab) != NULL) { + + /* the mountpoint is the first entry on each line */ + if ((tab = strchr(buf, '\t')) == NULL) + continue; + + *tab = '\0'; + if (strcmp(buf, mountpoint) == 0) { + /* + * the protocol field is the third field + * skip over second field + */ + ptr = ++tab; + if ((tab = strchr(ptr, '\t')) == NULL) + continue; + ptr = ++tab; + if ((tab = strchr(ptr, '\t')) == NULL) + continue; + *tab = '\0'; + if (strcmp(ptr, + proto_table[proto].p_name) == 0) { + switch (proto) { + case PROTO_NFS: + return (SHARED_NFS); + case PROTO_SMB: + return (SHARED_SMB); + default: + return (0); + } + } + } + } + + return (SHARED_NOT_SHARED); +} + +/* + * Returns true if the specified directory is empty. If we can't open the + * directory at all, return true so that the mount can fail with a more + * informative error message. + */ +static boolean_t +dir_is_empty(const char *dirname) +{ + DIR *dirp; + struct dirent64 *dp; + + if ((dirp = opendir(dirname)) == NULL) + return (B_TRUE); + + while ((dp = readdir64(dirp)) != NULL) { + + if (strcmp(dp->d_name, ".") == 0 || + strcmp(dp->d_name, "..") == 0) + continue; + + (void) closedir(dirp); + return (B_FALSE); + } + + (void) closedir(dirp); + return (B_TRUE); +} + +/* + * Checks to see if the mount is active. If the filesystem is mounted, we fill + * in 'where' with the current mountpoint, and return 1. Otherwise, we return + * 0. + */ +boolean_t +is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where) +{ + struct mnttab entry; + + if (libzfs_mnttab_find(zfs_hdl, special, &entry) != 0) + return (B_FALSE); + + if (where != NULL) + *where = zfs_strdup(zfs_hdl, entry.mnt_mountp); + + return (B_TRUE); +} + +boolean_t +zfs_is_mounted(zfs_handle_t *zhp, char **where) +{ + return (is_mounted(zhp->zfs_hdl, zfs_get_name(zhp), where)); +} + +/* + * Returns true if the given dataset is mountable, false otherwise. Returns the + * mountpoint in 'buf'. + */ +static boolean_t +zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, + zprop_source_t *source) +{ + char sourceloc[ZFS_MAXNAMELEN]; + zprop_source_t sourcetype; + + if (!zfs_prop_valid_for_type(ZFS_PROP_MOUNTPOINT, zhp->zfs_type)) + return (B_FALSE); + + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, buf, buflen, + &sourcetype, sourceloc, sizeof (sourceloc), B_FALSE) == 0); + + if (strcmp(buf, ZFS_MOUNTPOINT_NONE) == 0 || + strcmp(buf, ZFS_MOUNTPOINT_LEGACY) == 0) + return (B_FALSE); + + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_OFF) + return (B_FALSE); + + if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED) && + getzoneid() == GLOBAL_ZONEID) + return (B_FALSE); + + if (source) + *source = sourcetype; + + return (B_TRUE); +} + +/* + * Mount the given filesystem. + */ +int +zfs_mount(zfs_handle_t *zhp, const char *options, int flags) +{ + struct stat buf; + char mountpoint[ZFS_MAXPROPLEN]; + char mntopts[MNT_LINE_MAX]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + if (options == NULL) + mntopts[0] = '\0'; + else + (void) strlcpy(mntopts, options, sizeof (mntopts)); + + /* + * If the pool is imported read-only then all mounts must be read-only + */ + if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL)) + flags |= MS_RDONLY; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) + return (0); + + /* Create the directory if it doesn't already exist */ + if (lstat(mountpoint, &buf) != 0) { + if (mkdirp(mountpoint, 0755) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "failed to create mountpoint")); + return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot mount '%s'"), + mountpoint)); + } + } + + /* + * Determine if the mountpoint is empty. If so, refuse to perform the + * mount. We don't perform this check if MS_OVERLAY is specified, which + * would defeat the point. We also avoid this check if 'remount' is + * specified. + */ + if ((flags & MS_OVERLAY) == 0 && + strstr(mntopts, MNTOPT_REMOUNT) == NULL && + !dir_is_empty(mountpoint)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "directory is not empty")); + return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint)); + } + + /* perform the mount */ + if (mount(zfs_get_name(zhp), mountpoint, MS_OPTIONSTR | flags, + MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) { + /* + * Generic errors are nasty, but there are just way too many + * from mount(), and they're well-understood. We pick a few + * common ones to improve upon. + */ + if (errno == EBUSY) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "mountpoint or dataset is busy")); + } else if (errno == EPERM) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Insufficient privileges")); + } else if (errno == ENOTSUP) { + char buf[256]; + int spa_version; + + VERIFY(zfs_spa_version(zhp, &spa_version) == 0); + (void) snprintf(buf, sizeof (buf), + dgettext(TEXT_DOMAIN, "Can't mount a version %lld " + "file system on a version %d pool. Pool must be" + " upgraded to mount this file system."), + (u_longlong_t)zfs_prop_get_int(zhp, + ZFS_PROP_VERSION), spa_version); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf)); + } else { + zfs_error_aux(hdl, strerror(errno)); + } + return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot mount '%s'"), + zhp->zfs_name)); + } + + /* add the mounted entry into our cache */ + libzfs_mnttab_add(hdl, zfs_get_name(zhp), mountpoint, + mntopts); + return (0); +} + +/* + * Unmount a single filesystem. + */ +static int +unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags) +{ + if (umount2(mountpoint, flags) != 0) { + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error_fmt(hdl, EZFS_UMOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot unmount '%s'"), + mountpoint)); + } + + return (0); +} + +/* + * Unmount the given filesystem. + */ +int +zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + struct mnttab entry; + char *mntpt = NULL; + + /* check to see if we need to unmount the filesystem */ + if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && + libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)) { + /* + * mountpoint may have come from a call to + * getmnt/getmntany if it isn't NULL. If it is NULL, + * we know it comes from libzfs_mnttab_find which can + * then get freed later. We strdup it to play it safe. + */ + if (mountpoint == NULL) + mntpt = zfs_strdup(hdl, entry.mnt_mountp); + else + mntpt = zfs_strdup(hdl, mountpoint); + + /* + * Unshare and unmount the filesystem + */ + if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0) + return (-1); + + if (unmount_one(hdl, mntpt, flags) != 0) { + free(mntpt); + (void) zfs_shareall(zhp); + return (-1); + } + libzfs_mnttab_remove(hdl, zhp->zfs_name); + free(mntpt); + } + + return (0); +} + +/* + * Unmount this filesystem and any children inheriting the mountpoint property. + * To do this, just act like we're changing the mountpoint property, but don't + * remount the filesystems afterwards. + */ +int +zfs_unmountall(zfs_handle_t *zhp, int flags) +{ + prop_changelist_t *clp; + int ret; + + clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, 0, flags); + if (clp == NULL) + return (-1); + + ret = changelist_prefix(clp); + changelist_free(clp); + + return (ret); +} + +boolean_t +zfs_is_shared(zfs_handle_t *zhp) +{ + zfs_share_type_t rc = 0; + zfs_share_proto_t *curr_proto; + + if (ZFS_IS_VOLUME(zhp)) + return (B_FALSE); + + for (curr_proto = share_all_proto; *curr_proto != PROTO_END; + curr_proto++) + rc |= zfs_is_shared_proto(zhp, NULL, *curr_proto); + + return (rc ? B_TRUE : B_FALSE); +} + +int +zfs_share(zfs_handle_t *zhp) +{ + assert(!ZFS_IS_VOLUME(zhp)); + return (zfs_share_proto(zhp, share_all_proto)); +} + +int +zfs_unshare(zfs_handle_t *zhp) +{ + assert(!ZFS_IS_VOLUME(zhp)); + return (zfs_unshareall(zhp)); +} + +/* + * Check to see if the filesystem is currently shared. + */ +zfs_share_type_t +zfs_is_shared_proto(zfs_handle_t *zhp, char **where, zfs_share_proto_t proto) +{ + char *mountpoint; + zfs_share_type_t rc; + + if (!zfs_is_mounted(zhp, &mountpoint)) + return (SHARED_NOT_SHARED); + + if (rc = is_shared(zhp->zfs_hdl, mountpoint, proto)) { + if (where != NULL) + *where = mountpoint; + else + free(mountpoint); + return (rc); + } else { + free(mountpoint); + return (SHARED_NOT_SHARED); + } +} + +boolean_t +zfs_is_shared_nfs(zfs_handle_t *zhp, char **where) +{ + return (zfs_is_shared_proto(zhp, where, + PROTO_NFS) != SHARED_NOT_SHARED); +} + +boolean_t +zfs_is_shared_smb(zfs_handle_t *zhp, char **where) +{ + return (zfs_is_shared_proto(zhp, where, + PROTO_SMB) != SHARED_NOT_SHARED); +} + +/* + * Make sure things will work if libshare isn't installed by using + * wrapper functions that check to see that the pointers to functions + * initialized in _zfs_init_libshare() are actually present. + */ + +static sa_handle_t (*_sa_init)(int); +static void (*_sa_fini)(sa_handle_t); +static sa_share_t (*_sa_find_share)(sa_handle_t, char *); +static int (*_sa_enable_share)(sa_share_t, char *); +static int (*_sa_disable_share)(sa_share_t, char *); +static char *(*_sa_errorstr)(int); +static int (*_sa_parse_legacy_options)(sa_group_t, char *, char *); +static boolean_t (*_sa_needs_refresh)(sa_handle_t *); +static libzfs_handle_t *(*_sa_get_zfs_handle)(sa_handle_t); +static int (*_sa_zfs_process_share)(sa_handle_t, sa_group_t, sa_share_t, + char *, char *, zprop_source_t, char *, char *, char *); +static void (*_sa_update_sharetab_ts)(sa_handle_t); + +/* + * _zfs_init_libshare() + * + * Find the libshare.so.1 entry points that we use here and save the + * values to be used later. This is triggered by the runtime loader. + * Make sure the correct ISA version is loaded. + */ + +#pragma init(_zfs_init_libshare) +static void +_zfs_init_libshare(void) +{ + void *libshare; + char path[MAXPATHLEN]; + char isa[MAXISALEN]; + +#if defined(_LP64) + if (sysinfo(SI_ARCHITECTURE_64, isa, MAXISALEN) == -1) + isa[0] = '\0'; +#else + isa[0] = '\0'; +#endif + (void) snprintf(path, MAXPATHLEN, + "/usr/lib/%s/libshare.so.1", isa); + + if ((libshare = dlopen(path, RTLD_LAZY | RTLD_GLOBAL)) != NULL) { + _sa_init = (sa_handle_t (*)(int))dlsym(libshare, "sa_init"); + _sa_fini = (void (*)(sa_handle_t))dlsym(libshare, "sa_fini"); + _sa_find_share = (sa_share_t (*)(sa_handle_t, char *)) + dlsym(libshare, "sa_find_share"); + _sa_enable_share = (int (*)(sa_share_t, char *))dlsym(libshare, + "sa_enable_share"); + _sa_disable_share = (int (*)(sa_share_t, char *))dlsym(libshare, + "sa_disable_share"); + _sa_errorstr = (char *(*)(int))dlsym(libshare, "sa_errorstr"); + _sa_parse_legacy_options = (int (*)(sa_group_t, char *, char *)) + dlsym(libshare, "sa_parse_legacy_options"); + _sa_needs_refresh = (boolean_t (*)(sa_handle_t *)) + dlsym(libshare, "sa_needs_refresh"); + _sa_get_zfs_handle = (libzfs_handle_t *(*)(sa_handle_t)) + dlsym(libshare, "sa_get_zfs_handle"); + _sa_zfs_process_share = (int (*)(sa_handle_t, sa_group_t, + sa_share_t, char *, char *, zprop_source_t, char *, + char *, char *))dlsym(libshare, "sa_zfs_process_share"); + _sa_update_sharetab_ts = (void (*)(sa_handle_t)) + dlsym(libshare, "sa_update_sharetab_ts"); + if (_sa_init == NULL || _sa_fini == NULL || + _sa_find_share == NULL || _sa_enable_share == NULL || + _sa_disable_share == NULL || _sa_errorstr == NULL || + _sa_parse_legacy_options == NULL || + _sa_needs_refresh == NULL || _sa_get_zfs_handle == NULL || + _sa_zfs_process_share == NULL || + _sa_update_sharetab_ts == NULL) { + _sa_init = NULL; + _sa_fini = NULL; + _sa_disable_share = NULL; + _sa_enable_share = NULL; + _sa_errorstr = NULL; + _sa_parse_legacy_options = NULL; + (void) dlclose(libshare); + _sa_needs_refresh = NULL; + _sa_get_zfs_handle = NULL; + _sa_zfs_process_share = NULL; + _sa_update_sharetab_ts = NULL; + } + } +} + +/* + * zfs_init_libshare(zhandle, service) + * + * Initialize the libshare API if it hasn't already been initialized. + * In all cases it returns 0 if it succeeded and an error if not. The + * service value is which part(s) of the API to initialize and is a + * direct map to the libshare sa_init(service) interface. + */ +int +zfs_init_libshare(libzfs_handle_t *zhandle, int service) +{ + int ret = SA_OK; + + if (_sa_init == NULL) + ret = SA_CONFIG_ERR; + + if (ret == SA_OK && zhandle->libzfs_shareflags & ZFSSHARE_MISS) { + /* + * We had a cache miss. Most likely it is a new ZFS + * dataset that was just created. We want to make sure + * so check timestamps to see if a different process + * has updated any of the configuration. If there was + * some non-ZFS change, we need to re-initialize the + * internal cache. + */ + zhandle->libzfs_shareflags &= ~ZFSSHARE_MISS; + if (_sa_needs_refresh != NULL && + _sa_needs_refresh(zhandle->libzfs_sharehdl)) { + zfs_uninit_libshare(zhandle); + zhandle->libzfs_sharehdl = _sa_init(service); + } + } + + if (ret == SA_OK && zhandle && zhandle->libzfs_sharehdl == NULL) + zhandle->libzfs_sharehdl = _sa_init(service); + + if (ret == SA_OK && zhandle->libzfs_sharehdl == NULL) + ret = SA_NO_MEMORY; + + return (ret); +} + +/* + * zfs_uninit_libshare(zhandle) + * + * Uninitialize the libshare API if it hasn't already been + * uninitialized. It is OK to call multiple times. + */ +void +zfs_uninit_libshare(libzfs_handle_t *zhandle) +{ + if (zhandle != NULL && zhandle->libzfs_sharehdl != NULL) { + if (_sa_fini != NULL) + _sa_fini(zhandle->libzfs_sharehdl); + zhandle->libzfs_sharehdl = NULL; + } +} + +/* + * zfs_parse_options(options, proto) + * + * Call the legacy parse interface to get the protocol specific + * options using the NULL arg to indicate that this is a "parse" only. + */ +int +zfs_parse_options(char *options, zfs_share_proto_t proto) +{ + if (_sa_parse_legacy_options != NULL) { + return (_sa_parse_legacy_options(NULL, options, + proto_table[proto].p_name)); + } + return (SA_CONFIG_ERR); +} + +/* + * zfs_sa_find_share(handle, path) + * + * wrapper around sa_find_share to find a share path in the + * configuration. + */ +static sa_share_t +zfs_sa_find_share(sa_handle_t handle, char *path) +{ + if (_sa_find_share != NULL) + return (_sa_find_share(handle, path)); + return (NULL); +} + +/* + * zfs_sa_enable_share(share, proto) + * + * Wrapper for sa_enable_share which enables a share for a specified + * protocol. + */ +static int +zfs_sa_enable_share(sa_share_t share, char *proto) +{ + if (_sa_enable_share != NULL) + return (_sa_enable_share(share, proto)); + return (SA_CONFIG_ERR); +} + +/* + * zfs_sa_disable_share(share, proto) + * + * Wrapper for sa_enable_share which disables a share for a specified + * protocol. + */ +static int +zfs_sa_disable_share(sa_share_t share, char *proto) +{ + if (_sa_disable_share != NULL) + return (_sa_disable_share(share, proto)); + return (SA_CONFIG_ERR); +} + +/* + * Share the given filesystem according to the options in the specified + * protocol specific properties (sharenfs, sharesmb). We rely + * on "libshare" to the dirty work for us. + */ +static int +zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) +{ + char mountpoint[ZFS_MAXPROPLEN]; + char shareopts[ZFS_MAXPROPLEN]; + char sourcestr[ZFS_MAXPROPLEN]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + sa_share_t share; + zfs_share_proto_t *curr_proto; + zprop_source_t sourcetype; + int ret; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) + return (0); + + if ((ret = zfs_init_libshare(hdl, SA_INIT_SHARE_API)) != SA_OK) { + (void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED, + dgettext(TEXT_DOMAIN, "cannot share '%s': %s"), + zfs_get_name(zhp), _sa_errorstr != NULL ? + _sa_errorstr(ret) : ""); + return (-1); + } + + for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { + /* + * Return success if there are no share options. + */ + if (zfs_prop_get(zhp, proto_table[*curr_proto].p_prop, + shareopts, sizeof (shareopts), &sourcetype, sourcestr, + ZFS_MAXPROPLEN, B_FALSE) != 0 || + strcmp(shareopts, "off") == 0) + continue; + + /* + * If the 'zoned' property is set, then zfs_is_mountable() + * will have already bailed out if we are in the global zone. + * But local zones cannot be NFS servers, so we ignore it for + * local zones as well. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) + continue; + + share = zfs_sa_find_share(hdl->libzfs_sharehdl, mountpoint); + if (share == NULL) { + /* + * This may be a new file system that was just + * created so isn't in the internal cache + * (second time through). Rather than + * reloading the entire configuration, we can + * assume ZFS has done the checking and it is + * safe to add this to the internal + * configuration. + */ + if (_sa_zfs_process_share(hdl->libzfs_sharehdl, + NULL, NULL, mountpoint, + proto_table[*curr_proto].p_name, sourcetype, + shareopts, sourcestr, zhp->zfs_name) != SA_OK) { + (void) zfs_error_fmt(hdl, + proto_table[*curr_proto].p_share_err, + dgettext(TEXT_DOMAIN, "cannot share '%s'"), + zfs_get_name(zhp)); + return (-1); + } + hdl->libzfs_shareflags |= ZFSSHARE_MISS; + share = zfs_sa_find_share(hdl->libzfs_sharehdl, + mountpoint); + } + if (share != NULL) { + int err; + err = zfs_sa_enable_share(share, + proto_table[*curr_proto].p_name); + if (err != SA_OK) { + (void) zfs_error_fmt(hdl, + proto_table[*curr_proto].p_share_err, + dgettext(TEXT_DOMAIN, "cannot share '%s'"), + zfs_get_name(zhp)); + return (-1); + } + } else { + (void) zfs_error_fmt(hdl, + proto_table[*curr_proto].p_share_err, + dgettext(TEXT_DOMAIN, "cannot share '%s'"), + zfs_get_name(zhp)); + return (-1); + } + + } + return (0); +} + + +int +zfs_share_nfs(zfs_handle_t *zhp) +{ + return (zfs_share_proto(zhp, nfs_only)); +} + +int +zfs_share_smb(zfs_handle_t *zhp) +{ + return (zfs_share_proto(zhp, smb_only)); +} + +int +zfs_shareall(zfs_handle_t *zhp) +{ + return (zfs_share_proto(zhp, share_all_proto)); +} + +/* + * Unshare a filesystem by mountpoint. + */ +static int +unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, + zfs_share_proto_t proto) +{ + sa_share_t share; + int err; + char *mntpt; + /* + * Mountpoint could get trashed if libshare calls getmntany + * which it does during API initialization, so strdup the + * value. + */ + mntpt = zfs_strdup(hdl, mountpoint); + + /* make sure libshare initialized */ + if ((err = zfs_init_libshare(hdl, SA_INIT_SHARE_API)) != SA_OK) { + free(mntpt); /* don't need the copy anymore */ + return (zfs_error_fmt(hdl, EZFS_SHARENFSFAILED, + dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), + name, _sa_errorstr(err))); + } + + share = zfs_sa_find_share(hdl->libzfs_sharehdl, mntpt); + free(mntpt); /* don't need the copy anymore */ + + if (share != NULL) { + err = zfs_sa_disable_share(share, proto_table[proto].p_name); + if (err != SA_OK) { + return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED, + dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), + name, _sa_errorstr(err))); + } + } else { + return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED, + dgettext(TEXT_DOMAIN, "cannot unshare '%s': not found"), + name)); + } + return (0); +} + +/* + * Unshare the given filesystem. + */ +int +zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, + zfs_share_proto_t *proto) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + struct mnttab entry; + char *mntpt = NULL; + + /* check to see if need to unmount the filesystem */ + rewind(zhp->zfs_hdl->libzfs_mnttab); + if (mountpoint != NULL) + mountpoint = mntpt = zfs_strdup(hdl, mountpoint); + + if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && + libzfs_mnttab_find(hdl, zfs_get_name(zhp), &entry) == 0)) { + zfs_share_proto_t *curr_proto; + + if (mountpoint == NULL) + mntpt = zfs_strdup(zhp->zfs_hdl, entry.mnt_mountp); + + for (curr_proto = proto; *curr_proto != PROTO_END; + curr_proto++) { + + if (is_shared(hdl, mntpt, *curr_proto) && + unshare_one(hdl, zhp->zfs_name, + mntpt, *curr_proto) != 0) { + if (mntpt != NULL) + free(mntpt); + return (-1); + } + } + } + if (mntpt != NULL) + free(mntpt); + + return (0); +} + +int +zfs_unshare_nfs(zfs_handle_t *zhp, const char *mountpoint) +{ + return (zfs_unshare_proto(zhp, mountpoint, nfs_only)); +} + +int +zfs_unshare_smb(zfs_handle_t *zhp, const char *mountpoint) +{ + return (zfs_unshare_proto(zhp, mountpoint, smb_only)); +} + +/* + * Same as zfs_unmountall(), but for NFS and SMB unshares. + */ +int +zfs_unshareall_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) +{ + prop_changelist_t *clp; + int ret; + + clp = changelist_gather(zhp, ZFS_PROP_SHARENFS, 0, 0); + if (clp == NULL) + return (-1); + + ret = changelist_unshare(clp, proto); + changelist_free(clp); + + return (ret); +} + +int +zfs_unshareall_nfs(zfs_handle_t *zhp) +{ + return (zfs_unshareall_proto(zhp, nfs_only)); +} + +int +zfs_unshareall_smb(zfs_handle_t *zhp) +{ + return (zfs_unshareall_proto(zhp, smb_only)); +} + +int +zfs_unshareall(zfs_handle_t *zhp) +{ + return (zfs_unshareall_proto(zhp, share_all_proto)); +} + +int +zfs_unshareall_bypath(zfs_handle_t *zhp, const char *mountpoint) +{ + return (zfs_unshare_proto(zhp, mountpoint, share_all_proto)); +} + +/* + * Remove the mountpoint associated with the current dataset, if necessary. + * We only remove the underlying directory if: + * + * - The mountpoint is not 'none' or 'legacy' + * - The mountpoint is non-empty + * - The mountpoint is the default or inherited + * - The 'zoned' property is set, or we're in a local zone + * + * Any other directories we leave alone. + */ +void +remove_mountpoint(zfs_handle_t *zhp) +{ + char mountpoint[ZFS_MAXPROPLEN]; + zprop_source_t source; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), + &source)) + return; + + if (source == ZPROP_SRC_DEFAULT || + source == ZPROP_SRC_INHERITED) { + /* + * Try to remove the directory, silently ignoring any errors. + * The filesystem may have since been removed or moved around, + * and this error isn't really useful to the administrator in + * any way. + */ + (void) rmdir(mountpoint); + } +} + +void +libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) +{ + if (cbp->cb_alloc == cbp->cb_used) { + size_t newsz; + void *ptr; + + newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64; + ptr = zfs_realloc(zhp->zfs_hdl, + cbp->cb_handles, cbp->cb_alloc * sizeof (void *), + newsz * sizeof (void *)); + cbp->cb_handles = ptr; + cbp->cb_alloc = newsz; + } + cbp->cb_handles[cbp->cb_used++] = zhp; +} + +static int +mount_cb(zfs_handle_t *zhp, void *data) +{ + get_all_cb_t *cbp = data; + + if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) { + zfs_close(zhp); + return (0); + } + + libzfs_add_handle(cbp, zhp); + if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) { + zfs_close(zhp); + return (-1); + } + return (0); +} + +int +libzfs_dataset_cmp(const void *a, const void *b) +{ + zfs_handle_t **za = (zfs_handle_t **)a; + zfs_handle_t **zb = (zfs_handle_t **)b; + char mounta[MAXPATHLEN]; + char mountb[MAXPATHLEN]; + boolean_t gota, gotb; + + if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0) + verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta, + sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); + if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0) + verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb, + sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); + + if (gota && gotb) + return (strcmp(mounta, mountb)); + + if (gota) + return (-1); + if (gotb) + return (1); + + return (strcmp(zfs_get_name(a), zfs_get_name(b))); +} + +/* + * Mount and share all datasets within the given pool. This assumes that no + * datasets within the pool are currently mounted. Because users can create + * complicated nested hierarchies of mountpoints, we first gather all the + * datasets and mountpoints within the pool, and sort them by mountpoint. Once + * we have the list of all filesystems, we iterate over them in order and mount + * and/or share each one. + */ +#pragma weak zpool_mount_datasets = zpool_enable_datasets +int +zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) +{ + get_all_cb_t cb = { 0 }; + libzfs_handle_t *hdl = zhp->zpool_hdl; + zfs_handle_t *zfsp; + int i, ret = -1; + int *good; + + /* + * Gather all non-snap datasets within the pool. + */ + if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL) + goto out; + + libzfs_add_handle(&cb, zfsp); + if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0) + goto out; + /* + * Sort the datasets by mountpoint. + */ + qsort(cb.cb_handles, cb.cb_used, sizeof (void *), + libzfs_dataset_cmp); + + /* + * And mount all the datasets, keeping track of which ones + * succeeded or failed. + */ + if ((good = zfs_alloc(zhp->zpool_hdl, + cb.cb_used * sizeof (int))) == NULL) + goto out; + + ret = 0; + for (i = 0; i < cb.cb_used; i++) { + if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0) + ret = -1; + else + good[i] = 1; + } + + /* + * Then share all the ones that need to be shared. This needs + * to be a separate pass in order to avoid excessive reloading + * of the configuration. Good should never be NULL since + * zfs_alloc is supposed to exit if memory isn't available. + */ + for (i = 0; i < cb.cb_used; i++) { + if (good[i] && zfs_share(cb.cb_handles[i]) != 0) + ret = -1; + } + + free(good); + +out: + for (i = 0; i < cb.cb_used; i++) + zfs_close(cb.cb_handles[i]); + free(cb.cb_handles); + + return (ret); +} + +static int +mountpoint_compare(const void *a, const void *b) +{ + const char *mounta = *((char **)a); + const char *mountb = *((char **)b); + + return (strcmp(mountb, mounta)); +} + +/* alias for 2002/240 */ +#pragma weak zpool_unmount_datasets = zpool_disable_datasets +/* + * Unshare and unmount all datasets within the given pool. We don't want to + * rely on traversing the DSL to discover the filesystems within the pool, + * because this may be expensive (if not all of them are mounted), and can fail + * arbitrarily (on I/O error, for example). Instead, we walk /etc/mnttab and + * gather all the filesystems that are currently mounted. + */ +int +zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) +{ + int used, alloc; + struct mnttab entry; + size_t namelen; + char **mountpoints = NULL; + zfs_handle_t **datasets = NULL; + libzfs_handle_t *hdl = zhp->zpool_hdl; + int i; + int ret = -1; + int flags = (force ? MS_FORCE : 0); + + namelen = strlen(zhp->zpool_name); + + rewind(hdl->libzfs_mnttab); + used = alloc = 0; + while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { + /* + * Ignore non-ZFS entries. + */ + if (entry.mnt_fstype == NULL || + strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) + continue; + + /* + * Ignore filesystems not within this pool. + */ + if (entry.mnt_mountp == NULL || + strncmp(entry.mnt_special, zhp->zpool_name, namelen) != 0 || + (entry.mnt_special[namelen] != '/' && + entry.mnt_special[namelen] != '\0')) + continue; + + /* + * At this point we've found a filesystem within our pool. Add + * it to our growing list. + */ + if (used == alloc) { + if (alloc == 0) { + if ((mountpoints = zfs_alloc(hdl, + 8 * sizeof (void *))) == NULL) + goto out; + + if ((datasets = zfs_alloc(hdl, + 8 * sizeof (void *))) == NULL) + goto out; + + alloc = 8; + } else { + void *ptr; + + if ((ptr = zfs_realloc(hdl, mountpoints, + alloc * sizeof (void *), + alloc * 2 * sizeof (void *))) == NULL) + goto out; + mountpoints = ptr; + + if ((ptr = zfs_realloc(hdl, datasets, + alloc * sizeof (void *), + alloc * 2 * sizeof (void *))) == NULL) + goto out; + datasets = ptr; + + alloc *= 2; + } + } + + if ((mountpoints[used] = zfs_strdup(hdl, + entry.mnt_mountp)) == NULL) + goto out; + + /* + * This is allowed to fail, in case there is some I/O error. It + * is only used to determine if we need to remove the underlying + * mountpoint, so failure is not fatal. + */ + datasets[used] = make_dataset_handle(hdl, entry.mnt_special); + + used++; + } + + /* + * At this point, we have the entire list of filesystems, so sort it by + * mountpoint. + */ + qsort(mountpoints, used, sizeof (char *), mountpoint_compare); + + /* + * Walk through and first unshare everything. + */ + for (i = 0; i < used; i++) { + zfs_share_proto_t *curr_proto; + for (curr_proto = share_all_proto; *curr_proto != PROTO_END; + curr_proto++) { + if (is_shared(hdl, mountpoints[i], *curr_proto) && + unshare_one(hdl, mountpoints[i], + mountpoints[i], *curr_proto) != 0) + goto out; + } + } + + /* + * Now unmount everything, removing the underlying directories as + * appropriate. + */ + for (i = 0; i < used; i++) { + if (unmount_one(hdl, mountpoints[i], flags) != 0) + goto out; + } + + for (i = 0; i < used; i++) { + if (datasets[i]) + remove_mountpoint(datasets[i]); + } + + ret = 0; +out: + for (i = 0; i < used; i++) { + if (datasets[i]) + zfs_close(datasets[i]); + free(mountpoints[i]); + } + free(datasets); + free(mountpoints); + + return (ret); +} diff --git a/lib/libzfs/common/libzfs_pool.c b/lib/libzfs/common/libzfs_pool.c new file mode 100644 index 0000000..7df7e91 --- /dev/null +++ b/lib/libzfs/common/libzfs_pool.c @@ -0,0 +1,3803 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "libzfs_impl.h" +#include "zfs_comutil.h" + +static int read_efi_label(nvlist_t *config, diskaddr_t *sb); + +#define DISK_ROOT "/dev/dsk" +#define RDISK_ROOT "/dev/rdsk" +#define BACKUP_SLICE "s2" + +typedef struct prop_flags { + int create:1; /* Validate property on creation */ + int import:1; /* Validate property on import */ +} prop_flags_t; + +/* + * ==================================================================== + * zpool property functions + * ==================================================================== + */ + +static int +zpool_get_all_props(zpool_handle_t *zhp) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) + return (-1); + + while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { + if (errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } else { + zcmd_free_nvlists(&zc); + return (-1); + } + } + + if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + + zcmd_free_nvlists(&zc); + + return (0); +} + +static int +zpool_props_refresh(zpool_handle_t *zhp) +{ + nvlist_t *old_props; + + old_props = zhp->zpool_props; + + if (zpool_get_all_props(zhp) != 0) + return (-1); + + nvlist_free(old_props); + return (0); +} + +static char * +zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop, + zprop_source_t *src) +{ + nvlist_t *nv, *nvl; + uint64_t ival; + char *value; + zprop_source_t source; + + nvl = zhp->zpool_props; + if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { + verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0); + source = ival; + verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); + } else { + source = ZPROP_SRC_DEFAULT; + if ((value = (char *)zpool_prop_default_string(prop)) == NULL) + value = "-"; + } + + if (src) + *src = source; + + return (value); +} + +uint64_t +zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) +{ + nvlist_t *nv, *nvl; + uint64_t value; + zprop_source_t source; + + if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) { + /* + * zpool_get_all_props() has most likely failed because + * the pool is faulted, but if all we need is the top level + * vdev's guid then get it from the zhp config nvlist. + */ + if ((prop == ZPOOL_PROP_GUID) && + (nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) && + (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) + == 0)) { + return (value); + } + return (zpool_prop_default_numeric(prop)); + } + + nvl = zhp->zpool_props; + if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { + verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0); + source = value; + verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); + } else { + source = ZPROP_SRC_DEFAULT; + value = zpool_prop_default_numeric(prop); + } + + if (src) + *src = source; + + return (value); +} + +/* + * Map VDEV STATE to printed strings. + */ +char * +zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) +{ + switch (state) { + case VDEV_STATE_CLOSED: + case VDEV_STATE_OFFLINE: + return (gettext("OFFLINE")); + case VDEV_STATE_REMOVED: + return (gettext("REMOVED")); + case VDEV_STATE_CANT_OPEN: + if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) + return (gettext("FAULTED")); + else if (aux == VDEV_AUX_SPLIT_POOL) + return (gettext("SPLIT")); + else + return (gettext("UNAVAIL")); + case VDEV_STATE_FAULTED: + return (gettext("FAULTED")); + case VDEV_STATE_DEGRADED: + return (gettext("DEGRADED")); + case VDEV_STATE_HEALTHY: + return (gettext("ONLINE")); + } + + return (gettext("UNKNOWN")); +} + +/* + * Get a zpool property value for 'prop' and return the value in + * a pre-allocated buffer. + */ +int +zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, + zprop_source_t *srctype) +{ + uint64_t intval; + const char *strval; + zprop_source_t src = ZPROP_SRC_NONE; + nvlist_t *nvroot; + vdev_stat_t *vs; + uint_t vsc; + + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + switch (prop) { + case ZPOOL_PROP_NAME: + (void) strlcpy(buf, zpool_get_name(zhp), len); + break; + + case ZPOOL_PROP_HEALTH: + (void) strlcpy(buf, "FAULTED", len); + break; + + case ZPOOL_PROP_GUID: + intval = zpool_get_prop_int(zhp, prop, &src); + (void) snprintf(buf, len, "%llu", intval); + break; + + case ZPOOL_PROP_ALTROOT: + case ZPOOL_PROP_CACHEFILE: + if (zhp->zpool_props != NULL || + zpool_get_all_props(zhp) == 0) { + (void) strlcpy(buf, + zpool_get_prop_string(zhp, prop, &src), + len); + if (srctype != NULL) + *srctype = src; + return (0); + } + /* FALLTHROUGH */ + default: + (void) strlcpy(buf, "-", len); + break; + } + + if (srctype != NULL) + *srctype = src; + return (0); + } + + if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && + prop != ZPOOL_PROP_NAME) + return (-1); + + switch (zpool_prop_get_type(prop)) { + case PROP_TYPE_STRING: + (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), + len); + break; + + case PROP_TYPE_NUMBER: + intval = zpool_get_prop_int(zhp, prop, &src); + + switch (prop) { + case ZPOOL_PROP_SIZE: + case ZPOOL_PROP_ALLOCATED: + case ZPOOL_PROP_FREE: + (void) zfs_nicenum(intval, buf, len); + break; + + case ZPOOL_PROP_CAPACITY: + (void) snprintf(buf, len, "%llu%%", + (u_longlong_t)intval); + break; + + case ZPOOL_PROP_DEDUPRATIO: + (void) snprintf(buf, len, "%llu.%02llux", + (u_longlong_t)(intval / 100), + (u_longlong_t)(intval % 100)); + break; + + case ZPOOL_PROP_HEALTH: + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); + + (void) strlcpy(buf, zpool_state_to_name(intval, + vs->vs_aux), len); + break; + default: + (void) snprintf(buf, len, "%llu", intval); + } + break; + + case PROP_TYPE_INDEX: + intval = zpool_get_prop_int(zhp, prop, &src); + if (zpool_prop_index_to_string(prop, intval, &strval) + != 0) + return (-1); + (void) strlcpy(buf, strval, len); + break; + + default: + abort(); + } + + if (srctype) + *srctype = src; + + return (0); +} + +/* + * Check if the bootfs name has the same pool name as it is set to. + * Assuming bootfs is a valid dataset name. + */ +static boolean_t +bootfs_name_valid(const char *pool, char *bootfs) +{ + int len = strlen(pool); + + if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT)) + return (B_FALSE); + + if (strncmp(pool, bootfs, len) == 0 && + (bootfs[len] == '/' || bootfs[len] == '\0')) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Inspect the configuration to determine if any of the devices contain + * an EFI label. + */ +static boolean_t +pool_uses_efi(nvlist_t *config) +{ + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (read_efi_label(config, NULL) >= 0); + + for (c = 0; c < children; c++) { + if (pool_uses_efi(child[c])) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +pool_is_bootable(zpool_handle_t *zhp) +{ + char bootfs[ZPOOL_MAXNAMELEN]; + + return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs, + sizeof (bootfs), NULL) == 0 && strncmp(bootfs, "-", + sizeof (bootfs)) != 0); +} + + +/* + * Given an nvlist of zpool properties to be set, validate that they are + * correct, and parse any numeric properties (index, boolean, etc) if they are + * specified as strings. + */ +static nvlist_t * +zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, + nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf) +{ + nvpair_t *elem; + nvlist_t *retprops; + zpool_prop_t prop; + char *strval; + uint64_t intval; + char *slash; + struct stat64 statbuf; + zpool_handle_t *zhp; + nvlist_t *nvroot; + + if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (NULL); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + const char *propname = nvpair_name(elem); + + /* + * Make sure this property is valid and applies to this type. + */ + if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (zpool_prop_readonly(prop)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " + "is readonly"), propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); + goto error; + } + + if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops, + &strval, &intval, errbuf) != 0) + goto error; + + /* + * Perform additional checking for specific properties. + */ + switch (prop) { + case ZPOOL_PROP_VERSION: + if (intval < version || intval > SPA_VERSION) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' number %d is invalid."), + propname, intval); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + goto error; + } + break; + + case ZPOOL_PROP_BOOTFS: + if (flags.create || flags.import) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' cannot be set at creation " + "or import time"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (version < SPA_VERSION_BOOTFS) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to support " + "'%s' property"), propname); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + goto error; + } + + /* + * bootfs property value has to be a dataset name and + * the dataset has to be in the same pool as it sets to. + */ + if (strval[0] != '\0' && !bootfs_name_valid(poolname, + strval)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " + "is an invalid name"), strval); + (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + goto error; + } + + if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "could not open pool '%s'"), poolname); + (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); + goto error; + } + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + + /* + * bootfs property cannot be set on a disk which has + * been EFI labeled. + */ + if (pool_uses_efi(nvroot)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' not supported on " + "EFI labeled devices"), propname); + (void) zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf); + zpool_close(zhp); + goto error; + } + zpool_close(zhp); + break; + + case ZPOOL_PROP_ALTROOT: + if (!flags.create && !flags.import) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set during pool " + "creation or import"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (strval[0] != '/') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "bad alternate root '%s'"), strval); + (void) zfs_error(hdl, EZFS_BADPATH, errbuf); + goto error; + } + break; + + case ZPOOL_PROP_CACHEFILE: + if (strval[0] == '\0') + break; + + if (strcmp(strval, "none") == 0) + break; + + if (strval[0] != '/') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' must be empty, an " + "absolute path, or 'none'"), propname); + (void) zfs_error(hdl, EZFS_BADPATH, errbuf); + goto error; + } + + slash = strrchr(strval, '/'); + + if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || + strcmp(slash, "/..") == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is not a valid file"), strval); + (void) zfs_error(hdl, EZFS_BADPATH, errbuf); + goto error; + } + + *slash = '\0'; + + if (strval[0] != '\0' && + (stat64(strval, &statbuf) != 0 || + !S_ISDIR(statbuf.st_mode))) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is not a valid directory"), + strval); + (void) zfs_error(hdl, EZFS_BADPATH, errbuf); + goto error; + } + + *slash = '/'; + break; + + case ZPOOL_PROP_READONLY: + if (!flags.import) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set at " + "import time"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + } + } + + return (retprops); +error: + nvlist_free(retprops); + return (NULL); +} + +/* + * Set zpool property : propname=propval. + */ +int +zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) +{ + zfs_cmd_t zc = { 0 }; + int ret = -1; + char errbuf[1024]; + nvlist_t *nvl = NULL; + nvlist_t *realprops; + uint64_t version; + prop_flags_t flags = { 0 }; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), + zhp->zpool_name); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (no_memory(zhp->zpool_hdl)); + + if (nvlist_add_string(nvl, propname, propval) != 0) { + nvlist_free(nvl); + return (no_memory(zhp->zpool_hdl)); + } + + version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); + if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, + zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) { + nvlist_free(nvl); + return (-1); + } + + nvlist_free(nvl); + nvl = realprops; + + /* + * Execute the corresponding ioctl() to set this property. + */ + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + + if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) { + nvlist_free(nvl); + return (-1); + } + + ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc); + + zcmd_free_nvlists(&zc); + nvlist_free(nvl); + + if (ret) + (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); + else + (void) zpool_props_refresh(zhp); + + return (ret); +} + +int +zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) +{ + libzfs_handle_t *hdl = zhp->zpool_hdl; + zprop_list_t *entry; + char buf[ZFS_MAXPROPLEN]; + + if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0) + return (-1); + + for (entry = *plp; entry != NULL; entry = entry->pl_next) { + + if (entry->pl_fixed) + continue; + + if (entry->pl_prop != ZPROP_INVAL && + zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), + NULL) == 0) { + if (strlen(buf) > entry->pl_width) + entry->pl_width = strlen(buf); + } + } + + return (0); +} + + +/* + * Don't start the slice at the default block of 34; many storage + * devices will use a stripe width of 128k, so start there instead. + */ +#define NEW_START_BLOCK 256 + +/* + * Validate the given pool name, optionally putting an extended error message in + * 'buf'. + */ +boolean_t +zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) +{ + namecheck_err_t why; + char what; + int ret; + + ret = pool_namecheck(pool, &why, &what); + + /* + * The rules for reserved pool names were extended at a later point. + * But we need to support users with existing pools that may now be + * invalid. So we only check for this expanded set of names during a + * create (or import), and only in userland. + */ + if (ret == 0 && !isopen && + (strncmp(pool, "mirror", 6) == 0 || + strncmp(pool, "raidz", 5) == 0 || + strncmp(pool, "spare", 5) == 0 || + strcmp(pool, "log") == 0)) { + if (hdl != NULL) + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "name is reserved")); + return (B_FALSE); + } + + + if (ret != 0) { + if (hdl != NULL) { + switch (why) { + case NAME_ERR_TOOLONG: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "name is too long")); + break; + + case NAME_ERR_INVALCHAR: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "invalid character " + "'%c' in pool name"), what); + break; + + case NAME_ERR_NOLETTER: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "name must begin with a letter")); + break; + + case NAME_ERR_RESERVED: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "name is reserved")); + break; + + case NAME_ERR_DISKLIKE: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool name is reserved")); + break; + + case NAME_ERR_LEADING_SLASH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "leading slash in name")); + break; + + case NAME_ERR_EMPTY_COMPONENT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "empty component in name")); + break; + + case NAME_ERR_TRAILING_SLASH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "trailing slash in name")); + break; + + case NAME_ERR_MULTIPLE_AT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "multiple '@' delimiters in name")); + break; + + } + } + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Open a handle to the given pool, even if the pool is currently in the FAULTED + * state. + */ +zpool_handle_t * +zpool_open_canfail(libzfs_handle_t *hdl, const char *pool) +{ + zpool_handle_t *zhp; + boolean_t missing; + + /* + * Make sure the pool name is valid. + */ + if (!zpool_name_valid(hdl, B_TRUE, pool)) { + (void) zfs_error_fmt(hdl, EZFS_INVALIDNAME, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), + pool); + return (NULL); + } + + if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) + return (NULL); + + zhp->zpool_hdl = hdl; + (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); + + if (zpool_refresh_stats(zhp, &missing) != 0) { + zpool_close(zhp); + return (NULL); + } + + if (missing) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool")); + (void) zfs_error_fmt(hdl, EZFS_NOENT, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); + zpool_close(zhp); + return (NULL); + } + + return (zhp); +} + +/* + * Like the above, but silent on error. Used when iterating over pools (because + * the configuration cache may be out of date). + */ +int +zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret) +{ + zpool_handle_t *zhp; + boolean_t missing; + + if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) + return (-1); + + zhp->zpool_hdl = hdl; + (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); + + if (zpool_refresh_stats(zhp, &missing) != 0) { + zpool_close(zhp); + return (-1); + } + + if (missing) { + zpool_close(zhp); + *ret = NULL; + return (0); + } + + *ret = zhp; + return (0); +} + +/* + * Similar to zpool_open_canfail(), but refuses to open pools in the faulted + * state. + */ +zpool_handle_t * +zpool_open(libzfs_handle_t *hdl, const char *pool) +{ + zpool_handle_t *zhp; + + if ((zhp = zpool_open_canfail(hdl, pool)) == NULL) + return (NULL); + + if (zhp->zpool_state == POOL_STATE_UNAVAIL) { + (void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name); + zpool_close(zhp); + return (NULL); + } + + return (zhp); +} + +/* + * Close the handle. Simply frees the memory associated with the handle. + */ +void +zpool_close(zpool_handle_t *zhp) +{ + if (zhp->zpool_config) + nvlist_free(zhp->zpool_config); + if (zhp->zpool_old_config) + nvlist_free(zhp->zpool_old_config); + if (zhp->zpool_props) + nvlist_free(zhp->zpool_props); + free(zhp); +} + +/* + * Return the name of the pool. + */ +const char * +zpool_get_name(zpool_handle_t *zhp) +{ + return (zhp->zpool_name); +} + + +/* + * Return the state of the pool (ACTIVE or UNAVAILABLE) + */ +int +zpool_get_state(zpool_handle_t *zhp) +{ + return (zhp->zpool_state); +} + +/* + * Create the named pool, using the provided vdev list. It is assumed + * that the consumer has already validated the contents of the nvlist, so we + * don't have to worry about error semantics. + */ +int +zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, + nvlist_t *props, nvlist_t *fsprops) +{ + zfs_cmd_t zc = { 0 }; + nvlist_t *zc_fsprops = NULL; + nvlist_t *zc_props = NULL; + char msg[1024]; + char *altroot; + int ret = -1; + + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot create '%s'"), pool); + + if (!zpool_name_valid(hdl, B_FALSE, pool)) + return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); + + if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) + return (-1); + + if (props) { + prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE }; + + if ((zc_props = zpool_valid_proplist(hdl, pool, props, + SPA_VERSION_1, flags, msg)) == NULL) { + goto create_failed; + } + } + + if (fsprops) { + uint64_t zoned; + char *zonestr; + + zoned = ((nvlist_lookup_string(fsprops, + zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) && + strcmp(zonestr, "on") == 0); + + if ((zc_fsprops = zfs_valid_proplist(hdl, + ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, msg)) == NULL) { + goto create_failed; + } + if (!zc_props && + (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { + goto create_failed; + } + if (nvlist_add_nvlist(zc_props, + ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) { + goto create_failed; + } + } + + if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) + goto create_failed; + + (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); + + if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) { + + zcmd_free_nvlists(&zc); + nvlist_free(zc_props); + nvlist_free(zc_fsprops); + + switch (errno) { + case EBUSY: + /* + * This can happen if the user has specified the same + * device multiple times. We can't reliably detect this + * until we try to add it and see we already have a + * label. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more vdevs refer to the same device")); + return (zfs_error(hdl, EZFS_BADDEV, msg)); + + case EOVERFLOW: + /* + * This occurs when one of the devices is below + * SPA_MINDEVSIZE. Unfortunately, we can't detect which + * device was the problem device since there's no + * reliable way to determine device size from userland. + */ + { + char buf[64]; + + zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf)); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is less than the " + "minimum size (%s)"), buf); + } + return (zfs_error(hdl, EZFS_BADDEV, msg)); + + case ENOSPC: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is out of space")); + return (zfs_error(hdl, EZFS_BADDEV, msg)); + + case ENOTBLK: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cache device must be a disk or disk slice")); + return (zfs_error(hdl, EZFS_BADDEV, msg)); + + default: + return (zpool_standard_error(hdl, errno, msg)); + } + } + + /* + * If this is an alternate root pool, then we automatically set the + * mountpoint of the root dataset to be '/'. + */ + if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), + &altroot) == 0) { + zfs_handle_t *zhp; + + verify((zhp = zfs_open(hdl, pool, ZFS_TYPE_DATASET)) != NULL); + verify(zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), + "/") == 0); + + zfs_close(zhp); + } + +create_failed: + zcmd_free_nvlists(&zc); + nvlist_free(zc_props); + nvlist_free(zc_fsprops); + return (ret); +} + +/* + * Destroy the given pool. It is up to the caller to ensure that there are no + * datasets left in the pool. + */ +int +zpool_destroy(zpool_handle_t *zhp) +{ + zfs_cmd_t zc = { 0 }; + zfs_handle_t *zfp = NULL; + libzfs_handle_t *hdl = zhp->zpool_hdl; + char msg[1024]; + + if (zhp->zpool_state == POOL_STATE_ACTIVE && + (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL) + return (-1); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + + if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot destroy '%s'"), zhp->zpool_name); + + if (errno == EROFS) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is read only")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + } else { + (void) zpool_standard_error(hdl, errno, msg); + } + + if (zfp) + zfs_close(zfp); + return (-1); + } + + if (zfp) { + remove_mountpoint(zfp); + zfs_close(zfp); + } + + return (0); +} + +/* + * Add the given vdevs to the pool. The caller must have already performed the + * necessary verification to ensure that the vdev specification is well-formed. + */ +int +zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + zfs_cmd_t zc = { 0 }; + int ret; + libzfs_handle_t *hdl = zhp->zpool_hdl; + char msg[1024]; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot add to '%s'"), zhp->zpool_name); + + if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < + SPA_VERSION_SPARES && + nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " + "upgraded to add hot spares")); + return (zfs_error(hdl, EZFS_BADVERSION, msg)); + } + + if (pool_is_bootable(zhp) && nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { + uint64_t s; + + for (s = 0; s < nspares; s++) { + char *path; + + if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, + &path) == 0 && pool_uses_efi(spares[s])) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "device '%s' contains an EFI label and " + "cannot be used on root pools."), + zpool_vdev_name(hdl, NULL, spares[s], + B_FALSE)); + return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); + } + } + } + + if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < + SPA_VERSION_L2CACHE && + nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " + "upgraded to add cache devices")); + return (zfs_error(hdl, EZFS_BADVERSION, msg)); + } + + if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) + return (-1); + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { + switch (errno) { + case EBUSY: + /* + * This can happen if the user has specified the same + * device multiple times. We can't reliably detect this + * until we try to add it and see we already have a + * label. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more vdevs refer to the same device")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case EOVERFLOW: + /* + * This occurrs when one of the devices is below + * SPA_MINDEVSIZE. Unfortunately, we can't detect which + * device was the problem device since there's no + * reliable way to determine device size from userland. + */ + { + char buf[64]; + + zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf)); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "device is less than the minimum " + "size (%s)"), buf); + } + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to add these vdevs")); + (void) zfs_error(hdl, EZFS_BADVERSION, msg); + break; + + case EDOM: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "root pool can not have multiple vdevs" + " or separate logs")); + (void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg); + break; + + case ENOTBLK: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cache device must be a disk or disk slice")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + default: + (void) zpool_standard_error(hdl, errno, msg); + } + + ret = -1; + } else { + ret = 0; + } + + zcmd_free_nvlists(&zc); + + return (ret); +} + +/* + * Exports the pool from the system. The caller must ensure that there are no + * mounted datasets in the pool. + */ +int +zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot export '%s'"), zhp->zpool_name); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_cookie = force; + zc.zc_guid = hardforce; + + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { + switch (errno) { + case EXDEV: + zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, + "use '-f' to override the following errors:\n" + "'%s' has an active shared spare which could be" + " used by other pools once '%s' is exported."), + zhp->zpool_name, zhp->zpool_name); + return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, + msg)); + default: + return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, + msg)); + } + } + + return (0); +} + +int +zpool_export(zpool_handle_t *zhp, boolean_t force) +{ + return (zpool_export_common(zhp, force, B_FALSE)); +} + +int +zpool_export_force(zpool_handle_t *zhp) +{ + return (zpool_export_common(zhp, B_TRUE, B_TRUE)); +} + +static void +zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, + nvlist_t *config) +{ + nvlist_t *nv = NULL; + uint64_t rewindto; + int64_t loss = -1; + struct tm t; + char timestr[128]; + + if (!hdl->libzfs_printerr || config == NULL) + return; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0) + return; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + return; + (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); + + if (localtime_r((time_t *)&rewindto, &t) != NULL && + strftime(timestr, 128, 0, &t) != 0) { + if (dryrun) { + (void) printf(dgettext(TEXT_DOMAIN, + "Would be able to return %s " + "to its state as of %s.\n"), + name, timestr); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "Pool %s returned to its state as of %s.\n"), + name, timestr); + } + if (loss > 120) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s approximately %lld "), + dryrun ? "Would discard" : "Discarded", + (loss + 30) / 60); + (void) printf(dgettext(TEXT_DOMAIN, + "minutes of transactions.\n")); + } else if (loss > 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s approximately %lld "), + dryrun ? "Would discard" : "Discarded", loss); + (void) printf(dgettext(TEXT_DOMAIN, + "seconds of transactions.\n")); + } + } +} + +void +zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, + nvlist_t *config) +{ + nvlist_t *nv = NULL; + int64_t loss = -1; + uint64_t edata = UINT64_MAX; + uint64_t rewindto; + struct tm t; + char timestr[128]; + + if (!hdl->libzfs_printerr) + return; + + if (reason >= 0) + (void) printf(dgettext(TEXT_DOMAIN, "action: ")); + else + (void) printf(dgettext(TEXT_DOMAIN, "\t")); + + /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + goto no_info; + + (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, + &edata); + + (void) printf(dgettext(TEXT_DOMAIN, + "Recovery is possible, but will result in some data loss.\n")); + + if (localtime_r((time_t *)&rewindto, &t) != NULL && + strftime(timestr, 128, 0, &t) != 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "\tReturning the pool to its state as of %s\n" + "\tshould correct the problem. "), + timestr); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "\tReverting the pool to an earlier state " + "should correct the problem.\n\t")); + } + + if (loss > 120) { + (void) printf(dgettext(TEXT_DOMAIN, + "Approximately %lld minutes of data\n" + "\tmust be discarded, irreversibly. "), (loss + 30) / 60); + } else if (loss > 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "Approximately %lld seconds of data\n" + "\tmust be discarded, irreversibly. "), loss); + } + if (edata != 0 && edata != UINT64_MAX) { + if (edata == 1) { + (void) printf(dgettext(TEXT_DOMAIN, + "After rewind, at least\n" + "\tone persistent user-data error will remain. ")); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "After rewind, several\n" + "\tpersistent user-data errors will remain. ")); + } + } + (void) printf(dgettext(TEXT_DOMAIN, + "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "), + reason >= 0 ? "clear" : "import", name); + + (void) printf(dgettext(TEXT_DOMAIN, + "A scrub of the pool\n" + "\tis strongly recommended after recovery.\n")); + return; + +no_info: + (void) printf(dgettext(TEXT_DOMAIN, + "Destroy and re-create the pool from\n\ta backup source.\n")); +} + +/* + * zpool_import() is a contracted interface. Should be kept the same + * if possible. + * + * Applications should use zpool_import_props() to import a pool with + * new properties value to be set. + */ +int +zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, + char *altroot) +{ + nvlist_t *props = NULL; + int ret; + + if (altroot != NULL) { + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { + return (zfs_error_fmt(hdl, EZFS_NOMEM, + dgettext(TEXT_DOMAIN, "cannot import '%s'"), + newname)); + } + + if (nvlist_add_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 || + nvlist_add_string(props, + zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) { + nvlist_free(props); + return (zfs_error_fmt(hdl, EZFS_NOMEM, + dgettext(TEXT_DOMAIN, "cannot import '%s'"), + newname)); + } + } + + ret = zpool_import_props(hdl, config, newname, props, + ZFS_IMPORT_NORMAL); + if (props) + nvlist_free(props); + return (ret); +} + +static void +print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, + int indent) +{ + nvlist_t **child; + uint_t c, children; + char *vname; + uint64_t is_log = 0; + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, + &is_log); + + if (name != NULL) + (void) printf("\t%*s%s%s\n", indent, "", name, + is_log ? " [log]" : ""); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE); + print_vdev_tree(hdl, vname, child[c], indent + 2); + free(vname); + } +} + +/* + * Import the given pool using the known configuration and a list of + * properties to be set. The configuration should have come from + * zpool_find_import(). The 'newname' parameters control whether the pool + * is imported with a different name. + */ +int +zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, + nvlist_t *props, int flags) +{ + zfs_cmd_t zc = { 0 }; + zpool_rewind_policy_t policy; + nvlist_t *nv = NULL; + nvlist_t *nvinfo = NULL; + nvlist_t *missing = NULL; + char *thename; + char *origname; + int ret; + int error = 0; + char errbuf[1024]; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &origname) == 0); + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot import pool '%s'"), origname); + + if (newname != NULL) { + if (!zpool_name_valid(hdl, B_FALSE, newname)) + return (zfs_error_fmt(hdl, EZFS_INVALIDNAME, + dgettext(TEXT_DOMAIN, "cannot import '%s'"), + newname)); + thename = (char *)newname; + } else { + thename = origname; + } + + if (props) { + uint64_t version; + prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + if ((props = zpool_valid_proplist(hdl, origname, + props, version, flags, errbuf)) == NULL) { + return (-1); + } else if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { + nvlist_free(props); + return (-1); + } + } + + (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name)); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &zc.zc_guid) == 0); + + if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) { + nvlist_free(props); + return (-1); + } + if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) { + nvlist_free(props); + return (-1); + } + + zc.zc_cookie = flags; + while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && + errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } + if (ret != 0) + error = errno; + + (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); + zpool_get_rewind_policy(config, &policy); + + if (error) { + char desc[1024]; + + /* + * Dry-run failed, but we print out what success + * looks like if we found a best txg + */ + if (policy.zrp_request & ZPOOL_TRY_REWIND) { + zpool_rewind_exclaim(hdl, newname ? origname : thename, + B_TRUE, nv); + nvlist_free(nv); + return (-1); + } + + if (newname == NULL) + (void) snprintf(desc, sizeof (desc), + dgettext(TEXT_DOMAIN, "cannot import '%s'"), + thename); + else + (void) snprintf(desc, sizeof (desc), + dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), + origname, thename); + + switch (error) { + case ENOTSUP: + /* + * Unsupported version. + */ + (void) zfs_error(hdl, EZFS_BADVERSION, desc); + break; + + case EINVAL: + (void) zfs_error(hdl, EZFS_INVALCONFIG, desc); + break; + + case EROFS: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is read only")); + (void) zfs_error(hdl, EZFS_BADDEV, desc); + break; + + case ENXIO: + if (nv && nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && + nvlist_lookup_nvlist(nvinfo, + ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "The devices below are missing, use " + "'-m' to import the pool anyway:\n")); + print_vdev_tree(hdl, NULL, missing, 2); + (void) printf("\n"); + } + (void) zpool_standard_error(hdl, error, desc); + break; + + case EEXIST: + (void) zpool_standard_error(hdl, error, desc); + break; + + default: + (void) zpool_standard_error(hdl, error, desc); + zpool_explain_recover(hdl, + newname ? origname : thename, -error, nv); + break; + } + + nvlist_free(nv); + ret = -1; + } else { + zpool_handle_t *zhp; + + /* + * This should never fail, but play it safe anyway. + */ + if (zpool_open_silent(hdl, thename, &zhp) != 0) + ret = -1; + else if (zhp != NULL) + zpool_close(zhp); + if (policy.zrp_request & + (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { + zpool_rewind_exclaim(hdl, newname ? origname : thename, + ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv); + } + nvlist_free(nv); + return (0); + } + + zcmd_free_nvlists(&zc); + nvlist_free(props); + + return (ret); +} + +/* + * Scan the pool. + */ +int +zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_cookie = func; + + if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 || + (errno == ENOENT && func != POOL_SCAN_NONE)) + return (0); + + if (func == POOL_SCAN_SCRUB) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name); + } else if (func == POOL_SCAN_NONE) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), + zc.zc_name); + } else { + assert(!"unexpected result"); + } + + if (errno == EBUSY) { + nvlist_t *nvroot; + pool_scan_stat_t *ps = NULL; + uint_t psc; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); + if (ps && ps->pss_func == POOL_SCAN_SCRUB) + return (zfs_error(hdl, EZFS_SCRUBBING, msg)); + else + return (zfs_error(hdl, EZFS_RESILVERING, msg)); + } else if (errno == ENOENT) { + return (zfs_error(hdl, EZFS_NO_SCRUB, msg)); + } else { + return (zpool_standard_error(hdl, errno, msg)); + } +} + +/* + * This provides a very minimal check whether a given string is likely a + * c#t#d# style string. Users of this are expected to do their own + * verification of the s# part. + */ +#define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1])) + +/* + * More elaborate version for ones which may start with "/dev/dsk/" + * and the like. + */ +static int +ctd_check_path(char *str) { + /* + * If it starts with a slash, check the last component. + */ + if (str && str[0] == '/') { + char *tmp = strrchr(str, '/'); + + /* + * If it ends in "/old", check the second-to-last + * component of the string instead. + */ + if (tmp != str && strcmp(tmp, "/old") == 0) { + for (tmp--; *tmp != '/'; tmp--) + ; + } + str = tmp + 1; + } + return (CTD_CHECK(str)); +} + +/* + * Find a vdev that matches the search criteria specified. We use the + * the nvpair name to determine how we should look for the device. + * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL + * spare; but FALSE if its an INUSE spare. + */ +static nvlist_t * +vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, + boolean_t *l2cache, boolean_t *log) +{ + uint_t c, children; + nvlist_t **child; + nvlist_t *ret; + uint64_t is_log; + char *srchkey; + nvpair_t *pair = nvlist_next_nvpair(search, NULL); + + /* Nothing to look for */ + if (search == NULL || pair == NULL) + return (NULL); + + /* Obtain the key we will use to search */ + srchkey = nvpair_name(pair); + + switch (nvpair_type(pair)) { + case DATA_TYPE_UINT64: + if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { + uint64_t srchval, theguid; + + verify(nvpair_value_uint64(pair, &srchval) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &theguid) == 0); + if (theguid == srchval) + return (nv); + } + break; + + case DATA_TYPE_STRING: { + char *srchval, *val; + + verify(nvpair_value_string(pair, &srchval) == 0); + if (nvlist_lookup_string(nv, srchkey, &val) != 0) + break; + + /* + * Search for the requested value. Special cases: + * + * - ZPOOL_CONFIG_PATH for whole disk entries. These end in + * "s0" or "s0/old". The "s0" part is hidden from the user, + * but included in the string, so this matches around it. + * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). + * + * Otherwise, all other searches are simple string compares. + */ + if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && + ctd_check_path(val)) { + uint64_t wholedisk = 0; + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + if (wholedisk) { + int slen = strlen(srchval); + int vlen = strlen(val); + + if (slen != vlen - 2) + break; + + /* + * make_leaf_vdev() should only set + * wholedisk for ZPOOL_CONFIG_PATHs which + * will include "/dev/dsk/", giving plenty of + * room for the indices used next. + */ + ASSERT(vlen >= 6); + + /* + * strings identical except trailing "s0" + */ + if (strcmp(&val[vlen - 2], "s0") == 0 && + strncmp(srchval, val, slen) == 0) + return (nv); + + /* + * strings identical except trailing "s0/old" + */ + if (strcmp(&val[vlen - 6], "s0/old") == 0 && + strcmp(&srchval[slen - 4], "/old") == 0 && + strncmp(srchval, val, slen - 4) == 0) + return (nv); + + break; + } + } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { + char *type, *idx, *end, *p; + uint64_t id, vdev_id; + + /* + * Determine our vdev type, keeping in mind + * that the srchval is composed of a type and + * vdev id pair (i.e. mirror-4). + */ + if ((type = strdup(srchval)) == NULL) + return (NULL); + + if ((p = strrchr(type, '-')) == NULL) { + free(type); + break; + } + idx = p + 1; + *p = '\0'; + + /* + * If the types don't match then keep looking. + */ + if (strncmp(val, type, strlen(val)) != 0) { + free(type); + break; + } + + verify(strncmp(type, VDEV_TYPE_RAIDZ, + strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(type, VDEV_TYPE_MIRROR, + strlen(VDEV_TYPE_MIRROR)) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &id) == 0); + + errno = 0; + vdev_id = strtoull(idx, &end, 10); + + free(type); + if (errno != 0) + return (NULL); + + /* + * Now verify that we have the correct vdev id. + */ + if (vdev_id == id) + return (nv); + } + + /* + * Common case + */ + if (strcmp(srchval, val) == 0) + return (nv); + break; + } + + default: + break; + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = vdev_to_nvlist_iter(child[c], search, + avail_spare, l2cache, NULL)) != NULL) { + /* + * The 'is_log' value is only set for the toplevel + * vdev, not the leaf vdevs. So we always lookup the + * log device from the root of the vdev tree (where + * 'log' is non-NULL). + */ + if (log != NULL && + nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_LOG, &is_log) == 0 && + is_log) { + *log = B_TRUE; + } + return (ret); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if ((ret = vdev_to_nvlist_iter(child[c], search, + avail_spare, l2cache, NULL)) != NULL) { + *avail_spare = B_TRUE; + return (ret); + } + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if ((ret = vdev_to_nvlist_iter(child[c], search, + avail_spare, l2cache, NULL)) != NULL) { + *l2cache = B_TRUE; + return (ret); + } + } + } + + return (NULL); +} + +/* + * Given a physical path (minus the "/devices" prefix), find the + * associated vdev. + */ +nvlist_t * +zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, + boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) +{ + nvlist_t *search, *nvroot, *ret; + + verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); + verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0); + + verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + *avail_spare = B_FALSE; + *l2cache = B_FALSE; + if (log != NULL) + *log = B_FALSE; + ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); + nvlist_free(search); + + return (ret); +} + +/* + * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). + */ +boolean_t +zpool_vdev_is_interior(const char *name) +{ + if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) + return (B_TRUE); + return (B_FALSE); +} + +nvlist_t * +zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, + boolean_t *l2cache, boolean_t *log) +{ + char buf[MAXPATHLEN]; + char *end; + nvlist_t *nvroot, *search, *ret; + uint64_t guid; + + verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + guid = strtoull(path, &end, 10); + if (guid != 0 && *end == '\0') { + verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0); + } else if (zpool_vdev_is_interior(path)) { + verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0); + } else if (path[0] != '/') { + (void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path); + verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0); + } else { + verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0); + } + + verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + *avail_spare = B_FALSE; + *l2cache = B_FALSE; + if (log != NULL) + *log = B_FALSE; + ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); + nvlist_free(search); + + return (ret); +} + +static int +vdev_online(nvlist_t *nv) +{ + uint64_t ival; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) + return (0); + + return (1); +} + +/* + * Helper function for zpool_get_physpaths(). + */ +static int +vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size, + size_t *bytes_written) +{ + size_t bytes_left, pos, rsz; + char *tmppath; + const char *format; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH, + &tmppath) != 0) + return (EZFS_NODEVICE); + + pos = *bytes_written; + bytes_left = physpath_size - pos; + format = (pos == 0) ? "%s" : " %s"; + + rsz = snprintf(physpath + pos, bytes_left, format, tmppath); + *bytes_written += rsz; + + if (rsz >= bytes_left) { + /* if physpath was not copied properly, clear it */ + if (bytes_left != 0) { + physpath[pos] = 0; + } + return (EZFS_NOSPC); + } + return (0); +} + +static int +vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size, + size_t *rsz, boolean_t is_spare) +{ + char *type; + int ret; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (EZFS_INVALCONFIG); + + if (strcmp(type, VDEV_TYPE_DISK) == 0) { + /* + * An active spare device has ZPOOL_CONFIG_IS_SPARE set. + * For a spare vdev, we only want to boot from the active + * spare device. + */ + if (is_spare) { + uint64_t spare = 0; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare); + if (!spare) + return (EZFS_INVALCONFIG); + } + + if (vdev_online(nv)) { + if ((ret = vdev_get_one_physpath(nv, physpath, + phypath_size, rsz)) != 0) + return (ret); + } + } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 || + strcmp(type, VDEV_TYPE_REPLACING) == 0 || + (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) { + nvlist_t **child; + uint_t count; + int i, ret; + + if (nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) + return (EZFS_INVALCONFIG); + + for (i = 0; i < count; i++) { + ret = vdev_get_physpaths(child[i], physpath, + phypath_size, rsz, is_spare); + if (ret == EZFS_NOSPC) + return (ret); + } + } + + return (EZFS_POOL_INVALARG); +} + +/* + * Get phys_path for a root pool config. + * Return 0 on success; non-zero on failure. + */ +static int +zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size) +{ + size_t rsz; + nvlist_t *vdev_root; + nvlist_t **child; + uint_t count; + char *type; + + rsz = 0; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &vdev_root) != 0) + return (EZFS_INVALCONFIG); + + if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 || + nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, + &child, &count) != 0) + return (EZFS_INVALCONFIG); + + /* + * root pool can not have EFI labeled disks and can only have + * a single top-level vdev. + */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 || + pool_uses_efi(vdev_root)) + return (EZFS_POOL_INVALARG); + + (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz, + B_FALSE); + + /* No online devices */ + if (rsz == 0) + return (EZFS_NODEVICE); + + return (0); +} + +/* + * Get phys_path for a root pool + * Return 0 on success; non-zero on failure. + */ +int +zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size) +{ + return (zpool_get_config_physpath(zhp->zpool_config, physpath, + phypath_size)); +} + +/* + * If the device has being dynamically expanded then we need to relabel + * the disk to use the new unallocated space. + */ +static int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) +{ + char path[MAXPATHLEN]; + char errbuf[1024]; + int fd, error; + int (*_efi_use_whole_disk)(int); + + if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT, + "efi_use_whole_disk")) == NULL) + return (-1); + + (void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name); + + if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to open device"), name); + return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); + } + + /* + * It's possible that we might encounter an error if the device + * does not have any unallocated space left. If so, we simply + * ignore that error and continue on. + */ + error = _efi_use_whole_disk(fd); + (void) close(fd); + if (error && error != VT_ENOSPC) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to read disk capacity"), name); + return (zfs_error(hdl, EZFS_NOCAP, errbuf)); + } + return (0); +} + +/* + * Bring the specified vdev online. The 'flags' parameter is a set of the + * ZFS_ONLINE_* flags. + */ +int +zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, + vdev_state_t *newstate) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache, islog; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + if (flags & ZFS_ONLINE_EXPAND) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot expand %s"), path); + } else { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot online %s"), path); + } + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + &islog)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + if (flags & ZFS_ONLINE_EXPAND || + zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { + char *pathname = NULL; + uint64_t wholedisk = 0; + + (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, + &pathname) == 0); + + /* + * XXX - L2ARC 1.0 devices can't support expansion. + */ + if (l2cache) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot expand cache devices")); + return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg)); + } + + if (wholedisk) { + pathname += strlen(DISK_ROOT) + 1; + (void) zpool_relabel_disk(hdl, pathname); + } + } + + zc.zc_cookie = VDEV_STATE_ONLINE; + zc.zc_obj = flags; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { + if (errno == EINVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " + "from this pool into a new one. Use '%s' " + "instead"), "zpool detach"); + return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg)); + } + return (zpool_standard_error(hdl, errno, msg)); + } + + *newstate = zc.zc_cookie; + return (0); +} + +/* + * Take the specified vdev offline + */ +int +zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot offline %s"), path); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + NULL)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + zc.zc_cookie = VDEV_STATE_OFFLINE; + zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + switch (errno) { + case EBUSY: + + /* + * There are no other replicas of this device. + */ + return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); + + case EEXIST: + /* + * The log device has unplayed logs + */ + return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg)); + + default: + return (zpool_standard_error(hdl, errno, msg)); + } +} + +/* + * Mark the given vdev faulted. + */ +int +zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_guid = guid; + zc.zc_cookie = VDEV_STATE_FAULTED; + zc.zc_obj = aux; + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + switch (errno) { + case EBUSY: + + /* + * There are no other replicas of this device. + */ + return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); + + default: + return (zpool_standard_error(hdl, errno, msg)); + } + +} + +/* + * Mark the given vdev degraded. + */ +int +zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_guid = guid; + zc.zc_cookie = VDEV_STATE_DEGRADED; + zc.zc_obj = aux; + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Returns TRUE if the given nvlist is a vdev that was originally swapped in as + * a hot spare. + */ +static boolean_t +is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) +{ + nvlist_t **child; + uint_t c, children; + char *type; + + if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE, + &type) == 0); + + if (strcmp(type, VDEV_TYPE_SPARE) == 0 && + children == 2 && child[which] == tgt) + return (B_TRUE); + + for (c = 0; c < children; c++) + if (is_replacing_spare(child[c], tgt, which)) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Attach new_disk (fully described by nvroot) to old_disk. + * If 'replacing' is specified, the new disk will replace the old one. + */ +int +zpool_vdev_attach(zpool_handle_t *zhp, + const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + int ret; + nvlist_t *tgt; + boolean_t avail_spare, l2cache, islog; + uint64_t val; + char *newname; + nvlist_t **child; + uint_t children; + nvlist_t *config_root; + libzfs_handle_t *hdl = zhp->zpool_hdl; + boolean_t rootpool = pool_is_bootable(zhp); + + if (replacing) + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot replace %s with %s"), old_disk, new_disk); + else + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot attach %s to %s"), new_disk, old_disk); + + /* + * If this is a root pool, make sure that we're not attaching an + * EFI labeled device. + */ + if (rootpool && pool_uses_efi(nvroot)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "EFI labeled devices are not supported on root pools.")); + return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); + } + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, + &islog)) == 0) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + if (l2cache) + return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + zc.zc_cookie = replacing; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0 || children != 1) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new device must be a single disk")); + return (zfs_error(hdl, EZFS_INVALCONFIG, msg)); + } + + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0); + + if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL) + return (-1); + + /* + * If the target is a hot spare that has been swapped in, we can only + * replace it with another hot spare. + */ + if (replacing && + nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 && + (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache, + NULL) == NULL || !avail_spare) && + is_replacing_spare(config_root, tgt, 1)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "can only be replaced by another hot spare")); + free(newname); + return (zfs_error(hdl, EZFS_BADTARGET, msg)); + } + + free(newname); + + if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) + return (-1); + + ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); + + zcmd_free_nvlists(&zc); + + if (ret == 0) { + if (rootpool) { + /* + * XXX need a better way to prevent user from + * booting up a half-baked vdev. + */ + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make " + "sure to wait until resilver is done " + "before rebooting.\n")); + } + return (0); + } + + switch (errno) { + case ENOTSUP: + /* + * Can't attach to or replace this type of vdev. + */ + if (replacing) { + uint64_t version = zpool_get_prop_int(zhp, + ZPOOL_PROP_VERSION, NULL); + + if (islog) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot replace a log with a spare")); + else if (version >= SPA_VERSION_MULTI_REPLACE) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "already in replacing/spare config; wait " + "for completion or use 'zpool detach'")); + else + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot replace a replacing device")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "can only attach to mirrors and top-level " + "disks")); + } + (void) zfs_error(hdl, EZFS_BADTARGET, msg); + break; + + case EINVAL: + /* + * The new device must be a single disk. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new device must be a single disk")); + (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); + break; + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"), + new_disk); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case EOVERFLOW: + /* + * The new device is too small. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "device is too small")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case EDOM: + /* + * The new device has a different alignment requirement. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "devices have different sector alignment")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case ENAMETOOLONG: + /* + * The resulting top-level vdev spec won't fit in the label. + */ + (void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg); + break; + + default: + (void) zpool_standard_error(hdl, errno, msg); + } + + return (-1); +} + +/* + * Detach the specified device. + */ +int +zpool_vdev_detach(zpool_handle_t *zhp, const char *path) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot detach %s"), path); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + NULL)) == 0) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + if (l2cache) + return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0) + return (0); + + switch (errno) { + + case ENOTSUP: + /* + * Can't detach from this type of vdev. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only " + "applicable to mirror and replacing vdevs")); + (void) zfs_error(hdl, EZFS_BADTARGET, msg); + break; + + case EBUSY: + /* + * There are no other replicas of this device. + */ + (void) zfs_error(hdl, EZFS_NOREPLICAS, msg); + break; + + default: + (void) zpool_standard_error(hdl, errno, msg); + } + + return (-1); +} + +/* + * Find a mirror vdev in the source nvlist. + * + * The mchild array contains a list of disks in one of the top-level mirrors + * of the source pool. The schild array contains a list of disks that the + * user specified on the command line. We loop over the mchild array to + * see if any entry in the schild array matches. + * + * If a disk in the mchild array is found in the schild array, we return + * the index of that entry. Otherwise we return -1. + */ +static int +find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, + nvlist_t **schild, uint_t schildren) +{ + uint_t mc; + + for (mc = 0; mc < mchildren; mc++) { + uint_t sc; + char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, + mchild[mc], B_FALSE); + + for (sc = 0; sc < schildren; sc++) { + char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, + schild[sc], B_FALSE); + boolean_t result = (strcmp(mpath, spath) == 0); + + free(spath); + if (result) { + free(mpath); + return (mc); + } + } + + free(mpath); + } + + return (-1); +} + +/* + * Split a mirror pool. If newroot points to null, then a new nvlist + * is generated and it is the responsibility of the caller to free it. + */ +int +zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, + nvlist_t *props, splitflags_t flags) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; + nvlist_t **varray = NULL, *zc_props = NULL; + uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; + libzfs_handle_t *hdl = zhp->zpool_hdl; + uint64_t vers; + boolean_t freelist = B_FALSE, memory_err = B_TRUE; + int retval = 0; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); + + if (!zpool_name_valid(hdl, B_FALSE, newname)) + return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + (void) fprintf(stderr, gettext("Internal error: unable to " + "retrieve pool configuration\n")); + return (-1); + } + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) + == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0); + + if (props) { + prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; + if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, + props, vers, flags, msg)) == NULL) + return (-1); + } + + if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Source pool is missing vdev tree")); + if (zc_props) + nvlist_free(zc_props); + return (-1); + } + + varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); + vcount = 0; + + if (*newroot == NULL || + nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, + &newchild, &newchildren) != 0) + newchildren = 0; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE, is_hole = B_FALSE; + char *type; + nvlist_t **mchild, *vdev; + uint_t mchildren; + int entry; + + /* + * Unlike cache & spares, slogs are stored in the + * ZPOOL_CONFIG_CHILDREN array. We filter them out here. + */ + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + if (is_log || is_hole) { + /* + * Create a hole vdev and put it in the config. + */ + if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) + goto out; + if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0) + goto out; + if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, + 1) != 0) + goto out; + if (lastlog == 0) + lastlog = vcount; + varray[vcount++] = vdev; + continue; + } + lastlog = 0; + verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) + == 0); + if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Source pool must be composed only of mirrors\n")); + retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); + goto out; + } + + verify(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); + + /* find or add an entry for this top-level vdev */ + if (newchildren > 0 && + (entry = find_vdev_entry(zhp, mchild, mchildren, + newchild, newchildren)) >= 0) { + /* We found a disk that the user specified. */ + vdev = mchild[entry]; + ++found; + } else { + /* User didn't specify a disk for this vdev. */ + vdev = mchild[mchildren - 1]; + } + + if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) + goto out; + } + + /* did we find every disk the user specified? */ + if (found != newchildren) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " + "include at most one disk from each mirror")); + retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); + goto out; + } + + /* Prepare the nvlist for populating. */ + if (*newroot == NULL) { + if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) + goto out; + freelist = B_TRUE; + if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0) + goto out; + } else { + verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); + } + + /* Add all the children we found */ + if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray, + lastlog == 0 ? vcount : lastlog) != 0) + goto out; + + /* + * If we're just doing a dry run, exit now with success. + */ + if (flags.dryrun) { + memory_err = B_FALSE; + freelist = B_FALSE; + goto out; + } + + /* now build up the config list & call the ioctl */ + if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) + goto out; + + if (nvlist_add_nvlist(newconfig, + ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || + nvlist_add_string(newconfig, + ZPOOL_CONFIG_POOL_NAME, newname) != 0 || + nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) + goto out; + + /* + * The new pool is automatically part of the namespace unless we + * explicitly export it. + */ + if (!flags.import) + zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); + if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0) + goto out; + if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) + goto out; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { + retval = zpool_standard_error(hdl, errno, msg); + goto out; + } + + freelist = B_FALSE; + memory_err = B_FALSE; + +out: + if (varray != NULL) { + int v; + + for (v = 0; v < vcount; v++) + nvlist_free(varray[v]); + free(varray); + } + zcmd_free_nvlists(&zc); + if (zc_props) + nvlist_free(zc_props); + if (newconfig) + nvlist_free(newconfig); + if (freelist) { + nvlist_free(*newroot); + *newroot = NULL; + } + + if (retval != 0) + return (retval); + + if (memory_err) + return (no_memory(hdl)); + + return (0); +} + +/* + * Remove the given device. Currently, this is supported only for hot spares + * and level 2 cache devices. + */ +int +zpool_vdev_remove(zpool_handle_t *zhp, const char *path) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache, islog; + libzfs_handle_t *hdl = zhp->zpool_hdl; + uint64_t version; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot remove %s"), path); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + &islog)) == 0) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + /* + * XXX - this should just go away. + */ + if (!avail_spare && !l2cache && !islog) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "only inactive hot spares, cache, top-level, " + "or log devices can be removed")); + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + } + + version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); + if (islog && version < SPA_VERSION_HOLES) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgrade to support log removal")); + return (zfs_error(hdl, EZFS_BADVERSION, msg)); + } + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Clear the errors for the pool, or the particular device if specified. + */ +int +zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + nvlist_t *tgt; + zpool_rewind_policy_t policy; + boolean_t avail_spare, l2cache; + libzfs_handle_t *hdl = zhp->zpool_hdl; + nvlist_t *nvi = NULL; + int error; + + if (path) + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), + path); + else + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), + zhp->zpool_name); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if (path) { + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, + &l2cache, NULL)) == 0) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + /* + * Don't allow error clearing for hot spares. Do allow + * error clearing for l2cache devices. + */ + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, + &zc.zc_guid) == 0); + } + + zpool_get_rewind_policy(rewindnvl, &policy); + zc.zc_cookie = policy.zrp_request; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0) + return (-1); + + if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0) + return (-1); + + while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && + errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } + + if (!error || ((policy.zrp_request & ZPOOL_TRY_REWIND) && + errno != EPERM && errno != EACCES)) { + if (policy.zrp_request & + (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { + (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); + zpool_rewind_exclaim(hdl, zc.zc_name, + ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), + nvi); + nvlist_free(nvi); + } + zcmd_free_nvlists(&zc); + return (0); + } + + zcmd_free_nvlists(&zc); + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Similar to zpool_clear(), but takes a GUID (used by fmd). + */ +int +zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), + guid); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_guid = guid; + zc.zc_cookie = ZPOOL_NO_REWIND; + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Convert from a devid string to a path. + */ +static char * +devid_to_path(char *devid_str) +{ + ddi_devid_t devid; + char *minor; + char *path; + devid_nmlist_t *list = NULL; + int ret; + + if (devid_str_decode(devid_str, &devid, &minor) != 0) + return (NULL); + + ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list); + + devid_str_free(minor); + devid_free(devid); + + if (ret != 0) + return (NULL); + + if ((path = strdup(list[0].devname)) == NULL) + return (NULL); + + devid_free_nmlist(list); + + return (path); +} + +/* + * Convert from a path to a devid string. + */ +static char * +path_to_devid(const char *path) +{ + int fd; + ddi_devid_t devid; + char *minor, *ret; + + if ((fd = open(path, O_RDONLY)) < 0) + return (NULL); + + minor = NULL; + ret = NULL; + if (devid_get(fd, &devid) == 0) { + if (devid_get_minor_name(fd, &minor) == 0) + ret = devid_str_encode(devid, minor); + if (minor != NULL) + devid_str_free(minor); + devid_free(devid); + } + (void) close(fd); + + return (ret); +} + +/* + * Issue the necessary ioctl() to update the stored path value for the vdev. We + * ignore any failure here, since a common case is for an unprivileged user to + * type 'zpool status', and we'll display the correct information anyway. + */ +static void +set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) +{ + zfs_cmd_t zc = { 0 }; + + (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + (void) strncpy(zc.zc_value, path, sizeof (zc.zc_value)); + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &zc.zc_guid) == 0); + + (void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc); +} + +/* + * Given a vdev, return the name to display in iostat. If the vdev has a path, + * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type. + * We also check if this is a whole disk, in which case we strip off the + * trailing 's0' slice name. + * + * This routine is also responsible for identifying when disks have been + * reconfigured in a new location. The kernel will have opened the device by + * devid, but the path will still refer to the old location. To catch this, we + * first do a path -> devid translation (which is fast for the common case). If + * the devid matches, we're done. If not, we do a reverse devid -> path + * translation and issue the appropriate ioctl() to update the path of the vdev. + * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any + * of these checks. + */ +char * +zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, + boolean_t verbose) +{ + char *path, *devid; + uint64_t value; + char buf[64]; + vdev_stat_t *vs; + uint_t vsc; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &value) == 0) { + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &value) == 0); + (void) snprintf(buf, sizeof (buf), "%llu", + (u_longlong_t)value); + path = buf; + } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { + + /* + * If the device is dead (faulted, offline, etc) then don't + * bother opening it. Otherwise we may be forcing the user to + * open a misbehaving device, which can have undesirable + * effects. + */ + if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) != 0 || + vs->vs_state >= VDEV_STATE_DEGRADED) && + zhp != NULL && + nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) { + /* + * Determine if the current path is correct. + */ + char *newdevid = path_to_devid(path); + + if (newdevid == NULL || + strcmp(devid, newdevid) != 0) { + char *newpath; + + if ((newpath = devid_to_path(devid)) != NULL) { + /* + * Update the path appropriately. + */ + set_path(zhp, nv, newpath); + if (nvlist_add_string(nv, + ZPOOL_CONFIG_PATH, newpath) == 0) + verify(nvlist_lookup_string(nv, + ZPOOL_CONFIG_PATH, + &path) == 0); + free(newpath); + } + } + + if (newdevid) + devid_str_free(newdevid); + } + + if (strncmp(path, "/dev/dsk/", 9) == 0) + path += 9; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &value) == 0 && value) { + int pathlen = strlen(path); + char *tmp = zfs_strdup(hdl, path); + + /* + * If it starts with c#, and ends with "s0", chop + * the "s0" off, or if it ends with "s0/old", remove + * the "s0" from the middle. + */ + if (CTD_CHECK(tmp)) { + if (strcmp(&tmp[pathlen - 2], "s0") == 0) { + tmp[pathlen - 2] = '\0'; + } else if (pathlen > 6 && + strcmp(&tmp[pathlen - 6], "s0/old") == 0) { + (void) strcpy(&tmp[pathlen - 6], + "/old"); + } + } + return (tmp); + } + } else { + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0); + + /* + * If it's a raidz device, we need to stick in the parity level. + */ + if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &value) == 0); + (void) snprintf(buf, sizeof (buf), "%s%llu", path, + (u_longlong_t)value); + path = buf; + } + + /* + * We identify each top-level vdev by using a + * naming convention. + */ + if (verbose) { + uint64_t id; + + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &id) == 0); + (void) snprintf(buf, sizeof (buf), "%s-%llu", path, + (u_longlong_t)id); + path = buf; + } + } + + return (zfs_strdup(hdl, path)); +} + +static int +zbookmark_compare(const void *a, const void *b) +{ + return (memcmp(a, b, sizeof (zbookmark_t))); +} + +/* + * Retrieve the persistent error log, uniquify the members, and return to the + * caller. + */ +int +zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) +{ + zfs_cmd_t zc = { 0 }; + uint64_t count; + zbookmark_t *zb = NULL; + int i; + + /* + * Retrieve the raw error list from the kernel. If the number of errors + * has increased, allocate more space and continue until we get the + * entire list. + */ + verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT, + &count) == 0); + if (count == 0) + return (0); + if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl, + count * sizeof (zbookmark_t))) == (uintptr_t)NULL) + return (-1); + zc.zc_nvlist_dst_size = count; + (void) strcpy(zc.zc_name, zhp->zpool_name); + for (;;) { + if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG, + &zc) != 0) { + free((void *)(uintptr_t)zc.zc_nvlist_dst); + if (errno == ENOMEM) { + count = zc.zc_nvlist_dst_size; + if ((zc.zc_nvlist_dst = (uintptr_t) + zfs_alloc(zhp->zpool_hdl, count * + sizeof (zbookmark_t))) == (uintptr_t)NULL) + return (-1); + } else { + return (-1); + } + } else { + break; + } + } + + /* + * Sort the resulting bookmarks. This is a little confusing due to the + * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last + * to first, and 'zc_nvlist_dst_size' indicates the number of boomarks + * _not_ copied as part of the process. So we point the start of our + * array appropriate and decrement the total number of elements. + */ + zb = ((zbookmark_t *)(uintptr_t)zc.zc_nvlist_dst) + + zc.zc_nvlist_dst_size; + count -= zc.zc_nvlist_dst_size; + + qsort(zb, count, sizeof (zbookmark_t), zbookmark_compare); + + verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); + + /* + * Fill in the nverrlistp with nvlist's of dataset and object numbers. + */ + for (i = 0; i < count; i++) { + nvlist_t *nv; + + /* ignoring zb_blkid and zb_level for now */ + if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset && + zb[i-1].zb_object == zb[i].zb_object) + continue; + + if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0) + goto nomem; + if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET, + zb[i].zb_objset) != 0) { + nvlist_free(nv); + goto nomem; + } + if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, + zb[i].zb_object) != 0) { + nvlist_free(nv); + goto nomem; + } + if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) { + nvlist_free(nv); + goto nomem; + } + nvlist_free(nv); + } + + free((void *)(uintptr_t)zc.zc_nvlist_dst); + return (0); + +nomem: + free((void *)(uintptr_t)zc.zc_nvlist_dst); + return (no_memory(zhp->zpool_hdl)); +} + +/* + * Upgrade a ZFS pool to the latest on-disk version. + */ +int +zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strcpy(zc.zc_name, zhp->zpool_name); + zc.zc_cookie = new_version; + + if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0) + return (zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"), + zhp->zpool_name)); + return (0); +} + +void +zpool_set_history_str(const char *subcommand, int argc, char **argv, + char *history_str) +{ + int i; + + (void) strlcpy(history_str, subcommand, HIS_MAX_RECORD_LEN); + for (i = 1; i < argc; i++) { + if (strlen(history_str) + 1 + strlen(argv[i]) > + HIS_MAX_RECORD_LEN) + break; + (void) strlcat(history_str, " ", HIS_MAX_RECORD_LEN); + (void) strlcat(history_str, argv[i], HIS_MAX_RECORD_LEN); + } +} + +/* + * Stage command history for logging. + */ +int +zpool_stage_history(libzfs_handle_t *hdl, const char *history_str) +{ + if (history_str == NULL) + return (EINVAL); + + if (strlen(history_str) > HIS_MAX_RECORD_LEN) + return (EINVAL); + + if (hdl->libzfs_log_str != NULL) + free(hdl->libzfs_log_str); + + if ((hdl->libzfs_log_str = strdup(history_str)) == NULL) + return (no_memory(hdl)); + + return (0); +} + +/* + * Perform ioctl to get some command history of a pool. + * + * 'buf' is the buffer to fill up to 'len' bytes. 'off' is the + * logical offset of the history buffer to start reading from. + * + * Upon return, 'off' is the next logical offset to read from and + * 'len' is the actual amount of bytes read into 'buf'. + */ +static int +get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + + zc.zc_history = (uint64_t)(uintptr_t)buf; + zc.zc_history_len = *len; + zc.zc_history_offset = *off; + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { + switch (errno) { + case EPERM: + return (zfs_error_fmt(hdl, EZFS_PERM, + dgettext(TEXT_DOMAIN, + "cannot show history for pool '%s'"), + zhp->zpool_name)); + case ENOENT: + return (zfs_error_fmt(hdl, EZFS_NOHISTORY, + dgettext(TEXT_DOMAIN, "cannot get history for pool " + "'%s'"), zhp->zpool_name)); + case ENOTSUP: + return (zfs_error_fmt(hdl, EZFS_BADVERSION, + dgettext(TEXT_DOMAIN, "cannot get history for pool " + "'%s', pool must be upgraded"), zhp->zpool_name)); + default: + return (zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, + "cannot get history for '%s'"), zhp->zpool_name)); + } + } + + *len = zc.zc_history_len; + *off = zc.zc_history_offset; + + return (0); +} + +/* + * Process the buffer of nvlists, unpacking and storing each nvlist record + * into 'records'. 'leftover' is set to the number of bytes that weren't + * processed as there wasn't a complete record. + */ +int +zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, + nvlist_t ***records, uint_t *numrecords) +{ + uint64_t reclen; + nvlist_t *nv; + int i; + + while (bytes_read > sizeof (reclen)) { + + /* get length of packed record (stored as little endian) */ + for (i = 0, reclen = 0; i < sizeof (reclen); i++) + reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i); + + if (bytes_read < sizeof (reclen) + reclen) + break; + + /* unpack record */ + if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0) + return (ENOMEM); + bytes_read -= sizeof (reclen) + reclen; + buf += sizeof (reclen) + reclen; + + /* add record to nvlist array */ + (*numrecords)++; + if (ISP2(*numrecords + 1)) { + *records = realloc(*records, + *numrecords * 2 * sizeof (nvlist_t *)); + } + (*records)[*numrecords - 1] = nv; + } + + *leftover = bytes_read; + return (0); +} + +#define HIS_BUF_LEN (128*1024) + +/* + * Retrieve the command history of a pool. + */ +int +zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp) +{ + char buf[HIS_BUF_LEN]; + uint64_t off = 0; + nvlist_t **records = NULL; + uint_t numrecords = 0; + int err, i; + + do { + uint64_t bytes_read = sizeof (buf); + uint64_t leftover; + + if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0) + break; + + /* if nothing else was read in, we're at EOF, just return */ + if (!bytes_read) + break; + + if ((err = zpool_history_unpack(buf, bytes_read, + &leftover, &records, &numrecords)) != 0) + break; + off -= leftover; + + /* CONSTCOND */ + } while (1); + + if (!err) { + verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0); + verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD, + records, numrecords) == 0); + } + for (i = 0; i < numrecords; i++) + nvlist_free(records[i]); + free(records); + + return (err); +} + +void +zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, + char *pathname, size_t len) +{ + zfs_cmd_t zc = { 0 }; + boolean_t mounted = B_FALSE; + char *mntpnt = NULL; + char dsname[MAXNAMELEN]; + + if (dsobj == 0) { + /* special case for the MOS */ + (void) snprintf(pathname, len, ":<0x%llx>", obj); + return; + } + + /* get the dataset's name */ + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_obj = dsobj; + if (ioctl(zhp->zpool_hdl->libzfs_fd, + ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { + /* just write out a path of two object numbers */ + (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", + dsobj, obj); + return; + } + (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); + + /* find out if the dataset is mounted */ + mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt); + + /* get the corrupted object's path */ + (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); + zc.zc_obj = obj; + if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH, + &zc) == 0) { + if (mounted) { + (void) snprintf(pathname, len, "%s%s", mntpnt, + zc.zc_value); + } else { + (void) snprintf(pathname, len, "%s:%s", + dsname, zc.zc_value); + } + } else { + (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, obj); + } + free(mntpnt); +} + +/* + * Read the EFI label from the config, if a label does not exist then + * pass back the error to the caller. If the caller has passed a non-NULL + * diskaddr argument then we set it to the starting address of the EFI + * partition. + */ +static int +read_efi_label(nvlist_t *config, diskaddr_t *sb) +{ + char *path; + int fd; + char diskname[MAXPATHLEN]; + int err = -1; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) + return (err); + + (void) snprintf(diskname, sizeof (diskname), "%s%s", RDISK_ROOT, + strrchr(path, '/')); + if ((fd = open(diskname, O_RDONLY|O_NDELAY)) >= 0) { + struct dk_gpt *vtoc; + + if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { + if (sb != NULL) + *sb = vtoc->efi_parts[0].p_start; + efi_free(vtoc); + } + (void) close(fd); + } + return (err); +} + +/* + * determine where a partition starts on a disk in the current + * configuration + */ +static diskaddr_t +find_start_block(nvlist_t *config) +{ + nvlist_t **child; + uint_t c, children; + diskaddr_t sb = MAXOFFSET_T; + uint64_t wholedisk; + + if (nvlist_lookup_nvlist_array(config, + ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) != 0 || !wholedisk) { + return (MAXOFFSET_T); + } + if (read_efi_label(config, &sb) < 0) + sb = MAXOFFSET_T; + return (sb); + } + + for (c = 0; c < children; c++) { + sb = find_start_block(child[c]); + if (sb != MAXOFFSET_T) { + return (sb); + } + } + return (MAXOFFSET_T); +} + +/* + * Label an individual disk. The name provided is the short name, + * stripped of any leading /dev path. + */ +int +zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) +{ + char path[MAXPATHLEN]; + struct dk_gpt *vtoc; + int fd; + size_t resv = EFI_MIN_RESV_SIZE; + uint64_t slice_size; + diskaddr_t start_block; + char errbuf[1024]; + + /* prepare an error message just in case */ + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); + + if (zhp) { + nvlist_t *nvroot; + + if (pool_is_bootable(zhp)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "EFI labeled devices are not supported on root " + "pools.")); + return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); + } + + verify(nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + + if (zhp->zpool_start_block == 0) + start_block = find_start_block(nvroot); + else + start_block = zhp->zpool_start_block; + zhp->zpool_start_block = start_block; + } else { + /* new pool */ + start_block = NEW_START_BLOCK; + } + + (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, + BACKUP_SLICE); + + if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { + /* + * This shouldn't happen. We've long since verified that this + * is a valid device. + */ + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "unable to open device")); + return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); + } + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { + /* + * The only way this can fail is if we run out of memory, or we + * were unable to read the disk's capacity + */ + if (errno == ENOMEM) + (void) no_memory(hdl); + + (void) close(fd); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "unable to read disk capacity"), name); + + return (zfs_error(hdl, EZFS_NOCAP, errbuf)); + } + + slice_size = vtoc->efi_last_u_lba + 1; + slice_size -= EFI_MIN_RESV_SIZE; + if (start_block == MAXOFFSET_T) + start_block = NEW_START_BLOCK; + slice_size -= start_block; + + vtoc->efi_parts[0].p_start = start_block; + vtoc->efi_parts[0].p_size = slice_size; + + /* + * Why we use V_USR: V_BACKUP confuses users, and is considered + * disposable by some EFI utilities (since EFI doesn't have a backup + * slice). V_UNASSIGNED is supposed to be used only for zero size + * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, + * etc. were all pretty specific. V_USR is as close to reality as we + * can get, in the absence of V_OTHER. + */ + vtoc->efi_parts[0].p_tag = V_USR; + (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); + + vtoc->efi_parts[8].p_start = slice_size + start_block; + vtoc->efi_parts[8].p_size = resv; + vtoc->efi_parts[8].p_tag = V_RESERVED; + + if (efi_write(fd, vtoc) != 0) { + /* + * Some block drivers (like pcata) may not support EFI + * GPT labels. Print out a helpful error message dir- + * ecting the user to manually label the disk and give + * a specific slice. + */ + (void) close(fd); + efi_free(vtoc); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "try using fdisk(1M) and then provide a specific slice")); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + (void) close(fd); + efi_free(vtoc); + return (0); +} + +static boolean_t +supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf) +{ + char *type; + nvlist_t **child; + uint_t children, c; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0); + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_FILE) == 0 || + strcmp(type, VDEV_TYPE_LOG) == 0 || + strcmp(type, VDEV_TYPE_HOLE) == 0 || + strcmp(type, VDEV_TYPE_MISSING) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "vdev type '%s' is not supported"), type); + (void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf); + return (B_FALSE); + } + if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (!supported_dump_vdev_type(hdl, child[c], errbuf)) + return (B_FALSE); + } + } + return (B_TRUE); +} + +/* + * check if this zvol is allowable for use as a dump device; zero if + * it is, > 0 if it isn't, < 0 if it isn't a zvol + */ +int +zvol_check_dump_config(char *arg) +{ + zpool_handle_t *zhp = NULL; + nvlist_t *config, *nvroot; + char *p, *volname; + nvlist_t **top; + uint_t toplevels; + libzfs_handle_t *hdl; + char errbuf[1024]; + char poolname[ZPOOL_MAXNAMELEN]; + int pathlen = strlen(ZVOL_FULL_DEV_DIR); + int ret = 1; + + if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) { + return (-1); + } + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "dump is not supported on device '%s'"), arg); + + if ((hdl = libzfs_init()) == NULL) + return (1); + libzfs_print_on_error(hdl, B_TRUE); + + volname = arg + pathlen; + + /* check the configuration of the pool */ + if ((p = strchr(volname, '/')) == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "malformed dataset name")); + (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + return (1); + } else if (p - volname >= ZFS_MAXNAMELEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset name is too long")); + (void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf); + return (1); + } else { + (void) strncpy(poolname, volname, p - volname); + poolname[p - volname] = '\0'; + } + + if ((zhp = zpool_open(hdl, poolname)) == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "could not open pool '%s'"), poolname); + (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); + goto out; + } + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "could not obtain vdev configuration for '%s'"), poolname); + (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); + goto out; + } + + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &top, &toplevels) == 0); + if (toplevels != 1) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' has multiple top level vdevs"), poolname); + (void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf); + goto out; + } + + if (!supported_dump_vdev_type(hdl, top[0], errbuf)) { + goto out; + } + ret = 0; + +out: + if (zhp) + zpool_close(zhp); + libzfs_fini(hdl); + return (ret); +} diff --git a/lib/libzfs/common/libzfs_sendrecv.c b/lib/libzfs/common/libzfs_sendrecv.c new file mode 100644 index 0000000..3093ab9 --- /dev/null +++ b/lib/libzfs/common/libzfs_sendrecv.c @@ -0,0 +1,3021 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "zfs_fletcher.h" +#include "libzfs_impl.h" +#include +#include +#include + +/* in libzfs_dataset.c */ +extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); + +static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t, + int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *); + +static const zio_cksum_t zero_cksum = { 0 }; + +typedef struct dedup_arg { + int inputfd; + int outputfd; + libzfs_handle_t *dedup_hdl; +} dedup_arg_t; + +typedef struct dataref { + uint64_t ref_guid; + uint64_t ref_object; + uint64_t ref_offset; +} dataref_t; + +typedef struct dedup_entry { + struct dedup_entry *dde_next; + zio_cksum_t dde_chksum; + uint64_t dde_prop; + dataref_t dde_ref; +} dedup_entry_t; + +#define MAX_DDT_PHYSMEM_PERCENT 20 +#define SMALLEST_POSSIBLE_MAX_DDT_MB 128 + +typedef struct dedup_table { + dedup_entry_t **dedup_hash_array; + umem_cache_t *ddecache; + uint64_t max_ddt_size; /* max dedup table size in bytes */ + uint64_t cur_ddt_size; /* current dedup table size in bytes */ + uint64_t ddt_count; + int numhashbits; + boolean_t ddt_full; +} dedup_table_t; + +static int +high_order_bit(uint64_t n) +{ + int count; + + for (count = 0; n != 0; count++) + n >>= 1; + return (count); +} + +static size_t +ssread(void *buf, size_t len, FILE *stream) +{ + size_t outlen; + + if ((outlen = fread(buf, len, 1, stream)) == 0) + return (0); + + return (outlen); +} + +static void +ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp, + zio_cksum_t *cs, uint64_t prop, dataref_t *dr) +{ + dedup_entry_t *dde; + + if (ddt->cur_ddt_size >= ddt->max_ddt_size) { + if (ddt->ddt_full == B_FALSE) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Dedup table full. Deduplication will continue " + "with existing table entries")); + ddt->ddt_full = B_TRUE; + } + return; + } + + if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT)) + != NULL) { + assert(*ddepp == NULL); + dde->dde_next = NULL; + dde->dde_chksum = *cs; + dde->dde_prop = prop; + dde->dde_ref = *dr; + *ddepp = dde; + ddt->cur_ddt_size += sizeof (dedup_entry_t); + ddt->ddt_count++; + } +} + +/* + * Using the specified dedup table, do a lookup for an entry with + * the checksum cs. If found, return the block's reference info + * in *dr. Otherwise, insert a new entry in the dedup table, using + * the reference information specified by *dr. + * + * return value: true - entry was found + * false - entry was not found + */ +static boolean_t +ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs, + uint64_t prop, dataref_t *dr) +{ + uint32_t hashcode; + dedup_entry_t **ddepp; + + hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits); + + for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL; + ddepp = &((*ddepp)->dde_next)) { + if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) && + (*ddepp)->dde_prop == prop) { + *dr = (*ddepp)->dde_ref; + return (B_TRUE); + } + } + ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr); + return (B_FALSE); +} + +static int +cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd) +{ + fletcher_4_incremental_native(buf, len, zc); + return (write(outfd, buf, len)); +} + +/* + * This function is started in a separate thread when the dedup option + * has been requested. The main send thread determines the list of + * snapshots to be included in the send stream and makes the ioctl calls + * for each one. But instead of having the ioctl send the output to the + * the output fd specified by the caller of zfs_send()), the + * ioctl is told to direct the output to a pipe, which is read by the + * alternate thread running THIS function. This function does the + * dedup'ing by: + * 1. building a dedup table (the DDT) + * 2. doing checksums on each data block and inserting a record in the DDT + * 3. looking for matching checksums, and + * 4. sending a DRR_WRITE_BYREF record instead of a write record whenever + * a duplicate block is found. + * The output of this function then goes to the output fd requested + * by the caller of zfs_send(). + */ +static void * +cksummer(void *arg) +{ + dedup_arg_t *dda = arg; + char *buf = malloc(1<<20); + dmu_replay_record_t thedrr; + dmu_replay_record_t *drr = &thedrr; + struct drr_begin *drrb = &thedrr.drr_u.drr_begin; + struct drr_end *drre = &thedrr.drr_u.drr_end; + struct drr_object *drro = &thedrr.drr_u.drr_object; + struct drr_write *drrw = &thedrr.drr_u.drr_write; + struct drr_spill *drrs = &thedrr.drr_u.drr_spill; + FILE *ofp; + int outfd; + dmu_replay_record_t wbr_drr = {0}; + struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref; + dedup_table_t ddt; + zio_cksum_t stream_cksum; + uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); + uint64_t numbuckets; + + ddt.max_ddt_size = + MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100, + SMALLEST_POSSIBLE_MAX_DDT_MB<<20); + + numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t)); + + /* + * numbuckets must be a power of 2. Increase number to + * a power of 2 if necessary. + */ + if (!ISP2(numbuckets)) + numbuckets = 1 << high_order_bit(numbuckets); + + ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *)); + ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0, + NULL, NULL, NULL, NULL, NULL, 0); + ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *); + ddt.numhashbits = high_order_bit(numbuckets) - 1; + ddt.ddt_full = B_FALSE; + + /* Initialize the write-by-reference block. */ + wbr_drr.drr_type = DRR_WRITE_BYREF; + wbr_drr.drr_payloadlen = 0; + + outfd = dda->outputfd; + ofp = fdopen(dda->inputfd, "r"); + while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) { + + switch (drr->drr_type) { + case DRR_BEGIN: + { + int fflags; + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + + /* set the DEDUP feature flag for this stream */ + fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + fflags |= (DMU_BACKUP_FEATURE_DEDUP | + DMU_BACKUP_FEATURE_DEDUPPROPS); + DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); + + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) { + int sz = drr->drr_payloadlen; + + if (sz > 1<<20) { + free(buf); + buf = malloc(sz); + } + (void) ssread(buf, sz, ofp); + if (ferror(stdin)) + perror("fread"); + if (cksum_and_write(buf, sz, &stream_cksum, + outfd) == -1) + goto out; + } + break; + } + + case DRR_END: + { + /* use the recalculated checksum */ + ZIO_SET_CHECKSUM(&drre->drr_checksum, + stream_cksum.zc_word[0], stream_cksum.zc_word[1], + stream_cksum.zc_word[2], stream_cksum.zc_word[3]); + if ((write(outfd, drr, + sizeof (dmu_replay_record_t))) == -1) + goto out; + break; + } + + case DRR_OBJECT: + { + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + if (drro->drr_bonuslen > 0) { + (void) ssread(buf, + P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), + ofp); + if (cksum_and_write(buf, + P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), + &stream_cksum, outfd) == -1) + goto out; + } + break; + } + + case DRR_SPILL: + { + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + (void) ssread(buf, drrs->drr_length, ofp); + if (cksum_and_write(buf, drrs->drr_length, + &stream_cksum, outfd) == -1) + goto out; + break; + } + + case DRR_FREEOBJECTS: + { + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + break; + } + + case DRR_WRITE: + { + dataref_t dataref; + + (void) ssread(buf, drrw->drr_length, ofp); + + /* + * Use the existing checksum if it's dedup-capable, + * else calculate a SHA256 checksum for it. + */ + + if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum, + zero_cksum) || + !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) { + SHA256_CTX ctx; + zio_cksum_t tmpsha256; + + SHA256Init(&ctx); + SHA256Update(&ctx, buf, drrw->drr_length); + SHA256Final(&tmpsha256, &ctx); + drrw->drr_key.ddk_cksum.zc_word[0] = + BE_64(tmpsha256.zc_word[0]); + drrw->drr_key.ddk_cksum.zc_word[1] = + BE_64(tmpsha256.zc_word[1]); + drrw->drr_key.ddk_cksum.zc_word[2] = + BE_64(tmpsha256.zc_word[2]); + drrw->drr_key.ddk_cksum.zc_word[3] = + BE_64(tmpsha256.zc_word[3]); + drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256; + drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP; + } + + dataref.ref_guid = drrw->drr_toguid; + dataref.ref_object = drrw->drr_object; + dataref.ref_offset = drrw->drr_offset; + + if (ddt_update(dda->dedup_hdl, &ddt, + &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop, + &dataref)) { + /* block already present in stream */ + wbr_drrr->drr_object = drrw->drr_object; + wbr_drrr->drr_offset = drrw->drr_offset; + wbr_drrr->drr_length = drrw->drr_length; + wbr_drrr->drr_toguid = drrw->drr_toguid; + wbr_drrr->drr_refguid = dataref.ref_guid; + wbr_drrr->drr_refobject = + dataref.ref_object; + wbr_drrr->drr_refoffset = + dataref.ref_offset; + + wbr_drrr->drr_checksumtype = + drrw->drr_checksumtype; + wbr_drrr->drr_checksumflags = + drrw->drr_checksumtype; + wbr_drrr->drr_key.ddk_cksum = + drrw->drr_key.ddk_cksum; + wbr_drrr->drr_key.ddk_prop = + drrw->drr_key.ddk_prop; + + if (cksum_and_write(&wbr_drr, + sizeof (dmu_replay_record_t), &stream_cksum, + outfd) == -1) + goto out; + } else { + /* block not previously seen */ + if (cksum_and_write(drr, + sizeof (dmu_replay_record_t), &stream_cksum, + outfd) == -1) + goto out; + if (cksum_and_write(buf, + drrw->drr_length, + &stream_cksum, outfd) == -1) + goto out; + } + break; + } + + case DRR_FREE: + { + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + break; + } + + default: + (void) printf("INVALID record type 0x%x\n", + drr->drr_type); + /* should never happen, so assert */ + assert(B_FALSE); + } + } +out: + umem_cache_destroy(ddt.ddecache); + free(ddt.dedup_hash_array); + free(buf); + (void) fclose(ofp); + + return (NULL); +} + +/* + * Routines for dealing with the AVL tree of fs-nvlists + */ +typedef struct fsavl_node { + avl_node_t fn_node; + nvlist_t *fn_nvfs; + char *fn_snapname; + uint64_t fn_guid; +} fsavl_node_t; + +static int +fsavl_compare(const void *arg1, const void *arg2) +{ + const fsavl_node_t *fn1 = arg1; + const fsavl_node_t *fn2 = arg2; + + if (fn1->fn_guid > fn2->fn_guid) + return (+1); + else if (fn1->fn_guid < fn2->fn_guid) + return (-1); + else + return (0); +} + +/* + * Given the GUID of a snapshot, find its containing filesystem and + * (optionally) name. + */ +static nvlist_t * +fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname) +{ + fsavl_node_t fn_find; + fsavl_node_t *fn; + + fn_find.fn_guid = snapguid; + + fn = avl_find(avl, &fn_find, NULL); + if (fn) { + if (snapname) + *snapname = fn->fn_snapname; + return (fn->fn_nvfs); + } + return (NULL); +} + +static void +fsavl_destroy(avl_tree_t *avl) +{ + fsavl_node_t *fn; + void *cookie; + + if (avl == NULL) + return; + + cookie = NULL; + while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL) + free(fn); + avl_destroy(avl); + free(avl); +} + +/* + * Given an nvlist, produce an avl tree of snapshots, ordered by guid + */ +static avl_tree_t * +fsavl_create(nvlist_t *fss) +{ + avl_tree_t *fsavl; + nvpair_t *fselem = NULL; + + if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL) + return (NULL); + + avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t), + offsetof(fsavl_node_t, fn_node)); + + while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) { + nvlist_t *nvfs, *snaps; + nvpair_t *snapelem = NULL; + + VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); + VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); + + while ((snapelem = + nvlist_next_nvpair(snaps, snapelem)) != NULL) { + fsavl_node_t *fn; + uint64_t guid; + + VERIFY(0 == nvpair_value_uint64(snapelem, &guid)); + if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { + fsavl_destroy(fsavl); + return (NULL); + } + fn->fn_nvfs = nvfs; + fn->fn_snapname = nvpair_name(snapelem); + fn->fn_guid = guid; + + /* + * Note: if there are multiple snaps with the + * same GUID, we ignore all but one. + */ + if (avl_find(fsavl, fn, NULL) == NULL) + avl_add(fsavl, fn); + else + free(fn); + } + } + + return (fsavl); +} + +/* + * Routines for dealing with the giant nvlist of fs-nvlists, etc. + */ +typedef struct send_data { + uint64_t parent_fromsnap_guid; + nvlist_t *parent_snaps; + nvlist_t *fss; + nvlist_t *snapprops; + const char *fromsnap; + const char *tosnap; + boolean_t recursive; + + /* + * The header nvlist is of the following format: + * { + * "tosnap" -> string + * "fromsnap" -> string (if incremental) + * "fss" -> { + * id -> { + * + * "name" -> string (full name; for debugging) + * "parentfromsnap" -> number (guid of fromsnap in parent) + * + * "props" -> { name -> value (only if set here) } + * "snaps" -> { name (lastname) -> number (guid) } + * "snapprops" -> { name (lastname) -> { name -> value } } + * + * "origin" -> number (guid) (if clone) + * "sent" -> boolean (not on-disk) + * } + * } + * } + * + */ +} send_data_t; + +static void send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv); + +static int +send_iterate_snap(zfs_handle_t *zhp, void *arg) +{ + send_data_t *sd = arg; + uint64_t guid = zhp->zfs_dmustats.dds_guid; + char *snapname; + nvlist_t *nv; + + snapname = strrchr(zhp->zfs_name, '@')+1; + + VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid)); + /* + * NB: if there is no fromsnap here (it's a newly created fs in + * an incremental replication), we will substitute the tosnap. + */ + if ((sd->fromsnap && strcmp(snapname, sd->fromsnap) == 0) || + (sd->parent_fromsnap_guid == 0 && sd->tosnap && + strcmp(snapname, sd->tosnap) == 0)) { + sd->parent_fromsnap_guid = guid; + } + + VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); + send_iterate_prop(zhp, nv); + VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv)); + nvlist_free(nv); + + zfs_close(zhp); + return (0); +} + +static void +send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv) +{ + nvpair_t *elem = NULL; + + while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) { + char *propname = nvpair_name(elem); + zfs_prop_t prop = zfs_name_to_prop(propname); + nvlist_t *propnv; + + if (!zfs_prop_user(propname)) { + /* + * Realistically, this should never happen. However, + * we want the ability to add DSL properties without + * needing to make incompatible version changes. We + * need to ignore unknown properties to allow older + * software to still send datasets containing these + * properties, with the unknown properties elided. + */ + if (prop == ZPROP_INVAL) + continue; + + if (zfs_prop_readonly(prop)) + continue; + } + + verify(nvpair_value_nvlist(elem, &propnv) == 0); + if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || + prop == ZFS_PROP_REFQUOTA || + prop == ZFS_PROP_REFRESERVATION) { + char *source; + uint64_t value; + verify(nvlist_lookup_uint64(propnv, + ZPROP_VALUE, &value) == 0); + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) + continue; + /* + * May have no source before SPA_VERSION_RECVD_PROPS, + * but is still modifiable. + */ + if (nvlist_lookup_string(propnv, + ZPROP_SOURCE, &source) == 0) { + if ((strcmp(source, zhp->zfs_name) != 0) && + (strcmp(source, + ZPROP_SOURCE_VAL_RECVD) != 0)) + continue; + } + } else { + char *source; + if (nvlist_lookup_string(propnv, + ZPROP_SOURCE, &source) != 0) + continue; + if ((strcmp(source, zhp->zfs_name) != 0) && + (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) + continue; + } + + if (zfs_prop_user(propname) || + zfs_prop_get_type(prop) == PROP_TYPE_STRING) { + char *value; + verify(nvlist_lookup_string(propnv, + ZPROP_VALUE, &value) == 0); + VERIFY(0 == nvlist_add_string(nv, propname, value)); + } else { + uint64_t value; + verify(nvlist_lookup_uint64(propnv, + ZPROP_VALUE, &value) == 0); + VERIFY(0 == nvlist_add_uint64(nv, propname, value)); + } + } +} + +/* + * recursively generate nvlists describing datasets. See comment + * for the data structure send_data_t above for description of contents + * of the nvlist. + */ +static int +send_iterate_fs(zfs_handle_t *zhp, void *arg) +{ + send_data_t *sd = arg; + nvlist_t *nvfs, *nv; + int rv = 0; + uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; + uint64_t guid = zhp->zfs_dmustats.dds_guid; + char guidstring[64]; + + VERIFY(0 == nvlist_alloc(&nvfs, NV_UNIQUE_NAME, 0)); + VERIFY(0 == nvlist_add_string(nvfs, "name", zhp->zfs_name)); + VERIFY(0 == nvlist_add_uint64(nvfs, "parentfromsnap", + sd->parent_fromsnap_guid)); + + if (zhp->zfs_dmustats.dds_origin[0]) { + zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, + zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); + if (origin == NULL) + return (-1); + VERIFY(0 == nvlist_add_uint64(nvfs, "origin", + origin->zfs_dmustats.dds_guid)); + } + + /* iterate over props */ + VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); + send_iterate_prop(zhp, nv); + VERIFY(0 == nvlist_add_nvlist(nvfs, "props", nv)); + nvlist_free(nv); + + /* iterate over snaps, and set sd->parent_fromsnap_guid */ + sd->parent_fromsnap_guid = 0; + VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0)); + VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0)); + (void) zfs_iter_snapshots(zhp, send_iterate_snap, sd); + VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps)); + VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops)); + nvlist_free(sd->parent_snaps); + nvlist_free(sd->snapprops); + + /* add this fs to nvlist */ + (void) snprintf(guidstring, sizeof (guidstring), + "0x%llx", (longlong_t)guid); + VERIFY(0 == nvlist_add_nvlist(sd->fss, guidstring, nvfs)); + nvlist_free(nvfs); + + /* iterate over children */ + if (sd->recursive) + rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); + + sd->parent_fromsnap_guid = parent_fromsnap_guid_save; + + zfs_close(zhp); + return (rv); +} + +static int +gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, + const char *tosnap, boolean_t recursive, nvlist_t **nvlp, avl_tree_t **avlp) +{ + zfs_handle_t *zhp; + send_data_t sd = { 0 }; + int error; + + zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (EZFS_BADTYPE); + + VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0)); + sd.fromsnap = fromsnap; + sd.tosnap = tosnap; + sd.recursive = recursive; + + if ((error = send_iterate_fs(zhp, &sd)) != 0) { + nvlist_free(sd.fss); + if (avlp != NULL) + *avlp = NULL; + *nvlp = NULL; + return (error); + } + + if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { + nvlist_free(sd.fss); + *nvlp = NULL; + return (EZFS_NOMEM); + } + + *nvlp = sd.fss; + return (0); +} + +/* + * Routines for dealing with the sorted snapshot functionality + */ +typedef struct zfs_node { + zfs_handle_t *zn_handle; + avl_node_t zn_avlnode; +} zfs_node_t; + +static int +zfs_sort_snaps(zfs_handle_t *zhp, void *data) +{ + avl_tree_t *avl = data; + zfs_node_t *node; + zfs_node_t search; + + search.zn_handle = zhp; + node = avl_find(avl, &search, NULL); + if (node) { + /* + * If this snapshot was renamed while we were creating the + * AVL tree, it's possible that we already inserted it under + * its old name. Remove the old handle before adding the new + * one. + */ + zfs_close(node->zn_handle); + avl_remove(avl, node); + free(node); + } + + node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t)); + node->zn_handle = zhp; + avl_add(avl, node); + + return (0); +} + +static int +zfs_snapshot_compare(const void *larg, const void *rarg) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + uint64_t lcreate, rcreate; + + /* + * Sort them according to creation time. We use the hidden + * CREATETXG property to get an absolute ordering of snapshots. + */ + lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); + rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); + + if (lcreate < rcreate) + return (-1); + else if (lcreate > rcreate) + return (+1); + else + return (0); +} + +int +zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data) +{ + int ret = 0; + zfs_node_t *node; + avl_tree_t avl; + void *cookie = NULL; + + avl_create(&avl, zfs_snapshot_compare, + sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode)); + + ret = zfs_iter_snapshots(zhp, zfs_sort_snaps, &avl); + + for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node)) + ret |= callback(node->zn_handle, data); + + while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL) + free(node); + + avl_destroy(&avl); + + return (ret); +} + +/* + * Routines specific to "zfs send" + */ +typedef struct send_dump_data { + /* these are all just the short snapname (the part after the @) */ + const char *fromsnap; + const char *tosnap; + char prevsnap[ZFS_MAXNAMELEN]; + uint64_t prevsnap_obj; + boolean_t seenfrom, seento, replicate, doall, fromorigin; + boolean_t verbose; + int outfd; + boolean_t err; + nvlist_t *fss; + avl_tree_t *fsavl; + snapfilter_cb_t *filter_cb; + void *filter_cb_arg; + nvlist_t *debugnv; + char holdtag[ZFS_MAXNAMELEN]; + int cleanup_fd; +} send_dump_data_t; + +/* + * Dumps a backup of the given snapshot (incremental from fromsnap if it's not + * NULL) to the file descriptor specified by outfd. + */ +static int +dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, + boolean_t fromorigin, int outfd, nvlist_t *debugnv) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *thisdbg; + + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + assert(fromsnap_obj == 0 || !fromorigin); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_cookie = outfd; + zc.zc_obj = fromorigin; + zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + zc.zc_fromobj = fromsnap_obj; + + VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); + if (fromsnap && fromsnap[0] != '\0') { + VERIFY(0 == nvlist_add_string(thisdbg, + "fromsnap", fromsnap)); + } + + if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + + VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno)); + if (debugnv) { + VERIFY(0 == nvlist_add_nvlist(debugnv, + zhp->zfs_name, thisdbg)); + } + nvlist_free(thisdbg); + + switch (errno) { + + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + + case ENOENT: + if (zfs_dataset_exists(hdl, zc.zc_name, + ZFS_TYPE_SNAPSHOT)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source (@%s) does not exist"), + zc.zc_value); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: + case ENOSTR: + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + + if (debugnv) + VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); + nvlist_free(thisdbg); + + return (0); +} + +static int +hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd) +{ + zfs_handle_t *pzhp; + int error = 0; + char *thissnap; + + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + + /* + * zfs_send() only opens a cleanup_fd for sends that need it, + * e.g. replication and doall. + */ + if (sdd->cleanup_fd == -1) + return (0); + + thissnap = strchr(zhp->zfs_name, '@') + 1; + *(thissnap - 1) = '\0'; + pzhp = zfs_open(zhp->zfs_hdl, zhp->zfs_name, ZFS_TYPE_DATASET); + *(thissnap - 1) = '@'; + + /* + * It's OK if the parent no longer exists. The send code will + * handle that error. + */ + if (pzhp) { + error = zfs_hold(pzhp, thissnap, sdd->holdtag, + B_FALSE, B_TRUE, B_TRUE, sdd->cleanup_fd, + zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID), + zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG)); + zfs_close(pzhp); + } + + return (error); +} + +static int +dump_snapshot(zfs_handle_t *zhp, void *arg) +{ + send_dump_data_t *sdd = arg; + char *thissnap; + int err; + boolean_t isfromsnap, istosnap; + boolean_t exclude = B_FALSE; + + thissnap = strchr(zhp->zfs_name, '@') + 1; + isfromsnap = (sdd->fromsnap != NULL && + strcmp(sdd->fromsnap, thissnap) == 0); + + if (!sdd->seenfrom && isfromsnap) { + err = hold_for_send(zhp, sdd); + if (err == 0) { + sdd->seenfrom = B_TRUE; + (void) strcpy(sdd->prevsnap, thissnap); + sdd->prevsnap_obj = zfs_prop_get_int(zhp, + ZFS_PROP_OBJSETID); + } else if (err == ENOENT) { + err = 0; + } + zfs_close(zhp); + return (err); + } + + if (sdd->seento || !sdd->seenfrom) { + zfs_close(zhp); + return (0); + } + + istosnap = (strcmp(sdd->tosnap, thissnap) == 0); + if (istosnap) + sdd->seento = B_TRUE; + + if (!sdd->doall && !isfromsnap && !istosnap) { + if (sdd->replicate) { + char *snapname; + nvlist_t *snapprops; + /* + * Filter out all intermediate snapshots except origin + * snapshots needed to replicate clones. + */ + nvlist_t *nvfs = fsavl_find(sdd->fsavl, + zhp->zfs_dmustats.dds_guid, &snapname); + + VERIFY(0 == nvlist_lookup_nvlist(nvfs, + "snapprops", &snapprops)); + VERIFY(0 == nvlist_lookup_nvlist(snapprops, + thissnap, &snapprops)); + exclude = !nvlist_exists(snapprops, "is_clone_origin"); + } else { + exclude = B_TRUE; + } + } + + /* + * If a filter function exists, call it to determine whether + * this snapshot will be sent. + */ + if (exclude || (sdd->filter_cb != NULL && + sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { + /* + * This snapshot is filtered out. Don't send it, and don't + * set prevsnap_obj, so it will be as if this snapshot didn't + * exist, and the next accepted snapshot will be sent as + * an incremental from the last accepted one, or as the + * first (and full) snapshot in the case of a replication, + * non-incremental send. + */ + zfs_close(zhp); + return (0); + } + + err = hold_for_send(zhp, sdd); + if (err) { + if (err == ENOENT) + err = 0; + zfs_close(zhp); + return (err); + } + + /* send it */ + if (sdd->verbose) { + (void) fprintf(stderr, "sending from @%s to %s\n", + sdd->prevsnap, zhp->zfs_name); + } + + err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, + sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate), + sdd->outfd, sdd->debugnv); + + (void) strcpy(sdd->prevsnap, thissnap); + sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + zfs_close(zhp); + return (err); +} + +static int +dump_filesystem(zfs_handle_t *zhp, void *arg) +{ + int rv = 0; + send_dump_data_t *sdd = arg; + boolean_t missingfrom = B_FALSE; + zfs_cmd_t zc = { 0 }; + + (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", + zhp->zfs_name, sdd->tosnap); + if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { + (void) fprintf(stderr, "WARNING: " + "could not send %s@%s: does not exist\n", + zhp->zfs_name, sdd->tosnap); + sdd->err = B_TRUE; + return (0); + } + + if (sdd->replicate && sdd->fromsnap) { + /* + * If this fs does not have fromsnap, and we're doing + * recursive, we need to send a full stream from the + * beginning (or an incremental from the origin if this + * is a clone). If we're doing non-recursive, then let + * them get the error. + */ + (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", + zhp->zfs_name, sdd->fromsnap); + if (ioctl(zhp->zfs_hdl->libzfs_fd, + ZFS_IOC_OBJSET_STATS, &zc) != 0) { + missingfrom = B_TRUE; + } + } + + sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0; + sdd->prevsnap_obj = 0; + if (sdd->fromsnap == NULL || missingfrom) + sdd->seenfrom = B_TRUE; + + rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg); + if (!sdd->seenfrom) { + (void) fprintf(stderr, + "WARNING: could not send %s@%s:\n" + "incremental source (%s@%s) does not exist\n", + zhp->zfs_name, sdd->tosnap, + zhp->zfs_name, sdd->fromsnap); + sdd->err = B_TRUE; + } else if (!sdd->seento) { + if (sdd->fromsnap) { + (void) fprintf(stderr, + "WARNING: could not send %s@%s:\n" + "incremental source (%s@%s) " + "is not earlier than it\n", + zhp->zfs_name, sdd->tosnap, + zhp->zfs_name, sdd->fromsnap); + } else { + (void) fprintf(stderr, "WARNING: " + "could not send %s@%s: does not exist\n", + zhp->zfs_name, sdd->tosnap); + } + sdd->err = B_TRUE; + } + + return (rv); +} + +static int +dump_filesystems(zfs_handle_t *rzhp, void *arg) +{ + send_dump_data_t *sdd = arg; + nvpair_t *fspair; + boolean_t needagain, progress; + + if (!sdd->replicate) + return (dump_filesystem(rzhp, sdd)); + + /* Mark the clone origin snapshots. */ + for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; + fspair = nvlist_next_nvpair(sdd->fss, fspair)) { + nvlist_t *nvfs; + uint64_t origin_guid = 0; + + VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs)); + (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); + if (origin_guid != 0) { + char *snapname; + nvlist_t *origin_nv = fsavl_find(sdd->fsavl, + origin_guid, &snapname); + if (origin_nv != NULL) { + nvlist_t *snapprops; + VERIFY(0 == nvlist_lookup_nvlist(origin_nv, + "snapprops", &snapprops)); + VERIFY(0 == nvlist_lookup_nvlist(snapprops, + snapname, &snapprops)); + VERIFY(0 == nvlist_add_boolean( + snapprops, "is_clone_origin")); + } + } + } +again: + needagain = progress = B_FALSE; + for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; + fspair = nvlist_next_nvpair(sdd->fss, fspair)) { + nvlist_t *fslist; + char *fsname; + zfs_handle_t *zhp; + int err; + uint64_t origin_guid = 0; + + VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); + if (nvlist_lookup_boolean(fslist, "sent") == 0) + continue; + + VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0); + (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); + + if (origin_guid != 0) { + nvlist_t *origin_nv = fsavl_find(sdd->fsavl, + origin_guid, NULL); + if (origin_nv != NULL && + nvlist_lookup_boolean(origin_nv, + "sent") == ENOENT) { + /* + * origin has not been sent yet; + * skip this clone. + */ + needagain = B_TRUE; + continue; + } + } + + zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); + if (zhp == NULL) + return (-1); + err = dump_filesystem(zhp, sdd); + VERIFY(nvlist_add_boolean(fslist, "sent") == 0); + progress = B_TRUE; + zfs_close(zhp); + if (err) + return (err); + } + if (needagain) { + assert(progress); + goto again; + } + return (0); +} + +/* + * Generate a send stream for the dataset identified by the argument zhp. + * + * The content of the send stream is the snapshot identified by + * 'tosnap'. Incremental streams are requested in two ways: + * - from the snapshot identified by "fromsnap" (if non-null) or + * - from the origin of the dataset identified by zhp, which must + * be a clone. In this case, "fromsnap" is null and "fromorigin" + * is TRUE. + * + * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and + * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) + * if "replicate" is set. If "doall" is set, dump all the intermediate + * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" + * case too. If "props" is set, send properties. + */ +int +zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, + sendflags_t flags, int outfd, snapfilter_cb_t filter_func, + void *cb_arg, nvlist_t **debugnvp) +{ + char errbuf[1024]; + send_dump_data_t sdd = { 0 }; + int err; + nvlist_t *fss = NULL; + avl_tree_t *fsavl = NULL; + static uint64_t holdseq; + int spa_version; + boolean_t holdsnaps = B_FALSE; + pthread_t tid; + int pipefd[2]; + dedup_arg_t dda = { 0 }; + int featureflags = 0; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot send '%s'"), zhp->zfs_name); + + if (fromsnap && fromsnap[0] == '\0') { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "zero-length incremental source")); + return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); + } + + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) { + uint64_t version; + version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + if (version >= ZPL_VERSION_SA) { + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } + } + + if (zfs_spa_version(zhp, &spa_version) == 0 && + spa_version >= SPA_VERSION_USERREFS && + (flags.doall || flags.replicate)) + holdsnaps = B_TRUE; + + if (flags.dedup) { + featureflags |= (DMU_BACKUP_FEATURE_DEDUP | + DMU_BACKUP_FEATURE_DEDUPPROPS); + if (err = pipe(pipefd)) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, + errbuf)); + } + dda.outputfd = outfd; + dda.inputfd = pipefd[1]; + dda.dedup_hdl = zhp->zfs_hdl; + if (err = pthread_create(&tid, NULL, cksummer, &dda)) { + (void) close(pipefd[0]); + (void) close(pipefd[1]); + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); + } + } + + if (flags.replicate || flags.doall || flags.props) { + dmu_replay_record_t drr = { 0 }; + char *packbuf = NULL; + size_t buflen = 0; + zio_cksum_t zc = { 0 }; + + if (flags.replicate || flags.props) { + nvlist_t *hdrnv; + + VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0)); + if (fromsnap) { + VERIFY(0 == nvlist_add_string(hdrnv, + "fromsnap", fromsnap)); + } + VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap)); + if (!flags.replicate) { + VERIFY(0 == nvlist_add_boolean(hdrnv, + "not_recursive")); + } + + err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name, + fromsnap, tosnap, flags.replicate, &fss, &fsavl); + if (err) + goto err_out; + VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); + err = nvlist_pack(hdrnv, &packbuf, &buflen, + NV_ENCODE_XDR, 0); + if (debugnvp) + *debugnvp = hdrnv; + else + nvlist_free(hdrnv); + if (err) { + fsavl_destroy(fsavl); + nvlist_free(fss); + goto stderr_out; + } + } + + /* write first begin record */ + drr.drr_type = DRR_BEGIN; + drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.drr_versioninfo, + DMU_COMPOUNDSTREAM); + DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.drr_versioninfo, + featureflags); + (void) snprintf(drr.drr_u.drr_begin.drr_toname, + sizeof (drr.drr_u.drr_begin.drr_toname), + "%s@%s", zhp->zfs_name, tosnap); + drr.drr_payloadlen = buflen; + err = cksum_and_write(&drr, sizeof (drr), &zc, outfd); + + /* write header nvlist */ + if (err != -1 && packbuf != NULL) { + err = cksum_and_write(packbuf, buflen, &zc, outfd); + } + free(packbuf); + if (err == -1) { + fsavl_destroy(fsavl); + nvlist_free(fss); + err = errno; + goto stderr_out; + } + + /* write end record */ + if (err != -1) { + bzero(&drr, sizeof (drr)); + drr.drr_type = DRR_END; + drr.drr_u.drr_end.drr_checksum = zc; + err = write(outfd, &drr, sizeof (drr)); + if (err == -1) { + fsavl_destroy(fsavl); + nvlist_free(fss); + err = errno; + goto stderr_out; + } + } + } + + /* dump each stream */ + sdd.fromsnap = fromsnap; + sdd.tosnap = tosnap; + if (flags.dedup) + sdd.outfd = pipefd[0]; + else + sdd.outfd = outfd; + sdd.replicate = flags.replicate; + sdd.doall = flags.doall; + sdd.fromorigin = flags.fromorigin; + sdd.fss = fss; + sdd.fsavl = fsavl; + sdd.verbose = flags.verbose; + sdd.filter_cb = filter_func; + sdd.filter_cb_arg = cb_arg; + if (debugnvp) + sdd.debugnv = *debugnvp; + if (holdsnaps) { + ++holdseq; + (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), + ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); + sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); + if (sdd.cleanup_fd < 0) { + err = errno; + goto stderr_out; + } + } else { + sdd.cleanup_fd = -1; + } + err = dump_filesystems(zhp, &sdd); + fsavl_destroy(fsavl); + nvlist_free(fss); + + if (flags.dedup) { + (void) close(pipefd[0]); + (void) pthread_join(tid, NULL); + } + + if (sdd.cleanup_fd != -1) { + VERIFY(0 == close(sdd.cleanup_fd)); + sdd.cleanup_fd = -1; + } + + if (flags.replicate || flags.doall || flags.props) { + /* + * write final end record. NB: want to do this even if + * there was some error, because it might not be totally + * failed. + */ + dmu_replay_record_t drr = { 0 }; + drr.drr_type = DRR_END; + if (write(outfd, &drr, sizeof (drr)) == -1) { + return (zfs_standard_error(zhp->zfs_hdl, + errno, errbuf)); + } + } + + return (err || sdd.err); + +stderr_out: + err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); +err_out: + if (sdd.cleanup_fd != -1) + VERIFY(0 == close(sdd.cleanup_fd)); + if (flags.dedup) { + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + (void) close(pipefd[0]); + } + return (err); +} + +/* + * Routines specific to "zfs recv" + */ + +static int +recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, + boolean_t byteswap, zio_cksum_t *zc) +{ + char *cp = buf; + int rv; + int len = ilen; + + do { + rv = read(fd, cp, len); + cp += rv; + len -= rv; + } while (rv > 0); + + if (rv < 0 || len != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "failed to read from stream")); + return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN, + "cannot receive"))); + } + + if (zc) { + if (byteswap) + fletcher_4_incremental_byteswap(buf, ilen, zc); + else + fletcher_4_incremental_native(buf, ilen, zc); + } + return (0); +} + +static int +recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, + boolean_t byteswap, zio_cksum_t *zc) +{ + char *buf; + int err; + + buf = zfs_alloc(hdl, len); + if (buf == NULL) + return (ENOMEM); + + err = recv_read(hdl, fd, buf, len, byteswap, zc); + if (err != 0) { + free(buf); + return (err); + } + + err = nvlist_unpack(buf, len, nvp, 0); + free(buf); + if (err != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "stream (malformed nvlist)")); + return (EINVAL); + } + return (0); +} + +static int +recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, + int baselen, char *newname, recvflags_t flags) +{ + static int seq; + zfs_cmd_t zc = { 0 }; + int err; + prop_changelist_t *clp; + zfs_handle_t *zhp; + + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); + if (zhp == NULL) + return (-1); + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, + flags.force ? MS_FORCE : 0); + zfs_close(zhp); + if (clp == NULL) + return (-1); + err = changelist_prefix(clp); + if (err) + return (err); + + zc.zc_objset_type = DMU_OST_ZFS; + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + + if (tryname) { + (void) strcpy(newname, tryname); + + (void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value)); + + if (flags.verbose) { + (void) printf("attempting rename %s to %s\n", + zc.zc_name, zc.zc_value); + } + err = ioctl(hdl->libzfs_fd, ZFS_IOC_RENAME, &zc); + if (err == 0) + changelist_rename(clp, name, tryname); + } else { + err = ENOENT; + } + + if (err != 0 && strncmp(name+baselen, "recv-", 5) != 0) { + seq++; + + (void) strncpy(newname, name, baselen); + (void) snprintf(newname+baselen, ZFS_MAXNAMELEN-baselen, + "recv-%u-%u", getpid(), seq); + (void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value)); + + if (flags.verbose) { + (void) printf("failed - trying rename %s to %s\n", + zc.zc_name, zc.zc_value); + } + err = ioctl(hdl->libzfs_fd, ZFS_IOC_RENAME, &zc); + if (err == 0) + changelist_rename(clp, name, newname); + if (err && flags.verbose) { + (void) printf("failed (%u) - " + "will try again on next pass\n", errno); + } + err = EAGAIN; + } else if (flags.verbose) { + if (err == 0) + (void) printf("success\n"); + else + (void) printf("failed (%u)\n", errno); + } + + (void) changelist_postfix(clp); + changelist_free(clp); + + return (err); +} + +static int +recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, + char *newname, recvflags_t flags) +{ + zfs_cmd_t zc = { 0 }; + int err = 0; + prop_changelist_t *clp; + zfs_handle_t *zhp; + boolean_t defer = B_FALSE; + int spa_version; + + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); + if (zhp == NULL) + return (-1); + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, + flags.force ? MS_FORCE : 0); + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && + zfs_spa_version(zhp, &spa_version) == 0 && + spa_version >= SPA_VERSION_USERREFS) + defer = B_TRUE; + zfs_close(zhp); + if (clp == NULL) + return (-1); + err = changelist_prefix(clp); + if (err) + return (err); + + zc.zc_objset_type = DMU_OST_ZFS; + zc.zc_defer_destroy = defer; + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + + if (flags.verbose) + (void) printf("attempting destroy %s\n", zc.zc_name); + err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc); + if (err == 0) { + if (flags.verbose) + (void) printf("success\n"); + changelist_remove(clp, zc.zc_name); + } + + (void) changelist_postfix(clp); + changelist_free(clp); + + /* + * Deferred destroy might destroy the snapshot or only mark it to be + * destroyed later, and it returns success in either case. + */ + if (err != 0 || (defer && zfs_dataset_exists(hdl, name, + ZFS_TYPE_SNAPSHOT))) { + err = recv_rename(hdl, name, NULL, baselen, newname, flags); + } + + return (err); +} + +typedef struct guid_to_name_data { + uint64_t guid; + char *name; +} guid_to_name_data_t; + +static int +guid_to_name_cb(zfs_handle_t *zhp, void *arg) +{ + guid_to_name_data_t *gtnd = arg; + int err; + + if (zhp->zfs_dmustats.dds_guid == gtnd->guid) { + (void) strcpy(gtnd->name, zhp->zfs_name); + zfs_close(zhp); + return (EEXIST); + } + err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); + zfs_close(zhp); + return (err); +} + +static int +guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, + char *name) +{ + /* exhaustive search all local snapshots */ + guid_to_name_data_t gtnd; + int err = 0; + zfs_handle_t *zhp; + char *cp; + + gtnd.guid = guid; + gtnd.name = name; + + if (strchr(parent, '@') == NULL) { + zhp = make_dataset_handle(hdl, parent); + if (zhp != NULL) { + err = zfs_iter_children(zhp, guid_to_name_cb, >nd); + zfs_close(zhp); + if (err == EEXIST) + return (0); + } + } + + cp = strchr(parent, '/'); + if (cp) + *cp = '\0'; + zhp = make_dataset_handle(hdl, parent); + if (cp) + *cp = '/'; + + if (zhp) { + err = zfs_iter_children(zhp, guid_to_name_cb, >nd); + zfs_close(zhp); + } + + return (err == EEXIST ? 0 : ENOENT); + +} + +/* + * Return true if dataset guid1 is created before guid2. + */ +static int +created_before(libzfs_handle_t *hdl, avl_tree_t *avl, + uint64_t guid1, uint64_t guid2) +{ + nvlist_t *nvfs; + char *fsname, *snapname; + char buf[ZFS_MAXNAMELEN]; + int rv; + zfs_node_t zn1, zn2; + + if (guid2 == 0) + return (0); + if (guid1 == 0) + return (1); + + nvfs = fsavl_find(avl, guid1, &snapname); + VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); + (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); + zn1.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); + if (zn1.zn_handle == NULL) + return (-1); + + nvfs = fsavl_find(avl, guid2, &snapname); + VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); + (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); + zn2.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); + if (zn2.zn_handle == NULL) { + zfs_close(zn2.zn_handle); + return (-1); + } + + rv = (zfs_snapshot_compare(&zn1, &zn2) == -1); + + zfs_close(zn1.zn_handle); + zfs_close(zn2.zn_handle); + + return (rv); +} + +static int +recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, + recvflags_t flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, + nvlist_t *renamed) +{ + nvlist_t *local_nv; + avl_tree_t *local_avl; + nvpair_t *fselem, *nextfselem; + char *fromsnap; + char newname[ZFS_MAXNAMELEN]; + int error; + boolean_t needagain, progress, recursive; + char *s1, *s2; + + VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap)); + + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); + + if (flags.dryrun) + return (0); + +again: + needagain = progress = B_FALSE; + + if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, + recursive, &local_nv, &local_avl)) != 0) + return (error); + + /* + * Process deletes and renames + */ + for (fselem = nvlist_next_nvpair(local_nv, NULL); + fselem; fselem = nextfselem) { + nvlist_t *nvfs, *snaps; + nvlist_t *stream_nvfs = NULL; + nvpair_t *snapelem, *nextsnapelem; + uint64_t fromguid = 0; + uint64_t originguid = 0; + uint64_t stream_originguid = 0; + uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid; + char *fsname, *stream_fsname; + + nextfselem = nvlist_next_nvpair(local_nv, fselem); + + VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); + VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); + VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); + VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap", + &parent_fromsnap_guid)); + (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); + + /* + * First find the stream's fs, so we can check for + * a different origin (due to "zfs promote") + */ + for (snapelem = nvlist_next_nvpair(snaps, NULL); + snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { + uint64_t thisguid; + + VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); + stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); + + if (stream_nvfs != NULL) + break; + } + + /* check for promote */ + (void) nvlist_lookup_uint64(stream_nvfs, "origin", + &stream_originguid); + if (stream_nvfs && originguid != stream_originguid) { + switch (created_before(hdl, local_avl, + stream_originguid, originguid)) { + case 1: { + /* promote it! */ + zfs_cmd_t zc = { 0 }; + nvlist_t *origin_nvfs; + char *origin_fsname; + + if (flags.verbose) + (void) printf("promoting %s\n", fsname); + + origin_nvfs = fsavl_find(local_avl, originguid, + NULL); + VERIFY(0 == nvlist_lookup_string(origin_nvfs, + "name", &origin_fsname)); + (void) strlcpy(zc.zc_value, origin_fsname, + sizeof (zc.zc_value)); + (void) strlcpy(zc.zc_name, fsname, + sizeof (zc.zc_name)); + error = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); + if (error == 0) + progress = B_TRUE; + break; + } + default: + break; + case -1: + fsavl_destroy(local_avl); + nvlist_free(local_nv); + return (-1); + } + /* + * We had/have the wrong origin, therefore our + * list of snapshots is wrong. Need to handle + * them on the next pass. + */ + needagain = B_TRUE; + continue; + } + + for (snapelem = nvlist_next_nvpair(snaps, NULL); + snapelem; snapelem = nextsnapelem) { + uint64_t thisguid; + char *stream_snapname; + nvlist_t *found, *props; + + nextsnapelem = nvlist_next_nvpair(snaps, snapelem); + + VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); + found = fsavl_find(stream_avl, thisguid, + &stream_snapname); + + /* check for delete */ + if (found == NULL) { + char name[ZFS_MAXNAMELEN]; + + if (!flags.force) + continue; + + (void) snprintf(name, sizeof (name), "%s@%s", + fsname, nvpair_name(snapelem)); + + error = recv_destroy(hdl, name, + strlen(fsname)+1, newname, flags); + if (error) + needagain = B_TRUE; + else + progress = B_TRUE; + continue; + } + + stream_nvfs = found; + + if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops", + &props) && 0 == nvlist_lookup_nvlist(props, + stream_snapname, &props)) { + zfs_cmd_t zc = { 0 }; + + zc.zc_cookie = B_TRUE; /* received */ + (void) snprintf(zc.zc_name, sizeof (zc.zc_name), + "%s@%s", fsname, nvpair_name(snapelem)); + if (zcmd_write_src_nvlist(hdl, &zc, + props) == 0) { + (void) zfs_ioctl(hdl, + ZFS_IOC_SET_PROP, &zc); + zcmd_free_nvlists(&zc); + } + } + + /* check for different snapname */ + if (strcmp(nvpair_name(snapelem), + stream_snapname) != 0) { + char name[ZFS_MAXNAMELEN]; + char tryname[ZFS_MAXNAMELEN]; + + (void) snprintf(name, sizeof (name), "%s@%s", + fsname, nvpair_name(snapelem)); + (void) snprintf(tryname, sizeof (name), "%s@%s", + fsname, stream_snapname); + + error = recv_rename(hdl, name, tryname, + strlen(fsname)+1, newname, flags); + if (error) + needagain = B_TRUE; + else + progress = B_TRUE; + } + + if (strcmp(stream_snapname, fromsnap) == 0) + fromguid = thisguid; + } + + /* check for delete */ + if (stream_nvfs == NULL) { + if (!flags.force) + continue; + + error = recv_destroy(hdl, fsname, strlen(tofs)+1, + newname, flags); + if (error) + needagain = B_TRUE; + else + progress = B_TRUE; + continue; + } + + if (fromguid == 0) { + if (flags.verbose) { + (void) printf("local fs %s does not have " + "fromsnap (%s in stream); must have " + "been deleted locally; ignoring\n", + fsname, fromsnap); + } + continue; + } + + VERIFY(0 == nvlist_lookup_string(stream_nvfs, + "name", &stream_fsname)); + VERIFY(0 == nvlist_lookup_uint64(stream_nvfs, + "parentfromsnap", &stream_parent_fromsnap_guid)); + + s1 = strrchr(fsname, '/'); + s2 = strrchr(stream_fsname, '/'); + + /* + * Check for rename. If the exact receive path is specified, it + * does not count as a rename, but we still need to check the + * datasets beneath it. + */ + if ((stream_parent_fromsnap_guid != 0 && + parent_fromsnap_guid != 0 && + stream_parent_fromsnap_guid != parent_fromsnap_guid) || + ((flags.isprefix || strcmp(tofs, fsname) != 0) && + (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { + nvlist_t *parent; + char tryname[ZFS_MAXNAMELEN]; + + parent = fsavl_find(local_avl, + stream_parent_fromsnap_guid, NULL); + /* + * NB: parent might not be found if we used the + * tosnap for stream_parent_fromsnap_guid, + * because the parent is a newly-created fs; + * we'll be able to rename it after we recv the + * new fs. + */ + if (parent != NULL) { + char *pname; + + VERIFY(0 == nvlist_lookup_string(parent, "name", + &pname)); + (void) snprintf(tryname, sizeof (tryname), + "%s%s", pname, strrchr(stream_fsname, '/')); + } else { + tryname[0] = '\0'; + if (flags.verbose) { + (void) printf("local fs %s new parent " + "not found\n", fsname); + } + } + + newname[0] = '\0'; + + error = recv_rename(hdl, fsname, tryname, + strlen(tofs)+1, newname, flags); + + if (renamed != NULL && newname[0] != '\0') { + VERIFY(0 == nvlist_add_boolean(renamed, + newname)); + } + + if (error) + needagain = B_TRUE; + else + progress = B_TRUE; + } + } + + fsavl_destroy(local_avl); + nvlist_free(local_nv); + + if (needagain && progress) { + /* do another pass to fix up temporary names */ + if (flags.verbose) + (void) printf("another pass:\n"); + goto again; + } + + return (needagain); +} + +static int +zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, + recvflags_t flags, dmu_replay_record_t *drr, zio_cksum_t *zc, + char **top_zfs, int cleanup_fd, uint64_t *action_handlep) +{ + nvlist_t *stream_nv = NULL; + avl_tree_t *stream_avl = NULL; + char *fromsnap = NULL; + char *cp; + char tofs[ZFS_MAXNAMELEN]; + char sendfs[ZFS_MAXNAMELEN]; + char errbuf[1024]; + dmu_replay_record_t drre; + int error; + boolean_t anyerr = B_FALSE; + boolean_t softerr = B_FALSE; + boolean_t recursive; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive")); + + assert(drr->drr_type == DRR_BEGIN); + assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); + assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == + DMU_COMPOUNDSTREAM); + + /* + * Read in the nvlist from the stream. + */ + if (drr->drr_payloadlen != 0) { + error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, + &stream_nv, flags.byteswap, zc); + if (error) { + error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + } + + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); + + if (recursive && strchr(destname, '@')) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot specify snapshot name for multi-snapshot stream")); + error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + + /* + * Read in the end record and verify checksum. + */ + if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), + flags.byteswap, NULL))) + goto out; + if (flags.byteswap) { + drre.drr_type = BSWAP_32(drre.drr_type); + drre.drr_u.drr_end.drr_checksum.zc_word[0] = + BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); + drre.drr_u.drr_end.drr_checksum.zc_word[1] = + BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]); + drre.drr_u.drr_end.drr_checksum.zc_word[2] = + BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]); + drre.drr_u.drr_end.drr_checksum.zc_word[3] = + BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]); + } + if (drre.drr_type != DRR_END) { + error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incorrect header checksum")); + error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + + (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap); + + if (drr->drr_payloadlen != 0) { + nvlist_t *stream_fss; + + VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss", + &stream_fss)); + if ((stream_avl = fsavl_create(stream_fss)) == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "couldn't allocate avl tree")); + error = zfs_error(hdl, EZFS_NOMEM, errbuf); + goto out; + } + + if (fromsnap != NULL) { + nvlist_t *renamed = NULL; + nvpair_t *pair = NULL; + + (void) strlcpy(tofs, destname, ZFS_MAXNAMELEN); + if (flags.isprefix) { + struct drr_begin *drrb = &drr->drr_u.drr_begin; + int i; + + if (flags.istail) { + cp = strrchr(drrb->drr_toname, '/'); + if (cp == NULL) { + (void) strlcat(tofs, "/", + ZFS_MAXNAMELEN); + i = 0; + } else { + i = (cp - drrb->drr_toname); + } + } else { + i = strcspn(drrb->drr_toname, "/@"); + } + /* zfs_receive_one() will create_parents() */ + (void) strlcat(tofs, &drrb->drr_toname[i], + ZFS_MAXNAMELEN); + *strchr(tofs, '@') = '\0'; + } + + if (recursive && !flags.dryrun && !flags.nomount) { + VERIFY(0 == nvlist_alloc(&renamed, + NV_UNIQUE_NAME, 0)); + } + + softerr = recv_incremental_replication(hdl, tofs, flags, + stream_nv, stream_avl, renamed); + + /* Unmount renamed filesystems before receiving. */ + while ((pair = nvlist_next_nvpair(renamed, + pair)) != NULL) { + zfs_handle_t *zhp; + prop_changelist_t *clp = NULL; + + zhp = zfs_open(hdl, nvpair_name(pair), + ZFS_TYPE_FILESYSTEM); + if (zhp != NULL) { + clp = changelist_gather(zhp, + ZFS_PROP_MOUNTPOINT, 0, 0); + zfs_close(zhp); + if (clp != NULL) { + softerr |= + changelist_prefix(clp); + changelist_free(clp); + } + } + } + + nvlist_free(renamed); + } + } + + /* + * Get the fs specified by the first path in the stream (the top level + * specified by 'zfs send') and pass it to each invocation of + * zfs_receive_one(). + */ + (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, + ZFS_MAXNAMELEN); + if ((cp = strchr(sendfs, '@')) != NULL) + *cp = '\0'; + + /* Finally, receive each contained stream */ + do { + /* + * we should figure out if it has a recoverable + * error, in which case do a recv_skip() and drive on. + * Note, if we fail due to already having this guid, + * zfs_receive_one() will take care of it (ie, + * recv_skip() and return 0). + */ + error = zfs_receive_impl(hdl, destname, flags, fd, + sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, + action_handlep); + if (error == ENODATA) { + error = 0; + break; + } + anyerr |= error; + } while (error == 0); + + if (drr->drr_payloadlen != 0 && fromsnap != NULL) { + /* + * Now that we have the fs's they sent us, try the + * renames again. + */ + softerr = recv_incremental_replication(hdl, tofs, flags, + stream_nv, stream_avl, NULL); + } + +out: + fsavl_destroy(stream_avl); + if (stream_nv) + nvlist_free(stream_nv); + if (softerr) + error = -2; + if (anyerr) + error = -1; + return (error); +} + +static void +trunc_prop_errs(int truncated) +{ + ASSERT(truncated != 0); + + if (truncated == 1) + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "1 more property could not be set\n")); + else + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%d more properties could not be set\n"), truncated); +} + +static int +recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) +{ + dmu_replay_record_t *drr; + void *buf = malloc(1<<20); + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive:")); + + /* XXX would be great to use lseek if possible... */ + drr = buf; + + while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t), + byteswap, NULL) == 0) { + if (byteswap) + drr->drr_type = BSWAP_32(drr->drr_type); + + switch (drr->drr_type) { + case DRR_BEGIN: + /* NB: not to be used on v2 stream packages */ + if (drr->drr_payloadlen != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid substream header")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + break; + + case DRR_END: + free(buf); + return (0); + + case DRR_OBJECT: + if (byteswap) { + drr->drr_u.drr_object.drr_bonuslen = + BSWAP_32(drr->drr_u.drr_object. + drr_bonuslen); + } + (void) recv_read(hdl, fd, buf, + P2ROUNDUP(drr->drr_u.drr_object.drr_bonuslen, 8), + B_FALSE, NULL); + break; + + case DRR_WRITE: + if (byteswap) { + drr->drr_u.drr_write.drr_length = + BSWAP_64(drr->drr_u.drr_write.drr_length); + } + (void) recv_read(hdl, fd, buf, + drr->drr_u.drr_write.drr_length, B_FALSE, NULL); + break; + case DRR_SPILL: + if (byteswap) { + drr->drr_u.drr_write.drr_length = + BSWAP_64(drr->drr_u.drr_spill.drr_length); + } + (void) recv_read(hdl, fd, buf, + drr->drr_u.drr_spill.drr_length, B_FALSE, NULL); + break; + case DRR_WRITE_BYREF: + case DRR_FREEOBJECTS: + case DRR_FREE: + break; + + default: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid record type")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + } + + free(buf); + return (-1); +} + +/* + * Restores a backup of tosnap from the file descriptor specified by infd. + */ +static int +zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, + recvflags_t flags, dmu_replay_record_t *drr, + dmu_replay_record_t *drr_noswap, const char *sendfs, + nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, + uint64_t *action_handlep) +{ + zfs_cmd_t zc = { 0 }; + time_t begin_time; + int ioctl_err, ioctl_errno, err; + char *cp; + struct drr_begin *drrb = &drr->drr_u.drr_begin; + char errbuf[1024]; + char prop_errbuf[1024]; + const char *chopprefix; + boolean_t newfs = B_FALSE; + boolean_t stream_wantsnewfs; + uint64_t parent_snapguid = 0; + prop_changelist_t *clp = NULL; + nvlist_t *snapprops_nvlist = NULL; + zprop_errflags_t prop_errflags; + boolean_t recursive; + + begin_time = time(NULL); + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive")); + + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); + + if (stream_avl != NULL) { + char *snapname; + nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, + &snapname); + nvlist_t *props; + int ret; + + (void) nvlist_lookup_uint64(fs, "parentfromsnap", + &parent_snapguid); + err = nvlist_lookup_nvlist(fs, "props", &props); + if (err) + VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); + + if (flags.canmountoff) { + VERIFY(0 == nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0)); + } + ret = zcmd_write_src_nvlist(hdl, &zc, props); + if (err) + nvlist_free(props); + + if (0 == nvlist_lookup_nvlist(fs, "snapprops", &props)) { + VERIFY(0 == nvlist_lookup_nvlist(props, + snapname, &snapprops_nvlist)); + } + + if (ret != 0) + return (-1); + } + + cp = NULL; + + /* + * Determine how much of the snapshot name stored in the stream + * we are going to tack on to the name they specified on the + * command line, and how much we are going to chop off. + * + * If they specified a snapshot, chop the entire name stored in + * the stream. + */ + if (flags.istail) { + /* + * A filesystem was specified with -e. We want to tack on only + * the tail of the sent snapshot path. + */ + if (strchr(tosnap, '@')) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "argument - snapshot not allowed with -e")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + + chopprefix = strrchr(sendfs, '/'); + + if (chopprefix == NULL) { + /* + * The tail is the poolname, so we need to + * prepend a path separator. + */ + int len = strlen(drrb->drr_toname); + cp = malloc(len + 2); + cp[0] = '/'; + (void) strcpy(&cp[1], drrb->drr_toname); + chopprefix = cp; + } else { + chopprefix = drrb->drr_toname + (chopprefix - sendfs); + } + } else if (flags.isprefix) { + /* + * A filesystem was specified with -d. We want to tack on + * everything but the first element of the sent snapshot path + * (all but the pool name). + */ + if (strchr(tosnap, '@')) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "argument - snapshot not allowed with -d")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + + chopprefix = strchr(drrb->drr_toname, '/'); + if (chopprefix == NULL) + chopprefix = strchr(drrb->drr_toname, '@'); + } else if (strchr(tosnap, '@') == NULL) { + /* + * If a filesystem was specified without -d or -e, we want to + * tack on everything after the fs specified by 'zfs send'. + */ + chopprefix = drrb->drr_toname + strlen(sendfs); + } else { + /* A snapshot was specified as an exact path (no -d or -e). */ + if (recursive) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot specify snapshot name for multi-snapshot " + "stream")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); + } + + ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); + ASSERT(chopprefix > drrb->drr_toname); + ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname)); + ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || + chopprefix[0] == '\0'); + + /* + * Determine name of destination snapshot, store in zc_value. + */ + (void) strcpy(zc.zc_top_ds, tosnap); + (void) strcpy(zc.zc_value, tosnap); + (void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value)); + free(cp); + if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) { + zcmd_free_nvlists(&zc); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + + /* + * Determine the name of the origin snapshot, store in zc_string. + */ + if (drrb->drr_flags & DRR_FLAG_CLONE) { + if (guid_to_name(hdl, tosnap, + drrb->drr_fromguid, zc.zc_string) != 0) { + zcmd_free_nvlists(&zc); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "local origin for clone %s does not exist"), + zc.zc_value); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } + if (flags.verbose) + (void) printf("found clone origin %s\n", zc.zc_string); + } + + stream_wantsnewfs = (drrb->drr_fromguid == NULL || + (drrb->drr_flags & DRR_FLAG_CLONE)); + + if (stream_wantsnewfs) { + /* + * if the parent fs does not exist, look for it based on + * the parent snap GUID + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive new filesystem stream")); + + (void) strcpy(zc.zc_name, zc.zc_value); + cp = strrchr(zc.zc_name, '/'); + if (cp) + *cp = '\0'; + if (cp && + !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { + char suffix[ZFS_MAXNAMELEN]; + (void) strcpy(suffix, strrchr(zc.zc_value, '/')); + if (guid_to_name(hdl, tosnap, parent_snapguid, + zc.zc_value) == 0) { + *strchr(zc.zc_value, '@') = '\0'; + (void) strcat(zc.zc_value, suffix); + } + } + } else { + /* + * if the fs does not exist, look for it based on the + * fromsnap GUID + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive incremental stream")); + + (void) strcpy(zc.zc_name, zc.zc_value); + *strchr(zc.zc_name, '@') = '\0'; + + /* + * If the exact receive path was specified and this is the + * topmost path in the stream, then if the fs does not exist we + * should look no further. + */ + if ((flags.isprefix || (*(chopprefix = drrb->drr_toname + + strlen(sendfs)) != '\0' && *chopprefix != '@')) && + !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { + char snap[ZFS_MAXNAMELEN]; + (void) strcpy(snap, strchr(zc.zc_value, '@')); + if (guid_to_name(hdl, tosnap, drrb->drr_fromguid, + zc.zc_value) == 0) { + *strchr(zc.zc_value, '@') = '\0'; + (void) strcat(zc.zc_value, snap); + } + } + } + + (void) strcpy(zc.zc_name, zc.zc_value); + *strchr(zc.zc_name, '@') = '\0'; + + if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { + zfs_handle_t *zhp; + + /* + * Destination fs exists. Therefore this should either + * be an incremental, or the stream specifies a new fs + * (full stream or clone) and they want us to blow it + * away (and have therefore specified -F and removed any + * snapshots). + */ + if (stream_wantsnewfs) { + if (!flags.force) { + zcmd_free_nvlists(&zc); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination '%s' exists\n" + "must specify -F to overwrite it"), + zc.zc_name); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + } + if (ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, + &zc) == 0) { + zcmd_free_nvlists(&zc); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination has snapshots (eg. %s)\n" + "must destroy them to overwrite it"), + zc.zc_name); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + } + } + + if ((zhp = zfs_open(hdl, zc.zc_name, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { + zcmd_free_nvlists(&zc); + return (-1); + } + + if (stream_wantsnewfs && + zhp->zfs_dmustats.dds_origin[0]) { + zcmd_free_nvlists(&zc); + zfs_close(zhp); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination '%s' is a clone\n" + "must destroy it to overwrite it"), + zc.zc_name); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + } + + if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && + stream_wantsnewfs) { + /* We can't do online recv in this case */ + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0); + if (clp == NULL) { + zfs_close(zhp); + zcmd_free_nvlists(&zc); + return (-1); + } + if (changelist_prefix(clp) != 0) { + changelist_free(clp); + zfs_close(zhp); + zcmd_free_nvlists(&zc); + return (-1); + } + } + zfs_close(zhp); + } else { + /* + * Destination filesystem does not exist. Therefore we better + * be creating a new filesystem (either from a full backup, or + * a clone). It would therefore be invalid if the user + * specified only the pool name (i.e. if the destination name + * contained no slash character). + */ + if (!stream_wantsnewfs || + (cp = strrchr(zc.zc_name, '/')) == NULL) { + zcmd_free_nvlists(&zc); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination '%s' does not exist"), zc.zc_name); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } + + /* + * Trim off the final dataset component so we perform the + * recvbackup ioctl to the filesystems's parent. + */ + *cp = '\0'; + + if (flags.isprefix && !flags.istail && !flags.dryrun && + create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) { + zcmd_free_nvlists(&zc); + return (zfs_error(hdl, EZFS_BADRESTORE, errbuf)); + } + + newfs = B_TRUE; + } + + zc.zc_begin_record = drr_noswap->drr_u.drr_begin; + zc.zc_cookie = infd; + zc.zc_guid = flags.force; + if (flags.verbose) { + (void) printf("%s %s stream of %s into %s\n", + flags.dryrun ? "would receive" : "receiving", + drrb->drr_fromguid ? "incremental" : "full", + drrb->drr_toname, zc.zc_value); + (void) fflush(stdout); + } + + if (flags.dryrun) { + zcmd_free_nvlists(&zc); + return (recv_skip(hdl, infd, flags.byteswap)); + } + + zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf; + zc.zc_nvlist_dst_size = sizeof (prop_errbuf); + zc.zc_cleanup_fd = cleanup_fd; + zc.zc_action_handle = *action_handlep; + + err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc); + ioctl_errno = errno; + prop_errflags = (zprop_errflags_t)zc.zc_obj; + + if (err == 0) { + nvlist_t *prop_errors; + VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, + zc.zc_nvlist_dst_size, &prop_errors, 0)); + + nvpair_t *prop_err = NULL; + + while ((prop_err = nvlist_next_nvpair(prop_errors, + prop_err)) != NULL) { + char tbuf[1024]; + zfs_prop_t prop; + int intval; + + prop = zfs_name_to_prop(nvpair_name(prop_err)); + (void) nvpair_value_int32(prop_err, &intval); + if (strcmp(nvpair_name(prop_err), + ZPROP_N_MORE_ERRORS) == 0) { + trunc_prop_errs(intval); + break; + } else { + (void) snprintf(tbuf, sizeof (tbuf), + dgettext(TEXT_DOMAIN, + "cannot receive %s property on %s"), + nvpair_name(prop_err), zc.zc_name); + zfs_setprop_error(hdl, prop, intval, tbuf); + } + } + nvlist_free(prop_errors); + } + + zc.zc_nvlist_dst = 0; + zc.zc_nvlist_dst_size = 0; + zcmd_free_nvlists(&zc); + + if (err == 0 && snapprops_nvlist) { + zfs_cmd_t zc2 = { 0 }; + + (void) strcpy(zc2.zc_name, zc.zc_value); + zc2.zc_cookie = B_TRUE; /* received */ + if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) { + (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2); + zcmd_free_nvlists(&zc2); + } + } + + if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { + /* + * It may be that this snapshot already exists, + * in which case we want to consume & ignore it + * rather than failing. + */ + avl_tree_t *local_avl; + nvlist_t *local_nv, *fs; + cp = strchr(zc.zc_value, '@'); + + /* + * XXX Do this faster by just iterating over snaps in + * this fs. Also if zc_value does not exist, we will + * get a strange "does not exist" error message. + */ + *cp = '\0'; + if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE, + &local_nv, &local_avl) == 0) { + *cp = '@'; + fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); + fsavl_destroy(local_avl); + nvlist_free(local_nv); + + if (fs != NULL) { + if (flags.verbose) { + (void) printf("snap %s already exists; " + "ignoring\n", zc.zc_value); + } + err = ioctl_err = recv_skip(hdl, infd, + flags.byteswap); + } + } + *cp = '@'; + } + + if (ioctl_err != 0) { + switch (ioctl_errno) { + case ENODEV: + cp = strchr(zc.zc_value, '@'); + *cp = '\0'; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "most recent snapshot of %s does not\n" + "match incremental source"), zc.zc_value); + (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); + *cp = '@'; + break; + case ETXTBSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination %s has been modified\n" + "since most recent snapshot"), zc.zc_name); + (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); + break; + case EEXIST: + cp = strchr(zc.zc_value, '@'); + if (newfs) { + /* it's the containing fs that exists */ + *cp = '\0'; + } + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination already exists")); + (void) zfs_error_fmt(hdl, EZFS_EXISTS, + dgettext(TEXT_DOMAIN, "cannot restore to %s"), + zc.zc_value); + *cp = '@'; + break; + case EINVAL: + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; + case ECKSUM: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid stream (checksum mismatch)")); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to receive this stream.")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EDQUOT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination %s space quota exceeded"), zc.zc_name); + (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); + break; + default: + (void) zfs_standard_error(hdl, ioctl_errno, errbuf); + } + } + + /* + * Mount the target filesystem (if created). Also mount any + * children of the target filesystem if we did a replication + * receive (indicated by stream_avl being non-NULL). + */ + cp = strchr(zc.zc_value, '@'); + if (cp && (ioctl_err == 0 || !newfs)) { + zfs_handle_t *h; + + *cp = '\0'; + h = zfs_open(hdl, zc.zc_value, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (h != NULL) { + if (h->zfs_type == ZFS_TYPE_VOLUME) { + *cp = '@'; + } else if (newfs || stream_avl) { + /* + * Track the first/top of hierarchy fs, + * for mounting and sharing later. + */ + if (top_zfs && *top_zfs == NULL) + *top_zfs = zfs_strdup(hdl, zc.zc_value); + } + zfs_close(h); + } + *cp = '@'; + } + + if (clp) { + err |= changelist_postfix(clp); + changelist_free(clp); + } + + if (prop_errflags & ZPROP_ERR_NOCLEAR) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " + "failed to clear unreceived properties on %s"), + zc.zc_name); + (void) fprintf(stderr, "\n"); + } + if (prop_errflags & ZPROP_ERR_NORESTORE) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " + "failed to restore original properties on %s"), + zc.zc_name); + (void) fprintf(stderr, "\n"); + } + + if (err || ioctl_err) + return (-1); + + *action_handlep = zc.zc_action_handle; + + if (flags.verbose) { + char buf1[64]; + char buf2[64]; + uint64_t bytes = zc.zc_cookie; + time_t delta = time(NULL) - begin_time; + if (delta == 0) + delta = 1; + zfs_nicenum(bytes, buf1, sizeof (buf1)); + zfs_nicenum(bytes/delta, buf2, sizeof (buf1)); + + (void) printf("received %sB stream in %lu seconds (%sB/sec)\n", + buf1, delta, buf2); + } + + return (0); +} + +static int +zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, + int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, + char **top_zfs, int cleanup_fd, uint64_t *action_handlep) +{ + int err; + dmu_replay_record_t drr, drr_noswap; + struct drr_begin *drrb = &drr.drr_u.drr_begin; + char errbuf[1024]; + zio_cksum_t zcksum = { 0 }; + uint64_t featureflags; + int hdrtype; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive")); + + if (flags.isprefix && + !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " + "(%s) does not exist"), tosnap); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } + + /* read in the BEGIN record */ + if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, + &zcksum))) + return (err); + + if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) { + /* It's the double end record at the end of a package */ + return (ENODATA); + } + + /* the kernel needs the non-byteswapped begin record */ + drr_noswap = drr; + + flags.byteswap = B_FALSE; + if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + /* + * We computed the checksum in the wrong byteorder in + * recv_read() above; do it again correctly. + */ + bzero(&zcksum, sizeof (zio_cksum_t)); + fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); + flags.byteswap = B_TRUE; + + drr.drr_type = BSWAP_32(drr.drr_type); + drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); + drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_flags = BSWAP_32(drrb->drr_flags); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } + + if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "stream (bad magic number)")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + + featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); + + if (!DMU_STREAM_SUPPORTED(featureflags) || + (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "stream has unsupported feature, feature flags = %lx"), + featureflags); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + + if (strchr(drrb->drr_toname, '@') == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "stream (bad snapshot name)")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { + char nonpackage_sendfs[ZFS_MAXNAMELEN]; + if (sendfs == NULL) { + /* + * We were not called from zfs_receive_package(). Get + * the fs specified by 'zfs send'. + */ + char *cp; + (void) strlcpy(nonpackage_sendfs, + drr.drr_u.drr_begin.drr_toname, ZFS_MAXNAMELEN); + if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) + *cp = '\0'; + sendfs = nonpackage_sendfs; + } + return (zfs_receive_one(hdl, infd, tosnap, flags, + &drr, &drr_noswap, sendfs, stream_nv, stream_avl, + top_zfs, cleanup_fd, action_handlep)); + } else { + assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM); + return (zfs_receive_package(hdl, infd, tosnap, flags, + &drr, &zcksum, top_zfs, cleanup_fd, action_handlep)); + } +} + +/* + * Restores a backup of tosnap from the file descriptor specified by infd. + * Return 0 on total success, -2 if some things couldn't be + * destroyed/renamed/promoted, -1 if some things couldn't be received. + * (-1 will override -2). + */ +int +zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, + int infd, avl_tree_t *stream_avl) +{ + char *top_zfs = NULL; + int err; + int cleanup_fd; + uint64_t action_handle = 0; + + cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); + VERIFY(cleanup_fd >= 0); + + err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL, + stream_avl, &top_zfs, cleanup_fd, &action_handle); + + VERIFY(0 == close(cleanup_fd)); + + if (err == 0 && !flags.nomount && top_zfs) { + zfs_handle_t *zhp; + prop_changelist_t *clp; + + zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM); + if (zhp != NULL) { + clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, + CL_GATHER_MOUNT_ALWAYS, 0); + zfs_close(zhp); + if (clp != NULL) { + /* mount and share received datasets */ + err = changelist_postfix(clp); + changelist_free(clp); + } + } + if (zhp == NULL || clp == NULL || err) + err = -1; + } + if (top_zfs) + free(top_zfs); + + return (err); +} diff --git a/lib/libzfs/common/libzfs_status.c b/lib/libzfs/common/libzfs_status.c new file mode 100644 index 0000000..24725ec --- /dev/null +++ b/lib/libzfs/common/libzfs_status.c @@ -0,0 +1,398 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * This file contains the functions which analyze the status of a pool. This + * include both the status of an active pool, as well as the status exported + * pools. Returns one of the ZPOOL_STATUS_* defines describing the status of + * the pool. This status is independent (to a certain degree) from the state of + * the pool. A pool's state describes only whether or not it is capable of + * providing the necessary fault tolerance for data. The status describes the + * overall status of devices. A pool that is online can still have a device + * that is experiencing errors. + * + * Only a subset of the possible faults can be detected using 'zpool status', + * and not all possible errors correspond to a FMA message ID. The explanation + * is left up to the caller, depending on whether it is a live pool or an + * import. + */ + +#include +#include +#include +#include "libzfs_impl.h" + +/* + * Message ID table. This must be kept in sync with the ZPOOL_STATUS_* defines + * in libzfs.h. Note that there are some status results which go past the end + * of this table, and hence have no associated message ID. + */ +static char *zfs_msgid_table[] = { + "ZFS-8000-14", + "ZFS-8000-2Q", + "ZFS-8000-3C", + "ZFS-8000-4J", + "ZFS-8000-5E", + "ZFS-8000-6X", + "ZFS-8000-72", + "ZFS-8000-8A", + "ZFS-8000-9P", + "ZFS-8000-A5", + "ZFS-8000-EY", + "ZFS-8000-HC", + "ZFS-8000-JQ", + "ZFS-8000-K4", +}; + +#define NMSGID (sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0])) + +/* ARGSUSED */ +static int +vdev_missing(uint64_t state, uint64_t aux, uint64_t errs) +{ + return (state == VDEV_STATE_CANT_OPEN && + aux == VDEV_AUX_OPEN_FAILED); +} + +/* ARGSUSED */ +static int +vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs) +{ + return (state == VDEV_STATE_FAULTED); +} + +/* ARGSUSED */ +static int +vdev_errors(uint64_t state, uint64_t aux, uint64_t errs) +{ + return (state == VDEV_STATE_DEGRADED || errs != 0); +} + +/* ARGSUSED */ +static int +vdev_broken(uint64_t state, uint64_t aux, uint64_t errs) +{ + return (state == VDEV_STATE_CANT_OPEN); +} + +/* ARGSUSED */ +static int +vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs) +{ + return (state == VDEV_STATE_OFFLINE); +} + +/* ARGSUSED */ +static int +vdev_removed(uint64_t state, uint64_t aux, uint64_t errs) +{ + return (state == VDEV_STATE_REMOVED); +} + +/* + * Detect if any leaf devices that have seen errors or could not be opened. + */ +static boolean_t +find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) +{ + nvlist_t **child; + vdev_stat_t *vs; + uint_t c, children; + char *type; + + /* + * Ignore problems within a 'replacing' vdev, since we're presumably in + * the process of repairing any such errors, and don't want to call them + * out again. We'll pick up the fact that a resilver is happening + * later. + */ + verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0); + if (strcmp(type, VDEV_TYPE_REPLACING) == 0) + return (B_FALSE); + + if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + if (find_vdev_problem(child[c], func)) + return (B_TRUE); + } else { + verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + if (func(vs->vs_state, vs->vs_aux, + vs->vs_read_errors + + vs->vs_write_errors + + vs->vs_checksum_errors)) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Active pool health status. + * + * To determine the status for a pool, we make several passes over the config, + * picking the most egregious error we find. In order of importance, we do the + * following: + * + * - Check for a complete and valid configuration + * - Look for any faulted or missing devices in a non-replicated config + * - Check for any data errors + * - Check for any faulted or missing devices in a replicated config + * - Look for any devices showing errors + * - Check for any resilvering devices + * + * There can obviously be multiple errors within a single pool, so this routine + * only picks the most damaging of all the current errors to report. + */ +static zpool_status_t +check_status(nvlist_t *config, boolean_t isimport) +{ + nvlist_t *nvroot; + vdev_stat_t *vs; + pool_scan_stat_t *ps = NULL; + uint_t vsc, psc; + uint64_t nerr; + uint64_t version; + uint64_t stateval; + uint64_t suspended; + uint64_t hostid = 0; + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &stateval) == 0); + + /* + * Currently resilvering a vdev + */ + (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &psc); + if (ps && ps->pss_func == POOL_SCAN_RESILVER && + ps->pss_state == DSS_SCANNING) + return (ZPOOL_STATUS_RESILVERING); + + /* + * Pool last accessed by another system. + */ + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + if (hostid != 0 && (unsigned long)hostid != gethostid() && + stateval == POOL_STATE_ACTIVE) + return (ZPOOL_STATUS_HOSTID_MISMATCH); + + /* + * Newer on-disk version. + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_VERSION_NEWER) + return (ZPOOL_STATUS_VERSION_NEWER); + + /* + * Check that the config is complete. + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_BAD_GUID_SUM) + return (ZPOOL_STATUS_BAD_GUID_SUM); + + /* + * Check whether the pool has suspended due to failed I/O. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED, + &suspended) == 0) { + if (suspended == ZIO_FAILURE_MODE_CONTINUE) + return (ZPOOL_STATUS_IO_FAILURE_CONTINUE); + return (ZPOOL_STATUS_IO_FAILURE_WAIT); + } + + /* + * Could not read a log. + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_BAD_LOG) { + return (ZPOOL_STATUS_BAD_LOG); + } + + /* + * Bad devices in non-replicated config. + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + find_vdev_problem(nvroot, vdev_faulted)) + return (ZPOOL_STATUS_FAULTED_DEV_NR); + + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + find_vdev_problem(nvroot, vdev_missing)) + return (ZPOOL_STATUS_MISSING_DEV_NR); + + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + find_vdev_problem(nvroot, vdev_broken)) + return (ZPOOL_STATUS_CORRUPT_LABEL_NR); + + /* + * Corrupted pool metadata + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_CORRUPT_DATA) + return (ZPOOL_STATUS_CORRUPT_POOL); + + /* + * Persistent data errors. + */ + if (!isimport) { + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, + &nerr) == 0 && nerr != 0) + return (ZPOOL_STATUS_CORRUPT_DATA); + } + + /* + * Missing devices in a replicated config. + */ + if (find_vdev_problem(nvroot, vdev_faulted)) + return (ZPOOL_STATUS_FAULTED_DEV_R); + if (find_vdev_problem(nvroot, vdev_missing)) + return (ZPOOL_STATUS_MISSING_DEV_R); + if (find_vdev_problem(nvroot, vdev_broken)) + return (ZPOOL_STATUS_CORRUPT_LABEL_R); + + /* + * Devices with errors + */ + if (!isimport && find_vdev_problem(nvroot, vdev_errors)) + return (ZPOOL_STATUS_FAILING_DEV); + + /* + * Offlined devices + */ + if (find_vdev_problem(nvroot, vdev_offlined)) + return (ZPOOL_STATUS_OFFLINE_DEV); + + /* + * Removed device + */ + if (find_vdev_problem(nvroot, vdev_removed)) + return (ZPOOL_STATUS_REMOVED_DEV); + + /* + * Outdated, but usable, version + */ + if (version < SPA_VERSION) + return (ZPOOL_STATUS_VERSION_OLDER); + + return (ZPOOL_STATUS_OK); +} + +zpool_status_t +zpool_get_status(zpool_handle_t *zhp, char **msgid) +{ + zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE); + + if (ret >= NMSGID) + *msgid = NULL; + else + *msgid = zfs_msgid_table[ret]; + + return (ret); +} + +zpool_status_t +zpool_import_status(nvlist_t *config, char **msgid) +{ + zpool_status_t ret = check_status(config, B_TRUE); + + if (ret >= NMSGID) + *msgid = NULL; + else + *msgid = zfs_msgid_table[ret]; + + return (ret); +} + +static void +dump_ddt_stat(const ddt_stat_t *dds, int h) +{ + char refcnt[6]; + char blocks[6], lsize[6], psize[6], dsize[6]; + char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6]; + + if (dds == NULL || dds->dds_blocks == 0) + return; + + if (h == -1) + (void) strcpy(refcnt, "Total"); + else + zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt)); + + zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks)); + zfs_nicenum(dds->dds_lsize, lsize, sizeof (lsize)); + zfs_nicenum(dds->dds_psize, psize, sizeof (psize)); + zfs_nicenum(dds->dds_dsize, dsize, sizeof (dsize)); + zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks)); + zfs_nicenum(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize)); + zfs_nicenum(dds->dds_ref_psize, ref_psize, sizeof (ref_psize)); + zfs_nicenum(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize)); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + refcnt, + blocks, lsize, psize, dsize, + ref_blocks, ref_lsize, ref_psize, ref_dsize); +} + +/* + * Print the DDT histogram and the column totals. + */ +void +zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh) +{ + int h; + + (void) printf("\n"); + + (void) printf("bucket " + " allocated " + " referenced \n"); + (void) printf("______ " + "______________________________ " + "______________________________\n"); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + "refcnt", + "blocks", "LSIZE", "PSIZE", "DSIZE", + "blocks", "LSIZE", "PSIZE", "DSIZE"); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + "------", + "------", "-----", "-----", "-----", + "------", "-----", "-----", "-----"); + + for (h = 0; h < 64; h++) + dump_ddt_stat(&ddh->ddh_stat[h], h); + + dump_ddt_stat(dds_total, -1); + + (void) printf("\n"); +} diff --git a/lib/libzfs/common/libzfs_util.c b/lib/libzfs/common/libzfs_util.c new file mode 100644 index 0000000..01b7c87 --- /dev/null +++ b/lib/libzfs/common/libzfs_util.c @@ -0,0 +1,1482 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Internal utility routines for the ZFS library. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "libzfs_impl.h" +#include "zfs_prop.h" + +int +libzfs_errno(libzfs_handle_t *hdl) +{ + return (hdl->libzfs_error); +} + +const char * +libzfs_error_action(libzfs_handle_t *hdl) +{ + return (hdl->libzfs_action); +} + +const char * +libzfs_error_description(libzfs_handle_t *hdl) +{ + if (hdl->libzfs_desc[0] != '\0') + return (hdl->libzfs_desc); + + switch (hdl->libzfs_error) { + case EZFS_NOMEM: + return (dgettext(TEXT_DOMAIN, "out of memory")); + case EZFS_BADPROP: + return (dgettext(TEXT_DOMAIN, "invalid property value")); + case EZFS_PROPREADONLY: + return (dgettext(TEXT_DOMAIN, "read-only property")); + case EZFS_PROPTYPE: + return (dgettext(TEXT_DOMAIN, "property doesn't apply to " + "datasets of this type")); + case EZFS_PROPNONINHERIT: + return (dgettext(TEXT_DOMAIN, "property cannot be inherited")); + case EZFS_PROPSPACE: + return (dgettext(TEXT_DOMAIN, "invalid quota or reservation")); + case EZFS_BADTYPE: + return (dgettext(TEXT_DOMAIN, "operation not applicable to " + "datasets of this type")); + case EZFS_BUSY: + return (dgettext(TEXT_DOMAIN, "pool or dataset is busy")); + case EZFS_EXISTS: + return (dgettext(TEXT_DOMAIN, "pool or dataset exists")); + case EZFS_NOENT: + return (dgettext(TEXT_DOMAIN, "no such pool or dataset")); + case EZFS_BADSTREAM: + return (dgettext(TEXT_DOMAIN, "invalid backup stream")); + case EZFS_DSREADONLY: + return (dgettext(TEXT_DOMAIN, "dataset is read-only")); + case EZFS_VOLTOOBIG: + return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for " + "this system")); + case EZFS_INVALIDNAME: + return (dgettext(TEXT_DOMAIN, "invalid name")); + case EZFS_BADRESTORE: + return (dgettext(TEXT_DOMAIN, "unable to restore to " + "destination")); + case EZFS_BADBACKUP: + return (dgettext(TEXT_DOMAIN, "backup failed")); + case EZFS_BADTARGET: + return (dgettext(TEXT_DOMAIN, "invalid target vdev")); + case EZFS_NODEVICE: + return (dgettext(TEXT_DOMAIN, "no such device in pool")); + case EZFS_BADDEV: + return (dgettext(TEXT_DOMAIN, "invalid device")); + case EZFS_NOREPLICAS: + return (dgettext(TEXT_DOMAIN, "no valid replicas")); + case EZFS_RESILVERING: + return (dgettext(TEXT_DOMAIN, "currently resilvering")); + case EZFS_BADVERSION: + return (dgettext(TEXT_DOMAIN, "unsupported version")); + case EZFS_POOLUNAVAIL: + return (dgettext(TEXT_DOMAIN, "pool is unavailable")); + case EZFS_DEVOVERFLOW: + return (dgettext(TEXT_DOMAIN, "too many devices in one vdev")); + case EZFS_BADPATH: + return (dgettext(TEXT_DOMAIN, "must be an absolute path")); + case EZFS_CROSSTARGET: + return (dgettext(TEXT_DOMAIN, "operation crosses datasets or " + "pools")); + case EZFS_ZONED: + return (dgettext(TEXT_DOMAIN, "dataset in use by local zone")); + case EZFS_MOUNTFAILED: + return (dgettext(TEXT_DOMAIN, "mount failed")); + case EZFS_UMOUNTFAILED: + return (dgettext(TEXT_DOMAIN, "umount failed")); + case EZFS_UNSHARENFSFAILED: + return (dgettext(TEXT_DOMAIN, "unshare(1M) failed")); + case EZFS_SHARENFSFAILED: + return (dgettext(TEXT_DOMAIN, "share(1M) failed")); + case EZFS_UNSHARESMBFAILED: + return (dgettext(TEXT_DOMAIN, "smb remove share failed")); + case EZFS_SHARESMBFAILED: + return (dgettext(TEXT_DOMAIN, "smb add share failed")); + case EZFS_PERM: + return (dgettext(TEXT_DOMAIN, "permission denied")); + case EZFS_NOSPC: + return (dgettext(TEXT_DOMAIN, "out of space")); + case EZFS_FAULT: + return (dgettext(TEXT_DOMAIN, "bad address")); + case EZFS_IO: + return (dgettext(TEXT_DOMAIN, "I/O error")); + case EZFS_INTR: + return (dgettext(TEXT_DOMAIN, "signal received")); + case EZFS_ISSPARE: + return (dgettext(TEXT_DOMAIN, "device is reserved as a hot " + "spare")); + case EZFS_INVALCONFIG: + return (dgettext(TEXT_DOMAIN, "invalid vdev configuration")); + case EZFS_RECURSIVE: + return (dgettext(TEXT_DOMAIN, "recursive dataset dependency")); + case EZFS_NOHISTORY: + return (dgettext(TEXT_DOMAIN, "no history available")); + case EZFS_POOLPROPS: + return (dgettext(TEXT_DOMAIN, "failed to retrieve " + "pool properties")); + case EZFS_POOL_NOTSUP: + return (dgettext(TEXT_DOMAIN, "operation not supported " + "on this type of pool")); + case EZFS_POOL_INVALARG: + return (dgettext(TEXT_DOMAIN, "invalid argument for " + "this pool operation")); + case EZFS_NAMETOOLONG: + return (dgettext(TEXT_DOMAIN, "dataset name is too long")); + case EZFS_OPENFAILED: + return (dgettext(TEXT_DOMAIN, "open failed")); + case EZFS_NOCAP: + return (dgettext(TEXT_DOMAIN, + "disk capacity information could not be retrieved")); + case EZFS_LABELFAILED: + return (dgettext(TEXT_DOMAIN, "write of label failed")); + case EZFS_BADWHO: + return (dgettext(TEXT_DOMAIN, "invalid user/group")); + case EZFS_BADPERM: + return (dgettext(TEXT_DOMAIN, "invalid permission")); + case EZFS_BADPERMSET: + return (dgettext(TEXT_DOMAIN, "invalid permission set name")); + case EZFS_NODELEGATION: + return (dgettext(TEXT_DOMAIN, "delegated administration is " + "disabled on pool")); + case EZFS_BADCACHE: + return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); + case EZFS_ISL2CACHE: + return (dgettext(TEXT_DOMAIN, "device is in use as a cache")); + case EZFS_VDEVNOTSUP: + return (dgettext(TEXT_DOMAIN, "vdev specification is not " + "supported")); + case EZFS_NOTSUP: + return (dgettext(TEXT_DOMAIN, "operation not supported " + "on this dataset")); + case EZFS_ACTIVE_SPARE: + return (dgettext(TEXT_DOMAIN, "pool has active shared spare " + "device")); + case EZFS_UNPLAYED_LOGS: + return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " + "logs")); + case EZFS_REFTAG_RELE: + return (dgettext(TEXT_DOMAIN, "no such tag on this dataset")); + case EZFS_REFTAG_HOLD: + return (dgettext(TEXT_DOMAIN, "tag already exists on this " + "dataset")); + case EZFS_TAGTOOLONG: + return (dgettext(TEXT_DOMAIN, "tag too long")); + case EZFS_PIPEFAILED: + return (dgettext(TEXT_DOMAIN, "pipe create failed")); + case EZFS_THREADCREATEFAILED: + return (dgettext(TEXT_DOMAIN, "thread create failed")); + case EZFS_POSTSPLIT_ONLINE: + return (dgettext(TEXT_DOMAIN, "disk was split from this pool " + "into a new one")); + case EZFS_SCRUBBING: + return (dgettext(TEXT_DOMAIN, "currently scrubbing; " + "use 'zpool scrub -s' to cancel current scrub")); + case EZFS_NO_SCRUB: + return (dgettext(TEXT_DOMAIN, "there is no active scrub")); + case EZFS_DIFF: + return (dgettext(TEXT_DOMAIN, "unable to generate diffs")); + case EZFS_DIFFDATA: + return (dgettext(TEXT_DOMAIN, "invalid diff data")); + case EZFS_POOLREADONLY: + return (dgettext(TEXT_DOMAIN, "pool is read-only")); + case EZFS_UNKNOWN: + return (dgettext(TEXT_DOMAIN, "unknown error")); + default: + assert(hdl->libzfs_error == 0); + return (dgettext(TEXT_DOMAIN, "no error")); + } +} + +/*PRINTFLIKE2*/ +void +zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + (void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc), + fmt, ap); + hdl->libzfs_desc_active = 1; + + va_end(ap); +} + +static void +zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) +{ + (void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action), + fmt, ap); + hdl->libzfs_error = error; + + if (hdl->libzfs_desc_active) + hdl->libzfs_desc_active = 0; + else + hdl->libzfs_desc[0] = '\0'; + + if (hdl->libzfs_printerr) { + if (error == EZFS_UNKNOWN) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal " + "error: %s\n"), libzfs_error_description(hdl)); + abort(); + } + + (void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action, + libzfs_error_description(hdl)); + if (error == EZFS_NOMEM) + exit(1); + } +} + +int +zfs_error(libzfs_handle_t *hdl, int error, const char *msg) +{ + return (zfs_error_fmt(hdl, error, "%s", msg)); +} + +/*PRINTFLIKE3*/ +int +zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + zfs_verror(hdl, error, fmt, ap); + + va_end(ap); + + return (-1); +} + +static int +zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt, + va_list ap) +{ + switch (error) { + case EPERM: + case EACCES: + zfs_verror(hdl, EZFS_PERM, fmt, ap); + return (-1); + + case ECANCELED: + zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap); + return (-1); + + case EIO: + zfs_verror(hdl, EZFS_IO, fmt, ap); + return (-1); + + case EFAULT: + zfs_verror(hdl, EZFS_FAULT, fmt, ap); + return (-1); + + case EINTR: + zfs_verror(hdl, EZFS_INTR, fmt, ap); + return (-1); + } + + return (0); +} + +int +zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg) +{ + return (zfs_standard_error_fmt(hdl, error, "%s", msg)); +} + +/*PRINTFLIKE3*/ +int +zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + if (zfs_common_error(hdl, error, fmt, ap) != 0) { + va_end(ap); + return (-1); + } + + switch (error) { + case ENXIO: + case ENODEV: + zfs_verror(hdl, EZFS_IO, fmt, ap); + break; + + case ENOENT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset does not exist")); + zfs_verror(hdl, EZFS_NOENT, fmt, ap); + break; + + case ENOSPC: + case EDQUOT: + zfs_verror(hdl, EZFS_NOSPC, fmt, ap); + return (-1); + + case EEXIST: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset already exists")); + zfs_verror(hdl, EZFS_EXISTS, fmt, ap); + break; + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset is busy")); + zfs_verror(hdl, EZFS_BUSY, fmt, ap); + break; + case EROFS: + zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); + break; + case ENAMETOOLONG: + zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap); + break; + case ENOTSUP: + zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); + break; + case EAGAIN: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool I/O is currently suspended")); + zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); + break; + default: + zfs_error_aux(hdl, strerror(error)); + zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); + break; + } + + va_end(ap); + return (-1); +} + +int +zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg) +{ + return (zpool_standard_error_fmt(hdl, error, "%s", msg)); +} + +/*PRINTFLIKE3*/ +int +zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + if (zfs_common_error(hdl, error, fmt, ap) != 0) { + va_end(ap); + return (-1); + } + + switch (error) { + case ENODEV: + zfs_verror(hdl, EZFS_NODEVICE, fmt, ap); + break; + + case ENOENT: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "no such pool or dataset")); + zfs_verror(hdl, EZFS_NOENT, fmt, ap); + break; + + case EEXIST: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool already exists")); + zfs_verror(hdl, EZFS_EXISTS, fmt, ap); + break; + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy")); + zfs_verror(hdl, EZFS_BUSY, fmt, ap); + break; + + case ENXIO: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is currently unavailable")); + zfs_verror(hdl, EZFS_BADDEV, fmt, ap); + break; + + case ENAMETOOLONG: + zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap); + break; + + case ENOTSUP: + zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap); + break; + + case EINVAL: + zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap); + break; + + case ENOSPC: + case EDQUOT: + zfs_verror(hdl, EZFS_NOSPC, fmt, ap); + return (-1); + + case EAGAIN: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool I/O is currently suspended")); + zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); + break; + + case EROFS: + zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); + break; + + default: + zfs_error_aux(hdl, strerror(error)); + zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); + } + + va_end(ap); + return (-1); +} + +/* + * Display an out of memory error message and abort the current program. + */ +int +no_memory(libzfs_handle_t *hdl) +{ + return (zfs_error(hdl, EZFS_NOMEM, "internal error")); +} + +/* + * A safe form of malloc() which will die if the allocation fails. + */ +void * +zfs_alloc(libzfs_handle_t *hdl, size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) + (void) no_memory(hdl); + + return (data); +} + +/* + * A safe form of asprintf() which will die if the allocation fails. + */ +/*PRINTFLIKE2*/ +char * +zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) +{ + va_list ap; + char *ret; + int err; + + va_start(ap, fmt); + + err = vasprintf(&ret, fmt, ap); + + va_end(ap); + + if (err < 0) + (void) no_memory(hdl); + + return (ret); +} + +/* + * A safe form of realloc(), which also zeroes newly allocated space. + */ +void * +zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize) +{ + void *ret; + + if ((ret = realloc(ptr, newsize)) == NULL) { + (void) no_memory(hdl); + return (NULL); + } + + bzero((char *)ret + oldsize, (newsize - oldsize)); + return (ret); +} + +/* + * A safe form of strdup() which will die if the allocation fails. + */ +char * +zfs_strdup(libzfs_handle_t *hdl, const char *str) +{ + char *ret; + + if ((ret = strdup(str)) == NULL) + (void) no_memory(hdl); + + return (ret); +} + +/* + * Convert a number to an appropriately human-readable output. + */ +void +zfs_nicenum(uint64_t num, char *buf, size_t buflen) +{ + uint64_t n = num; + int index = 0; + char u; + + while (n >= 1024) { + n /= 1024; + index++; + } + + u = " KMGTPE"[index]; + + if (index == 0) { + (void) snprintf(buf, buflen, "%llu", n); + } else if ((num & ((1ULL << 10 * index) - 1)) == 0) { + /* + * If this is an even multiple of the base, always display + * without any decimal precision. + */ + (void) snprintf(buf, buflen, "%llu%c", n, u); + } else { + /* + * We want to choose a precision that reflects the best choice + * for fitting in 5 characters. This can get rather tricky when + * we have numbers that are very close to an order of magnitude. + * For example, when displaying 10239 (which is really 9.999K), + * we want only a single place of precision for 10.0K. We could + * develop some complex heuristics for this, but it's much + * easier just to try each combination in turn. + */ + int i; + for (i = 2; i >= 0; i--) { + if (snprintf(buf, buflen, "%.*f%c", i, + (double)num / (1ULL << 10 * index), u) <= 5) + break; + } + } +} + +void +libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) +{ + hdl->libzfs_printerr = printerr; +} + +libzfs_handle_t * +libzfs_init(void) +{ + libzfs_handle_t *hdl; + + if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) { + return (NULL); + } + + if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { + free(hdl); + return (NULL); + } + + if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) { + (void) close(hdl->libzfs_fd); + free(hdl); + return (NULL); + } + + hdl->libzfs_sharetab = fopen("/etc/dfs/sharetab", "r"); + + zfs_prop_init(); + zpool_prop_init(); + libzfs_mnttab_init(hdl); + + return (hdl); +} + +void +libzfs_fini(libzfs_handle_t *hdl) +{ + (void) close(hdl->libzfs_fd); + if (hdl->libzfs_mnttab) + (void) fclose(hdl->libzfs_mnttab); + if (hdl->libzfs_sharetab) + (void) fclose(hdl->libzfs_sharetab); + zfs_uninit_libshare(hdl); + if (hdl->libzfs_log_str) + (void) free(hdl->libzfs_log_str); + zpool_free_handles(hdl); + libzfs_fru_clear(hdl, B_TRUE); + namespace_clear(hdl); + libzfs_mnttab_fini(hdl); + free(hdl); +} + +libzfs_handle_t * +zpool_get_handle(zpool_handle_t *zhp) +{ + return (zhp->zpool_hdl); +} + +libzfs_handle_t * +zfs_get_handle(zfs_handle_t *zhp) +{ + return (zhp->zfs_hdl); +} + +zpool_handle_t * +zfs_get_pool_handle(const zfs_handle_t *zhp) +{ + return (zhp->zpool_hdl); +} + +/* + * Given a name, determine whether or not it's a valid path + * (starts with '/' or "./"). If so, walk the mnttab trying + * to match the device number. If not, treat the path as an + * fs/vol/snap name. + */ +zfs_handle_t * +zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype) +{ + struct stat64 statbuf; + struct extmnttab entry; + int ret; + + if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) { + /* + * It's not a valid path, assume it's a name of type 'argtype'. + */ + return (zfs_open(hdl, path, argtype)); + } + + if (stat64(path, &statbuf) != 0) { + (void) fprintf(stderr, "%s: %s\n", path, strerror(errno)); + return (NULL); + } + + rewind(hdl->libzfs_mnttab); + while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) { + if (makedevice(entry.mnt_major, entry.mnt_minor) == + statbuf.st_dev) { + break; + } + } + if (ret != 0) { + return (NULL); + } + + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { + (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"), + path); + return (NULL); + } + + return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)); +} + +/* + * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from + * an ioctl(). + */ +int +zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) +{ + if (len == 0) + len = 16 * 1024; + zc->zc_nvlist_dst_size = len; + if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t) + zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == NULL) + return (-1); + + return (0); +} + +/* + * Called when an ioctl() which returns an nvlist fails with ENOMEM. This will + * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was + * filled in by the kernel to indicate the actual required size. + */ +int +zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc) +{ + free((void *)(uintptr_t)zc->zc_nvlist_dst); + if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t) + zfs_alloc(hdl, zc->zc_nvlist_dst_size)) + == NULL) + return (-1); + + return (0); +} + +/* + * Called to free the src and dst nvlists stored in the command structure. + */ +void +zcmd_free_nvlists(zfs_cmd_t *zc) +{ + free((void *)(uintptr_t)zc->zc_nvlist_conf); + free((void *)(uintptr_t)zc->zc_nvlist_src); + free((void *)(uintptr_t)zc->zc_nvlist_dst); +} + +static int +zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen, + nvlist_t *nvl) +{ + char *packed; + size_t len; + + verify(nvlist_size(nvl, &len, NV_ENCODE_NATIVE) == 0); + + if ((packed = zfs_alloc(hdl, len)) == NULL) + return (-1); + + verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0); + + *outnv = (uint64_t)(uintptr_t)packed; + *outlen = len; + + return (0); +} + +int +zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) +{ + return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf, + &zc->zc_nvlist_conf_size, nvl)); +} + +int +zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) +{ + return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src, + &zc->zc_nvlist_src_size, nvl)); +} + +/* + * Unpacks an nvlist from the ZFS ioctl command structure. + */ +int +zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp) +{ + if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, nvlp, 0) != 0) + return (no_memory(hdl)); + + return (0); +} + +int +zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) +{ + int error; + + zc->zc_history = (uint64_t)(uintptr_t)hdl->libzfs_log_str; + error = ioctl(hdl->libzfs_fd, request, zc); + if (hdl->libzfs_log_str) { + free(hdl->libzfs_log_str); + hdl->libzfs_log_str = NULL; + } + zc->zc_history = 0; + + return (error); +} + +/* + * ================================================================ + * API shared by zfs and zpool property management + * ================================================================ + */ + +static void +zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) +{ + zprop_list_t *pl = cbp->cb_proplist; + int i; + char *title; + size_t len; + + cbp->cb_first = B_FALSE; + if (cbp->cb_scripted) + return; + + /* + * Start with the length of the column headers. + */ + cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME")); + cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN, + "PROPERTY")); + cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN, + "VALUE")); + cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN, + "RECEIVED")); + cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN, + "SOURCE")); + + /* first property is always NAME */ + assert(cbp->cb_proplist->pl_prop == + ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME)); + + /* + * Go through and calculate the widths for each column. For the + * 'source' column, we kludge it up by taking the worst-case scenario of + * inheriting from the longest name. This is acceptable because in the + * majority of cases 'SOURCE' is the last column displayed, and we don't + * use the width anyway. Note that the 'VALUE' column can be oversized, + * if the name of the property is much longer than any values we find. + */ + for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { + /* + * 'PROPERTY' column + */ + if (pl->pl_prop != ZPROP_INVAL) { + const char *propname = (type == ZFS_TYPE_POOL) ? + zpool_prop_to_name(pl->pl_prop) : + zfs_prop_to_name(pl->pl_prop); + + len = strlen(propname); + if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) + cbp->cb_colwidths[GET_COL_PROPERTY] = len; + } else { + len = strlen(pl->pl_user_prop); + if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) + cbp->cb_colwidths[GET_COL_PROPERTY] = len; + } + + /* + * 'VALUE' column. The first property is always the 'name' + * property that was tacked on either by /sbin/zfs's + * zfs_do_get() or when calling zprop_expand_list(), so we + * ignore its width. If the user specified the name property + * to display, then it will be later in the list in any case. + */ + if (pl != cbp->cb_proplist && + pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE]) + cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width; + + /* 'RECEIVED' column. */ + if (pl != cbp->cb_proplist && + pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD]) + cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width; + + /* + * 'NAME' and 'SOURCE' columns + */ + if (pl->pl_prop == (type == ZFS_TYPE_POOL ? ZPOOL_PROP_NAME : + ZFS_PROP_NAME) && + pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) { + cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width; + cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width + + strlen(dgettext(TEXT_DOMAIN, "inherited from")); + } + } + + /* + * Now go through and print the headers. + */ + for (i = 0; i < ZFS_GET_NCOLS; i++) { + switch (cbp->cb_columns[i]) { + case GET_COL_NAME: + title = dgettext(TEXT_DOMAIN, "NAME"); + break; + case GET_COL_PROPERTY: + title = dgettext(TEXT_DOMAIN, "PROPERTY"); + break; + case GET_COL_VALUE: + title = dgettext(TEXT_DOMAIN, "VALUE"); + break; + case GET_COL_RECVD: + title = dgettext(TEXT_DOMAIN, "RECEIVED"); + break; + case GET_COL_SOURCE: + title = dgettext(TEXT_DOMAIN, "SOURCE"); + break; + default: + title = NULL; + } + + if (title != NULL) { + if (i == (ZFS_GET_NCOLS - 1) || + cbp->cb_columns[i + 1] == GET_COL_NONE) + (void) printf("%s", title); + else + (void) printf("%-*s ", + cbp->cb_colwidths[cbp->cb_columns[i]], + title); + } + } + (void) printf("\n"); +} + +/* + * Display a single line of output, according to the settings in the callback + * structure. + */ +void +zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, + const char *propname, const char *value, zprop_source_t sourcetype, + const char *source, const char *recvd_value) +{ + int i; + const char *str; + char buf[128]; + + /* + * Ignore those source types that the user has chosen to ignore. + */ + if ((sourcetype & cbp->cb_sources) == 0) + return; + + if (cbp->cb_first) + zprop_print_headers(cbp, cbp->cb_type); + + for (i = 0; i < ZFS_GET_NCOLS; i++) { + switch (cbp->cb_columns[i]) { + case GET_COL_NAME: + str = name; + break; + + case GET_COL_PROPERTY: + str = propname; + break; + + case GET_COL_VALUE: + str = value; + break; + + case GET_COL_SOURCE: + switch (sourcetype) { + case ZPROP_SRC_NONE: + str = "-"; + break; + + case ZPROP_SRC_DEFAULT: + str = "default"; + break; + + case ZPROP_SRC_LOCAL: + str = "local"; + break; + + case ZPROP_SRC_TEMPORARY: + str = "temporary"; + break; + + case ZPROP_SRC_INHERITED: + (void) snprintf(buf, sizeof (buf), + "inherited from %s", source); + str = buf; + break; + case ZPROP_SRC_RECEIVED: + str = "received"; + break; + } + break; + + case GET_COL_RECVD: + str = (recvd_value == NULL ? "-" : recvd_value); + break; + + default: + continue; + } + + if (cbp->cb_columns[i + 1] == GET_COL_NONE) + (void) printf("%s", str); + else if (cbp->cb_scripted) + (void) printf("%s\t", str); + else + (void) printf("%-*s ", + cbp->cb_colwidths[cbp->cb_columns[i]], + str); + } + + (void) printf("\n"); +} + +/* + * Given a numeric suffix, convert the value into a number of bits that the + * resulting value must be shifted. + */ +static int +str2shift(libzfs_handle_t *hdl, const char *buf) +{ + const char *ends = "BKMGTPEZ"; + int i; + + if (buf[0] == '\0') + return (0); + for (i = 0; i < strlen(ends); i++) { + if (toupper(buf[0]) == ends[i]) + break; + } + if (i == strlen(ends)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid numeric suffix '%s'"), buf); + return (-1); + } + + /* + * We want to allow trailing 'b' characters for 'GB' or 'Mb'. But don't + * allow 'BB' - that's just weird. + */ + if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0' && + toupper(buf[0]) != 'B')) + return (10*i); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid numeric suffix '%s'"), buf); + return (-1); +} + +/* + * Convert a string of the form '100G' into a real number. Used when setting + * properties or creating a volume. 'buf' is used to place an extended error + * message for the caller to use. + */ +int +zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num) +{ + char *end; + int shift; + + *num = 0; + + /* Check to see if this looks like a number. */ + if ((value[0] < '0' || value[0] > '9') && value[0] != '.') { + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "bad numeric value '%s'"), value); + return (-1); + } + + /* Rely on strtoull() to process the numeric portion. */ + errno = 0; + *num = strtoull(value, &end, 10); + + /* + * Check for ERANGE, which indicates that the value is too large to fit + * in a 64-bit value. + */ + if (errno == ERANGE) { + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "numeric value is too large")); + return (-1); + } + + /* + * If we have a decimal value, then do the computation with floating + * point arithmetic. Otherwise, use standard arithmetic. + */ + if (*end == '.') { + double fval = strtod(value, &end); + + if ((shift = str2shift(hdl, end)) == -1) + return (-1); + + fval *= pow(2, shift); + + if (fval > UINT64_MAX) { + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "numeric value is too large")); + return (-1); + } + + *num = (uint64_t)fval; + } else { + if ((shift = str2shift(hdl, end)) == -1) + return (-1); + + /* Check for overflow */ + if (shift >= 64 || (*num << shift) >> shift != *num) { + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "numeric value is too large")); + return (-1); + } + + *num <<= shift; + } + + return (0); +} + +/* + * Given a propname=value nvpair to set, parse any numeric properties + * (index, boolean, etc) if they are specified as strings and add the + * resulting nvpair to the returned nvlist. + * + * At the DSL layer, all properties are either 64-bit numbers or strings. + * We want the user to be able to ignore this fact and specify properties + * as native values (numbers, for example) or as strings (to simplify + * command line utilities). This also handles converting index types + * (compression, checksum, etc) from strings to their on-disk index. + */ +int +zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, + zfs_type_t type, nvlist_t *ret, char **svalp, uint64_t *ivalp, + const char *errbuf) +{ + data_type_t datatype = nvpair_type(elem); + zprop_type_t proptype; + const char *propname; + char *value; + boolean_t isnone = B_FALSE; + + if (type == ZFS_TYPE_POOL) { + proptype = zpool_prop_get_type(prop); + propname = zpool_prop_to_name(prop); + } else { + proptype = zfs_prop_get_type(prop); + propname = zfs_prop_to_name(prop); + } + + /* + * Convert any properties to the internal DSL value types. + */ + *svalp = NULL; + *ivalp = 0; + + switch (proptype) { + case PROP_TYPE_STRING: + if (datatype != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), nvpair_name(elem)); + goto error; + } + (void) nvpair_value_string(elem, svalp); + if (strlen(*svalp) >= ZFS_MAXPROPLEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is too long"), nvpair_name(elem)); + goto error; + } + break; + + case PROP_TYPE_NUMBER: + if (datatype == DATA_TYPE_STRING) { + (void) nvpair_value_string(elem, &value); + if (strcmp(value, "none") == 0) { + isnone = B_TRUE; + } else if (zfs_nicestrtonum(hdl, value, ivalp) + != 0) { + goto error; + } + } else if (datatype == DATA_TYPE_UINT64) { + (void) nvpair_value_uint64(elem, ivalp); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a number"), nvpair_name(elem)); + goto error; + } + + /* + * Quota special: force 'none' and don't allow 0. + */ + if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone && + (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "use 'none' to disable quota/refquota")); + goto error; + } + break; + + case PROP_TYPE_INDEX: + if (datatype != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), nvpair_name(elem)); + goto error; + } + + (void) nvpair_value_string(elem, &value); + + if (zprop_string_to_index(prop, value, ivalp, type) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be one of '%s'"), propname, + zprop_values(prop, type)); + goto error; + } + break; + + default: + abort(); + } + + /* + * Add the result to our return set of properties. + */ + if (*svalp != NULL) { + if (nvlist_add_string(ret, propname, *svalp) != 0) { + (void) no_memory(hdl); + return (-1); + } + } else { + if (nvlist_add_uint64(ret, propname, *ivalp) != 0) { + (void) no_memory(hdl); + return (-1); + } + } + + return (0); +error: + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + return (-1); +} + +static int +addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, + zfs_type_t type) +{ + int prop; + zprop_list_t *entry; + + prop = zprop_name_to_prop(propname, type); + + if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type)) + prop = ZPROP_INVAL; + + /* + * When no property table entry can be found, return failure if + * this is a pool property or if this isn't a user-defined + * dataset property, + */ + if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL || + (!zfs_prop_user(propname) && !zfs_prop_userquota(propname)))) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), propname); + return (zfs_error(hdl, EZFS_BADPROP, + dgettext(TEXT_DOMAIN, "bad property list"))); + } + + if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) + return (-1); + + entry->pl_prop = prop; + if (prop == ZPROP_INVAL) { + if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == NULL) { + free(entry); + return (-1); + } + entry->pl_width = strlen(propname); + } else { + entry->pl_width = zprop_width(prop, &entry->pl_fixed, + type); + } + + *listp = entry; + + return (0); +} + +/* + * Given a comma-separated list of properties, construct a property list + * containing both user-defined and native properties. This function will + * return a NULL list if 'all' is specified, which can later be expanded + * by zprop_expand_list(). + */ +int +zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, + zfs_type_t type) +{ + *listp = NULL; + + /* + * If 'all' is specified, return a NULL list. + */ + if (strcmp(props, "all") == 0) + return (0); + + /* + * If no props were specified, return an error. + */ + if (props[0] == '\0') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no properties specified")); + return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, + "bad property list"))); + } + + /* + * It would be nice to use getsubopt() here, but the inclusion of column + * aliases makes this more effort than it's worth. + */ + while (*props != '\0') { + size_t len; + char *p; + char c; + + if ((p = strchr(props, ',')) == NULL) { + len = strlen(props); + p = props + len; + } else { + len = p - props; + } + + /* + * Check for empty options. + */ + if (len == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "empty property name")); + return (zfs_error(hdl, EZFS_BADPROP, + dgettext(TEXT_DOMAIN, "bad property list"))); + } + + /* + * Check all regular property names. + */ + c = props[len]; + props[len] = '\0'; + + if (strcmp(props, "space") == 0) { + static char *spaceprops[] = { + "name", "avail", "used", "usedbysnapshots", + "usedbydataset", "usedbyrefreservation", + "usedbychildren", NULL + }; + int i; + + for (i = 0; spaceprops[i]; i++) { + if (addlist(hdl, spaceprops[i], listp, type)) + return (-1); + listp = &(*listp)->pl_next; + } + } else { + if (addlist(hdl, props, listp, type)) + return (-1); + listp = &(*listp)->pl_next; + } + + props = p; + if (c == ',') + props++; + } + + return (0); +} + +void +zprop_free_list(zprop_list_t *pl) +{ + zprop_list_t *next; + + while (pl != NULL) { + next = pl->pl_next; + free(pl->pl_user_prop); + free(pl); + pl = next; + } +} + +typedef struct expand_data { + zprop_list_t **last; + libzfs_handle_t *hdl; + zfs_type_t type; +} expand_data_t; + +int +zprop_expand_list_cb(int prop, void *cb) +{ + zprop_list_t *entry; + expand_data_t *edp = cb; + + if ((entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t))) == NULL) + return (ZPROP_INVAL); + + entry->pl_prop = prop; + entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type); + entry->pl_all = B_TRUE; + + *(edp->last) = entry; + edp->last = &entry->pl_next; + + return (ZPROP_CONT); +} + +int +zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type) +{ + zprop_list_t *entry; + zprop_list_t **last; + expand_data_t exp; + + if (*plp == NULL) { + /* + * If this is the very first time we've been called for an 'all' + * specification, expand the list to include all native + * properties. + */ + last = plp; + + exp.last = last; + exp.hdl = hdl; + exp.type = type; + + if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE, + B_FALSE, type) == ZPROP_INVAL) + return (-1); + + /* + * Add 'name' to the beginning of the list, which is handled + * specially. + */ + if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) + return (-1); + + entry->pl_prop = (type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : + ZFS_PROP_NAME; + entry->pl_width = zprop_width(entry->pl_prop, + &entry->pl_fixed, type); + entry->pl_all = B_TRUE; + entry->pl_next = *plp; + *plp = entry; + } + return (0); +} + +int +zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, + zfs_type_t type) +{ + return (zprop_iter_common(func, cb, show_all, ordered, type)); +} diff --git a/lib/libzpool/common/kernel.c b/lib/libzpool/common/kernel.c new file mode 100644 index 0000000..f323bf6 --- /dev/null +++ b/lib/libzpool/common/kernel.c @@ -0,0 +1,981 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Emulation of kernel services in userland. + */ + +int aok; +uint64_t physmem; +vnode_t *rootdir = (vnode_t *)0xabcd1234; +char hw_serial[HW_HOSTID_LEN]; + +struct utsname utsname = { + "userland", "libzpool", "1", "1", "na" +}; + +/* this only exists to have its address taken */ +struct proc p0; + +/* + * ========================================================================= + * threads + * ========================================================================= + */ +/*ARGSUSED*/ +kthread_t * +zk_thread_create(void (*func)(), void *arg) +{ + thread_t tid; + + VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED, + &tid) == 0); + + return ((void *)(uintptr_t)tid); +} + +/* + * ========================================================================= + * kstats + * ========================================================================= + */ +/*ARGSUSED*/ +kstat_t * +kstat_create(char *module, int instance, char *name, char *class, + uchar_t type, ulong_t ndata, uchar_t ks_flag) +{ + return (NULL); +} + +/*ARGSUSED*/ +void +kstat_install(kstat_t *ksp) +{} + +/*ARGSUSED*/ +void +kstat_delete(kstat_t *ksp) +{} + +/* + * ========================================================================= + * mutexes + * ========================================================================= + */ +void +zmutex_init(kmutex_t *mp) +{ + mp->m_owner = NULL; + mp->initialized = B_TRUE; + (void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL); +} + +void +zmutex_destroy(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_owner == NULL); + (void) _mutex_destroy(&(mp)->m_lock); + mp->m_owner = (void *)-1UL; + mp->initialized = B_FALSE; +} + +void +mutex_enter(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_owner != (void *)-1UL); + ASSERT(mp->m_owner != curthread); + VERIFY(mutex_lock(&mp->m_lock) == 0); + ASSERT(mp->m_owner == NULL); + mp->m_owner = curthread; +} + +int +mutex_tryenter(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_owner != (void *)-1UL); + if (0 == mutex_trylock(&mp->m_lock)) { + ASSERT(mp->m_owner == NULL); + mp->m_owner = curthread; + return (1); + } else { + return (0); + } +} + +void +mutex_exit(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; + VERIFY(mutex_unlock(&mp->m_lock) == 0); +} + +void * +mutex_owner(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + return (mp->m_owner); +} + +/* + * ========================================================================= + * rwlocks + * ========================================================================= + */ +/*ARGSUSED*/ +void +rw_init(krwlock_t *rwlp, char *name, int type, void *arg) +{ + rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL); + rwlp->rw_owner = NULL; + rwlp->initialized = B_TRUE; +} + +void +rw_destroy(krwlock_t *rwlp) +{ + rwlock_destroy(&rwlp->rw_lock); + rwlp->rw_owner = (void *)-1UL; + rwlp->initialized = B_FALSE; +} + +void +rw_enter(krwlock_t *rwlp, krw_t rw) +{ + ASSERT(!RW_LOCK_HELD(rwlp)); + ASSERT(rwlp->initialized == B_TRUE); + ASSERT(rwlp->rw_owner != (void *)-1UL); + ASSERT(rwlp->rw_owner != curthread); + + if (rw == RW_READER) + VERIFY(rw_rdlock(&rwlp->rw_lock) == 0); + else + VERIFY(rw_wrlock(&rwlp->rw_lock) == 0); + + rwlp->rw_owner = curthread; +} + +void +rw_exit(krwlock_t *rwlp) +{ + ASSERT(rwlp->initialized == B_TRUE); + ASSERT(rwlp->rw_owner != (void *)-1UL); + + rwlp->rw_owner = NULL; + VERIFY(rw_unlock(&rwlp->rw_lock) == 0); +} + +int +rw_tryenter(krwlock_t *rwlp, krw_t rw) +{ + int rv; + + ASSERT(rwlp->initialized == B_TRUE); + ASSERT(rwlp->rw_owner != (void *)-1UL); + + if (rw == RW_READER) + rv = rw_tryrdlock(&rwlp->rw_lock); + else + rv = rw_trywrlock(&rwlp->rw_lock); + + if (rv == 0) { + rwlp->rw_owner = curthread; + return (1); + } + + return (0); +} + +/*ARGSUSED*/ +int +rw_tryupgrade(krwlock_t *rwlp) +{ + ASSERT(rwlp->initialized == B_TRUE); + ASSERT(rwlp->rw_owner != (void *)-1UL); + + return (0); +} + +/* + * ========================================================================= + * condition variables + * ========================================================================= + */ +/*ARGSUSED*/ +void +cv_init(kcondvar_t *cv, char *name, int type, void *arg) +{ + VERIFY(cond_init(cv, type, NULL) == 0); +} + +void +cv_destroy(kcondvar_t *cv) +{ + VERIFY(cond_destroy(cv) == 0); +} + +void +cv_wait(kcondvar_t *cv, kmutex_t *mp) +{ + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; + int ret = cond_wait(cv, &mp->m_lock); + VERIFY(ret == 0 || ret == EINTR); + mp->m_owner = curthread; +} + +clock_t +cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) +{ + int error; + timestruc_t ts; + clock_t delta; + +top: + delta = abstime - ddi_get_lbolt(); + if (delta <= 0) + return (-1); + + ts.tv_sec = delta / hz; + ts.tv_nsec = (delta % hz) * (NANOSEC / hz); + + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; + error = cond_reltimedwait(cv, &mp->m_lock, &ts); + mp->m_owner = curthread; + + if (error == ETIME) + return (-1); + + if (error == EINTR) + goto top; + + ASSERT(error == 0); + + return (1); +} + +void +cv_signal(kcondvar_t *cv) +{ + VERIFY(cond_signal(cv) == 0); +} + +void +cv_broadcast(kcondvar_t *cv) +{ + VERIFY(cond_broadcast(cv) == 0); +} + +/* + * ========================================================================= + * vnode operations + * ========================================================================= + */ +/* + * Note: for the xxxat() versions of these functions, we assume that the + * starting vp is always rootdir (which is true for spa_directory.c, the only + * ZFS consumer of these interfaces). We assert this is true, and then emulate + * them by adding '/' in front of the path. + */ + +/*ARGSUSED*/ +int +vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) +{ + int fd; + vnode_t *vp; + int old_umask; + char realpath[MAXPATHLEN]; + struct stat64 st; + + /* + * If we're accessing a real disk from userland, we need to use + * the character interface to avoid caching. This is particularly + * important if we're trying to look at a real in-kernel storage + * pool from userland, e.g. via zdb, because otherwise we won't + * see the changes occurring under the segmap cache. + * On the other hand, the stupid character device returns zero + * for its size. So -- gag -- we open the block device to get + * its size, and remember it for subsequent VOP_GETATTR(). + */ + if (strncmp(path, "/dev/", 5) == 0) { + char *dsk; + fd = open64(path, O_RDONLY); + if (fd == -1) + return (errno); + if (fstat64(fd, &st) == -1) { + close(fd); + return (errno); + } + close(fd); + (void) sprintf(realpath, "%s", path); + dsk = strstr(path, "/dsk/"); + if (dsk != NULL) + (void) sprintf(realpath + (dsk - path) + 1, "r%s", + dsk + 1); + } else { + (void) sprintf(realpath, "%s", path); + if (!(flags & FCREAT) && stat64(realpath, &st) == -1) + return (errno); + } + + if (flags & FCREAT) + old_umask = umask(0); + + /* + * The construct 'flags - FREAD' conveniently maps combinations of + * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. + */ + fd = open64(realpath, flags - FREAD, mode); + + if (flags & FCREAT) + (void) umask(old_umask); + + if (fd == -1) + return (errno); + + if (fstat64(fd, &st) == -1) { + close(fd); + return (errno); + } + + (void) fcntl(fd, F_SETFD, FD_CLOEXEC); + + *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); + + vp->v_fd = fd; + vp->v_size = st.st_size; + vp->v_path = spa_strdup(path); + + return (0); +} + +/*ARGSUSED*/ +int +vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, + int x3, vnode_t *startvp, int fd) +{ + char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); + int ret; + + ASSERT(startvp == rootdir); + (void) sprintf(realpath, "/%s", path); + + /* fd ignored for now, need if want to simulate nbmand support */ + ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); + + umem_free(realpath, strlen(path) + 2); + + return (ret); +} + +/*ARGSUSED*/ +int +vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, + int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) +{ + ssize_t iolen, split; + + if (uio == UIO_READ) { + iolen = pread64(vp->v_fd, addr, len, offset); + } else { + /* + * To simulate partial disk writes, we split writes into two + * system calls so that the process can be killed in between. + */ + split = (len > 0 ? rand() % len : 0); + iolen = pwrite64(vp->v_fd, addr, split, offset); + iolen += pwrite64(vp->v_fd, (char *)addr + split, + len - split, offset + split); + } + + if (iolen == -1) + return (errno); + if (residp) + *residp = len - iolen; + else if (iolen != len) + return (EIO); + return (0); +} + +void +vn_close(vnode_t *vp) +{ + close(vp->v_fd); + spa_strfree(vp->v_path); + umem_free(vp, sizeof (vnode_t)); +} + +/* + * At a minimum we need to update the size since vdev_reopen() + * will no longer call vn_openat(). + */ +int +fop_getattr(vnode_t *vp, vattr_t *vap) +{ + struct stat64 st; + + if (fstat64(vp->v_fd, &st) == -1) { + close(vp->v_fd); + return (errno); + } + + vap->va_size = st.st_size; + return (0); +} + +#ifdef ZFS_DEBUG + +/* + * ========================================================================= + * Figure out which debugging statements to print + * ========================================================================= + */ + +static char *dprintf_string; +static int dprintf_print_all; + +int +dprintf_find_string(const char *string) +{ + char *tmp_str = dprintf_string; + int len = strlen(string); + + /* + * Find out if this is a string we want to print. + * String format: file1.c,function_name1,file2.c,file3.c + */ + + while (tmp_str != NULL) { + if (strncmp(tmp_str, string, len) == 0 && + (tmp_str[len] == ',' || tmp_str[len] == '\0')) + return (1); + tmp_str = strchr(tmp_str, ','); + if (tmp_str != NULL) + tmp_str++; /* Get rid of , */ + } + return (0); +} + +void +dprintf_setup(int *argc, char **argv) +{ + int i, j; + + /* + * Debugging can be specified two ways: by setting the + * environment variable ZFS_DEBUG, or by including a + * "debug=..." argument on the command line. The command + * line setting overrides the environment variable. + */ + + for (i = 1; i < *argc; i++) { + int len = strlen("debug="); + /* First look for a command line argument */ + if (strncmp("debug=", argv[i], len) == 0) { + dprintf_string = argv[i] + len; + /* Remove from args */ + for (j = i; j < *argc; j++) + argv[j] = argv[j+1]; + argv[j] = NULL; + (*argc)--; + } + } + + if (dprintf_string == NULL) { + /* Look for ZFS_DEBUG environment variable */ + dprintf_string = getenv("ZFS_DEBUG"); + } + + /* + * Are we just turning on all debugging? + */ + if (dprintf_find_string("on")) + dprintf_print_all = 1; +} + +/* + * ========================================================================= + * debug printfs + * ========================================================================= + */ +void +__dprintf(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + va_list adx; + + /* + * Get rid of annoying "../common/" prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + if (dprintf_print_all || + dprintf_find_string(newfile) || + dprintf_find_string(func)) { + /* Print out just the function name if requested */ + flockfile(stdout); + if (dprintf_find_string("pid")) + (void) printf("%d ", getpid()); + if (dprintf_find_string("tid")) + (void) printf("%u ", thr_self()); + if (dprintf_find_string("cpu")) + (void) printf("%u ", getcpuid()); + if (dprintf_find_string("time")) + (void) printf("%llu ", gethrtime()); + if (dprintf_find_string("long")) + (void) printf("%s, line %d: ", newfile, line); + (void) printf("%s: ", func); + va_start(adx, fmt); + (void) vprintf(fmt, adx); + va_end(adx); + funlockfile(stdout); + } +} + +#endif /* ZFS_DEBUG */ + +/* + * ========================================================================= + * cmn_err() and panic() + * ========================================================================= + */ +static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; +static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; + +void +vpanic(const char *fmt, va_list adx) +{ + (void) fprintf(stderr, "error: "); + (void) vfprintf(stderr, fmt, adx); + (void) fprintf(stderr, "\n"); + + abort(); /* think of it as a "user-level crash dump" */ +} + +void +panic(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vpanic(fmt, adx); + va_end(adx); +} + +void +vcmn_err(int ce, const char *fmt, va_list adx) +{ + if (ce == CE_PANIC) + vpanic(fmt, adx); + if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ + (void) fprintf(stderr, "%s", ce_prefix[ce]); + (void) vfprintf(stderr, fmt, adx); + (void) fprintf(stderr, "%s", ce_suffix[ce]); + } +} + +/*PRINTFLIKE2*/ +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(ce, fmt, adx); + va_end(adx); +} + +/* + * ========================================================================= + * kobj interfaces + * ========================================================================= + */ +struct _buf * +kobj_open_file(char *name) +{ + struct _buf *file; + vnode_t *vp; + + /* set vp as the _fd field of the file */ + if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, + -1) != 0) + return ((void *)-1UL); + + file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); + file->_fd = (intptr_t)vp; + return (file); +} + +int +kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) +{ + ssize_t resid; + + vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, + UIO_SYSSPACE, 0, 0, 0, &resid); + + return (size - resid); +} + +void +kobj_close_file(struct _buf *file) +{ + vn_close((vnode_t *)file->_fd); + umem_free(file, sizeof (struct _buf)); +} + +int +kobj_get_filesize(struct _buf *file, uint64_t *size) +{ + struct stat64 st; + vnode_t *vp = (vnode_t *)file->_fd; + + if (fstat64(vp->v_fd, &st) == -1) { + vn_close(vp); + return (errno); + } + *size = st.st_size; + return (0); +} + +/* + * ========================================================================= + * misc routines + * ========================================================================= + */ + +void +delay(clock_t ticks) +{ + poll(0, 0, ticks * (1000 / hz)); +} + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + * High order bit is 31 (or 63 in _LP64 kernel). + */ +int +highbit(ulong_t i) +{ + register int h = 1; + + if (i == 0) + return (0); +#ifdef _LP64 + if (i & 0xffffffff00000000ul) { + h += 32; i >>= 32; + } +#endif + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +static int random_fd = -1, urandom_fd = -1; + +static int +random_get_bytes_common(uint8_t *ptr, size_t len, int fd) +{ + size_t resid = len; + ssize_t bytes; + + ASSERT(fd != -1); + + while (resid != 0) { + bytes = read(fd, ptr, resid); + ASSERT3S(bytes, >=, 0); + ptr += bytes; + resid -= bytes; + } + + return (0); +} + +int +random_get_bytes(uint8_t *ptr, size_t len) +{ + return (random_get_bytes_common(ptr, len, random_fd)); +} + +int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + return (random_get_bytes_common(ptr, len, urandom_fd)); +} + +int +ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) +{ + char *end; + + *result = strtoul(hw_serial, &end, base); + if (*result == 0) + return (errno); + return (0); +} + +int +ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) +{ + char *end; + + *result = strtoull(str, &end, base); + if (*result == 0) + return (errno); + return (0); +} + +/* + * ========================================================================= + * kernel emulation setup & teardown + * ========================================================================= + */ +static int +umem_out_of_memory(void) +{ + char errmsg[] = "out of memory -- generating core dump\n"; + + write(fileno(stderr), errmsg, sizeof (errmsg)); + abort(); + return (0); +} + +void +kernel_init(int mode) +{ + umem_nofail_callback(umem_out_of_memory); + + physmem = sysconf(_SC_PHYS_PAGES); + + dprintf("physmem = %llu pages (%.2f GB)\n", physmem, + (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); + + (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", + (mode & FWRITE) ? gethostid() : 0); + + VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1); + VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1); + + system_taskq_init(); + + spa_init(mode); +} + +void +kernel_fini(void) +{ + spa_fini(); + + system_taskq_fini(); + + close(random_fd); + close(urandom_fd); + + random_fd = -1; + urandom_fd = -1; +} + +int +z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + int ret; + uLongf len = *dstlen; + + if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK) + *dstlen = (size_t)len; + + return (ret); +} + +int +z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, + int level) +{ + int ret; + uLongf len = *dstlen; + + if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK) + *dstlen = (size_t)len; + + return (ret); +} + +uid_t +crgetuid(cred_t *cr) +{ + return (0); +} + +gid_t +crgetgid(cred_t *cr) +{ + return (0); +} + +int +crgetngroups(cred_t *cr) +{ + return (0); +} + +gid_t * +crgetgroups(cred_t *cr) +{ + return (NULL); +} + +int +zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) +{ + return (0); +} + +int +zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) +{ + return (0); +} + +int +zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) +{ + return (0); +} + +ksiddomain_t * +ksid_lookupdomain(const char *dom) +{ + ksiddomain_t *kd; + + kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); + kd->kd_name = spa_strdup(dom); + return (kd); +} + +void +ksiddomain_rele(ksiddomain_t *ksid) +{ + spa_strfree(ksid->kd_name); + umem_free(ksid, sizeof (ksiddomain_t)); +} + +/* + * Do not change the length of the returned string; it must be freed + * with strfree(). + */ +char * +kmem_asprintf(const char *fmt, ...) +{ + int size; + va_list adx; + char *buf; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx) + 1; + va_end(adx); + + buf = kmem_alloc(size, KM_SLEEP); + + va_start(adx, fmt); + size = vsnprintf(buf, size, fmt, adx); + va_end(adx); + + return (buf); +} + +/* ARGSUSED */ +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + *minorp = 0; + return (0); +} + +/* ARGSUSED */ +void +zfs_onexit_fd_rele(int fd) +{ +} + +/* ARGSUSED */ +int +zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle) +{ + return (0); +} + +/* ARGSUSED */ +int +zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) +{ + return (0); +} + +/* ARGSUSED */ +int +zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) +{ + return (0); +} diff --git a/lib/libzpool/common/sys/zfs_context.h b/lib/libzpool/common/sys/zfs_context.h new file mode 100644 index 0000000..3b0390d --- /dev/null +++ b/lib/libzpool/common/sys/zfs_context.h @@ -0,0 +1,611 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_ZFS_CONTEXT_H +#define _SYS_ZFS_CONTEXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define _SYS_MUTEX_H +#define _SYS_RWLOCK_H +#define _SYS_CONDVAR_H +#define _SYS_SYSTM_H +#define _SYS_DEBUG_H +#define _SYS_T_LOCK_H +#define _SYS_VNODE_H +#define _SYS_VFS_H +#define _SYS_SUNDDI_H +#define _SYS_CALLB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Debugging + */ + +/* + * Note that we are not using the debugging levels. + */ + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +/* + * ZFS debugging + */ + +#ifdef ZFS_DEBUG +extern void dprintf_setup(int *argc, char **argv); +#endif /* ZFS_DEBUG */ + +extern void cmn_err(int, const char *, ...); +extern void vcmn_err(int, const char *, __va_list); +extern void panic(const char *, ...); +extern void vpanic(const char *, __va_list); + +#define fm_panic panic + +extern int aok; + +/* This definition is copied from assert.h. */ +#if defined(__STDC__) +#if __STDC_VERSION__ - 0 >= 199901L +#define zverify(EX) (void)((EX) || (aok) || \ + (__assert_c99(#EX, __FILE__, __LINE__, __func__), 0)) +#else +#define zverify(EX) (void)((EX) || (aok) || \ + (__assert(#EX, __FILE__, __LINE__), 0)) +#endif /* __STDC_VERSION__ - 0 >= 199901L */ +#else +#define zverify(EX) (void)((EX) || (aok) || \ + (_assert("EX", __FILE__, __LINE__), 0)) +#endif /* __STDC__ */ + + +#define VERIFY zverify +#define ASSERT zverify +#undef assert +#define assert zverify + +extern void __assert(const char *, const char *, int); + +#ifdef lint +#define VERIFY3_IMPL(x, y, z, t) if (x == z) ((void)0) +#else +/* BEGIN CSTYLED */ +#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \ + const TYPE __left = (TYPE)(LEFT); \ + const TYPE __right = (TYPE)(RIGHT); \ + if (!(__left OP __right) && (!aok)) { \ + char *__buf = alloca(256); \ + (void) snprintf(__buf, 256, "%s %s %s (0x%llx %s 0x%llx)", \ + #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right); \ + __assert(__buf, __FILE__, __LINE__); \ + } \ +_NOTE(CONSTCOND) } while (0) +/* END CSTYLED */ +#endif /* lint */ + +#define VERIFY3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t) +#define VERIFY3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t) +#define VERIFY3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t) + +#ifdef NDEBUG +#define ASSERT3S(x, y, z) ((void)0) +#define ASSERT3U(x, y, z) ((void)0) +#define ASSERT3P(x, y, z) ((void)0) +#else +#define ASSERT3S(x, y, z) VERIFY3S(x, y, z) +#define ASSERT3U(x, y, z) VERIFY3U(x, y, z) +#define ASSERT3P(x, y, z) VERIFY3P(x, y, z) +#endif + +/* + * DTrace SDT probes have different signatures in userland than they do in + * kernel. If they're being used in kernel code, re-define them out of + * existence for their counterparts in libzpool. + */ + +#ifdef DTRACE_PROBE +#undef DTRACE_PROBE +#define DTRACE_PROBE(a) ((void)0) +#endif /* DTRACE_PROBE */ + +#ifdef DTRACE_PROBE1 +#undef DTRACE_PROBE1 +#define DTRACE_PROBE1(a, b, c) ((void)0) +#endif /* DTRACE_PROBE1 */ + +#ifdef DTRACE_PROBE2 +#undef DTRACE_PROBE2 +#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) +#endif /* DTRACE_PROBE2 */ + +#ifdef DTRACE_PROBE3 +#undef DTRACE_PROBE3 +#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) +#endif /* DTRACE_PROBE3 */ + +#ifdef DTRACE_PROBE4 +#undef DTRACE_PROBE4 +#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) +#endif /* DTRACE_PROBE4 */ + +/* + * Threads + */ +#define curthread ((void *)(uintptr_t)thr_self()) + +typedef struct kthread kthread_t; + +#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ + zk_thread_create(func, arg) +#define thread_exit() thr_exit(NULL) +#define thread_join(t) panic("libzpool cannot join threads") + +#define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) + +/* in libzpool, p0 exists only to have its address taken */ +struct proc { + uintptr_t this_is_never_used_dont_dereference_it; +}; + +extern struct proc p0; + +#define PS_NONE -1 + +extern kthread_t *zk_thread_create(void (*func)(), void *arg); + +#define issig(why) (FALSE) +#define ISSIG(thr, why) (FALSE) + +/* + * Mutexes + */ +typedef struct kmutex { + void *m_owner; + boolean_t initialized; + mutex_t m_lock; +} kmutex_t; + +#define MUTEX_DEFAULT USYNC_THREAD +#undef MUTEX_HELD +#undef MUTEX_NOT_HELD +#define MUTEX_HELD(m) _mutex_held(&(m)->m_lock) +#define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m)) + +/* + * Argh -- we have to get cheesy here because the kernel and userland + * have different signatures for the same routine. + */ +extern int _mutex_init(mutex_t *mp, int type, void *arg); +extern int _mutex_destroy(mutex_t *mp); + +#define mutex_init(mp, b, c, d) zmutex_init((kmutex_t *)(mp)) +#define mutex_destroy(mp) zmutex_destroy((kmutex_t *)(mp)) + +extern void zmutex_init(kmutex_t *mp); +extern void zmutex_destroy(kmutex_t *mp); +extern void mutex_enter(kmutex_t *mp); +extern void mutex_exit(kmutex_t *mp); +extern int mutex_tryenter(kmutex_t *mp); +extern void *mutex_owner(kmutex_t *mp); + +/* + * RW locks + */ +typedef struct krwlock { + void *rw_owner; + boolean_t initialized; + rwlock_t rw_lock; +} krwlock_t; + +typedef int krw_t; + +#define RW_READER 0 +#define RW_WRITER 1 +#define RW_DEFAULT USYNC_THREAD + +#undef RW_READ_HELD +#define RW_READ_HELD(x) _rw_read_held(&(x)->rw_lock) + +#undef RW_WRITE_HELD +#define RW_WRITE_HELD(x) _rw_write_held(&(x)->rw_lock) + +extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); +extern void rw_destroy(krwlock_t *rwlp); +extern void rw_enter(krwlock_t *rwlp, krw_t rw); +extern int rw_tryenter(krwlock_t *rwlp, krw_t rw); +extern int rw_tryupgrade(krwlock_t *rwlp); +extern void rw_exit(krwlock_t *rwlp); +#define rw_downgrade(rwlp) do { } while (0) + +extern uid_t crgetuid(cred_t *cr); +extern gid_t crgetgid(cred_t *cr); +extern int crgetngroups(cred_t *cr); +extern gid_t *crgetgroups(cred_t *cr); + +/* + * Condition variables + */ +typedef cond_t kcondvar_t; + +#define CV_DEFAULT USYNC_THREAD + +extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); +extern void cv_destroy(kcondvar_t *cv); +extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); +extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); +extern void cv_signal(kcondvar_t *cv); +extern void cv_broadcast(kcondvar_t *cv); + +/* + * kstat creation, installation and deletion + */ +extern kstat_t *kstat_create(char *, int, + char *, char *, uchar_t, ulong_t, uchar_t); +extern void kstat_install(kstat_t *); +extern void kstat_delete(kstat_t *); + +/* + * Kernel memory + */ +#define KM_SLEEP UMEM_NOFAIL +#define KM_PUSHPAGE KM_SLEEP +#define KM_NOSLEEP UMEM_DEFAULT +#define KMC_NODEBUG UMC_NODEBUG +#define KMC_NOTOUCH 0 /* not needed for userland caches */ +#define kmem_alloc(_s, _f) umem_alloc(_s, _f) +#define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) +#define kmem_free(_b, _s) umem_free(_b, _s) +#define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \ + umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) +#define kmem_cache_destroy(_c) umem_cache_destroy(_c) +#define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f) +#define kmem_cache_free(_c, _b) umem_cache_free(_c, _b) +#define kmem_debugging() 0 +#define kmem_cache_reap_now(_c) /* nothing */ +#define kmem_cache_set_move(_c, _cb) /* nothing */ +#define POINTER_INVALIDATE(_pp) /* nothing */ +#define POINTER_IS_VALID(_p) 0 + +typedef umem_cache_t kmem_cache_t; + +typedef enum kmem_cbrc { + KMEM_CBRC_YES, + KMEM_CBRC_NO, + KMEM_CBRC_LATER, + KMEM_CBRC_DONT_NEED, + KMEM_CBRC_DONT_KNOW +} kmem_cbrc_t; + +/* + * Task queues + */ +typedef struct taskq taskq_t; +typedef uintptr_t taskqid_t; +typedef void (task_func_t)(void *); + +#define TASKQ_PREPOPULATE 0x0001 +#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ +#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */ +#define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */ + +#define TQ_SLEEP KM_SLEEP /* Can block for memory */ +#define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_FRONT 0x08 /* Queue in front */ + +extern taskq_t *system_taskq; + +extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +#define taskq_create_proc(a, b, c, d, e, p, f) \ + (taskq_create(a, b, c, d, e, f)) +#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ + (taskq_create(a, b, maxclsyspri, d, e, f)) +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait(taskq_t *); +extern int taskq_member(taskq_t *, void *); +extern void system_taskq_init(void); +extern void system_taskq_fini(void); + +#define XVA_MAPSIZE 3 +#define XVA_MAGIC 0x78766174 + +/* + * vnodes + */ +typedef struct vnode { + uint64_t v_size; + int v_fd; + char *v_path; +} vnode_t; + +#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ + +typedef struct xoptattr { + timestruc_t xoa_createtime; /* Create time of file */ + uint8_t xoa_archive; + uint8_t xoa_system; + uint8_t xoa_readonly; + uint8_t xoa_hidden; + uint8_t xoa_nounlink; + uint8_t xoa_immutable; + uint8_t xoa_appendonly; + uint8_t xoa_nodump; + uint8_t xoa_settable; + uint8_t xoa_opaque; + uint8_t xoa_av_quarantined; + uint8_t xoa_av_modified; + uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; + uint8_t xoa_reparse; + uint8_t xoa_offline; + uint8_t xoa_sparse; +} xoptattr_t; + +typedef struct vattr { + uint_t va_mask; /* bit-mask of attributes */ + u_offset_t va_size; /* file size in bytes */ +} vattr_t; + + +typedef struct xvattr { + vattr_t xva_vattr; /* Embedded vattr structure */ + uint32_t xva_magic; /* Magic Number */ + uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ + uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ + uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ + uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ + xoptattr_t xva_xoptattrs; /* Optional attributes */ +} xvattr_t; + +typedef struct vsecattr { + uint_t vsa_mask; /* See below */ + int vsa_aclcnt; /* ACL entry count */ + void *vsa_aclentp; /* pointer to ACL entries */ + int vsa_dfaclcnt; /* default ACL entry count */ + void *vsa_dfaclentp; /* pointer to default ACL entries */ + size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ +} vsecattr_t; + +#define AT_TYPE 0x00001 +#define AT_MODE 0x00002 +#define AT_UID 0x00004 +#define AT_GID 0x00008 +#define AT_FSID 0x00010 +#define AT_NODEID 0x00020 +#define AT_NLINK 0x00040 +#define AT_SIZE 0x00080 +#define AT_ATIME 0x00100 +#define AT_MTIME 0x00200 +#define AT_CTIME 0x00400 +#define AT_RDEV 0x00800 +#define AT_BLKSIZE 0x01000 +#define AT_NBLOCKS 0x02000 +#define AT_SEQ 0x08000 +#define AT_XVATTR 0x10000 + +#define CRCREAT 0 + +extern int fop_getattr(vnode_t *vp, vattr_t *vap); + +#define VOP_CLOSE(vp, f, c, o, cr, ct) 0 +#define VOP_PUTPAGE(vp, of, sz, fl, cr, ct) 0 +#define VOP_GETATTR(vp, vap, fl, cr, ct) fop_getattr((vp), (vap)); + +#define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd) + +#define VN_RELE(vp) vn_close(vp) + +extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp, + int x2, int x3); +extern int vn_openat(char *path, int x1, int oflags, int mode, vnode_t **vpp, + int x2, int x3, vnode_t *vp, int fd); +extern int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, + offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp); +extern void vn_close(vnode_t *vp); + +#define vn_remove(path, x1, x2) remove(path) +#define vn_rename(from, to, seg) rename((from), (to)) +#define vn_is_readonly(vp) B_FALSE + +extern vnode_t *rootdir; + +#include /* for FREAD, FWRITE, etc */ + +/* + * Random stuff + */ +#define ddi_get_lbolt() (gethrtime() >> 23) +#define ddi_get_lbolt64() (gethrtime() >> 23) +#define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */ + +extern void delay(clock_t ticks); + +#define gethrestime_sec() time(NULL) +#define gethrestime(t) \ + do {\ + (t)->tv_sec = gethrestime_sec();\ + (t)->tv_nsec = 0;\ + } while (0); + +#define max_ncpus 64 + +#define minclsyspri 60 +#define maxclsyspri 99 + +#define CPU_SEQID (thr_self() & (max_ncpus - 1)) + +#define kcred NULL +#define CRED() NULL + +#define ptob(x) ((x) * PAGESIZE) + +extern uint64_t physmem; + +extern int highbit(ulong_t i); +extern int random_get_bytes(uint8_t *ptr, size_t len); +extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); + +extern void kernel_init(int); +extern void kernel_fini(void); + +struct spa; +extern void nicenum(uint64_t num, char *buf); +extern void show_pool_stats(struct spa *); + +typedef struct callb_cpr { + kmutex_t *cc_lockp; +} callb_cpr_t; + +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + (cp)->cc_lockp = lockp; \ +} + +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ +} + +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ +} + +#define CALLB_CPR_EXIT(cp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ + mutex_exit((cp)->cc_lockp); \ +} + +#define zone_dataset_visible(x, y) (1) +#define INGLOBALZONE(z) (1) + +extern char *kmem_asprintf(const char *fmt, ...); +#define strfree(str) kmem_free((str), strlen(str)+1) + +/* + * Hostname information + */ +extern char hw_serial[]; /* for userland-emulated hostid access */ +extern int ddi_strtoul(const char *str, char **nptr, int base, + unsigned long *result); + +extern int ddi_strtoull(const char *str, char **nptr, int base, + u_longlong_t *result); + +/* ZFS Boot Related stuff. */ + +struct _buf { + intptr_t _fd; +}; + +struct bootstat { + uint64_t st_size; +}; + +typedef struct ace_object { + uid_t a_who; + uint32_t a_access_mask; + uint16_t a_flags; + uint16_t a_type; + uint8_t a_obj_type[16]; + uint8_t a_inherit_obj_type[16]; +} ace_object_t; + + +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 + +extern struct _buf *kobj_open_file(char *name); +extern int kobj_read_file(struct _buf *file, char *buf, unsigned size, + unsigned off); +extern void kobj_close_file(struct _buf *file); +extern int kobj_get_filesize(struct _buf *file, uint64_t *size); +extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); +extern int zfs_secpolicy_rename_perms(const char *from, const char *to, + cred_t *cr); +extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); +extern zoneid_t getzoneid(void); + +/* SID stuff */ +typedef struct ksiddomain { + uint_t kd_ref; + uint_t kd_len; + char *kd_name; +} ksiddomain_t; + +ksiddomain_t *ksid_lookupdomain(const char *); +void ksiddomain_rele(ksiddomain_t *); + +#define DDI_SLEEP KM_SLEEP +#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \ + sysevent_post_event(_c, _d, _b, "libzpool", _e, _f) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/lib/libzpool/common/taskq.c b/lib/libzpool/common/taskq.c new file mode 100644 index 0000000..8db5d11 --- /dev/null +++ b/lib/libzpool/common/taskq.c @@ -0,0 +1,303 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include + +int taskq_now; +taskq_t *system_taskq; + +typedef struct task { + struct task *task_next; + struct task *task_prev; + task_func_t *task_func; + void *task_arg; +} task_t; + +#define TASKQ_ACTIVE 0x00010000 + +struct taskq { + kmutex_t tq_lock; + krwlock_t tq_threadlock; + kcondvar_t tq_dispatch_cv; + kcondvar_t tq_wait_cv; + thread_t *tq_threadlist; + int tq_flags; + int tq_active; + int tq_nthreads; + int tq_nalloc; + int tq_minalloc; + int tq_maxalloc; + kcondvar_t tq_maxalloc_cv; + int tq_maxalloc_wait; + task_t *tq_freelist; + task_t tq_task; +}; + +static task_t * +task_alloc(taskq_t *tq, int tqflags) +{ + task_t *t; + int rv; + +again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { + tq->tq_freelist = t->task_next; + } else { + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (!(tqflags & KM_SLEEP)) + return (NULL); + + /* + * We don't want to exceed tq_maxalloc, but we can't + * wait for other tasks to complete (and thus free up + * task structures) without risking deadlock with + * the caller. So, we just delay for one second + * to throttle the allocation rate. If we have tasks + * complete before one second timeout expires then + * taskq_ent_free will signal us and we will + * immediately retry the allocation. + */ + tq->tq_maxalloc_wait++; + rv = cv_timedwait(&tq->tq_maxalloc_cv, + &tq->tq_lock, ddi_get_lbolt() + hz); + tq->tq_maxalloc_wait--; + if (rv > 0) + goto again; /* signaled */ + } + mutex_exit(&tq->tq_lock); + + t = kmem_alloc(sizeof (task_t), tqflags); + + mutex_enter(&tq->tq_lock); + if (t != NULL) + tq->tq_nalloc++; + } + return (t); +} + +static void +task_free(taskq_t *tq, task_t *t) +{ + if (tq->tq_nalloc <= tq->tq_minalloc) { + t->task_next = tq->tq_freelist; + tq->tq_freelist = t; + } else { + tq->tq_nalloc--; + mutex_exit(&tq->tq_lock); + kmem_free(t, sizeof (task_t)); + mutex_enter(&tq->tq_lock); + } + + if (tq->tq_maxalloc_wait) + cv_signal(&tq->tq_maxalloc_cv); +} + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) +{ + task_t *t; + + if (taskq_now) { + func(arg); + return (1); + } + + mutex_enter(&tq->tq_lock); + ASSERT(tq->tq_flags & TASKQ_ACTIVE); + if ((t = task_alloc(tq, tqflags)) == NULL) { + mutex_exit(&tq->tq_lock); + return (0); + } + if (tqflags & TQ_FRONT) { + t->task_next = tq->tq_task.task_next; + t->task_prev = &tq->tq_task; + } else { + t->task_next = &tq->tq_task; + t->task_prev = tq->tq_task.task_prev; + } + t->task_next->task_prev = t; + t->task_prev->task_next = t; + t->task_func = func; + t->task_arg = arg; + cv_signal(&tq->tq_dispatch_cv); + mutex_exit(&tq->tq_lock); + return (1); +} + +void +taskq_wait(taskq_t *tq) +{ + mutex_enter(&tq->tq_lock); + while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + mutex_exit(&tq->tq_lock); +} + +static void * +taskq_thread(void *arg) +{ + taskq_t *tq = arg; + task_t *t; + + mutex_enter(&tq->tq_lock); + while (tq->tq_flags & TASKQ_ACTIVE) { + if ((t = tq->tq_task.task_next) == &tq->tq_task) { + if (--tq->tq_active == 0) + cv_broadcast(&tq->tq_wait_cv); + cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); + tq->tq_active++; + continue; + } + t->task_prev->task_next = t->task_next; + t->task_next->task_prev = t->task_prev; + mutex_exit(&tq->tq_lock); + + rw_enter(&tq->tq_threadlock, RW_READER); + t->task_func(t->task_arg); + rw_exit(&tq->tq_threadlock); + + mutex_enter(&tq->tq_lock); + task_free(tq, t); + } + tq->tq_nthreads--; + cv_broadcast(&tq->tq_wait_cv); + mutex_exit(&tq->tq_lock); + return (NULL); +} + +/*ARGSUSED*/ +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); + int t; + + if (flags & TASKQ_THREADS_CPU_PCT) { + int pct; + ASSERT3S(nthreads, >=, 0); + ASSERT3S(nthreads, <=, 100); + pct = MIN(nthreads, 100); + pct = MAX(pct, 0); + + nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100; + nthreads = MAX(nthreads, 1); /* need at least 1 thread */ + } else { + ASSERT3S(nthreads, >=, 1); + } + + rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); + mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL); + tq->tq_flags = flags | TASKQ_ACTIVE; + tq->tq_active = nthreads; + tq->tq_nthreads = nthreads; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_task.task_next = &tq->tq_task; + tq->tq_task.task_prev = &tq->tq_task; + tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP); + + if (flags & TASKQ_PREPOPULATE) { + mutex_enter(&tq->tq_lock); + while (minalloc-- > 0) + task_free(tq, task_alloc(tq, KM_SLEEP)); + mutex_exit(&tq->tq_lock); + } + + for (t = 0; t < nthreads; t++) + (void) thr_create(0, 0, taskq_thread, + tq, THR_BOUND, &tq->tq_threadlist[t]); + + return (tq); +} + +void +taskq_destroy(taskq_t *tq) +{ + int t; + int nthreads = tq->tq_nthreads; + + taskq_wait(tq); + + mutex_enter(&tq->tq_lock); + + tq->tq_flags &= ~TASKQ_ACTIVE; + cv_broadcast(&tq->tq_dispatch_cv); + + while (tq->tq_nthreads != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + + tq->tq_minalloc = 0; + while (tq->tq_nalloc != 0) { + ASSERT(tq->tq_freelist != NULL); + task_free(tq, task_alloc(tq, KM_SLEEP)); + } + + mutex_exit(&tq->tq_lock); + + for (t = 0; t < nthreads; t++) + (void) thr_join(tq->tq_threadlist[t], NULL, NULL); + + kmem_free(tq->tq_threadlist, nthreads * sizeof (thread_t)); + + rw_destroy(&tq->tq_threadlock); + mutex_destroy(&tq->tq_lock); + cv_destroy(&tq->tq_dispatch_cv); + cv_destroy(&tq->tq_wait_cv); + cv_destroy(&tq->tq_maxalloc_cv); + + kmem_free(tq, sizeof (taskq_t)); +} + +int +taskq_member(taskq_t *tq, void *t) +{ + int i; + + if (taskq_now) + return (1); + + for (i = 0; i < tq->tq_nthreads; i++) + if (tq->tq_threadlist[i] == (thread_t)(uintptr_t)t) + return (1); + + return (0); +} + +void +system_taskq_init(void) +{ + system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512, + TASKQ_DYNAMIC | TASKQ_PREPOPULATE); +} + +void +system_taskq_fini(void) +{ + taskq_destroy(system_taskq); + system_taskq = NULL; /* defensive */ +} diff --git a/lib/libzpool/common/util.c b/lib/libzpool/common/util.c new file mode 100644 index 0000000..9b99531 --- /dev/null +++ b/lib/libzpool/common/util.c @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Routines needed by more than one client of libzpool. + */ + +void +nicenum(uint64_t num, char *buf) +{ + uint64_t n = num; + int index = 0; + char u; + + while (n >= 1024) { + n = (n + (1024 / 2)) / 1024; /* Round up or down */ + index++; + } + + u = " KMGTPE"[index]; + + if (index == 0) { + (void) sprintf(buf, "%llu", (u_longlong_t)n); + } else if (n < 10 && (num & (num - 1)) != 0) { + (void) sprintf(buf, "%.2f%c", + (double)num / (1ULL << 10 * index), u); + } else if (n < 100 && (num & (num - 1)) != 0) { + (void) sprintf(buf, "%.1f%c", + (double)num / (1ULL << 10 * index), u); + } else { + (void) sprintf(buf, "%llu%c", (u_longlong_t)n, u); + } +} + +static void +show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) +{ + vdev_stat_t *vs; + vdev_stat_t v0 = { 0 }; + uint64_t sec; + uint64_t is_log = 0; + nvlist_t **child; + uint_t c, children; + char used[6], avail[6]; + char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6]; + char *prefix = ""; + + if (indent == 0 && desc != NULL) { + (void) printf(" " + " capacity operations bandwidth ---- errors ----\n"); + (void) printf("description " + "used avail read write read write read write cksum\n"); + } + + if (desc != NULL) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); + + if (is_log) + prefix = "log "; + + if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) != 0) + vs = &v0; + + sec = MAX(1, vs->vs_timestamp / NANOSEC); + + nicenum(vs->vs_alloc, used); + nicenum(vs->vs_space - vs->vs_alloc, avail); + nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops); + nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops); + nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes); + nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes); + nicenum(vs->vs_read_errors, rerr); + nicenum(vs->vs_write_errors, werr); + nicenum(vs->vs_checksum_errors, cerr); + + (void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n", + indent, "", + prefix, + indent + strlen(prefix) - 25 - (vs->vs_space ? 0 : 12), + desc, + vs->vs_space ? 6 : 0, vs->vs_space ? used : "", + vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", + rops, wops, rbytes, wbytes, rerr, werr, cerr); + } + + if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + nvlist_t *cnv = child[c]; + char *cname, *tname; + uint64_t np; + if (nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &cname) && + nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &cname)) + cname = ""; + tname = calloc(1, strlen(cname) + 2); + (void) strcpy(tname, cname); + if (nvlist_lookup_uint64(cnv, ZPOOL_CONFIG_NPARITY, &np) == 0) + tname[strlen(tname)] = '0' + np; + show_vdev_stats(tname, ctype, cnv, indent + 2); + free(tname); + } +} + +void +show_pool_stats(spa_t *spa) +{ + nvlist_t *config, *nvroot; + char *name; + + VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0); + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + + show_vdev_stats(name, ZPOOL_CONFIG_CHILDREN, nvroot, 0); + show_vdev_stats(NULL, ZPOOL_CONFIG_L2CACHE, nvroot, 0); + show_vdev_stats(NULL, ZPOOL_CONFIG_SPARES, nvroot, 0); + + nvlist_free(config); +} diff --git a/lib/pyzfs/common/__init__.py b/lib/pyzfs/common/__init__.py new file mode 100644 index 0000000..76b0998 --- /dev/null +++ b/lib/pyzfs/common/__init__.py @@ -0,0 +1,27 @@ +#! /usr/bin/python2.6 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +""" +package which provides an administrative interface to ZFS +""" diff --git a/lib/pyzfs/common/allow.py b/lib/pyzfs/common/allow.py new file mode 100644 index 0000000..fa8209f --- /dev/null +++ b/lib/pyzfs/common/allow.py @@ -0,0 +1,396 @@ +#! /usr/bin/python2.6 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +"""This module implements the "zfs allow" and "zfs unallow" subcommands. +The only public interface is the zfs.allow.do_allow() function.""" + +import zfs.util +import zfs.dataset +import optparse +import sys +import pwd +import grp +import errno + +_ = zfs.util._ + +class FSPerms(object): + """This class represents all the permissions that are set on a + particular filesystem (not including those inherited).""" + + __slots__ = "create", "sets", "local", "descend", "ld" + __repr__ = zfs.util.default_repr + + def __init__(self, raw): + """Create a FSPerms based on the dict of raw permissions + from zfs.ioctl.get_fsacl().""" + # set of perms + self.create = set() + + # below are { "Ntype name": set(perms) } + # where N is a number that we just use for sorting, + # type is "user", "group", "everyone", or "" (for sets) + # name is a user, group, or set name, or "" (for everyone) + self.sets = dict() + self.local = dict() + self.descend = dict() + self.ld = dict() + + # see the comment in dsl_deleg.c for the definition of whokey + for whokey in raw.keys(): + perms = raw[whokey].keys() + whotypechr = whokey[0].lower() + ws = whokey[3:] + if whotypechr == "c": + self.create.update(perms) + elif whotypechr == "s": + nwho = "1" + ws + self.sets.setdefault(nwho, set()).update(perms) + else: + if whotypechr == "u": + try: + name = pwd.getpwuid(int(ws)).pw_name + except KeyError: + name = ws + nwho = "1user " + name + elif whotypechr == "g": + try: + name = grp.getgrgid(int(ws)).gr_name + except KeyError: + name = ws + nwho = "2group " + name + elif whotypechr == "e": + nwho = "3everyone" + else: + raise ValueError(whotypechr) + + if whokey[1] == "l": + d = self.local + elif whokey[1] == "d": + d = self.descend + else: + raise ValueError(whokey[1]) + + d.setdefault(nwho, set()).update(perms) + + # Find perms that are in both local and descend, and + # move them to ld. + for nwho in self.local: + if nwho not in self.descend: + continue + # note: these are set operations + self.ld[nwho] = self.local[nwho] & self.descend[nwho] + self.local[nwho] -= self.ld[nwho] + self.descend[nwho] -= self.ld[nwho] + + @staticmethod + def __ldstr(d, header): + s = "" + for (nwho, perms) in sorted(d.items()): + # local and descend may have entries where perms + # is an empty set, due to consolidating all + # permissions into ld + if perms: + s += "\t%s %s\n" % \ + (nwho[1:], ",".join(sorted(perms))) + if s: + s = header + s + return s + + def __str__(self): + s = self.__ldstr(self.sets, _("Permission sets:\n")) + + if self.create: + s += _("Create time permissions:\n") + s += "\t%s\n" % ",".join(sorted(self.create)) + + s += self.__ldstr(self.local, _("Local permissions:\n")) + s += self.__ldstr(self.descend, _("Descendent permissions:\n")) + s += self.__ldstr(self.ld, _("Local+Descendent permissions:\n")) + return s.rstrip() + +def args_to_perms(parser, options, who, perms): + """Return a dict of raw perms {"whostr" -> {"perm" -> None}} + based on the command-line input.""" + + # perms is not set if we are doing a "zfs unallow " to + # remove all of someone's permissions + if perms: + setperms = dict(((p, None) for p in perms if p[0] == "@")) + baseperms = dict(((canonicalized_perm(p), None) + for p in perms if p[0] != "@")) + else: + setperms = None + baseperms = None + + d = dict() + + def storeperm(typechr, inheritchr, arg): + assert typechr in "ugecs" + assert inheritchr in "ld-" + + def mkwhokey(t): + return "%c%c$%s" % (t, inheritchr, arg) + + if baseperms or not perms: + d[mkwhokey(typechr)] = baseperms + if setperms or not perms: + d[mkwhokey(typechr.upper())] = setperms + + def decodeid(w, toidfunc, fmt): + try: + return int(w) + except ValueError: + try: + return toidfunc(w)[2] + except KeyError: + parser.error(fmt % w) + + if options.set: + storeperm("s", "-", who) + elif options.create: + storeperm("c", "-", "") + else: + for w in who: + if options.user: + id = decodeid(w, pwd.getpwnam, + _("invalid user %s")) + typechr = "u" + elif options.group: + id = decodeid(w, grp.getgrnam, + _("invalid group %s")) + typechr = "g" + elif w == "everyone": + id = "" + typechr = "e" + else: + try: + id = pwd.getpwnam(w)[2] + typechr = "u" + except KeyError: + try: + id = grp.getgrnam(w)[2] + typechr = "g" + except KeyError: + parser.error(_("invalid user/group %s") % w) + if options.local: + storeperm(typechr, "l", id) + if options.descend: + storeperm(typechr, "d", id) + return d + +perms_subcmd = dict( + create=_("Must also have the 'mount' ability"), + destroy=_("Must also have the 'mount' ability"), + snapshot="", + rollback="", + clone=_("""Must also have the 'create' ability and 'mount' +\t\t\t\tability in the origin file system"""), + promote=_("""Must also have the 'mount' +\t\t\t\tand 'promote' ability in the origin file system"""), + rename=_("""Must also have the 'mount' and 'create' +\t\t\t\tability in the new parent"""), + receive=_("Must also have the 'mount' and 'create' ability"), + allow=_("Must also have the permission that is being\n\t\t\t\tallowed"), + mount=_("Allows mount/umount of ZFS datasets"), + share=_("Allows sharing file systems over NFS or SMB\n\t\t\t\tprotocols"), + send="", + hold=_("Allows adding a user hold to a snapshot"), + release=_("Allows releasing a user hold which\n\t\t\t\tmight destroy the snapshot"), + diff=_("Allows lookup of paths within a dataset,\n\t\t\t\tgiven an object number. Ordinary users need this\n\t\t\t\tin order to use zfs diff"), +) + +perms_other = dict( + userprop=_("Allows changing any user property"), + userquota=_("Allows accessing any userquota@... property"), + groupquota=_("Allows accessing any groupquota@... property"), + userused=_("Allows reading any userused@... property"), + groupused=_("Allows reading any groupused@... property"), +) + +def hasset(ds, setname): + """Return True if the given setname (string) is defined for this + ds (Dataset).""" + # It would be nice to cache the result of get_fsacl(). + for raw in ds.get_fsacl().values(): + for whokey in raw.keys(): + if whokey[0].lower() == "s" and whokey[3:] == setname: + return True + return False + +def canonicalized_perm(permname): + """Return the canonical name (string) for this permission (string). + Raises ZFSError if it is not a valid permission.""" + if permname in perms_subcmd.keys() or permname in perms_other.keys(): + return permname + try: + return zfs.dataset.getpropobj(permname).name + except KeyError: + raise zfs.util.ZFSError(errno.EINVAL, permname, + _("invalid permission")) + +def print_perms(): + """Print the set of supported permissions.""" + print(_("\nThe following permissions are supported:\n")) + fmt = "%-16s %-14s\t%s" + print(fmt % (_("NAME"), _("TYPE"), _("NOTES"))) + + for (name, note) in sorted(perms_subcmd.iteritems()): + print(fmt % (name, _("subcommand"), note)) + + for (name, note) in sorted(perms_other.iteritems()): + print(fmt % (name, _("other"), note)) + + for (name, prop) in sorted(zfs.dataset.proptable.iteritems()): + if prop.visible and prop.delegatable(): + print(fmt % (name, _("property"), "")) + +def do_allow(): + """Implements the "zfs allow" and "zfs unallow" subcommands.""" + un = (sys.argv[1] == "unallow") + + def usage(msg=None): + parser.print_help() + print_perms() + if msg: + print + parser.exit("zfs: error: " + msg) + else: + parser.exit() + + if un: + u = _("""unallow [-rldug] <"everyone"|user|group>[,...] + [[,...]] + unallow [-rld] -e [[,...]] + unallow [-r] -c [[,...]] + unallow [-r] -s @setname [[,...]] """) + verb = _("remove") + sstr = _("undefine permission set") + else: + u = _("""allow + allow [-ldug] <"everyone"|user|group>[,...] [,...] + + allow [-ld] -e [,...] + allow -c [,...] + allow -s @setname [,...] """) + verb = _("set") + sstr = _("define permission set") + + parser = optparse.OptionParser(usage=u, prog="zfs") + + parser.add_option("-l", action="store_true", dest="local", + help=_("%s permission locally") % verb) + parser.add_option("-d", action="store_true", dest="descend", + help=_("%s permission for descendents") % verb) + parser.add_option("-u", action="store_true", dest="user", + help=_("%s permission for user") % verb) + parser.add_option("-g", action="store_true", dest="group", + help=_("%s permission for group") % verb) + parser.add_option("-e", action="store_true", dest="everyone", + help=_("%s permission for everyone") % verb) + parser.add_option("-c", action="store_true", dest="create", + help=_("%s create time permissions") % verb) + parser.add_option("-s", action="store_true", dest="set", help=sstr) + if un: + parser.add_option("-r", action="store_true", dest="recursive", + help=_("remove permissions recursively")) + + if len(sys.argv) == 3 and not un: + # just print the permissions on this fs + + if sys.argv[2] == "-h": + # hack to make "zfs allow -h" work + usage() + ds = zfs.dataset.Dataset(sys.argv[2], snaps=False) + + p = dict() + for (fs, raw) in ds.get_fsacl().items(): + p[fs] = FSPerms(raw) + + for fs in sorted(p.keys(), reverse=True): + s = _("---- Permissions on %s ") % fs + print(s + "-" * (70-len(s))) + print(p[fs]) + return + + + (options, args) = parser.parse_args(sys.argv[2:]) + + if sum((bool(options.everyone), bool(options.user), + bool(options.group))) > 1: + parser.error(_("-u, -g, and -e are mutually exclusive")) + + def mungeargs(expected_len): + if un and len(args) == expected_len-1: + return (None, args[expected_len-2]) + elif len(args) == expected_len: + return (args[expected_len-2].split(","), + args[expected_len-1]) + else: + usage(_("wrong number of parameters")) + + if options.set: + if options.local or options.descend or options.user or \ + options.group or options.everyone or options.create: + parser.error(_("invalid option combined with -s")) + if args[0][0] != "@": + parser.error(_("invalid set name: missing '@' prefix")) + + (perms, fsname) = mungeargs(3) + who = args[0] + elif options.create: + if options.local or options.descend or options.user or \ + options.group or options.everyone or options.set: + parser.error(_("invalid option combined with -c")) + + (perms, fsname) = mungeargs(2) + who = None + elif options.everyone: + if options.user or options.group or \ + options.create or options.set: + parser.error(_("invalid option combined with -e")) + + (perms, fsname) = mungeargs(2) + who = ["everyone"] + else: + (perms, fsname) = mungeargs(3) + who = args[0].split(",") + + if not options.local and not options.descend: + options.local = True + options.descend = True + + d = args_to_perms(parser, options, who, perms) + + ds = zfs.dataset.Dataset(fsname, snaps=False) + + if not un and perms: + for p in perms: + if p[0] == "@" and not hasset(ds, p): + parser.error(_("set %s is not defined") % p) + + ds.set_fsacl(un, d) + if un and options.recursive: + for child in ds.descendents(): + child.set_fsacl(un, d) diff --git a/lib/pyzfs/common/dataset.py b/lib/pyzfs/common/dataset.py new file mode 100644 index 0000000..26192e4 --- /dev/null +++ b/lib/pyzfs/common/dataset.py @@ -0,0 +1,234 @@ +#! /usr/bin/python2.6 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +"""Implements the Dataset class, providing methods for manipulating ZFS +datasets. Also implements the Property class, which describes ZFS +properties.""" + +import zfs.ioctl +import zfs.util +import errno + +_ = zfs.util._ + +class Property(object): + """This class represents a ZFS property. It contains + information about the property -- if it's readonly, a number vs + string vs index, etc. Only native properties are represented by + this class -- not user properties (eg "user:prop") or userspace + properties (eg "userquota@joe").""" + + __slots__ = "name", "number", "type", "default", "attr", "validtypes", \ + "values", "colname", "rightalign", "visible", "indextable" + __repr__ = zfs.util.default_repr + + def __init__(self, t): + """t is the tuple of information about this property + from zfs.ioctl.get_proptable, which should match the + members of zprop_desc_t (see zfs_prop.h).""" + + self.name = t[0] + self.number = t[1] + self.type = t[2] + if self.type == "string": + self.default = t[3] + else: + self.default = t[4] + self.attr = t[5] + self.validtypes = t[6] + self.values = t[7] + self.colname = t[8] + self.rightalign = t[9] + self.visible = t[10] + self.indextable = t[11] + + def delegatable(self): + """Return True if this property can be delegated with + "zfs allow".""" + return self.attr != "readonly" + +proptable = dict() +for name, t in zfs.ioctl.get_proptable().iteritems(): + proptable[name] = Property(t) +del name, t + +def getpropobj(name): + """Return the Property object that is identified by the given + name string. It can be the full name, or the column name.""" + try: + return proptable[name] + except KeyError: + for p in proptable.itervalues(): + if p.colname and p.colname.lower() == name: + return p + raise + +class Dataset(object): + """Represents a ZFS dataset (filesystem, snapshot, zvol, clone, etc). + + Generally, this class provides interfaces to the C functions in + zfs.ioctl which actually interface with the kernel to manipulate + datasets. + + Unless otherwise noted, any method can raise a ZFSError to + indicate failure.""" + + __slots__ = "name", "__props" + __repr__ = zfs.util.default_repr + + def __init__(self, name, props=None, + types=("filesystem", "volume"), snaps=True): + """Open the named dataset, checking that it exists and + is of the specified type. + + name is the string name of this dataset. + + props is the property settings dict from zfs.ioctl.next_dataset. + + types is an iterable of strings specifying which types + of datasets are permitted. Accepted strings are + "filesystem" and "volume". Defaults to accepting all + types. + + snaps is a boolean specifying if snapshots are acceptable. + + Raises a ZFSError if the dataset can't be accessed (eg + doesn't exist) or is not of the specified type. + """ + + self.name = name + + e = zfs.util.ZFSError(errno.EINVAL, + _("cannot open %s") % name, + _("operation not applicable to datasets of this type")) + if "@" in name and not snaps: + raise e + if not props: + props = zfs.ioctl.dataset_props(name) + self.__props = props + if "volume" not in types and self.getprop("type") == 3: + raise e + if "filesystem" not in types and self.getprop("type") == 2: + raise e + + def getprop(self, propname): + """Return the value of the given property for this dataset. + + Currently only works for native properties (those with a + Property object.) + + Raises KeyError if propname does not specify a native property. + Does not raise ZFSError. + """ + + p = getpropobj(propname) + try: + return self.__props[p.name]["value"] + except KeyError: + return p.default + + def parent(self): + """Return a Dataset representing the parent of this one.""" + return Dataset(self.name[:self.name.rindex("/")]) + + def descendents(self): + """A generator function which iterates over all + descendent Datasets (not including snapshots.""" + + cookie = 0 + while True: + # next_dataset raises StopIteration when done + (name, cookie, props) = \ + zfs.ioctl.next_dataset(self.name, False, cookie) + ds = Dataset(name, props) + yield ds + for child in ds.descendents(): + yield child + + def userspace(self, prop): + """A generator function which iterates over a + userspace-type property. + + prop specifies which property ("userused@", + "userquota@", "groupused@", or "groupquota@"). + + returns 3-tuple of domain (string), rid (int), and space (int). + """ + + d = zfs.ioctl.userspace_many(self.name, prop) + for ((domain, rid), space) in d.iteritems(): + yield (domain, rid, space) + + def userspace_upgrade(self): + """Initialize the accounting information for + userused@... and groupused@... properties.""" + return zfs.ioctl.userspace_upgrade(self.name) + + def set_fsacl(self, un, d): + """Add to the "zfs allow"-ed permissions on this Dataset. + + un is True if the specified permissions should be removed. + + d is a dict specifying which permissions to add/remove: + { "whostr" -> None # remove all perms for this entity + "whostr" -> { "perm" -> None} # add/remove these perms + } """ + return zfs.ioctl.set_fsacl(self.name, un, d) + + def get_fsacl(self): + """Get the "zfs allow"-ed permissions on the Dataset. + + Return a dict("whostr": { "perm" -> None }).""" + + return zfs.ioctl.get_fsacl(self.name) + + def get_holds(self): + """Get the user holds on this Dataset. + + Return a dict("tag": timestamp).""" + + return zfs.ioctl.get_holds(self.name) + +def snapshots_fromcmdline(dsnames, recursive): + for dsname in dsnames: + if not "@" in dsname: + raise zfs.util.ZFSError(errno.EINVAL, + _("cannot open %s") % dsname, + _("operation only applies to snapshots")) + try: + ds = Dataset(dsname) + yield ds + except zfs.util.ZFSError, e: + if not recursive or e.errno != errno.ENOENT: + raise + if recursive: + (base, snapname) = dsname.split('@') + parent = Dataset(base) + for child in parent.descendents(): + try: + yield Dataset(child.name + "@" + + snapname) + except zfs.util.ZFSError, e: + if e.errno != errno.ENOENT: + raise diff --git a/lib/pyzfs/common/groupspace.py b/lib/pyzfs/common/groupspace.py new file mode 100644 index 0000000..9f380fd --- /dev/null +++ b/lib/pyzfs/common/groupspace.py @@ -0,0 +1,28 @@ +#! /usr/bin/python2.6 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +import zfs.userspace + +do_groupspace = zfs.userspace.do_userspace + diff --git a/lib/pyzfs/common/holds.py b/lib/pyzfs/common/holds.py new file mode 100644 index 0000000..800e28f --- /dev/null +++ b/lib/pyzfs/common/holds.py @@ -0,0 +1,75 @@ +#! /usr/bin/python2.6 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +"""This module implements the "zfs holds" subcommand. +The only public interface is the zfs.holds.do_holds() function.""" + +import optparse +import sys +import errno +import time +import zfs.util +import zfs.dataset +import zfs.table + +_ = zfs.util._ + +def do_holds(): + """Implements the "zfs holds" subcommand.""" + def usage(msg=None): + parser.print_help() + if msg: + print + parser.exit("zfs: error: " + msg) + else: + parser.exit() + + u = _("""holds [-r] ...""") + + parser = optparse.OptionParser(usage=u, prog="zfs") + + parser.add_option("-r", action="store_true", dest="recursive", + help=_("list holds recursively")) + + (options, args) = parser.parse_args(sys.argv[2:]) + + if len(args) < 1: + usage(_("missing snapshot argument")) + + fields = ("name", "tag", "timestamp") + rjustfields = () + printing = False + gotone = False + t = zfs.table.Table(fields, rjustfields) + for ds in zfs.dataset.snapshots_fromcmdline(args, options.recursive): + gotone = True + for tag, tm in ds.get_holds().iteritems(): + val = {"name": ds.name, "tag": tag, + "timestamp": time.ctime(tm)} + t.addline(ds.name, val) + printing = True + if printing: + t.printme() + elif not gotone: + raise zfs.util.ZFSError(errno.ENOENT, _("no matching datasets")) diff --git a/lib/pyzfs/common/ioctl.c b/lib/pyzfs/common/ioctl.c new file mode 100644 index 0000000..d8c0d18 --- /dev/null +++ b/lib/pyzfs/common/ioctl.c @@ -0,0 +1,543 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_prop.h" + +static PyObject *ZFSError; +static int zfsdevfd; + +#ifdef __lint +#define dgettext(x, y) y +#endif + +#define _(s) dgettext(TEXT_DOMAIN, s) + +/*PRINTFLIKE1*/ +static void +seterr(char *fmt, ...) +{ + char errstr[1024]; + va_list v; + + va_start(v, fmt); + (void) vsnprintf(errstr, sizeof (errstr), fmt, v); + va_end(v); + + PyErr_SetObject(ZFSError, Py_BuildValue("is", errno, errstr)); +} + +static char cmdstr[HIS_MAX_RECORD_LEN]; + +static int +ioctl_with_cmdstr(int ioc, zfs_cmd_t *zc) +{ + int err; + + if (cmdstr[0]) + zc->zc_history = (uint64_t)(uintptr_t)cmdstr; + err = ioctl(zfsdevfd, ioc, zc); + cmdstr[0] = '\0'; + return (err); +} + +static PyObject * +nvl2py(nvlist_t *nvl) +{ + PyObject *pyo; + nvpair_t *nvp; + + pyo = PyDict_New(); + + for (nvp = nvlist_next_nvpair(nvl, NULL); nvp; + nvp = nvlist_next_nvpair(nvl, nvp)) { + PyObject *pyval; + char *sval; + uint64_t ival; + boolean_t bval; + nvlist_t *nval; + + switch (nvpair_type(nvp)) { + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &sval); + pyval = Py_BuildValue("s", sval); + break; + + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &ival); + pyval = Py_BuildValue("K", ival); + break; + + case DATA_TYPE_NVLIST: + (void) nvpair_value_nvlist(nvp, &nval); + pyval = nvl2py(nval); + break; + + case DATA_TYPE_BOOLEAN: + Py_INCREF(Py_None); + pyval = Py_None; + break; + + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(nvp, &bval); + pyval = Py_BuildValue("i", bval); + break; + + default: + PyErr_SetNone(PyExc_ValueError); + Py_DECREF(pyo); + return (NULL); + } + + PyDict_SetItemString(pyo, nvpair_name(nvp), pyval); + Py_DECREF(pyval); + } + + return (pyo); +} + +static nvlist_t * +dict2nvl(PyObject *d) +{ + nvlist_t *nvl; + int err; + PyObject *key, *value; + int pos = 0; + + if (!PyDict_Check(d)) { + PyErr_SetObject(PyExc_ValueError, d); + return (NULL); + } + + err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); + assert(err == 0); + + while (PyDict_Next(d, &pos, &key, &value)) { + char *keystr = PyString_AsString(key); + if (keystr == NULL) { + PyErr_SetObject(PyExc_KeyError, key); + nvlist_free(nvl); + return (NULL); + } + + if (PyDict_Check(value)) { + nvlist_t *valnvl = dict2nvl(value); + err = nvlist_add_nvlist(nvl, keystr, valnvl); + nvlist_free(valnvl); + } else if (value == Py_None) { + err = nvlist_add_boolean(nvl, keystr); + } else if (PyString_Check(value)) { + char *valstr = PyString_AsString(value); + err = nvlist_add_string(nvl, keystr, valstr); + } else if (PyInt_Check(value)) { + uint64_t valint = PyInt_AsUnsignedLongLongMask(value); + err = nvlist_add_uint64(nvl, keystr, valint); + } else if (PyBool_Check(value)) { + boolean_t valbool = value == Py_True ? B_TRUE : B_FALSE; + err = nvlist_add_boolean_value(nvl, keystr, valbool); + } else { + PyErr_SetObject(PyExc_ValueError, value); + nvlist_free(nvl); + return (NULL); + } + assert(err == 0); + } + + return (nvl); +} + +static PyObject * +fakepropval(uint64_t value) +{ + PyObject *d = PyDict_New(); + PyDict_SetItemString(d, "value", Py_BuildValue("K", value)); + return (d); +} + +static void +add_ds_props(zfs_cmd_t *zc, PyObject *nvl) +{ + dmu_objset_stats_t *s = &zc->zc_objset_stats; + PyDict_SetItemString(nvl, "numclones", + fakepropval(s->dds_num_clones)); + PyDict_SetItemString(nvl, "issnap", + fakepropval(s->dds_is_snapshot)); + PyDict_SetItemString(nvl, "inconsistent", + fakepropval(s->dds_inconsistent)); +} + +/* On error, returns NULL but does not set python exception. */ +static PyObject * +ioctl_with_dstnv(int ioc, zfs_cmd_t *zc) +{ + int nvsz = 2048; + void *nvbuf; + PyObject *pynv = NULL; + +again: + nvbuf = malloc(nvsz); + zc->zc_nvlist_dst_size = nvsz; + zc->zc_nvlist_dst = (uintptr_t)nvbuf; + + if (ioctl(zfsdevfd, ioc, zc) == 0) { + nvlist_t *nvl; + + errno = nvlist_unpack(nvbuf, zc->zc_nvlist_dst_size, &nvl, 0); + if (errno == 0) { + pynv = nvl2py(nvl); + nvlist_free(nvl); + } + } else if (errno == ENOMEM) { + free(nvbuf); + nvsz = zc->zc_nvlist_dst_size; + goto again; + } + free(nvbuf); + return (pynv); +} + +static PyObject * +py_next_dataset(PyObject *self, PyObject *args) +{ + int ioc; + uint64_t cookie; + zfs_cmd_t zc = { 0 }; + int snaps; + char *name; + PyObject *nvl; + PyObject *ret = NULL; + + if (!PyArg_ParseTuple(args, "siK", &name, &snaps, &cookie)) + return (NULL); + + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + zc.zc_cookie = cookie; + + if (snaps) + ioc = ZFS_IOC_SNAPSHOT_LIST_NEXT; + else + ioc = ZFS_IOC_DATASET_LIST_NEXT; + + nvl = ioctl_with_dstnv(ioc, &zc); + if (nvl) { + add_ds_props(&zc, nvl); + ret = Py_BuildValue("sKO", zc.zc_name, zc.zc_cookie, nvl); + Py_DECREF(nvl); + } else if (errno == ESRCH) { + PyErr_SetNone(PyExc_StopIteration); + } else { + if (snaps) + seterr(_("cannot get snapshots of %s"), name); + else + seterr(_("cannot get child datasets of %s"), name); + } + return (ret); +} + +static PyObject * +py_dataset_props(PyObject *self, PyObject *args) +{ + zfs_cmd_t zc = { 0 }; + int snaps; + char *name; + PyObject *nvl; + + if (!PyArg_ParseTuple(args, "s", &name)) + return (NULL); + + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + + nvl = ioctl_with_dstnv(ZFS_IOC_OBJSET_STATS, &zc); + if (nvl) { + add_ds_props(&zc, nvl); + } else { + seterr(_("cannot access dataset %s"), name); + } + return (nvl); +} + +static PyObject * +py_get_fsacl(PyObject *self, PyObject *args) +{ + zfs_cmd_t zc = { 0 }; + char *name; + PyObject *nvl; + + if (!PyArg_ParseTuple(args, "s", &name)) + return (NULL); + + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + + nvl = ioctl_with_dstnv(ZFS_IOC_GET_FSACL, &zc); + if (nvl == NULL) + seterr(_("cannot get permissions on %s"), name); + + return (nvl); +} + +static PyObject * +py_set_fsacl(PyObject *self, PyObject *args) +{ + int un; + size_t nvsz; + zfs_cmd_t zc = { 0 }; + char *name, *nvbuf; + PyObject *dict, *file; + nvlist_t *nvl; + int err; + + if (!PyArg_ParseTuple(args, "siO!", &name, &un, + &PyDict_Type, &dict)) + return (NULL); + + nvl = dict2nvl(dict); + if (nvl == NULL) + return (NULL); + + err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); + assert(err == 0); + nvbuf = malloc(nvsz); + err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); + assert(err == 0); + + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + zc.zc_nvlist_src_size = nvsz; + zc.zc_nvlist_src = (uintptr_t)nvbuf; + zc.zc_perm_action = un; + + err = ioctl_with_cmdstr(ZFS_IOC_SET_FSACL, &zc); + free(nvbuf); + if (err) { + seterr(_("cannot set permissions on %s"), name); + return (NULL); + } + + Py_RETURN_NONE; +} + +static PyObject * +py_get_holds(PyObject *self, PyObject *args) +{ + zfs_cmd_t zc = { 0 }; + char *name; + PyObject *nvl; + + if (!PyArg_ParseTuple(args, "s", &name)) + return (NULL); + + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + + nvl = ioctl_with_dstnv(ZFS_IOC_GET_HOLDS, &zc); + if (nvl == NULL) + seterr(_("cannot get holds for %s"), name); + + return (nvl); +} + +static PyObject * +py_userspace_many(PyObject *self, PyObject *args) +{ + zfs_cmd_t zc = { 0 }; + zfs_userquota_prop_t type; + char *name, *propname; + int bufsz = 1<<20; + void *buf; + PyObject *dict, *file; + int error; + + if (!PyArg_ParseTuple(args, "ss", &name, &propname)) + return (NULL); + + for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) + if (strcmp(propname, zfs_userquota_prop_prefixes[type]) == 0) + break; + if (type == ZFS_NUM_USERQUOTA_PROPS) { + PyErr_SetString(PyExc_KeyError, propname); + return (NULL); + } + + dict = PyDict_New(); + buf = malloc(bufsz); + + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + zc.zc_objset_type = type; + zc.zc_cookie = 0; + + while (1) { + zfs_useracct_t *zua = buf; + + zc.zc_nvlist_dst = (uintptr_t)buf; + zc.zc_nvlist_dst_size = bufsz; + + error = ioctl(zfsdevfd, ZFS_IOC_USERSPACE_MANY, &zc); + if (error || zc.zc_nvlist_dst_size == 0) + break; + + while (zc.zc_nvlist_dst_size > 0) { + PyObject *pykey, *pyval; + + pykey = Py_BuildValue("sI", + zua->zu_domain, zua->zu_rid); + pyval = Py_BuildValue("K", zua->zu_space); + PyDict_SetItem(dict, pykey, pyval); + Py_DECREF(pykey); + Py_DECREF(pyval); + + zua++; + zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); + } + } + + free(buf); + + if (error != 0) { + Py_DECREF(dict); + seterr(_("cannot get %s property on %s"), propname, name); + return (NULL); + } + + return (dict); +} + +static PyObject * +py_userspace_upgrade(PyObject *self, PyObject *args) +{ + zfs_cmd_t zc = { 0 }; + char *name; + int error; + + if (!PyArg_ParseTuple(args, "s", &name)) + return (NULL); + + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + error = ioctl(zfsdevfd, ZFS_IOC_USERSPACE_UPGRADE, &zc); + + if (error != 0) { + seterr(_("cannot initialize user accounting information on %s"), + name); + return (NULL); + } + + Py_RETURN_NONE; +} + +static PyObject * +py_set_cmdstr(PyObject *self, PyObject *args) +{ + char *str; + + if (!PyArg_ParseTuple(args, "s", &str)) + return (NULL); + + (void) strlcpy(cmdstr, str, sizeof (cmdstr)); + + Py_RETURN_NONE; +} + +static PyObject * +py_get_proptable(PyObject *self, PyObject *args) +{ + zprop_desc_t *t = zfs_prop_get_table(); + PyObject *d = PyDict_New(); + zfs_prop_t i; + + for (i = 0; i < ZFS_NUM_PROPS; i++) { + zprop_desc_t *p = &t[i]; + PyObject *tuple; + static const char *typetable[] = + {"number", "string", "index"}; + static const char *attrtable[] = + {"default", "readonly", "inherit", "onetime"}; + PyObject *indextable; + + if (p->pd_proptype == PROP_TYPE_INDEX) { + const zprop_index_t *it = p->pd_table; + indextable = PyDict_New(); + int j; + for (j = 0; it[j].pi_name; j++) { + PyDict_SetItemString(indextable, + it[j].pi_name, + Py_BuildValue("K", it[j].pi_value)); + } + } else { + Py_INCREF(Py_None); + indextable = Py_None; + } + + tuple = Py_BuildValue("sissKsissiiO", + p->pd_name, p->pd_propnum, typetable[p->pd_proptype], + p->pd_strdefault, p->pd_numdefault, + attrtable[p->pd_attr], p->pd_types, + p->pd_values, p->pd_colname, + p->pd_rightalign, p->pd_visible, indextable); + PyDict_SetItemString(d, p->pd_name, tuple); + Py_DECREF(tuple); + } + + return (d); +} + +static PyMethodDef zfsmethods[] = { + {"next_dataset", py_next_dataset, METH_VARARGS, + "Get next child dataset or snapshot."}, + {"get_fsacl", py_get_fsacl, METH_VARARGS, "Get allowed permissions."}, + {"set_fsacl", py_set_fsacl, METH_VARARGS, "Set allowed permissions."}, + {"userspace_many", py_userspace_many, METH_VARARGS, + "Get user space accounting."}, + {"userspace_upgrade", py_userspace_upgrade, METH_VARARGS, + "Upgrade fs to enable user space accounting."}, + {"set_cmdstr", py_set_cmdstr, METH_VARARGS, + "Set command string for history logging."}, + {"dataset_props", py_dataset_props, METH_VARARGS, + "Get dataset properties."}, + {"get_proptable", py_get_proptable, METH_NOARGS, + "Get property table."}, + {"get_holds", py_get_holds, METH_VARARGS, "Get user holds."}, + {NULL, NULL, 0, NULL} +}; + +void +initioctl(void) +{ + PyObject *zfs_ioctl = Py_InitModule("zfs.ioctl", zfsmethods); + PyObject *zfs_util = PyImport_ImportModule("zfs.util"); + PyObject *devfile; + + if (zfs_util == NULL) + return; + + ZFSError = PyObject_GetAttrString(zfs_util, "ZFSError"); + devfile = PyObject_GetAttrString(zfs_util, "dev"); + zfsdevfd = PyObject_AsFileDescriptor(devfile); + + zfs_prop_init(); +} diff --git a/lib/pyzfs/common/table.py b/lib/pyzfs/common/table.py new file mode 100644 index 0000000..d2a45a1 --- /dev/null +++ b/lib/pyzfs/common/table.py @@ -0,0 +1,70 @@ +#! /usr/bin/python2.6 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +import zfs.util + +class Table: + __slots__ = "fields", "rjustfields", "maxfieldlen", "lines" + __repr__ = zfs.util.default_repr + + def __init__(self, fields, rjustfields=()): + # XXX maybe have a defaults, too? + self.fields = fields + self.rjustfields = rjustfields + self.maxfieldlen = dict.fromkeys(fields, 0) + self.lines = list() + + def __updatemax(self, k, v): + self.maxfieldlen[k] = max(self.maxfieldlen.get(k, None), v) + + def addline(self, sortkey, values): + """values is a dict from field name to value""" + + va = list() + for f in self.fields: + v = str(values[f]) + va.append(v) + self.__updatemax(f, len(v)) + self.lines.append((sortkey, va)) + + def printme(self, headers=True): + if headers: + d = dict([(f, f.upper()) for f in self.fields]) + self.addline(None, d) + + self.lines.sort() + for (k, va) in self.lines: + line = str() + for i in range(len(self.fields)): + if not headers: + line += va[i] + line += "\t" + else: + if self.fields[i] in self.rjustfields: + fmt = "%*s " + else: + fmt = "%-*s " + mfl = self.maxfieldlen[self.fields[i]] + line += fmt % (mfl, va[i]) + print(line) diff --git a/lib/pyzfs/common/unallow.py b/lib/pyzfs/common/unallow.py new file mode 100644 index 0000000..cbdd4dd --- /dev/null +++ b/lib/pyzfs/common/unallow.py @@ -0,0 +1,27 @@ +#! /usr/bin/python2.6 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +import zfs.allow + +do_unallow = zfs.allow.do_allow diff --git a/lib/pyzfs/common/userspace.py b/lib/pyzfs/common/userspace.py new file mode 100644 index 0000000..8464f54 --- /dev/null +++ b/lib/pyzfs/common/userspace.py @@ -0,0 +1,246 @@ +#! /usr/bin/python2.6 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +"""This module implements the "zfs userspace" and "zfs groupspace" subcommands. +The only public interface is the zfs.userspace.do_userspace() function.""" + +import optparse +import sys +import pwd +import grp +import errno +import solaris.misc +import zfs.util +import zfs.ioctl +import zfs.dataset +import zfs.table + +_ = zfs.util._ + +# map from property name prefix -> (field name, isgroup) +props = { + "userused@": ("used", False), + "userquota@": ("quota", False), + "groupused@": ("used", True), + "groupquota@": ("quota", True), +} + +def skiptype(options, prop): + """Return True if this property (eg "userquota@") should be skipped.""" + (field, isgroup) = props[prop] + if field not in options.fields: + return True + if isgroup and "posixgroup" not in options.types and \ + "smbgroup" not in options.types: + return True + if not isgroup and "posixuser" not in options.types and \ + "smbuser" not in options.types: + return True + return False + +def new_entry(options, isgroup, domain, rid): + """Return a dict("field": value) for this domain (string) + rid (int)""" + + if domain: + idstr = "%s-%u" % (domain, rid) + else: + idstr = "%u" % rid + + (typename, mapfunc) = { + (1, 1): ("SMB Group", lambda id: solaris.misc.sid_to_name(id, 0)), + (1, 0): ("POSIX Group", lambda id: grp.getgrgid(int(id)).gr_name), + (0, 1): ("SMB User", lambda id: solaris.misc.sid_to_name(id, 1)), + (0, 0): ("POSIX User", lambda id: pwd.getpwuid(int(id)).pw_name) + }[isgroup, bool(domain)] + + if typename.lower().replace(" ", "") not in options.types: + return None + + v = dict() + v["type"] = typename + + # python's getpwuid/getgrgid is confused by ephemeral uids + if not options.noname and rid < 1<<31: + try: + v["name"] = mapfunc(idstr) + except KeyError: + pass + + if "name" not in v: + v["name"] = idstr + if not domain: + # it's just a number, so pad it with spaces so + # that it will sort numerically + v["name.sort"] = "%20d" % rid + # fill in default values + v["used"] = "0" + v["used.sort"] = 0 + v["quota"] = "none" + v["quota.sort"] = 0 + return v + +def process_one_raw(acct, options, prop, elem): + """Update the acct dict to incorporate the + information from this elem from Dataset.userspace(prop).""" + + (domain, rid, value) = elem + (field, isgroup) = props[prop] + + if options.translate and domain: + try: + rid = solaris.misc.sid_to_id("%s-%u" % (domain, rid), + not isgroup) + domain = None + except KeyError: + pass; + key = (isgroup, domain, rid) + + try: + v = acct[key] + except KeyError: + v = new_entry(options, isgroup, domain, rid) + if not v: + return + acct[key] = v + + # Add our value to an existing value, which may be present if + # options.translate is set. + value = v[field + ".sort"] = value + v[field + ".sort"] + + if options.parsable: + v[field] = str(value) + else: + v[field] = zfs.util.nicenum(value) + +def do_userspace(): + """Implements the "zfs userspace" and "zfs groupspace" subcommands.""" + + def usage(msg=None): + parser.print_help() + if msg: + print + parser.exit("zfs: error: " + msg) + else: + parser.exit() + + if sys.argv[1] == "userspace": + defaulttypes = "posixuser,smbuser" + else: + defaulttypes = "posixgroup,smbgroup" + + fields = ("type", "name", "used", "quota") + rjustfields = ("used", "quota") + types = ("all", "posixuser", "smbuser", "posixgroup", "smbgroup") + + u = _("%s [-niHp] [-o field[,...]] [-sS field] ... \n") % sys.argv[1] + u += _(" [-t type[,...]] ") + parser = optparse.OptionParser(usage=u, prog="zfs") + + parser.add_option("-n", action="store_true", dest="noname", + help=_("Print numeric ID instead of user/group name")) + parser.add_option("-i", action="store_true", dest="translate", + help=_("translate SID to posix (possibly ephemeral) ID")) + parser.add_option("-H", action="store_true", dest="noheaders", + help=_("no headers, tab delimited output")) + parser.add_option("-p", action="store_true", dest="parsable", + help=_("exact (parsable) numeric output")) + parser.add_option("-o", dest="fields", metavar="field[,...]", + default="type,name,used,quota", + help=_("print only these fields (eg type,name,used,quota)")) + parser.add_option("-s", dest="sortfields", metavar="field", + type="choice", choices=fields, default=list(), + action="callback", callback=zfs.util.append_with_opt, + help=_("sort field")) + parser.add_option("-S", dest="sortfields", metavar="field", + type="choice", choices=fields, #-s sets the default + action="callback", callback=zfs.util.append_with_opt, + help=_("reverse sort field")) + parser.add_option("-t", dest="types", metavar="type[,...]", + default=defaulttypes, + help=_("print only these types (eg posixuser,smbuser,posixgroup,smbgroup,all)")) + + (options, args) = parser.parse_args(sys.argv[2:]) + if len(args) != 1: + usage(_("wrong number of arguments")) + dsname = args[0] + + options.fields = options.fields.split(",") + for f in options.fields: + if f not in fields: + usage(_("invalid field %s") % f) + + options.types = options.types.split(",") + for t in options.types: + if t not in types: + usage(_("invalid type %s") % t) + + if not options.sortfields: + options.sortfields = [("-s", "type"), ("-s", "name")] + + if "all" in options.types: + options.types = types[1:] + + ds = zfs.dataset.Dataset(dsname, types=("filesystem")) + + if ds.getprop("zoned") and solaris.misc.isglobalzone(): + options.noname = True + + if not ds.getprop("useraccounting"): + print(_("Initializing accounting information on old filesystem, please wait...")) + ds.userspace_upgrade() + + # gather and process accounting information + # Due to -i, we need to keep a dict, so we can potentially add + # together the posix ID and SID's usage. Grr. + acct = dict() + for prop in props.keys(): + if skiptype(options, prop): + continue; + for elem in ds.userspace(prop): + process_one_raw(acct, options, prop, elem) + + def cmpkey(val): + l = list() + for (opt, field) in options.sortfields: + try: + n = val[field + ".sort"] + except KeyError: + n = val[field] + if opt == "-S": + # reverse sorting + try: + n = -n + except TypeError: + # it's a string; decompose it + # into an array of integers, + # each one the negative of that + # character + n = [-ord(c) for c in n] + l.append(n) + return l + + t = zfs.table.Table(options.fields, rjustfields) + for val in acct.itervalues(): + t.addline(cmpkey(val), val) + t.printme(not options.noheaders) diff --git a/lib/pyzfs/common/util.py b/lib/pyzfs/common/util.py new file mode 100644 index 0000000..a33c669 --- /dev/null +++ b/lib/pyzfs/common/util.py @@ -0,0 +1,141 @@ +#! /usr/bin/python2.6 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. +# + +"""This module provides utility functions for ZFS. +zfs.util.dev -- a file object of /dev/zfs """ + +import gettext +import errno +import os +import solaris.misc +# Note: this module (zfs.util) should not import zfs.ioctl, because that +# would introduce a circular dependency + +errno.ECANCELED = 47 +errno.ENOTSUP = 48 + +dev = open("/dev/zfs", "w") + +try: + _ = gettext.translation("SUNW_OST_OSLIB", "/usr/lib/locale", + fallback=True).gettext +except: + _ = solaris.misc.gettext + +def default_repr(self): + """A simple __repr__ function.""" + if self.__slots__: + str = "<" + self.__class__.__name__ + for v in self.__slots__: + str += " %s: %r" % (v, getattr(self, v)) + return str + ">" + else: + return "<%s %s>" % \ + (self.__class__.__name__, repr(self.__dict__)) + +class ZFSError(StandardError): + """This exception class represents a potentially user-visible + ZFS error. If uncaught, it will be printed and the process will + exit with exit code 1. + + errno -- the error number (eg, from ioctl(2)).""" + + __slots__ = "why", "task", "errno" + __repr__ = default_repr + + def __init__(self, eno, task=None, why=None): + """Create a ZFS exception. + eno -- the error number (errno) + task -- a string describing the task that failed + why -- a string describing why it failed (defaults to + strerror(eno))""" + + self.errno = eno + self.task = task + self.why = why + + def __str__(self): + s = "" + if self.task: + s += self.task + ": " + if self.why: + s += self.why + else: + s += self.strerror + return s + + __strs = { + errno.EPERM: _("permission denied"), + errno.ECANCELED: + _("delegated administration is disabled on pool"), + errno.EINTR: _("signal received"), + errno.EIO: _("I/O error"), + errno.ENOENT: _("dataset does not exist"), + errno.ENOSPC: _("out of space"), + errno.EEXIST: _("dataset already exists"), + errno.EBUSY: _("dataset is busy"), + errno.EROFS: + _("snapshot permissions cannot be modified"), + errno.ENAMETOOLONG: _("dataset name is too long"), + errno.ENOTSUP: _("unsupported version"), + errno.EAGAIN: _("pool I/O is currently suspended"), + } + + __strs[errno.EACCES] = __strs[errno.EPERM] + __strs[errno.ENXIO] = __strs[errno.EIO] + __strs[errno.ENODEV] = __strs[errno.EIO] + __strs[errno.EDQUOT] = __strs[errno.ENOSPC] + + @property + def strerror(self): + return ZFSError.__strs.get(self.errno, os.strerror(self.errno)) + +def nicenum(num): + """Return a nice string (eg "1.23M") for this integer.""" + index = 0; + n = num; + + while n >= 1024: + n /= 1024 + index += 1 + + u = " KMGTPE"[index] + if index == 0: + return "%u" % n; + elif n >= 100 or num & ((1024*index)-1) == 0: + # it's an exact multiple of its index, or it wouldn't + # fit as floating point, so print as an integer + return "%u%c" % (n, u) + else: + # due to rounding, it's tricky to tell what precision to + # use; try each precision and see which one fits + for i in (2, 1, 0): + s = "%.*f%c" % (i, float(num) / (1<<(10*index)), u) + if len(s) <= 5: + return s + +def append_with_opt(option, opt, value, parser): + """A function for OptionParser which appends a tuple (opt, value).""" + getattr(parser.values, option.dest).append((opt, value)) + -- cgit v1.1 From e22578e9bd14eaffe616f5cb12020da1aef5ed1c Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 18 Jul 2012 09:47:48 +0000 Subject: Move dtrace manual page to new vendor location --- cmd/dtrace/dtrace.1 | 670 ---------------------------------------------------- man/man1m/dtrace.1m | 670 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 670 insertions(+), 670 deletions(-) delete mode 100644 cmd/dtrace/dtrace.1 create mode 100644 man/man1m/dtrace.1m diff --git a/cmd/dtrace/dtrace.1 b/cmd/dtrace/dtrace.1 deleted file mode 100644 index e20ed9f..0000000 --- a/cmd/dtrace/dtrace.1 +++ /dev/null @@ -1,670 +0,0 @@ -'\" te -.\" CDDL HEADER START -.\" -.\" The contents of this file are subject to the terms of the -.\" Common Development and Distribution License (the "License"). -.\" You may not use this file except in compliance with the License. -.\" -.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -.\" or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions -.\" and limitations under the License. -.\" -.\" When distributing Covered Code, include this CDDL HEADER in each -.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. -.\" If applicable, add the following below this CDDL HEADER, with the -.\" fields enclosed by brackets "[]" replaced with your own identifying -.\" information: Portions Copyright [yyyy] [name of copyright owner] -.\" -.\" CDDL HEADER END -.\" Copyright (c) 2006, Sun Microsystems, Inc. All Rights Reserved. -.TH dtrace 1M "5 Sep 2006" "SunOS 5.11" "System Administration Commands" -.SH NAME -dtrace \- DTrace dynamic tracing compiler and tracing utility -.SH SYNOPSIS -.LP -.nf -\fBdtrace\fR [\fB-32\fR | \fB-64\fR] [\fB-aACeFGHhlqSvVwZ\fR] [\fB-b\fR \fIbufsz\fR] [\fB-c\fR \fIcmd\fR] - [\fB-D\fR \fIname\fR [\fI=value\fR]] [\fB-I\fR \fIpath\fR] [\fB-L\fR \fIpath\fR] [\fB-o\fR \fIoutput\fR] - [\fB-s\fR \fIscript\fR] [\fB-U\fR \fIname\fR] [\fB-x\fR \fIarg\fR [\fI=val\fR]] - [\fB-X\fR a | c | s | t] [\fB-p\fR \fIpid\fR] - [\fB-P\fR \fIprovider\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-m\fR [\fIprovider:\fR] \fImodule\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-f\fR [[\fIprovider:\fR] \fImodule:\fR] \fIfunction\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-i\fR \fIprobe-id\fR [[\fIpredicate\fR] \fIaction\fR]] -.fi - -.SH DESCRIPTION -.sp -.LP -DTrace is a comprehensive dynamic tracing framework for the Solaris Operating System. DTrace provides a powerful infrastructure that permits administrators, developers, and service personnel to concisely answer arbitrary questions about the behavior of the operating system and user programs. -.sp -.LP -The \fISolaris Dynamic Tracing Guide\fR describes how to use DTrace to observe, debug, and tune system behavior. Refer to this book for a detailed description of DTrace features, including the bundled DTrace observability -tools, instrumentation providers, and the D programming language. -.sp -.LP -The \fBdtrace\fR command provides a generic interface to the essential services provided by the DTrace facility, including: -.RS +4 -.TP -.ie t \(bu -.el o -Options that list the set of probes and providers currently published by DTrace -.RE -.RS +4 -.TP -.ie t \(bu -.el o -Options that enable probes directly using any of the probe description specifiers (provider, module, function, name) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -Options that run the D compiler and compile one or more D program files or programs written directly on the command line -.RE -.RS +4 -.TP -.ie t \(bu -.el o -Options that generate anonymous tracing programs -.RE -.RS +4 -.TP -.ie t \(bu -.el o -Options that generate program stability reports -.RE -.RS +4 -.TP -.ie t \(bu -.el o -Options that modify DTrace tracing and buffering behavior and enable additional D compiler features -.RE -.sp -.LP -You can use \fBdtrace\fR to create D scripts by using it in a \fB#!\fR declaration to create an interpreter file. You can also use \fBdtrace\fR to attempt to compile D programs and determine their properties without actually enabling tracing using the \fB-e\fR option. See \fBOPTIONS\fR. See the \fISolaris Dynamic Tracing Guide\fR for detailed examples of how to use the \fBdtrace\fR utility to perform these tasks. -.SH OPTIONS -.sp -.LP -The arguments accepted by the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, and \fB-i\fR options can include an optional D language \fIpredicate\fR enclosed in slashes \fB//\fR and optional D language \fIaction\fR statement list enclosed in braces \fB{}\fR. D program code specified on the command line must be appropriately quoted to avoid intepretation of meta-characters by the shell. -.sp -.LP -The following options are supported: -.sp -.ne 2 -.mk -.na -\fB\fB-32\fR | \fB-64\fR\fR -.ad -.sp .6 -.RS 4n -The D compiler produces programs using the native data model of the operating system kernel. You can use the \fBisainfo\fR \fB-b\fR command to determine the current operating system data model. If the \fB-32\fR option is specified, \fBdtrace\fR forces -the D compiler to compile a D program using the 32-bit data model. If the \fB-64\fR option is specified, \fBdtrace\fR forces the D compiler to compile a D program using the 64-bit data model. These options are typically not required as \fBdtrace\fR selects the -native data model as the default. The data model affects the sizes of integer types and other language properties. D programs compiled for either data model can be executed on both 32-bit and 64-bit kernels. The \fB-32\fR and \fB-64\fR options also determine the ELF file format -(ELF32 or ELF64) produced by the \fB-G\fR option. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-a\fR\fR -.ad -.sp .6 -.RS 4n -Claim anonymous tracing state and display the traced data. You can combine the \fB-a\fR option with the \fB-e\fR option to force \fBdtrace\fR to exit immediately after consuming the anonymous tracing state rather than continuing to wait for new -data. See the \fISolaris Dynamic Tracing Guide\fR for more information about anonymous tracing. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-A\fR\fR -.ad -.sp .6 -.RS 4n -Generate \fBdriver.conf\fR(4) directives for anonymous tracing. This option constructs a set of \fBdtrace\fR(7D) configuration file directives to enable the specified probes for anonymous tracing and then exits. By default, \fBdtrace\fR attempts to store the directives to the file \fB/kernel/drv/dtrace.conf\fR. You can modify this behavior if you use the \fB-o\fR option to specify an alternate output file. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-b\fR \fIbufsz\fR\fR -.ad -.sp .6 -.RS 4n -Set principal trace buffer size (\fIbufsz\fR). The trace buffer size can include any of the size suffixes \fBk\fR, \fBm\fR, \fBg\fR, or \fBt\fR. If the buffer space cannot be allocated, \fBdtrace\fR attempts -to reduce the buffer size or exit depending on the setting of the \fBbufresize\fR property. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-c\fR \fIcmd\fR\fR -.ad -.sp .6 -.RS 4n -Run the specified command \fIcmd\fR and exit upon its completion. If more than one \fB-c\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status for each child process as it -terminates. The process-ID of the first command is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information -on macro variables. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-C\fR\fR -.ad -.sp .6 -.RS 4n -Run the C preprocessor \fBcpp\fR(1) over D programs before compiling them. You can pass options to the C preprocessor using the \fB-D\fR, \fB-U\fR, \fB-I\fR, and \fB-H\fR options. You can select the degree of C standard conformance if you use the \fB-X\fR option. For a description of the set of tokens defined by the D compiler when invoking the C preprocessor, see \fB-X\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-D\fR \fIname\fR \fB[=\fR\fIvalue\fR\fB]\fR\fR -.ad -.sp .6 -.RS 4n -Define \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). If you specify the equals sign (\fB=\fR) -and additional \fIvalue\fR, the name is assigned the corresponding value. This option passes the \fB-D\fR option to each \fBcpp\fR invocation. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-e\fR\fR -.ad -.sp .6 -.RS 4n -Exit after compiling any requests and consuming anonymous tracing state (\fB-a\fR option) but prior to enabling any probes. You can combine this option with the \fB-a\fR option to print anonymous tracing data and exit. You can also combine this option with D -compiler options. This combination verifies that the programs compile without actually executing them and enabling the corresponding instrumentation. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-f\fR\fB[[\fR\fIprovider\fR\fB:]\fR\fImodule\fR\fB:]\fR\fIfunction\fR\fB[[\fR\fIpredicate\fR\fB]\fR\fIaction\fR\fB]]\fR\fR -.ad -.sp .6 -.RS 4n -Specify function name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function\fR, \fImodule:function\fR, or \fIfunction\fR. -Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIfunction\fR are specified in the description, all probes with the corresponding \fIfunction\fR are matched. -The \fB-f\fR argument can be suffixed with an optional D probe clause. You can specify more than one \fB-f\fR option on the command line at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-F\fR\fR -.ad -.sp .6 -.RS 4n -Coalesce trace output by identifying function entry and return. Function entry probe reports are indented and their output is prefixed with \fB->\fR. Function return probe reports are unindented and their output is prefixed with \fB<-\fR\&. System call -entry probe reports are indented and their output is prefixed with \fB=>\fR. System call return probe reports are unindented and their output is prefixed with \fB<=\fR\&. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-G\fR\fR -.ad -.sp .6 -.RS 4n -Generate an ELF file containing an embedded DTrace program. The DTrace probes specified in the program are saved inside of a relocatable ELF object which can be linked into another program. If the \fB-o\fR option is present, the ELF file is saved using the pathname specified -as the argument for this operand. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fB\fIfilename\fR.d\fR, then the ELF file is saved using the name \fB\fIfilename\fR.o\fR. -Otherwise the ELF file is saved using the name \fBd.out\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-H\fR\fR -.ad -.sp .6 -.RS 4n -Print the pathnames of included files when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-H\fR option -to each \fBcpp\fR invocation, causing it to display the list of pathnames, one for each line, to \fBstderr\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-h\fR\fR -.ad -.sp .6 -.RS 4n -Generate a header file containing macros that correspond to probes in the specified provider definitions. This option should be used to generate a header file that is included by other source files for later use with the \fB-G\fR option. If the \fB-o\fR option -is present, the header file is saved using the pathname specified as the argument for that option. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fIfilename\fR\fB\&.d\fR, then the header file is saved -using the name \fIfilename\fR\fB\&.h\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-i\fR \fIprobe-id\fR\fB[[\fR\fIpredicate\fR] \fIaction\fR\fB]\fR\fR -.ad -.sp .6 -.RS 4n -Specify probe identifier (\fIprobe-id\fR) to trace or list (\fB-l\fR option). You can specify probe IDs using decimal integers as shown by \fBdtrace\fR \fB-l\fR. The \fB-i\fR argument can be suffixed with an optional -D probe clause. You can specify more than one \fB-i\fR option at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-I\fR \fIpath\fR\fR -.ad -.sp .6 -.RS 4n -Add the specified directory \fIpath\fR to the search path for \fB#include\fR files when invoking \fBcpp\fR(1) (enabled -using the \fB-C\fR option). This option passes the \fB-I\fR option to each \fBcpp\fR invocation. The specified \fIpath\fR is inserted into the search path ahead of the default directory list. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-L\fR \fIpath\fR\fR -.ad -.sp .6 -.RS 4n -Add the specified directory \fIpath\fR to the search path for DTrace libraries. DTrace libraries are used to contain common definitions that can be used when writing D programs. The specified \fIpath\fR is added after the default library -search path. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-l\fR\fR -.ad -.sp .6 -.RS 4n -List probes instead of enabling them. If the \fB-l\fR option is specified, \fBdtrace\fR produces a report of the probes matching the descriptions given using the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, \fB-i\fR, -and \fB-s\fR options. If none of these options are specified, this option lists all probes. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-m\fR [[\fIprovider:\fR] \fImodule:\fR [[\fIpredicate\fR] \fIaction\fR]]\fR -.ad -.sp .6 -.RS 4n -Specify module name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module\fR or \fImodule\fR. Unspecified probe description fields are left blank and match -any probes regardless of the values in those fields. If no qualifiers other than \fImodule\fR are specified in the description, all probes with a corresponding \fImodule\fR are matched. The \fB-m\fR argument can be suffixed with an optional D -probe clause. More than one \fB-m\fR option can be specified on the command line at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]\fR -.ad -.sp .6 -.RS 4n -Specify probe name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function:name\fR, \fImodule:function:name\fR, \fIfunction:name\fR, -or \fIname\fR. Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIname\fR are specified in the description, all probes with a corresponding \fIname\fR are -matched. The \fB-n\fR argument can be suffixed with an optional D probe clause. More than one \fB-n\fR option can be specified on the command line at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-o\fR \fIoutput\fR\fR -.ad -.sp .6 -.RS 4n -Specify the \fIoutput\fR file for the \fB-A\fR , \fB-G\fR, and \fB-l\fR options, or for the traced data itself. If the \fB-A\fR option is present and \fB-o\fR is not present, the default output file is \fB/kernel/drv/dtrace.conf\fR. If the \fB-G\fR option is present and the \fB-s\fR option's argument is of the form \fB\fIfilename\fR.d\fR and \fB-o\fR is not present, the default output file is \fB\fIfilename\fR.o\fR. -Otherwise the default output file is \fBd.out\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-p\fR \fIpid\fR\fR -.ad -.sp .6 -.RS 4n -Grab the specified process-ID \fIpid\fR, cache its symbol tables, and exit upon its completion. If more than one \fB-p\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status -for each process as it terminates. The first process-ID is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for -more information on macro variables. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-P\fR \fIprovider\fR \fB[[\fR\fIpredicate\fR\fB]\fR \fIaction\fR]\fR -.ad -.sp .6 -.RS 4n -Specify provider name to trace or list (\fB-l\fR option). The remaining probe description fields module, function, and name are left blank and match any probes regardless of the values in those fields. The \fB-P\fR argument can be suffixed with an optional D -probe clause. You can specify more than one \fB-P\fR option on the command line at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-q\fR\fR -.ad -.sp .6 -.RS 4n -Set quiet mode. \fBdtrace\fR suppresses messages such as the number of probes matched by the specified options and D programs and does not print column headers, the CPU ID, the probe ID, or insert newlines into the output. Only data traced and formatted by D program -statements such as \fBtrace()\fR and \fBprintf()\fR is displayed to \fBstdout\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-s\fR\fR -.ad -.sp .6 -.RS 4n -Compile the specified D program source file. If the \fB-e\fR option is present, the program is compiled but instrumentation is not enabled. If the \fB-l\fR option is present, the program is compiled and the set of probes matched by it is listed, but instrumentation -is not enabled. If none of \fB-e\fR, \fB-l\fR, \fB-G\fR, or \fB-A\fR are present, the instrumentation specified by the D program is enabled and tracing begins. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-S\fR\fR -.ad -.sp .6 -.RS 4n -Show D compiler intermediate code. The D compiler produces a report of the intermediate code generated for each D program to \fBstderr\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-U\fR \fIname\fR\fR -.ad -.sp .6 -.RS 4n -Undefine the specified \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-U\fR option to each \fBcpp\fR invocation. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-v\fR\fR -.ad -.sp .6 -.RS 4n -Set verbose mode. If the \fB-v\fR option is specified, \fBdtrace\fR produces a program stability report showing the minimum interface stability and dependency level for the specified D programs. DTrace stability levels are explained in further detail in the \fISolaris Dynamic Tracing Guide\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-V\fR\fR -.ad -.sp .6 -.RS 4n -Report the highest D programming interface version supported by \fBdtrace\fR. The version information is printed to \fBstdout\fR and the \fBdtrace\fR command exits. Refer to the \fISolaris Dynamic Tracing Guide\fR for -more information about DTrace versioning features. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-w\fR\fR -.ad -.sp .6 -.RS 4n -Permit destructive actions in D programs specified using the \fB-s\fR, \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options. If the \fB-w\fR option is not specified, \fBdtrace\fR does not -permit the compilation or enabling of a D program that contains destructive actions. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-x\fR \fIarg\fR [\fI=val\fR]\fR -.ad -.sp .6 -.RS 4n -Enable or modify a DTrace runtime option or D compiler option. The list of options is found in the \fISolaris Dynamic Tracing Guide\fR. Boolean options are enabled by specifying their name. Options with values are set by separating the option name and -value with an equals sign (\fB=\fR). -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-X\fR \fBa | c | s | t\fR\fR -.ad -.sp .6 -.RS 4n -Specify the degree of conformance to the ISO C standard that should be selected when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). -The \fB-X\fR option argument affects the value and presence of the \fB__STDC__\fR macro depending upon the value of the argument letter. -.sp -The \fB-X\fR option supports the following arguments: -.sp -.ne 2 -.mk -.na -\fB\fBa\fR\fR -.ad -.RS 5n -.rt -Default. ISO C plus K&R compatibility extensions, with semantic changes required by ISO C. This is the default mode if \fB-X\fR is not specified. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction -with the \fB-Xa\fR option. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fBc\fR\fR -.ad -.RS 5n -.rt -Conformance. Strictly conformant ISO C, without K&R C compatibility extensions. The predefined macro \fB__STDC__\fR has a value of 1 when \fBcpp\fR is invoked in conjunction with the \fB-Xc\fR option. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fBs\fR\fR -.ad -.RS 5n -.rt -K&R C only. The macro \fB__STDC__\fR is not defined when \fBcpp\fR is invoked in conjunction with the \fB-Xs\fR option. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fBt\fR\fR -.ad -.RS 5n -.rt -Transition. ISO C plus K&R C compatibility extensions, without semantic changes required by ISO C. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction with the \fB-Xt\fR option. -.RE - -As the \fB-X\fR option only affects how the D compiler invokes the C preprocessor, the \fB-Xa\fR and \fB-Xt\fR options are equivalent from the perspective of D and both are provided only to ease re-use of settings from a C build environment. -.sp -Regardless of the \fB-X\fR mode, the following additional C preprocessor definitions are always specified and valid in all modes: -.RS +4 -.TP -.ie t \(bu -.el o -\fB__sun\fR -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__unix\fR -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__SVR4\fR -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__sparc\fR (on SPARC systems only) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__sparcv9\fR (on SPARC systems only when 64-bit programs are compiled) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__i386\fR (on x86 systems only when 32-bit programs are compiled) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__amd64\fR (on x86 systems only when 64-bit programs are compiled) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__\fI`uname -s`\fR_\fI`uname -r`\fR\fR (for example, \fB__SunOS_5_10\fR) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__SUNW_D=1\fR -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__SUNW_D_VERSION=0x\fIMMmmmuuu\fR\fR -.sp -Where \fIMM\fR is the major release value in hexadecimal, \fImmm\fR is the minor release value in hexadecimal, and \fIuuu\fR is the -micro release value in hexadecimal. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information about DTrace versioning. -.RE -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-Z\fR\fR -.ad -.sp .6 -.RS 4n -Permit probe descriptions that match zero probes. If the \fB-Z\fR option is not specified, \fBdtrace\fR reports an error and exits if any probe descriptions specified in D program files (\fB-s\fR option) or on the command line (\fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options) contain descriptions that do not match any known probes. -.RE - -.SH OPERANDS -.sp -.LP -You can specify zero or more additional arguments on the \fBdtrace\fR command line to define a set of macro variables (\fB$1\fR, \fB$2\fR, and so forth). The additional arguments can be used in D programs specified using the \fB-s\fR option -or on the command line. The use of macro variables is described further in the \fISolaris Dynamic Tracing Guide\fR. -.SH EXIT STATUS -.sp -.LP -The following exit values are returned: -.sp -.ne 2 -.mk -.na -\fB0\fR -.ad -.RS 5n -.rt -Successful completion. -.sp -For D program requests, an exit status of \fB0\fR indicates that programs were successfully compiled, probes were successfully enabled, or anonymous state was successfully retrieved. \fBdtrace\fR returns \fB0\fR even if the specified tracing requests -encountered errors or drops. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB1\fR\fR -.ad -.RS 5n -.rt -An error occurred. -.sp -For D program requests, an exit status of \fB1\fR indicates that program compilation failed or that the specified request could not be satisfied. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB2\fR\fR -.ad -.RS 5n -.rt -Invalid command line options or arguments were specified. -.RE - -.SH ATTRIBUTES -.sp -.LP -See \fBattributes\fR(5) for descriptions of the following attributes: -.sp - -.sp -.TS -tab() box; -cw(2.75i) |cw(2.75i) -lw(2.75i) |lw(2.75i) -. -ATTRIBUTE TYPEATTRIBUTE VALUE -_ -AvailabilitySUNWdtrc -_ -Interface StabilitySee below. -.TE - -.sp -.LP -The command-line syntax is Committed. The human-readable output is Uncommitted. -.SH SEE ALSO -.sp -.LP -\fBcpp\fR(1), \fBisainfo\fR(1), \fBlibdtrace\fR(3LIB), \fBdriver.conf\fR(4), \fBattributes\fR(5), \fBdtrace\fR(7D) -.sp -.LP -\fISolaris Dynamic Tracing Guide\fR diff --git a/man/man1m/dtrace.1m b/man/man1m/dtrace.1m new file mode 100644 index 0000000..e20ed9f --- /dev/null +++ b/man/man1m/dtrace.1m @@ -0,0 +1,670 @@ +'\" te +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" Copyright (c) 2006, Sun Microsystems, Inc. All Rights Reserved. +.TH dtrace 1M "5 Sep 2006" "SunOS 5.11" "System Administration Commands" +.SH NAME +dtrace \- DTrace dynamic tracing compiler and tracing utility +.SH SYNOPSIS +.LP +.nf +\fBdtrace\fR [\fB-32\fR | \fB-64\fR] [\fB-aACeFGHhlqSvVwZ\fR] [\fB-b\fR \fIbufsz\fR] [\fB-c\fR \fIcmd\fR] + [\fB-D\fR \fIname\fR [\fI=value\fR]] [\fB-I\fR \fIpath\fR] [\fB-L\fR \fIpath\fR] [\fB-o\fR \fIoutput\fR] + [\fB-s\fR \fIscript\fR] [\fB-U\fR \fIname\fR] [\fB-x\fR \fIarg\fR [\fI=val\fR]] + [\fB-X\fR a | c | s | t] [\fB-p\fR \fIpid\fR] + [\fB-P\fR \fIprovider\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-m\fR [\fIprovider:\fR] \fImodule\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-f\fR [[\fIprovider:\fR] \fImodule:\fR] \fIfunction\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-i\fR \fIprobe-id\fR [[\fIpredicate\fR] \fIaction\fR]] +.fi + +.SH DESCRIPTION +.sp +.LP +DTrace is a comprehensive dynamic tracing framework for the Solaris Operating System. DTrace provides a powerful infrastructure that permits administrators, developers, and service personnel to concisely answer arbitrary questions about the behavior of the operating system and user programs. +.sp +.LP +The \fISolaris Dynamic Tracing Guide\fR describes how to use DTrace to observe, debug, and tune system behavior. Refer to this book for a detailed description of DTrace features, including the bundled DTrace observability +tools, instrumentation providers, and the D programming language. +.sp +.LP +The \fBdtrace\fR command provides a generic interface to the essential services provided by the DTrace facility, including: +.RS +4 +.TP +.ie t \(bu +.el o +Options that list the set of probes and providers currently published by DTrace +.RE +.RS +4 +.TP +.ie t \(bu +.el o +Options that enable probes directly using any of the probe description specifiers (provider, module, function, name) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +Options that run the D compiler and compile one or more D program files or programs written directly on the command line +.RE +.RS +4 +.TP +.ie t \(bu +.el o +Options that generate anonymous tracing programs +.RE +.RS +4 +.TP +.ie t \(bu +.el o +Options that generate program stability reports +.RE +.RS +4 +.TP +.ie t \(bu +.el o +Options that modify DTrace tracing and buffering behavior and enable additional D compiler features +.RE +.sp +.LP +You can use \fBdtrace\fR to create D scripts by using it in a \fB#!\fR declaration to create an interpreter file. You can also use \fBdtrace\fR to attempt to compile D programs and determine their properties without actually enabling tracing using the \fB-e\fR option. See \fBOPTIONS\fR. See the \fISolaris Dynamic Tracing Guide\fR for detailed examples of how to use the \fBdtrace\fR utility to perform these tasks. +.SH OPTIONS +.sp +.LP +The arguments accepted by the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, and \fB-i\fR options can include an optional D language \fIpredicate\fR enclosed in slashes \fB//\fR and optional D language \fIaction\fR statement list enclosed in braces \fB{}\fR. D program code specified on the command line must be appropriately quoted to avoid intepretation of meta-characters by the shell. +.sp +.LP +The following options are supported: +.sp +.ne 2 +.mk +.na +\fB\fB-32\fR | \fB-64\fR\fR +.ad +.sp .6 +.RS 4n +The D compiler produces programs using the native data model of the operating system kernel. You can use the \fBisainfo\fR \fB-b\fR command to determine the current operating system data model. If the \fB-32\fR option is specified, \fBdtrace\fR forces +the D compiler to compile a D program using the 32-bit data model. If the \fB-64\fR option is specified, \fBdtrace\fR forces the D compiler to compile a D program using the 64-bit data model. These options are typically not required as \fBdtrace\fR selects the +native data model as the default. The data model affects the sizes of integer types and other language properties. D programs compiled for either data model can be executed on both 32-bit and 64-bit kernels. The \fB-32\fR and \fB-64\fR options also determine the ELF file format +(ELF32 or ELF64) produced by the \fB-G\fR option. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-a\fR\fR +.ad +.sp .6 +.RS 4n +Claim anonymous tracing state and display the traced data. You can combine the \fB-a\fR option with the \fB-e\fR option to force \fBdtrace\fR to exit immediately after consuming the anonymous tracing state rather than continuing to wait for new +data. See the \fISolaris Dynamic Tracing Guide\fR for more information about anonymous tracing. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-A\fR\fR +.ad +.sp .6 +.RS 4n +Generate \fBdriver.conf\fR(4) directives for anonymous tracing. This option constructs a set of \fBdtrace\fR(7D) configuration file directives to enable the specified probes for anonymous tracing and then exits. By default, \fBdtrace\fR attempts to store the directives to the file \fB/kernel/drv/dtrace.conf\fR. You can modify this behavior if you use the \fB-o\fR option to specify an alternate output file. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-b\fR \fIbufsz\fR\fR +.ad +.sp .6 +.RS 4n +Set principal trace buffer size (\fIbufsz\fR). The trace buffer size can include any of the size suffixes \fBk\fR, \fBm\fR, \fBg\fR, or \fBt\fR. If the buffer space cannot be allocated, \fBdtrace\fR attempts +to reduce the buffer size or exit depending on the setting of the \fBbufresize\fR property. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-c\fR \fIcmd\fR\fR +.ad +.sp .6 +.RS 4n +Run the specified command \fIcmd\fR and exit upon its completion. If more than one \fB-c\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status for each child process as it +terminates. The process-ID of the first command is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information +on macro variables. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-C\fR\fR +.ad +.sp .6 +.RS 4n +Run the C preprocessor \fBcpp\fR(1) over D programs before compiling them. You can pass options to the C preprocessor using the \fB-D\fR, \fB-U\fR, \fB-I\fR, and \fB-H\fR options. You can select the degree of C standard conformance if you use the \fB-X\fR option. For a description of the set of tokens defined by the D compiler when invoking the C preprocessor, see \fB-X\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-D\fR \fIname\fR \fB[=\fR\fIvalue\fR\fB]\fR\fR +.ad +.sp .6 +.RS 4n +Define \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). If you specify the equals sign (\fB=\fR) +and additional \fIvalue\fR, the name is assigned the corresponding value. This option passes the \fB-D\fR option to each \fBcpp\fR invocation. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-e\fR\fR +.ad +.sp .6 +.RS 4n +Exit after compiling any requests and consuming anonymous tracing state (\fB-a\fR option) but prior to enabling any probes. You can combine this option with the \fB-a\fR option to print anonymous tracing data and exit. You can also combine this option with D +compiler options. This combination verifies that the programs compile without actually executing them and enabling the corresponding instrumentation. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fB[[\fR\fIprovider\fR\fB:]\fR\fImodule\fR\fB:]\fR\fIfunction\fR\fB[[\fR\fIpredicate\fR\fB]\fR\fIaction\fR\fB]]\fR\fR +.ad +.sp .6 +.RS 4n +Specify function name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function\fR, \fImodule:function\fR, or \fIfunction\fR. +Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIfunction\fR are specified in the description, all probes with the corresponding \fIfunction\fR are matched. +The \fB-f\fR argument can be suffixed with an optional D probe clause. You can specify more than one \fB-f\fR option on the command line at a time. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-F\fR\fR +.ad +.sp .6 +.RS 4n +Coalesce trace output by identifying function entry and return. Function entry probe reports are indented and their output is prefixed with \fB->\fR. Function return probe reports are unindented and their output is prefixed with \fB<-\fR\&. System call +entry probe reports are indented and their output is prefixed with \fB=>\fR. System call return probe reports are unindented and their output is prefixed with \fB<=\fR\&. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-G\fR\fR +.ad +.sp .6 +.RS 4n +Generate an ELF file containing an embedded DTrace program. The DTrace probes specified in the program are saved inside of a relocatable ELF object which can be linked into another program. If the \fB-o\fR option is present, the ELF file is saved using the pathname specified +as the argument for this operand. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fB\fIfilename\fR.d\fR, then the ELF file is saved using the name \fB\fIfilename\fR.o\fR. +Otherwise the ELF file is saved using the name \fBd.out\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-H\fR\fR +.ad +.sp .6 +.RS 4n +Print the pathnames of included files when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-H\fR option +to each \fBcpp\fR invocation, causing it to display the list of pathnames, one for each line, to \fBstderr\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-h\fR\fR +.ad +.sp .6 +.RS 4n +Generate a header file containing macros that correspond to probes in the specified provider definitions. This option should be used to generate a header file that is included by other source files for later use with the \fB-G\fR option. If the \fB-o\fR option +is present, the header file is saved using the pathname specified as the argument for that option. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fIfilename\fR\fB\&.d\fR, then the header file is saved +using the name \fIfilename\fR\fB\&.h\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-i\fR \fIprobe-id\fR\fB[[\fR\fIpredicate\fR] \fIaction\fR\fB]\fR\fR +.ad +.sp .6 +.RS 4n +Specify probe identifier (\fIprobe-id\fR) to trace or list (\fB-l\fR option). You can specify probe IDs using decimal integers as shown by \fBdtrace\fR \fB-l\fR. The \fB-i\fR argument can be suffixed with an optional +D probe clause. You can specify more than one \fB-i\fR option at a time. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-I\fR \fIpath\fR\fR +.ad +.sp .6 +.RS 4n +Add the specified directory \fIpath\fR to the search path for \fB#include\fR files when invoking \fBcpp\fR(1) (enabled +using the \fB-C\fR option). This option passes the \fB-I\fR option to each \fBcpp\fR invocation. The specified \fIpath\fR is inserted into the search path ahead of the default directory list. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-L\fR \fIpath\fR\fR +.ad +.sp .6 +.RS 4n +Add the specified directory \fIpath\fR to the search path for DTrace libraries. DTrace libraries are used to contain common definitions that can be used when writing D programs. The specified \fIpath\fR is added after the default library +search path. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-l\fR\fR +.ad +.sp .6 +.RS 4n +List probes instead of enabling them. If the \fB-l\fR option is specified, \fBdtrace\fR produces a report of the probes matching the descriptions given using the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, \fB-i\fR, +and \fB-s\fR options. If none of these options are specified, this option lists all probes. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-m\fR [[\fIprovider:\fR] \fImodule:\fR [[\fIpredicate\fR] \fIaction\fR]]\fR +.ad +.sp .6 +.RS 4n +Specify module name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module\fR or \fImodule\fR. Unspecified probe description fields are left blank and match +any probes regardless of the values in those fields. If no qualifiers other than \fImodule\fR are specified in the description, all probes with a corresponding \fImodule\fR are matched. The \fB-m\fR argument can be suffixed with an optional D +probe clause. More than one \fB-m\fR option can be specified on the command line at a time. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]\fR +.ad +.sp .6 +.RS 4n +Specify probe name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function:name\fR, \fImodule:function:name\fR, \fIfunction:name\fR, +or \fIname\fR. Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIname\fR are specified in the description, all probes with a corresponding \fIname\fR are +matched. The \fB-n\fR argument can be suffixed with an optional D probe clause. More than one \fB-n\fR option can be specified on the command line at a time. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIoutput\fR\fR +.ad +.sp .6 +.RS 4n +Specify the \fIoutput\fR file for the \fB-A\fR , \fB-G\fR, and \fB-l\fR options, or for the traced data itself. If the \fB-A\fR option is present and \fB-o\fR is not present, the default output file is \fB/kernel/drv/dtrace.conf\fR. If the \fB-G\fR option is present and the \fB-s\fR option's argument is of the form \fB\fIfilename\fR.d\fR and \fB-o\fR is not present, the default output file is \fB\fIfilename\fR.o\fR. +Otherwise the default output file is \fBd.out\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR \fIpid\fR\fR +.ad +.sp .6 +.RS 4n +Grab the specified process-ID \fIpid\fR, cache its symbol tables, and exit upon its completion. If more than one \fB-p\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status +for each process as it terminates. The first process-ID is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for +more information on macro variables. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-P\fR \fIprovider\fR \fB[[\fR\fIpredicate\fR\fB]\fR \fIaction\fR]\fR +.ad +.sp .6 +.RS 4n +Specify provider name to trace or list (\fB-l\fR option). The remaining probe description fields module, function, and name are left blank and match any probes regardless of the values in those fields. The \fB-P\fR argument can be suffixed with an optional D +probe clause. You can specify more than one \fB-P\fR option on the command line at a time. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-q\fR\fR +.ad +.sp .6 +.RS 4n +Set quiet mode. \fBdtrace\fR suppresses messages such as the number of probes matched by the specified options and D programs and does not print column headers, the CPU ID, the probe ID, or insert newlines into the output. Only data traced and formatted by D program +statements such as \fBtrace()\fR and \fBprintf()\fR is displayed to \fBstdout\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-s\fR\fR +.ad +.sp .6 +.RS 4n +Compile the specified D program source file. If the \fB-e\fR option is present, the program is compiled but instrumentation is not enabled. If the \fB-l\fR option is present, the program is compiled and the set of probes matched by it is listed, but instrumentation +is not enabled. If none of \fB-e\fR, \fB-l\fR, \fB-G\fR, or \fB-A\fR are present, the instrumentation specified by the D program is enabled and tracing begins. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-S\fR\fR +.ad +.sp .6 +.RS 4n +Show D compiler intermediate code. The D compiler produces a report of the intermediate code generated for each D program to \fBstderr\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-U\fR \fIname\fR\fR +.ad +.sp .6 +.RS 4n +Undefine the specified \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-U\fR option to each \fBcpp\fR invocation. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-v\fR\fR +.ad +.sp .6 +.RS 4n +Set verbose mode. If the \fB-v\fR option is specified, \fBdtrace\fR produces a program stability report showing the minimum interface stability and dependency level for the specified D programs. DTrace stability levels are explained in further detail in the \fISolaris Dynamic Tracing Guide\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-V\fR\fR +.ad +.sp .6 +.RS 4n +Report the highest D programming interface version supported by \fBdtrace\fR. The version information is printed to \fBstdout\fR and the \fBdtrace\fR command exits. Refer to the \fISolaris Dynamic Tracing Guide\fR for +more information about DTrace versioning features. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-w\fR\fR +.ad +.sp .6 +.RS 4n +Permit destructive actions in D programs specified using the \fB-s\fR, \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options. If the \fB-w\fR option is not specified, \fBdtrace\fR does not +permit the compilation or enabling of a D program that contains destructive actions. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-x\fR \fIarg\fR [\fI=val\fR]\fR +.ad +.sp .6 +.RS 4n +Enable or modify a DTrace runtime option or D compiler option. The list of options is found in the \fISolaris Dynamic Tracing Guide\fR. Boolean options are enabled by specifying their name. Options with values are set by separating the option name and +value with an equals sign (\fB=\fR). +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-X\fR \fBa | c | s | t\fR\fR +.ad +.sp .6 +.RS 4n +Specify the degree of conformance to the ISO C standard that should be selected when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). +The \fB-X\fR option argument affects the value and presence of the \fB__STDC__\fR macro depending upon the value of the argument letter. +.sp +The \fB-X\fR option supports the following arguments: +.sp +.ne 2 +.mk +.na +\fB\fBa\fR\fR +.ad +.RS 5n +.rt +Default. ISO C plus K&R compatibility extensions, with semantic changes required by ISO C. This is the default mode if \fB-X\fR is not specified. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction +with the \fB-Xa\fR option. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBc\fR\fR +.ad +.RS 5n +.rt +Conformance. Strictly conformant ISO C, without K&R C compatibility extensions. The predefined macro \fB__STDC__\fR has a value of 1 when \fBcpp\fR is invoked in conjunction with the \fB-Xc\fR option. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBs\fR\fR +.ad +.RS 5n +.rt +K&R C only. The macro \fB__STDC__\fR is not defined when \fBcpp\fR is invoked in conjunction with the \fB-Xs\fR option. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBt\fR\fR +.ad +.RS 5n +.rt +Transition. ISO C plus K&R C compatibility extensions, without semantic changes required by ISO C. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction with the \fB-Xt\fR option. +.RE + +As the \fB-X\fR option only affects how the D compiler invokes the C preprocessor, the \fB-Xa\fR and \fB-Xt\fR options are equivalent from the perspective of D and both are provided only to ease re-use of settings from a C build environment. +.sp +Regardless of the \fB-X\fR mode, the following additional C preprocessor definitions are always specified and valid in all modes: +.RS +4 +.TP +.ie t \(bu +.el o +\fB__sun\fR +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fB__unix\fR +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fB__SVR4\fR +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fB__sparc\fR (on SPARC systems only) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fB__sparcv9\fR (on SPARC systems only when 64-bit programs are compiled) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fB__i386\fR (on x86 systems only when 32-bit programs are compiled) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fB__amd64\fR (on x86 systems only when 64-bit programs are compiled) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fB__\fI`uname -s`\fR_\fI`uname -r`\fR\fR (for example, \fB__SunOS_5_10\fR) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fB__SUNW_D=1\fR +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fB__SUNW_D_VERSION=0x\fIMMmmmuuu\fR\fR +.sp +Where \fIMM\fR is the major release value in hexadecimal, \fImmm\fR is the minor release value in hexadecimal, and \fIuuu\fR is the +micro release value in hexadecimal. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information about DTrace versioning. +.RE +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-Z\fR\fR +.ad +.sp .6 +.RS 4n +Permit probe descriptions that match zero probes. If the \fB-Z\fR option is not specified, \fBdtrace\fR reports an error and exits if any probe descriptions specified in D program files (\fB-s\fR option) or on the command line (\fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options) contain descriptions that do not match any known probes. +.RE + +.SH OPERANDS +.sp +.LP +You can specify zero or more additional arguments on the \fBdtrace\fR command line to define a set of macro variables (\fB$1\fR, \fB$2\fR, and so forth). The additional arguments can be used in D programs specified using the \fB-s\fR option +or on the command line. The use of macro variables is described further in the \fISolaris Dynamic Tracing Guide\fR. +.SH EXIT STATUS +.sp +.LP +The following exit values are returned: +.sp +.ne 2 +.mk +.na +\fB0\fR +.ad +.RS 5n +.rt +Successful completion. +.sp +For D program requests, an exit status of \fB0\fR indicates that programs were successfully compiled, probes were successfully enabled, or anonymous state was successfully retrieved. \fBdtrace\fR returns \fB0\fR even if the specified tracing requests +encountered errors or drops. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB1\fR\fR +.ad +.RS 5n +.rt +An error occurred. +.sp +For D program requests, an exit status of \fB1\fR indicates that program compilation failed or that the specified request could not be satisfied. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB2\fR\fR +.ad +.RS 5n +.rt +Invalid command line options or arguments were specified. +.RE + +.SH ATTRIBUTES +.sp +.LP +See \fBattributes\fR(5) for descriptions of the following attributes: +.sp + +.sp +.TS +tab() box; +cw(2.75i) |cw(2.75i) +lw(2.75i) |lw(2.75i) +. +ATTRIBUTE TYPEATTRIBUTE VALUE +_ +AvailabilitySUNWdtrc +_ +Interface StabilitySee below. +.TE + +.sp +.LP +The command-line syntax is Committed. The human-readable output is Uncommitted. +.SH SEE ALSO +.sp +.LP +\fBcpp\fR(1), \fBisainfo\fR(1), \fBlibdtrace\fR(3LIB), \fBdriver.conf\fR(4), \fBattributes\fR(5), \fBdtrace\fR(7D) +.sp +.LP +\fISolaris Dynamic Tracing Guide\fR -- cgit v1.1 From 9a3a05c57f08e07890087a8cc0cdbd31a8cd9cf0 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 18 Jul 2012 09:53:20 +0000 Subject: Import relevant vendor manual pages from illumos-gate revision 13304:b54231762cfa Obtained from: ssh://anonhg@hg.illumos.org/illumos-gate --- man/man1m/dtrace.1m | 393 +++-- man/man1m/lockstat.1m | 909 +++++++++++ man/man1m/plockstat.1m | 273 ++++ man/man1m/zdb.1m | 87 ++ man/man1m/zfs.1m | 3781 ++++++++++++++++++++++++++++++++++++++++++++++ man/man1m/zpool.1m | 2146 ++++++++++++++++++++++++++ man/man1m/zstreamdump.1m | 67 + 7 files changed, 7532 insertions(+), 124 deletions(-) create mode 100644 man/man1m/lockstat.1m create mode 100644 man/man1m/plockstat.1m create mode 100644 man/man1m/zdb.1m create mode 100644 man/man1m/zfs.1m create mode 100644 man/man1m/zpool.1m create mode 100644 man/man1m/zstreamdump.1m diff --git a/man/man1m/dtrace.1m b/man/man1m/dtrace.1m index e20ed9f..1381044 100644 --- a/man/man1m/dtrace.1m +++ b/man/man1m/dtrace.1m @@ -1,51 +1,42 @@ '\" te -.\" CDDL HEADER START -.\" -.\" The contents of this file are subject to the terms of the -.\" Common Development and Distribution License (the "License"). -.\" You may not use this file except in compliance with the License. -.\" -.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -.\" or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions -.\" and limitations under the License. -.\" -.\" When distributing Covered Code, include this CDDL HEADER in each -.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. -.\" If applicable, add the following below this CDDL HEADER, with the -.\" fields enclosed by brackets "[]" replaced with your own identifying -.\" information: Portions Copyright [yyyy] [name of copyright owner] -.\" -.\" CDDL HEADER END -.\" Copyright (c) 2006, Sun Microsystems, Inc. All Rights Reserved. -.TH dtrace 1M "5 Sep 2006" "SunOS 5.11" "System Administration Commands" +.\" Copyright (c) 2009, Sun Microsystems, Inc. All Rights Reserved. +.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] +.TH dtrace 1M "4 Aug 2009" "SunOS 5.11" "System Administration Commands" .SH NAME dtrace \- DTrace dynamic tracing compiler and tracing utility .SH SYNOPSIS .LP .nf \fBdtrace\fR [\fB-32\fR | \fB-64\fR] [\fB-aACeFGHhlqSvVwZ\fR] [\fB-b\fR \fIbufsz\fR] [\fB-c\fR \fIcmd\fR] - [\fB-D\fR \fIname\fR [\fI=value\fR]] [\fB-I\fR \fIpath\fR] [\fB-L\fR \fIpath\fR] [\fB-o\fR \fIoutput\fR] - [\fB-s\fR \fIscript\fR] [\fB-U\fR \fIname\fR] [\fB-x\fR \fIarg\fR [\fI=val\fR]] - [\fB-X\fR a | c | s | t] [\fB-p\fR \fIpid\fR] - [\fB-P\fR \fIprovider\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-m\fR [\fIprovider:\fR] \fImodule\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-f\fR [[\fIprovider:\fR] \fImodule:\fR] \fIfunction\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-i\fR \fIprobe-id\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-D\fR \fIname\fR [\fI=value\fR]] [\fB-I\fR \fIpath\fR] [\fB-L\fR \fIpath\fR] [\fB-o\fR \fIoutput\fR] + [\fB-s\fR \fIscript\fR] [\fB-U\fR \fIname\fR] [\fB-x\fR \fIarg\fR [\fI=val\fR]] + [\fB-X\fR a | c | s | t] [\fB-p\fR \fIpid\fR] + [\fB-P\fR \fIprovider\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-m\fR [\fIprovider:\fR] \fImodule\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-f\fR [[\fIprovider:\fR] \fImodule:\fR] \fIfunction\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-i\fR \fIprobe-id\fR [[\fIpredicate\fR] \fIaction\fR]] .fi .SH DESCRIPTION .sp .LP -DTrace is a comprehensive dynamic tracing framework for the Solaris Operating System. DTrace provides a powerful infrastructure that permits administrators, developers, and service personnel to concisely answer arbitrary questions about the behavior of the operating system and user programs. +DTrace is a comprehensive dynamic tracing framework for the Solaris Operating +System. DTrace provides a powerful infrastructure that permits administrators, +developers, and service personnel to concisely answer arbitrary questions about +the behavior of the operating system and user programs. .sp .LP -The \fISolaris Dynamic Tracing Guide\fR describes how to use DTrace to observe, debug, and tune system behavior. Refer to this book for a detailed description of DTrace features, including the bundled DTrace observability -tools, instrumentation providers, and the D programming language. +The \fISolaris Dynamic Tracing Guide\fR describes how to use DTrace to observe, +debug, and tune system behavior. Refer to this book for a detailed description +of DTrace features, including the bundled DTrace observability tools, +instrumentation providers, and the D programming language. .sp .LP -The \fBdtrace\fR command provides a generic interface to the essential services provided by the DTrace facility, including: +The \fBdtrace\fR command provides a generic interface to the essential services +provided by the DTrace facility, including: .RS +4 .TP .ie t \(bu @@ -56,13 +47,15 @@ Options that list the set of probes and providers currently published by DTrace .TP .ie t \(bu .el o -Options that enable probes directly using any of the probe description specifiers (provider, module, function, name) +Options that enable probes directly using any of the probe description +specifiers (provider, module, function, name) .RE .RS +4 .TP .ie t \(bu .el o -Options that run the D compiler and compile one or more D program files or programs written directly on the command line +Options that run the D compiler and compile one or more D program files or +programs written directly on the command line .RE .RS +4 .TP @@ -80,15 +73,25 @@ Options that generate program stability reports .TP .ie t \(bu .el o -Options that modify DTrace tracing and buffering behavior and enable additional D compiler features +Options that modify DTrace tracing and buffering behavior and enable additional +D compiler features .RE .sp .LP -You can use \fBdtrace\fR to create D scripts by using it in a \fB#!\fR declaration to create an interpreter file. You can also use \fBdtrace\fR to attempt to compile D programs and determine their properties without actually enabling tracing using the \fB-e\fR option. See \fBOPTIONS\fR. See the \fISolaris Dynamic Tracing Guide\fR for detailed examples of how to use the \fBdtrace\fR utility to perform these tasks. +You can use \fBdtrace\fR to create D scripts by using it in a \fB#!\fR +declaration to create an interpreter file. You can also use \fBdtrace\fR to +attempt to compile D programs and determine their properties without actually +enabling tracing using the \fB-e\fR option. See \fBOPTIONS\fR. See the +\fISolaris Dynamic Tracing Guide\fR for detailed examples of how to use the +\fBdtrace\fR utility to perform these tasks. .SH OPTIONS .sp .LP -The arguments accepted by the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, and \fB-i\fR options can include an optional D language \fIpredicate\fR enclosed in slashes \fB//\fR and optional D language \fIaction\fR statement list enclosed in braces \fB{}\fR. D program code specified on the command line must be appropriately quoted to avoid intepretation of meta-characters by the shell. +The arguments accepted by the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, and +\fB-i\fR options can include an optional D language \fIpredicate\fR enclosed in +slashes \fB//\fR and optional D language \fIaction\fR statement list enclosed +in braces \fB{}\fR. D program code specified on the command line must be +appropriately quoted to avoid interpretation of meta-characters by the shell. .sp .LP The following options are supported: @@ -100,10 +103,17 @@ The following options are supported: .ad .sp .6 .RS 4n -The D compiler produces programs using the native data model of the operating system kernel. You can use the \fBisainfo\fR \fB-b\fR command to determine the current operating system data model. If the \fB-32\fR option is specified, \fBdtrace\fR forces -the D compiler to compile a D program using the 32-bit data model. If the \fB-64\fR option is specified, \fBdtrace\fR forces the D compiler to compile a D program using the 64-bit data model. These options are typically not required as \fBdtrace\fR selects the -native data model as the default. The data model affects the sizes of integer types and other language properties. D programs compiled for either data model can be executed on both 32-bit and 64-bit kernels. The \fB-32\fR and \fB-64\fR options also determine the ELF file format -(ELF32 or ELF64) produced by the \fB-G\fR option. +The D compiler produces programs using the native data model of the operating +system kernel. You can use the \fBisainfo\fR \fB-b\fR command to determine the +current operating system data model. If the \fB-32\fR option is specified, +\fBdtrace\fR forces the D compiler to compile a D program using the 32-bit data +model. If the \fB-64\fR option is specified, \fBdtrace\fR forces the D compiler +to compile a D program using the 64-bit data model. These options are typically +not required as \fBdtrace\fR selects the native data model as the default. The +data model affects the sizes of integer types and other language properties. D +programs compiled for either data model can be executed on both 32-bit and +64-bit kernels. The \fB-32\fR and \fB-64\fR options also determine the ELF file +format (ELF32 or ELF64) produced by the \fB-G\fR option. .RE .sp @@ -114,8 +124,11 @@ native data model as the default. The data model affects the sizes of integer ty .ad .sp .6 .RS 4n -Claim anonymous tracing state and display the traced data. You can combine the \fB-a\fR option with the \fB-e\fR option to force \fBdtrace\fR to exit immediately after consuming the anonymous tracing state rather than continuing to wait for new -data. See the \fISolaris Dynamic Tracing Guide\fR for more information about anonymous tracing. +Claim anonymous tracing state and display the traced data. You can combine the +\fB-a\fR option with the \fB-e\fR option to force \fBdtrace\fR to exit +immediately after consuming the anonymous tracing state rather than continuing +to wait for new data. See the \fISolaris Dynamic Tracing Guide\fR for more +information about anonymous tracing. .RE .sp @@ -126,7 +139,12 @@ data. See the \fISolaris Dynamic Tracing Guide\fR for more information about ano .ad .sp .6 .RS 4n -Generate \fBdriver.conf\fR(4) directives for anonymous tracing. This option constructs a set of \fBdtrace\fR(7D) configuration file directives to enable the specified probes for anonymous tracing and then exits. By default, \fBdtrace\fR attempts to store the directives to the file \fB/kernel/drv/dtrace.conf\fR. You can modify this behavior if you use the \fB-o\fR option to specify an alternate output file. +Generate \fBdriver.conf\fR(4) directives for anonymous tracing. This option +constructs a set of \fBdtrace\fR(7D) configuration file directives to enable +the specified probes for anonymous tracing and then exits. By default, +\fBdtrace\fR attempts to store the directives to the file +\fB/kernel/drv/dtrace.conf\fR. You can modify this behavior if you use the +\fB-o\fR option to specify an alternate output file. .RE .sp @@ -137,8 +155,10 @@ Generate \fBdriver.conf\fR(4) directives for anonymous tracing. This option cons .ad .sp .6 .RS 4n -Set principal trace buffer size (\fIbufsz\fR). The trace buffer size can include any of the size suffixes \fBk\fR, \fBm\fR, \fBg\fR, or \fBt\fR. If the buffer space cannot be allocated, \fBdtrace\fR attempts -to reduce the buffer size or exit depending on the setting of the \fBbufresize\fR property. +Set principal trace buffer size (\fIbufsz\fR). The trace buffer size can +include any of the size suffixes \fBk\fR, \fBm\fR, \fBg\fR, or \fBt\fR. If the +buffer space cannot be allocated, \fBdtrace\fR attempts to reduce the buffer +size or exit depending on the setting of the \fBbufresize\fR property. .RE .sp @@ -149,9 +169,13 @@ to reduce the buffer size or exit depending on the setting of the \fBbufresize\f .ad .sp .6 .RS 4n -Run the specified command \fIcmd\fR and exit upon its completion. If more than one \fB-c\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status for each child process as it -terminates. The process-ID of the first command is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information -on macro variables. +Run the specified command \fIcmd\fR and exit upon its completion. If more than +one \fB-c\fR option is present on the command line, \fBdtrace\fR exits when all +commands have exited, reporting the exit status for each child process as it +terminates. The process-ID of the first command is made available to any D +programs specified on the command line or using the \fB-s\fR option through the +\fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR +for more information on macro variables. .RE .sp @@ -162,7 +186,11 @@ on macro variables. .ad .sp .6 .RS 4n -Run the C preprocessor \fBcpp\fR(1) over D programs before compiling them. You can pass options to the C preprocessor using the \fB-D\fR, \fB-U\fR, \fB-I\fR, and \fB-H\fR options. You can select the degree of C standard conformance if you use the \fB-X\fR option. For a description of the set of tokens defined by the D compiler when invoking the C preprocessor, see \fB-X\fR. +Run the C preprocessor \fBcpp\fR(1) over D programs before compiling them. You +can pass options to the C preprocessor using the \fB-D\fR, \fB-U\fR, \fB-I\fR, +and \fB-H\fR options. You can select the degree of C standard conformance if +you use the \fB-X\fR option. For a description of the set of tokens defined by +the D compiler when invoking the C preprocessor, see \fB-X\fR. .RE .sp @@ -173,8 +201,10 @@ Run the C preprocessor \fBcpp\fR(1) over D programs before compiling them. You c .ad .sp .6 .RS 4n -Define \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). If you specify the equals sign (\fB=\fR) -and additional \fIvalue\fR, the name is assigned the corresponding value. This option passes the \fB-D\fR option to each \fBcpp\fR invocation. +Define \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR +option). If you specify the equals sign (\fB=\fR) and additional \fIvalue\fR, +the name is assigned the corresponding value. This option passes the \fB-D\fR +option to each \fBcpp\fR invocation. .RE .sp @@ -185,21 +215,32 @@ and additional \fIvalue\fR, the name is assigned the corresponding value. This o .ad .sp .6 .RS 4n -Exit after compiling any requests and consuming anonymous tracing state (\fB-a\fR option) but prior to enabling any probes. You can combine this option with the \fB-a\fR option to print anonymous tracing data and exit. You can also combine this option with D -compiler options. This combination verifies that the programs compile without actually executing them and enabling the corresponding instrumentation. +Exit after compiling any requests and consuming anonymous tracing state +(\fB-a\fR option) but prior to enabling any probes. You can combine this option +with the \fB-a\fR option to print anonymous tracing data and exit. You can also +combine this option with D compiler options. This combination verifies that the +programs compile without actually executing them and enabling the corresponding +instrumentation. .RE .sp .ne 2 .mk .na -\fB\fB-f\fR\fB[[\fR\fIprovider\fR\fB:]\fR\fImodule\fR\fB:]\fR\fIfunction\fR\fB[[\fR\fIpredicate\fR\fB]\fR\fIaction\fR\fB]]\fR\fR +\fB\fB-f\fR\fB[[\fR\fIprovider\fR\fB:]\fR\fImodule\fR\fB:]\fR\fIfunction\fR\fB[ +[\fR\fIpredicate\fR\fB]\fR\fIaction\fR\fB]]\fR\fR .ad .sp .6 .RS 4n -Specify function name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function\fR, \fImodule:function\fR, or \fIfunction\fR. -Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIfunction\fR are specified in the description, all probes with the corresponding \fIfunction\fR are matched. -The \fB-f\fR argument can be suffixed with an optional D probe clause. You can specify more than one \fB-f\fR option on the command line at a time. +Specify function name to trace or list (\fB-l\fR option). The corresponding +argument can include any of the probe description forms +\fIprovider:module:function\fR, \fImodule:function\fR, or \fIfunction\fR. +Unspecified probe description fields are left blank and match any probes +regardless of the values in those fields. If no qualifiers other than +\fIfunction\fR are specified in the description, all probes with the +corresponding \fIfunction\fR are matched. The \fB-f\fR argument can be suffixed +with an optional D probe clause. You can specify more than one \fB-f\fR option +on the command line at a time. .RE .sp @@ -210,8 +251,12 @@ The \fB-f\fR argument can be suffixed with an optional D probe clause. You can s .ad .sp .6 .RS 4n -Coalesce trace output by identifying function entry and return. Function entry probe reports are indented and their output is prefixed with \fB->\fR. Function return probe reports are unindented and their output is prefixed with \fB<-\fR\&. System call -entry probe reports are indented and their output is prefixed with \fB=>\fR. System call return probe reports are unindented and their output is prefixed with \fB<=\fR\&. +Coalesce trace output by identifying function entry and return. Function entry +probe reports are indented and their output is prefixed with \fB->\fR. Function +return probe reports are unindented and their output is prefixed with +\fB<-\fR\&. System call entry probe reports are indented and their output is +prefixed with \fB=>\fR. System call return probe reports are unindented and +their output is prefixed with \fB<=\fR\&. .RE .sp @@ -222,9 +267,14 @@ entry probe reports are indented and their output is prefixed with \fB=>\fR. Sys .ad .sp .6 .RS 4n -Generate an ELF file containing an embedded DTrace program. The DTrace probes specified in the program are saved inside of a relocatable ELF object which can be linked into another program. If the \fB-o\fR option is present, the ELF file is saved using the pathname specified -as the argument for this operand. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fB\fIfilename\fR.d\fR, then the ELF file is saved using the name \fB\fIfilename\fR.o\fR. -Otherwise the ELF file is saved using the name \fBd.out\fR. +Generate an ELF file containing an embedded DTrace program. The DTrace probes +specified in the program are saved inside of a relocatable ELF object which can +be linked into another program. If the \fB-o\fR option is present, the ELF file +is saved using the pathname specified as the argument for this operand. If the +\fB-o\fR option is not present and the DTrace program is contained with a file +whose name is \fB\fIfilename\fR.d\fR, then the ELF file is saved using the name +\fB\fIfilename\fR.o\fR. Otherwise the ELF file is saved using the name +\fBd.out\fR. .RE .sp @@ -235,8 +285,10 @@ Otherwise the ELF file is saved using the name \fBd.out\fR. .ad .sp .6 .RS 4n -Print the pathnames of included files when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-H\fR option -to each \fBcpp\fR invocation, causing it to display the list of pathnames, one for each line, to \fBstderr\fR. +Print the pathnames of included files when invoking \fBcpp\fR(1) (enabled using +the \fB-C\fR option). This option passes the \fB-H\fR option to each \fBcpp\fR +invocation, causing it to display the list of pathnames, one for each line, to +\fBstderr\fR. .RE .sp @@ -247,9 +299,14 @@ to each \fBcpp\fR invocation, causing it to display the list of pathnames, one f .ad .sp .6 .RS 4n -Generate a header file containing macros that correspond to probes in the specified provider definitions. This option should be used to generate a header file that is included by other source files for later use with the \fB-G\fR option. If the \fB-o\fR option -is present, the header file is saved using the pathname specified as the argument for that option. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fIfilename\fR\fB\&.d\fR, then the header file is saved -using the name \fIfilename\fR\fB\&.h\fR. +Generate a header file containing macros that correspond to probes in the +specified provider definitions. This option should be used to generate a header +file that is included by other source files for later use with the \fB-G\fR +option. If the \fB-o\fR option is present, the header file is saved using the +pathname specified as the argument for that option. If the \fB-o\fR option is +not present and the DTrace program is contained with a file whose name is +\fIfilename\fR\fB\&.d\fR, then the header file is saved using the name +\fIfilename\fR\fB\&.h\fR. .RE .sp @@ -260,8 +317,10 @@ using the name \fIfilename\fR\fB\&.h\fR. .ad .sp .6 .RS 4n -Specify probe identifier (\fIprobe-id\fR) to trace or list (\fB-l\fR option). You can specify probe IDs using decimal integers as shown by \fBdtrace\fR \fB-l\fR. The \fB-i\fR argument can be suffixed with an optional -D probe clause. You can specify more than one \fB-i\fR option at a time. +Specify probe identifier (\fIprobe-id\fR) to trace or list (\fB-l\fR option). +You can specify probe IDs using decimal integers as shown by \fBdtrace\fR +\fB-l\fR. The \fB-i\fR argument can be suffixed with an optional D probe +clause. You can specify more than one \fB-i\fR option at a time. .RE .sp @@ -272,8 +331,11 @@ D probe clause. You can specify more than one \fB-i\fR option at a time. .ad .sp .6 .RS 4n -Add the specified directory \fIpath\fR to the search path for \fB#include\fR files when invoking \fBcpp\fR(1) (enabled -using the \fB-C\fR option). This option passes the \fB-I\fR option to each \fBcpp\fR invocation. The specified \fIpath\fR is inserted into the search path ahead of the default directory list. +Add the specified directory \fIpath\fR to the search path for \fB#include\fR +files when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This +option passes the \fB-I\fR option to each \fBcpp\fR invocation. The specified +\fIpath\fR is inserted into the search path ahead of the default directory +list. .RE .sp @@ -284,7 +346,9 @@ using the \fB-C\fR option). This option passes the \fB-I\fR option to each \fBcp .ad .sp .6 .RS 4n -Add the specified directory \fIpath\fR to the search path for DTrace libraries. DTrace libraries are used to contain common definitions that can be used when writing D programs. The specified \fIpath\fR is added after the default library +Add the specified directory \fIpath\fR to the search path for DTrace libraries. +DTrace libraries are used to contain common definitions that can be used when +writing D programs. The specified \fIpath\fR is added after the default library search path. .RE @@ -296,34 +360,49 @@ search path. .ad .sp .6 .RS 4n -List probes instead of enabling them. If the \fB-l\fR option is specified, \fBdtrace\fR produces a report of the probes matching the descriptions given using the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, \fB-i\fR, -and \fB-s\fR options. If none of these options are specified, this option lists all probes. +List probes instead of enabling them. If the \fB-l\fR option is specified, +\fBdtrace\fR produces a report of the probes matching the descriptions given +using the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, \fB-i\fR, and \fB-s\fR +options. If none of these options are specified, this option lists all probes. .RE .sp .ne 2 .mk .na -\fB\fB-m\fR [[\fIprovider:\fR] \fImodule:\fR [[\fIpredicate\fR] \fIaction\fR]]\fR +\fB\fB-m\fR [[\fIprovider:\fR] \fImodule:\fR [[\fIpredicate\fR] +\fIaction\fR]]\fR .ad .sp .6 .RS 4n -Specify module name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module\fR or \fImodule\fR. Unspecified probe description fields are left blank and match -any probes regardless of the values in those fields. If no qualifiers other than \fImodule\fR are specified in the description, all probes with a corresponding \fImodule\fR are matched. The \fB-m\fR argument can be suffixed with an optional D -probe clause. More than one \fB-m\fR option can be specified on the command line at a time. +Specify module name to trace or list (\fB-l\fR option). The corresponding +argument can include any of the probe description forms \fIprovider:module\fR +or \fImodule\fR. Unspecified probe description fields are left blank and match +any probes regardless of the values in those fields. If no qualifiers other +than \fImodule\fR are specified in the description, all probes with a +corresponding \fImodule\fR are matched. The \fB-m\fR argument can be suffixed +with an optional D probe clause. More than one \fB-m\fR option can be specified +on the command line at a time. .RE .sp .ne 2 .mk .na -\fB\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]\fR +\fB\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR +[[\fIpredicate\fR] \fIaction\fR]\fR .ad .sp .6 .RS 4n -Specify probe name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function:name\fR, \fImodule:function:name\fR, \fIfunction:name\fR, -or \fIname\fR. Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIname\fR are specified in the description, all probes with a corresponding \fIname\fR are -matched. The \fB-n\fR argument can be suffixed with an optional D probe clause. More than one \fB-n\fR option can be specified on the command line at a time. +Specify probe name to trace or list (\fB-l\fR option). The corresponding +argument can include any of the probe description forms +\fIprovider:module:function:name\fR, \fImodule:function:name\fR, +\fIfunction:name\fR, or \fIname\fR. Unspecified probe description fields are +left blank and match any probes regardless of the values in those fields. If no +qualifiers other than \fIname\fR are specified in the description, all probes +with a corresponding \fIname\fR are matched. The \fB-n\fR argument can be +suffixed with an optional D probe clause. More than one \fB-n\fR option can be +specified on the command line at a time. .RE .sp @@ -334,8 +413,13 @@ matched. The \fB-n\fR argument can be suffixed with an optional D probe clause. .ad .sp .6 .RS 4n -Specify the \fIoutput\fR file for the \fB-A\fR , \fB-G\fR, and \fB-l\fR options, or for the traced data itself. If the \fB-A\fR option is present and \fB-o\fR is not present, the default output file is \fB/kernel/drv/dtrace.conf\fR. If the \fB-G\fR option is present and the \fB-s\fR option's argument is of the form \fB\fIfilename\fR.d\fR and \fB-o\fR is not present, the default output file is \fB\fIfilename\fR.o\fR. -Otherwise the default output file is \fBd.out\fR. +Specify the \fIoutput\fR file for the \fB-A\fR , \fB-G\fR, \fB-h\fR, and +\fB-l\fR options, or for the traced data itself. If the \fB-A\fR option is +present and \fB-o\fR is not present, the default output file is +\fB/kernel/drv/dtrace.conf\fR. If the \fB-G\fR option is present and the +\fB-s\fR option's argument is of the form \fB\fIfilename\fR.d\fR and \fB-o\fR +is not present, the default output file is \fB\fIfilename\fR.o\fR. Otherwise +the default output file is \fBd.out\fR. .RE .sp @@ -346,9 +430,13 @@ Otherwise the default output file is \fBd.out\fR. .ad .sp .6 .RS 4n -Grab the specified process-ID \fIpid\fR, cache its symbol tables, and exit upon its completion. If more than one \fB-p\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status -for each process as it terminates. The first process-ID is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for -more information on macro variables. +Grab the specified process-ID \fIpid\fR, cache its symbol tables, and exit upon +its completion. If more than one \fB-p\fR option is present on the command +line, \fBdtrace\fR exits when all commands have exited, reporting the exit +status for each process as it terminates. The first process-ID is made +available to any D programs specified on the command line or using the \fB-s\fR +option through the \fB$target\fR macro variable. Refer to the \fISolaris +Dynamic Tracing Guide\fR for more information on macro variables. .RE .sp @@ -359,8 +447,11 @@ more information on macro variables. .ad .sp .6 .RS 4n -Specify provider name to trace or list (\fB-l\fR option). The remaining probe description fields module, function, and name are left blank and match any probes regardless of the values in those fields. The \fB-P\fR argument can be suffixed with an optional D -probe clause. You can specify more than one \fB-P\fR option on the command line at a time. +Specify provider name to trace or list (\fB-l\fR option). The remaining probe +description fields module, function, and name are left blank and match any +probes regardless of the values in those fields. The \fB-P\fR argument can be +suffixed with an optional D probe clause. You can specify more than one +\fB-P\fR option on the command line at a time. .RE .sp @@ -371,8 +462,11 @@ probe clause. You can specify more than one \fB-P\fR option on the command line .ad .sp .6 .RS 4n -Set quiet mode. \fBdtrace\fR suppresses messages such as the number of probes matched by the specified options and D programs and does not print column headers, the CPU ID, the probe ID, or insert newlines into the output. Only data traced and formatted by D program -statements such as \fBtrace()\fR and \fBprintf()\fR is displayed to \fBstdout\fR. +Set quiet mode. \fBdtrace\fR suppresses messages such as the number of probes +matched by the specified options and D programs and does not print column +headers, the CPU ID, the probe ID, or insert newlines into the output. Only +data traced and formatted by D program statements such as \fBtrace()\fR and +\fBprintf()\fR is displayed to \fBstdout\fR. .RE .sp @@ -383,8 +477,12 @@ statements such as \fBtrace()\fR and \fBprintf()\fR is displayed to \fBstdout\fR .ad .sp .6 .RS 4n -Compile the specified D program source file. If the \fB-e\fR option is present, the program is compiled but instrumentation is not enabled. If the \fB-l\fR option is present, the program is compiled and the set of probes matched by it is listed, but instrumentation -is not enabled. If none of \fB-e\fR, \fB-l\fR, \fB-G\fR, or \fB-A\fR are present, the instrumentation specified by the D program is enabled and tracing begins. +Compile the specified D program source file. If the \fB-e\fR option is present, +the program is compiled but instrumentation is not enabled. If the \fB-l\fR +option is present, the program is compiled and the set of probes matched by it +is listed, but instrumentation is not enabled. If none of \fB-e\fR, \fB-l\fR, +\fB-G\fR, or \fB-A\fR are present, the instrumentation specified by the D +program is enabled and tracing begins. .RE .sp @@ -395,7 +493,8 @@ is not enabled. If none of \fB-e\fR, \fB-l\fR, \fB-G\fR, or \fB-A\fR are present .ad .sp .6 .RS 4n -Show D compiler intermediate code. The D compiler produces a report of the intermediate code generated for each D program to \fBstderr\fR. +Show D compiler intermediate code. The D compiler produces a report of the +intermediate code generated for each D program to \fBstderr\fR. .RE .sp @@ -406,7 +505,9 @@ Show D compiler intermediate code. The D compiler produces a report of the inter .ad .sp .6 .RS 4n -Undefine the specified \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-U\fR option to each \fBcpp\fR invocation. +Undefine the specified \fIname\fR when invoking \fBcpp\fR(1) (enabled using the +\fB-C\fR option). This option passes the \fB-U\fR option to each \fBcpp\fR +invocation. .RE .sp @@ -417,7 +518,10 @@ Undefine the specified \fIname\fR when invoking \fBcpp\fR(1) (enabled using the .ad .sp .6 .RS 4n -Set verbose mode. If the \fB-v\fR option is specified, \fBdtrace\fR produces a program stability report showing the minimum interface stability and dependency level for the specified D programs. DTrace stability levels are explained in further detail in the \fISolaris Dynamic Tracing Guide\fR. +Set verbose mode. If the \fB-v\fR option is specified, \fBdtrace\fR produces a +program stability report showing the minimum interface stability and dependency +level for the specified D programs. DTrace stability levels are explained in +further detail in the \fISolaris Dynamic Tracing Guide\fR. .RE .sp @@ -428,8 +532,10 @@ Set verbose mode. If the \fB-v\fR option is specified, \fBdtrace\fR produces a p .ad .sp .6 .RS 4n -Report the highest D programming interface version supported by \fBdtrace\fR. The version information is printed to \fBstdout\fR and the \fBdtrace\fR command exits. Refer to the \fISolaris Dynamic Tracing Guide\fR for -more information about DTrace versioning features. +Report the highest D programming interface version supported by \fBdtrace\fR. +The version information is printed to \fBstdout\fR and the \fBdtrace\fR command +exits. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information +about DTrace versioning features. .RE .sp @@ -440,8 +546,10 @@ more information about DTrace versioning features. .ad .sp .6 .RS 4n -Permit destructive actions in D programs specified using the \fB-s\fR, \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options. If the \fB-w\fR option is not specified, \fBdtrace\fR does not -permit the compilation or enabling of a D program that contains destructive actions. +Permit destructive actions in D programs specified using the \fB-s\fR, +\fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options. If the \fB-w\fR +option is not specified, \fBdtrace\fR does not permit the compilation or +enabling of a D program that contains destructive actions. .RE .sp @@ -452,8 +560,10 @@ permit the compilation or enabling of a D program that contains destructive acti .ad .sp .6 .RS 4n -Enable or modify a DTrace runtime option or D compiler option. The list of options is found in the \fISolaris Dynamic Tracing Guide\fR. Boolean options are enabled by specifying their name. Options with values are set by separating the option name and -value with an equals sign (\fB=\fR). +Enable or modify a DTrace runtime option or D compiler option. The list of +options is found in the \fISolaris Dynamic Tracing Guide\fR. Boolean options +are enabled by specifying their name. Options with values are set by separating +the option name and value with an equals sign (\fB=\fR). .RE .sp @@ -464,8 +574,10 @@ value with an equals sign (\fB=\fR). .ad .sp .6 .RS 4n -Specify the degree of conformance to the ISO C standard that should be selected when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). -The \fB-X\fR option argument affects the value and presence of the \fB__STDC__\fR macro depending upon the value of the argument letter. +Specify the degree of conformance to the ISO C standard that should be selected +when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). The \fB-X\fR +option argument affects the value and presence of the \fB__STDC__\fR macro +depending upon the value of the argument letter. .sp The \fB-X\fR option supports the following arguments: .sp @@ -476,8 +588,10 @@ The \fB-X\fR option supports the following arguments: .ad .RS 5n .rt -Default. ISO C plus K&R compatibility extensions, with semantic changes required by ISO C. This is the default mode if \fB-X\fR is not specified. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction -with the \fB-Xa\fR option. +Default. ISO C plus K&R compatibility extensions, with semantic changes +required by ISO C. This is the default mode if \fB-X\fR is not specified. The +predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in +conjunction with the \fB-Xa\fR option. .RE .sp @@ -488,7 +602,9 @@ with the \fB-Xa\fR option. .ad .RS 5n .rt -Conformance. Strictly conformant ISO C, without K&R C compatibility extensions. The predefined macro \fB__STDC__\fR has a value of 1 when \fBcpp\fR is invoked in conjunction with the \fB-Xc\fR option. +Conformance. Strictly conformant ISO C, without K&R C compatibility extensions. +The predefined macro \fB__STDC__\fR has a value of 1 when \fBcpp\fR is invoked +in conjunction with the \fB-Xc\fR option. .RE .sp @@ -499,7 +615,8 @@ Conformance. Strictly conformant ISO C, without K&R C compatibility extensions. .ad .RS 5n .rt -K&R C only. The macro \fB__STDC__\fR is not defined when \fBcpp\fR is invoked in conjunction with the \fB-Xs\fR option. +K&R C only. The macro \fB__STDC__\fR is not defined when \fBcpp\fR is invoked +in conjunction with the \fB-Xs\fR option. .RE .sp @@ -510,12 +627,18 @@ K&R C only. The macro \fB__STDC__\fR is not defined when \fBcpp\fR is invoked in .ad .RS 5n .rt -Transition. ISO C plus K&R C compatibility extensions, without semantic changes required by ISO C. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction with the \fB-Xt\fR option. +Transition. ISO C plus K&R C compatibility extensions, without semantic changes +required by ISO C. The predefined macro \fB__STDC__\fR has a value of 0 when +\fBcpp\fR is invoked in conjunction with the \fB-Xt\fR option. .RE -As the \fB-X\fR option only affects how the D compiler invokes the C preprocessor, the \fB-Xa\fR and \fB-Xt\fR options are equivalent from the perspective of D and both are provided only to ease re-use of settings from a C build environment. +As the \fB-X\fR option only affects how the D compiler invokes the C +preprocessor, the \fB-Xa\fR and \fB-Xt\fR options are equivalent from the +perspective of D and both are provided only to ease re-use of settings from a C +build environment. .sp -Regardless of the \fB-X\fR mode, the following additional C preprocessor definitions are always specified and valid in all modes: +Regardless of the \fB-X\fR mode, the following additional C preprocessor +definitions are always specified and valid in all modes: .RS +4 .TP .ie t \(bu @@ -576,8 +699,10 @@ Regardless of the \fB-X\fR mode, the following additional C preprocessor definit .el o \fB__SUNW_D_VERSION=0x\fIMMmmmuuu\fR\fR .sp -Where \fIMM\fR is the major release value in hexadecimal, \fImmm\fR is the minor release value in hexadecimal, and \fIuuu\fR is the -micro release value in hexadecimal. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information about DTrace versioning. +Where \fIMM\fR is the major release value in hexadecimal, \fImmm\fR is the +minor release value in hexadecimal, and \fIuuu\fR is the micro release value in +hexadecimal. Refer to the \fISolaris Dynamic Tracing Guide\fR for more +information about DTrace versioning. .RE .RE @@ -589,14 +714,21 @@ micro release value in hexadecimal. Refer to the \fISolaris Dynamic Tracing Guid .ad .sp .6 .RS 4n -Permit probe descriptions that match zero probes. If the \fB-Z\fR option is not specified, \fBdtrace\fR reports an error and exits if any probe descriptions specified in D program files (\fB-s\fR option) or on the command line (\fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options) contain descriptions that do not match any known probes. +Permit probe descriptions that match zero probes. If the \fB-Z\fR option is not +specified, \fBdtrace\fR reports an error and exits if any probe descriptions +specified in D program files (\fB-s\fR option) or on the command line +(\fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options) contain +descriptions that do not match any known probes. .RE .SH OPERANDS .sp .LP -You can specify zero or more additional arguments on the \fBdtrace\fR command line to define a set of macro variables (\fB$1\fR, \fB$2\fR, and so forth). The additional arguments can be used in D programs specified using the \fB-s\fR option -or on the command line. The use of macro variables is described further in the \fISolaris Dynamic Tracing Guide\fR. +You can specify zero or more additional arguments on the \fBdtrace\fR command +line to define a set of macro variables (\fB$1\fR, \fB$2\fR, and so forth). The +additional arguments can be used in D programs specified using the \fB-s\fR +option or on the command line. The use of macro variables is described further +in the \fISolaris Dynamic Tracing Guide\fR. .SH EXIT STATUS .sp .LP @@ -609,10 +741,12 @@ The following exit values are returned: .ad .RS 5n .rt -Successful completion. +Successful completion. .sp -For D program requests, an exit status of \fB0\fR indicates that programs were successfully compiled, probes were successfully enabled, or anonymous state was successfully retrieved. \fBdtrace\fR returns \fB0\fR even if the specified tracing requests -encountered errors or drops. +For D program requests, an exit status of \fB0\fR indicates that programs were +successfully compiled, probes were successfully enabled, or anonymous state was +successfully retrieved. \fBdtrace\fR returns \fB0\fR even if the specified +tracing requests encountered errors or drops. .RE .sp @@ -625,7 +759,8 @@ encountered errors or drops. .rt An error occurred. .sp -For D program requests, an exit status of \fB1\fR indicates that program compilation failed or that the specified request could not be satisfied. +For D program requests, an exit status of \fB1\fR indicates that program +compilation failed or that the specified request could not be satisfied. .RE .sp @@ -653,8 +788,6 @@ lw(2.75i) |lw(2.75i) . ATTRIBUTE TYPEATTRIBUTE VALUE _ -AvailabilitySUNWdtrc -_ Interface StabilitySee below. .TE @@ -664,7 +797,19 @@ The command-line syntax is Committed. The human-readable output is Uncommitted. .SH SEE ALSO .sp .LP -\fBcpp\fR(1), \fBisainfo\fR(1), \fBlibdtrace\fR(3LIB), \fBdriver.conf\fR(4), \fBattributes\fR(5), \fBdtrace\fR(7D) +\fBcpp\fR(1), \fBisainfo\fR(1), \fBssh\fR(1), \fBlibdtrace\fR(3LIB), +\fBdriver.conf\fR(4), \fBattributes\fR(5), \fBdtrace\fR(7D) .sp .LP \fISolaris Dynamic Tracing Guide\fR +.SH USAGE +.sp +.LP +When using the \fB-p\fR flag, \fBdtrace\fR stops the target processes while it +is inspecting them and reporting results. A process can do nothing while it is +stopped. This means that, if , for example, the X server is inspected by +\fBdtrace\fR running in a window under the X server's control, the whole window +system can become deadlocked, because the \fBproc\fR tool would be attempting +to display its results to a window that cannot be refreshed. In such a case, +logging in from another system using \fBssh\fR(1) and killing the offending +\fBproc\fR tool clears the deadlock. diff --git a/man/man1m/lockstat.1m b/man/man1m/lockstat.1m new file mode 100644 index 0000000..495b294 --- /dev/null +++ b/man/man1m/lockstat.1m @@ -0,0 +1,909 @@ +'\" te +.\" Copyright (c) 2008, Sun Microsystems, Inc. All Rights Reserved. +.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. +.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] +.TH lockstat 1M "28 Feb 2008" "SunOS 5.11" "System Administration Commands" +.SH NAME +lockstat \- report kernel lock and profiling statistics +.SH SYNOPSIS +.LP +.nf +\fBlockstat\fR [\fB-ACEHI\fR] [\fB-e\fR \fIevent_list\fR] [\fB-i\fR \fIrate\fR] + [\fB-b\fR | \fB-t\fR | \fB-h\fR | \fB-s\fR \fIdepth\fR] [\fB-n\fR \fInrecords\fR] + [\fB-l\fR \fIlock\fR [, \fIsize\fR]] [\fB-d\fR \fIduration\fR] + [\fB-f\fR \fIfunction\fR [, \fIsize\fR]] [\fB-T\fR] [\fB-ckgwWRpP\fR] [\fB-D\fR \fIcount\fR] + [\fB-o\fR \fIfilename\fR] [\fB-x\fR \fIopt\fR [=val]] \fIcommand\fR [\fIargs\fR] +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBlockstat\fR utility gathers and displays kernel locking and profiling +statistics. \fBlockstat\fR allows you to specify which events to watch (for +example, spin on adaptive mutex, block on read access to rwlock due to waiting +writers, and so forth) how much data to gather for each event, and how to +display the data. By default, \fBlockstat\fR monitors all lock contention +events, gathers frequency and timing data about those events, and displays the +data in decreasing frequency order, so that the most common events appear +first. +.sp +.LP +\fBlockstat\fR gathers data until the specified command completes. For example, +to gather statistics for a fixed-time interval, use \fBsleep\fR(1) as the +command, as follows: +.sp +.LP +\fBexample#\fR \fBlockstat\fR \fBsleep\fR \fB5\fR +.sp +.LP +When the \fB-I\fR option is specified, \fBlockstat\fR establishes a +per-processor high-level periodic interrupt source to gather profiling data. +The interrupt handler simply generates a \fBlockstat\fR event whose caller is +the interrupted PC (program counter). The profiling event is just like any +other \fBlockstat\fR event, so all of the normal \fBlockstat\fR options are +applicable. +.sp +.LP +\fBlockstat\fR relies on DTrace to modify the running kernel's text to +intercept events of interest. This imposes a small but measurable overhead on +all system activity, so access to \fBlockstat\fR is restricted to super-user by +default. The system administrator can permit other users to use \fBlockstat\fR +by granting them additional DTrace privileges. Refer to the \fISolaris Dynamic +Tracing Guide\fR for more information about DTrace security features. +.SH OPTIONS +.sp +.LP +The following options are supported: +.SS "Event Selection" +.sp +.LP +If no event selection options are specified, the default is \fB-C\fR. +.sp +.ne 2 +.mk +.na +\fB\fB-A\fR\fR +.ad +.sp .6 +.RS 4n +Watch all lock events. \fB-A\fR is equivalent to \fB-CH\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-C\fR\fR +.ad +.sp .6 +.RS 4n +Watch contention events. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-E\fR\fR +.ad +.sp .6 +.RS 4n +Watch error events. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB\fR\fB-e\fR \fIevent_list\fR\fR +.ad +.sp .6 +.RS 4n +Only watch the specified events. \fIevent\fR \fIlist\fR is a comma-separated +list of events or ranges of events such as 1,4-7,35. Run \fBlockstat\fR with no +arguments to get a brief description of all events. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-H\fR\fR +.ad +.sp .6 +.RS 4n +Watch hold events. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-I\fR\fR +.ad +.sp .6 +.RS 4n +Watch profiling interrupt events. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB\fR\fB-i\fR \fIrate\fR\fR +.ad +.sp .6 +.RS 4n +Interrupt rate (per second) for \fB-I\fR. The default is 97 Hz, so that +profiling doesn't run in lockstep with the clock interrupt (which runs at 100 +Hz). +.RE + +.SS "Data Gathering" +.sp +.ne 2 +.mk +.na +\fB\fB-x\fR \fIarg\fR[=\fIval\fR]\fR +.ad +.sp .6 +.RS 4n +Enable or modify a DTrace runtime option or D compiler option. The list of +options is found in the \fI\fR. Boolean options are enabled by specifying their +name. Options with values are set by separating the option name and value with +an equals sign (=). +.RE + +.SS "Data Gathering (Mutually Exclusive)" +.sp +.ne 2 +.mk +.na +\fB\fB-b\fR\fR +.ad +.sp .6 +.RS 4n +Basic statistics: lock, caller, number of events. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-h\fR\fR +.ad +.sp .6 +.RS 4n +Histogram: Timing plus time-distribution histograms. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB\fR\fB-s\fR \fIdepth\fR\fR +.ad +.sp .6 +.RS 4n +Stack trace: Histogram plus stack traces up to \fIdepth\fR frames deep. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-t\fR\fR +.ad +.sp .6 +.RS 4n +Timing: Basic plus timing for all events [default]. +.RE + +.SS "Data Filtering" +.sp +.ne 2 +.mk +.na +\fB\fB\fR\fB-d\fR \fIduration\fR\fR +.ad +.sp .6 +.RS 4n +Only watch events longer than \fIduration\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB\fR\fB-f\fR \fIfunc[,size]\fR\fR +.ad +.sp .6 +.RS 4n +Only watch events generated by \fIfunc\fR, which can be specified as a symbolic +name or hex address. \fIsize\fR defaults to the \fBELF\fR symbol size if +available, or \fB1\fR if not. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB\fR\fB-l\fR \fIlock[,size]\fR\fR +.ad +.sp .6 +.RS 4n +Only watch \fIlock\fR, which can be specified as a symbolic name or hex +address. \fBsize\fR defaults to the \fBELF\fR symbol size or \fB1\fR if the +symbol size is not available. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB\fR\fB-n\fR \fInrecords\fR\fR +.ad +.sp .6 +.RS 4n +Maximum number of data records. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-T\fR\fR +.ad +.sp .6 +.RS 4n +Trace (rather than sample) events [off by default]. +.RE + +.SS "Data Reporting" +.sp +.ne 2 +.mk +.na +\fB\fB-c\fR\fR +.ad +.sp .6 +.RS 4n +Coalesce lock data for lock arrays (for example, \fBpse_mutex[]\fR). +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB\fR\fB-D\fR \fIcount\fR\fR +.ad +.sp .6 +.RS 4n +Only display the top \fIcount\fR events of each type. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-g\fR\fR +.ad +.sp .6 +.RS 4n +Show total events generated by function. For example, if \fBfoo()\fR calls +\fBbar()\fR in a loop, the work done by \fBbar()\fR counts as work generated by +\fBfoo()\fR (along with any work done by \fBfoo()\fR itself). The \fB-g\fR +option works by counting the total number of stack frames in which each +function appears. This implies two things: (1) the data reported by \fB-g\fR +can be misleading if the stack traces are not deep enough, and (2) functions +that are called recursively might show greater than 100% activity. In light of +issue (1), the default data gathering mode when using \fB-g\fR is \fB-s\fR +\fB50\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-k\fR\fR +.ad +.sp .6 +.RS 4n +Coalesce PCs within functions. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB\fR\fB-o\fR \fIfilename\fR\fR +.ad +.sp .6 +.RS 4n +Direct output to \fIfilename\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-P\fR\fR +.ad +.sp .6 +.RS 4n +Sort data by (\fIcount * time\fR) product. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Parsable output format. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-R\fR\fR +.ad +.sp .6 +.RS 4n +Display rates (events per second) rather than counts. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-W\fR\fR +.ad +.sp .6 +.RS 4n +Whichever: distinguish events only by caller, not by lock. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-w\fR\fR +.ad +.sp .6 +.RS 4n +Wherever: distinguish events only by lock, not by caller. +.RE + +.SH DISPLAY FORMATS +.sp +.LP +The following headers appear over various columns of data. +.sp +.ne 2 +.mk +.na +\fB\fBCount\fR or \fBops/s\fR\fR +.ad +.sp .6 +.RS 4n +Number of times this event occurred, or the rate (times per second) if \fB-R\fR +was specified. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBindv\fR\fR +.ad +.sp .6 +.RS 4n +Percentage of all events represented by this individual event. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBgenr\fR\fR +.ad +.sp .6 +.RS 4n +Percentage of all events generated by this function. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcuml\fR\fR +.ad +.sp .6 +.RS 4n +Cumulative percentage; a running total of the individuals. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBrcnt\fR\fR +.ad +.sp .6 +.RS 4n +Average reference count. This will always be \fB1\fR for exclusive locks +(mutexes, spin locks, rwlocks held as writer) but can be greater than \fB1\fR +for shared locks (rwlocks held as reader). +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBnsec\fR\fR +.ad +.sp .6 +.RS 4n +Average duration of the events in nanoseconds, as appropriate for the event. +For the profiling event, duration means interrupt latency. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBLock\fR\fR +.ad +.sp .6 +.RS 4n +Address of the lock; displayed symbolically if possible. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBCPU+PIL\fR\fR +.ad +.sp .6 +.RS 4n +\fBCPU\fR plus processor interrupt level (\fBPIL\fR). For example, if \fBCPU\fR +4 is interrupted while at \fBPIL\fR 6, this will be reported as \fBcpu[4]+6\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBCaller\fR\fR +.ad +.sp .6 +.RS 4n +Address of the caller; displayed symbolically if possible. +.RE + +.SH EXAMPLES +.LP +\fBExample 1 \fRMeasuring Kernel Lock Contention +.sp +.in +2 +.nf +example# \fBlockstat sleep 5\fR +Adaptive mutex spin: 2210 events in 5.055 seconds (437 events/sec) +.fi +.in -2 +.sp + +.sp +.in +2 +.nf +Count indv cuml rcnt nsec Lock Caller +------------------------------------------------------------------------ + 269 12% 12% 1.00 2160 service_queue background+0xdc + 249 11% 23% 1.00 86 service_queue qenable_locked+0x64 + 228 10% 34% 1.00 131 service_queue background+0x15c + 68 3% 37% 1.00 79 0x30000024070 untimeout+0x1c + 59 3% 40% 1.00 384 0x300066fa8e0 background+0xb0 + 43 2% 41% 1.00 30 rqcred_lock svc_getreq+0x3c + 42 2% 43% 1.00 341 0x30006834eb8 background+0xb0 + 41 2% 45% 1.00 135 0x30000021058 untimeout+0x1c + 40 2% 47% 1.00 39 rqcred_lock svc_getreq+0x260 + 37 2% 49% 1.00 2372 0x300068e83d0 hmestart+0x1c4 + 36 2% 50% 1.00 77 0x30000021058 timeout_common+0x4 + 36 2% 52% 1.00 354 0x300066fa120 background+0xb0 + 32 1% 53% 1.00 97 0x30000024070 timeout_common+0x4 + 31 1% 55% 1.00 2923 0x300069883d0 hmestart+0x1c4 + 29 1% 56% 1.00 366 0x300066fb290 background+0xb0 + 28 1% 57% 1.00 117 0x3000001e040 untimeout+0x1c + 25 1% 59% 1.00 93 0x3000001e040 timeout_common+0x4 + 22 1% 60% 1.00 25 0x30005161110 sync_stream_buf+0xdc + 21 1% 60% 1.00 291 0x30006834eb8 putq+0xa4 + 19 1% 61% 1.00 43 0x3000515dcb0 mdf_alloc+0xc + 18 1% 62% 1.00 456 0x30006834eb8 qenable+0x8 + 18 1% 63% 1.00 61 service_queue queuerun+0x168 + 17 1% 64% 1.00 268 0x30005418ee8 vmem_free+0x3c +[...] + +R/W reader blocked by writer: 76 events in 5.055 seconds (15 events/sec) + +Count indv cuml rcnt nsec Lock Caller +------------------------------------------------------------------------ + 23 30% 30% 1.00 22590137 0x300098ba358 ufs_dirlook+0xd0 + 17 22% 53% 1.00 5820995 0x3000ad815e8 find_bp+0x10 + 13 17% 70% 1.00 2639918 0x300098ba360 ufs_iget+0x198 + 4 5% 75% 1.00 3193015 0x300098ba360 ufs_getattr+0x54 + 3 4% 79% 1.00 7953418 0x3000ad817c0 find_bp+0x10 + 3 4% 83% 1.00 935211 0x3000ad815e8 find_read_lof+0x14 + 2 3% 86% 1.00 16357310 0x300073a4720 find_bp+0x10 + 2 3% 88% 1.00 2072433 0x300073a4720 find_read_lof+0x14 + 2 3% 91% 1.00 1606153 0x300073a4370 find_bp+0x10 + 1 1% 92% 1.00 2656909 0x300107e7400 ufs_iget+0x198 +[...] +.fi +.in -2 +.sp + +.LP +\fBExample 2 \fRMeasuring Hold Times +.sp +.in +2 +.nf +example# \fBlockstat -H -D 10 sleep 1\fR +Adaptive mutex spin: 513 events +.fi +.in -2 +.sp + +.sp +.in +2 +.nf +Count indv cuml rcnt nsec Lock Caller +------------------------------------------------------------------------- + 480 5% 5% 1.00 1136 0x300007718e8 putnext+0x40 + 286 3% 9% 1.00 666 0x3000077b430 getf+0xd8 + 271 3% 12% 1.00 537 0x3000077b430 msgio32+0x2fc + 270 3% 15% 1.00 3670 0x300007718e8 strgetmsg+0x3d4 + 270 3% 18% 1.00 1016 0x300007c38b0 getq_noenab+0x200 + 264 3% 20% 1.00 1649 0x300007718e8 strgetmsg+0xa70 + 216 2% 23% 1.00 6251 tcp_mi_lock tcp_snmp_get+0xfc + 206 2% 25% 1.00 602 thread_free_lock clock+0x250 + 138 2% 27% 1.00 485 0x300007c3998 putnext+0xb8 + 138 2% 28% 1.00 3706 0x300007718e8 strrput+0x5b8 +------------------------------------------------------------------------- +[...] +.fi +.in -2 +.sp + +.LP +\fBExample 3 \fRMeasuring Hold Times for Stack Traces Containing a Specific +Function +.sp +.in +2 +.nf +example# \fBlockstat -H -f tcp_rput_data -s 50 -D 10 sleep 1\fR +Adaptive mutex spin: 11 events in 1.023 seconds (11 +events/sec) +.fi +.in -2 +.sp + +.sp +.in +2 +.nf +------------------------------------------------------------------------- +Count indv cuml rcnt nsec Lock Caller + 9 82% 82% 1.00 2540 0x30000031380 tcp_rput_data+0x2b90 + + nsec ------ Time Distribution ------ count Stack + 256 |@@@@@@@@@@@@@@@@ 5 tcp_rput_data+0x2b90 + 512 |@@@@@@ 2 putnext+0x78 + 1024 |@@@ 1 ip_rput+0xec4 + 2048 | 0 _c_putnext+0x148 + 4096 | 0 hmeread+0x31c + 8192 | 0 hmeintr+0x36c + 16384 |@@@ 1 +sbus_intr_wrapper+0x30 +[...] + +Count indv cuml rcnt nsec Lock Caller + 1 9% 91% 1.00 1036 0x30000055380 freemsg+0x44 + + nsec ------ Time Distribution ------ count Stack + 1024 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1 freemsg+0x44 + tcp_rput_data+0x2fd0 + putnext+0x78 + ip_rput+0xec4 + _c_putnext+0x148 + hmeread+0x31c + hmeintr+0x36c + +sbus_intr_wrapper+0x30 +------------------------------------------------------------------------- +[...] +.fi +.in -2 +.sp + +.LP +\fBExample 4 \fRBasic Kernel Profiling +.sp +.LP +For basic profiling, we don't care whether the profiling interrupt sampled +\fBfoo()\fR\fB+0x4c\fR or \fBfoo()\fR\fB+0x78\fR; we care only that it sampled +somewhere in \fBfoo()\fR, so we use \fB-k\fR. The \fBCPU\fR and \fBPIL\fR +aren't relevant to basic profiling because we are measuring the system as a +whole, not a particular \fBCPU\fR or interrupt level, so we use \fB-W\fR. + +.sp +.in +2 +.nf +example# \fBlockstat -kIW -D 20 ./polltest\fR +Profiling interrupt: 82 events in 0.424 seconds (194 +events/sec) +.fi +.in -2 +.sp + +.sp +.in +2 +.nf +Count indv cuml rcnt nsec Hottest CPU+PIL Caller +----------------------------------------------------------------------- + 8 10% 10% 1.00 698 cpu[1] utl0 + 6 7% 17% 1.00 299 cpu[0] read + 5 6% 23% 1.00 124 cpu[1] getf + 4 5% 28% 1.00 327 cpu[0] fifo_read + 4 5% 33% 1.00 112 cpu[1] poll + 4 5% 38% 1.00 212 cpu[1] uiomove + 4 5% 43% 1.00 361 cpu[1] mutex_tryenter + 3 4% 46% 1.00 682 cpu[0] write + 3 4% 50% 1.00 89 cpu[0] pcache_poll + 3 4% 54% 1.00 118 cpu[1] set_active_fd + 3 4% 57% 1.00 105 cpu[0] syscall_trap32 + 3 4% 61% 1.00 640 cpu[1] (usermode) + 2 2% 63% 1.00 127 cpu[1] fifo_poll + 2 2% 66% 1.00 300 cpu[1] fifo_write + 2 2% 68% 1.00 669 cpu[0] releasef + 2 2% 71% 1.00 112 cpu[1] bt_getlowbit + 2 2% 73% 1.00 247 cpu[1] splx + 2 2% 76% 1.00 503 cpu[0] mutex_enter + 2 2% 78% 1.00 467 cpu[0]+10 disp_lock_enter + 2 2% 80% 1.00 139 cpu[1] default_copyin +----------------------------------------------------------------------- +[...] +.fi +.in -2 +.sp + +.LP +\fBExample 5 \fRGenerated-load Profiling +.sp +.LP +In the example above, 5% of the samples were in \fBpoll()\fR. This tells us how +much time was spent inside \fBpoll()\fR itself, but tells us nothing about how +much work was \fBgenerated\fR by \fBpoll()\fR; that is, how much time we spent +in functions called by \fBpoll()\fR. To determine that, we use the \fB-g\fR +option. The example below shows that although \fBpolltest\fR spends only 5% of +its time in \fBpoll()\fR itself, \fBpoll()\fR-induced work accounts for 34% of +the load. + +.sp +.LP +Note that the functions that generate the profiling interrupt +(\fBlockstat_intr()\fR, \fBcyclic_fire()\fR, and so forth) appear in every +stack trace, and therefore are considered to have generated 100% of the load. +This illustrates an important point: the generated load percentages do +\fBnot\fR add up to 100% because they are not independent. If 72% of all stack +traces contain both \fBfoo()\fR and \fBbar()\fR, then both \fBfoo()\fR and +\fBbar()\fR are 72% load generators. + +.sp +.in +2 +.nf +example# \fBlockstat -kgIW -D 20 ./polltest\fR +Profiling interrupt: 80 events in 0.412 seconds (194 events/sec) +.fi +.in -2 +.sp + +.sp +.in +2 +.nf +Count genr cuml rcnt nsec Hottest CPU+PIL Caller +------------------------------------------------------------------------- + 80 100% ---- 1.00 310 cpu[1] lockstat_intr + 80 100% ---- 1.00 310 cpu[1] cyclic_fire + 80 100% ---- 1.00 310 cpu[1] cbe_level14 + 80 100% ---- 1.00 310 cpu[1] current_thread + 27 34% ---- 1.00 176 cpu[1] poll + 20 25% ---- 1.00 221 cpu[0] write + 19 24% ---- 1.00 249 cpu[1] read + 17 21% ---- 1.00 232 cpu[0] write32 + 17 21% ---- 1.00 207 cpu[1] pcache_poll + 14 18% ---- 1.00 319 cpu[0] fifo_write + 13 16% ---- 1.00 214 cpu[1] read32 + 10 12% ---- 1.00 208 cpu[1] fifo_read + 10 12% ---- 1.00 787 cpu[1] utl0 + 9 11% ---- 1.00 178 cpu[0] pcacheset_resolve + 9 11% ---- 1.00 262 cpu[0] uiomove + 7 9% ---- 1.00 506 cpu[1] (usermode) + 5 6% ---- 1.00 195 cpu[1] fifo_poll + 5 6% ---- 1.00 136 cpu[1] syscall_trap32 + 4 5% ---- 1.00 139 cpu[0] releasef + 3 4% ---- 1.00 277 cpu[1] polllock +------------------------------------------------------------------------- +[...] +.fi +.in -2 +.sp + +.LP +\fBExample 6 \fRGathering Lock Contention and Profiling Data for a Specific +Module +.sp +.LP +In this example we use the \fB-f\fR option not to specify a single function, +but rather to specify the entire text space of the \fBsbus\fR module. We gather +both lock contention and profiling statistics so that contention can be +correlated with overall load on the module. + +.sp +.in +2 +.nf +example# \fBmodinfo | grep sbus\fR + 24 102a8b6f b8b4 59 1 sbus (SBus (sysio) nexus driver) +.fi +.in -2 +.sp + +.sp +.in +2 +.nf +example# \fBlockstat -kICE -f 0x102a8b6f,0xb8b4 sleep 10\fR +Adaptive mutex spin: 39 events in 10.042 seconds (4 events/sec) +.fi +.in -2 +.sp + +.sp +.in +2 +.nf +Count indv cuml rcnt nsec Lock Caller +------------------------------------------------------------------------- + 15 38% 38% 1.00 206 0x30005160528 sync_stream_buf + 7 18% 56% 1.00 14 0x30005160d18 sync_stream_buf + 6 15% 72% 1.00 27 0x300060c3118 sync_stream_buf + 5 13% 85% 1.00 24 0x300060c3510 sync_stream_buf + 2 5% 90% 1.00 29 0x300060c2d20 sync_stream_buf + 2 5% 95% 1.00 24 0x30005161cf8 sync_stream_buf + 1 3% 97% 1.00 21 0x30005161110 sync_stream_buf + 1 3% 100% 1.00 23 0x30005160130 sync_stream_buf +[...] + +Adaptive mutex block: 9 events in 10.042 seconds (1 events/sec) + +Count indv cuml rcnt nsec Lock Caller +------------------------------------------------------------------------- + 4 44% 44% 1.00 156539 0x30005160528 sync_stream_buf + 2 22% 67% 1.00 763516 0x30005160d18 sync_stream_buf + 1 11% 78% 1.00 462130 0x300060c3510 sync_stream_buf + 1 11% 89% 1.00 288749 0x30005161110 sync_stream_buf + 1 11% 100% 1.00 1015374 0x30005160130 sync_stream_buf +[...] + +Profiling interrupt: 229 events in 10.042 seconds (23 events/sec) + +Count indv cuml rcnt nsec Hottest CPU+PIL Caller + +------------------------------------------------------------------------- + 89 39% 39% 1.00 426 cpu[0]+6 sync_stream_buf + 64 28% 67% 1.00 398 cpu[0]+6 sbus_intr_wrapper + 23 10% 77% 1.00 324 cpu[0]+6 iommu_dvma_kaddr_load + 21 9% 86% 1.00 512 cpu[0]+6 iommu_tlb_flush + 14 6% 92% 1.00 342 cpu[0]+6 iommu_dvma_unload + 13 6% 98% 1.00 306 cpu[1] iommu_dvma_sync + 5 2% 100% 1.00 389 cpu[1] iommu_dma_bindhdl +------------------------------------------------------------------------- +[...] +.fi +.in -2 +.sp + +.LP +\fBExample 7 \fRDetermining the Average PIL (processor interrupt level) for a +CPU +.sp +.in +2 +.nf +example# \fBlockstat -Iw -l cpu[3] ./testprog\fR + +Profiling interrupt: 14791 events in 152.463 seconds (97 events/sec) + +Count indv cuml rcnt nsec CPU+PIL Hottest Caller + +----------------------------------------------------------------------- +13641 92% 92% 1.00 253 cpu[3] (usermode) + 579 4% 96% 1.00 325 cpu[3]+6 ip_ocsum+0xe8 + 375 3% 99% 1.00 411 cpu[3]+10 splx + 154 1% 100% 1.00 527 cpu[3]+4 fas_intr_svc+0x80 + 41 0% 100% 1.00 293 cpu[3]+13 send_mondo+0x18 + 1 0% 100% 1.00 266 cpu[3]+12 zsa_rxint+0x400 +----------------------------------------------------------------------- +[...] +.fi +.in -2 +.sp + +.LP +\fBExample 8 \fRDetermining which Subsystem is Causing the System to be Busy +.sp +.in +2 +.nf +example# \fBlockstat -s 10 -I sleep 20\fR + +Profiling interrupt: 4863 events in 47.375 seconds (103 events/sec) + +Count indv cuml rcnt nsec CPU+PIL Caller + +----------------------------------------------------------------------- +1929 40% 40% 0.00 3215 cpu[0] usec_delay+0x78 + nsec ------ Time Distribution ------ count Stack + 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1872 ata_wait+0x90 + 8192 | 27 acersb_get_intr_status+0x34 + 16384 | 29 ata_set_feature+0x124 + 32768 | 1 ata_disk_start+0x15c + ata_hba_start+0xbc + ghd_waitq_process_and \e + _mutex_hold+0x70 + ghd_waitq_process_and \e + _mutex_exit+0x4 + ghd_transport+0x12c + ata_disk_tran_start+0x108 +----------------------------------------------------------------------- +[...] +.fi +.in -2 +.sp + +.SH SEE ALSO +.sp +.LP +\fBdtrace\fR(1M), \fBplockstat\fR(1M), \fBattributes\fR(5), \fBlockstat\fR(7D), +\fBmutex\fR(9F), \fBrwlock\fR(9F) +.sp +.LP +\fISolaris Dynamic Tracing Guide\fR +.SH NOTES +.sp +.LP +The profiling support provided by \fBlockstat\fR \fB-I\fR replaces the old (and +undocumented) \fB/usr/bin/kgmon\fR and \fB/dev/profile\fR. +.sp +.LP +Tail-call elimination can affect call sites. For example, if +\fBfoo()\fR\fB+0x50\fR calls \fBbar()\fR and the last thing \fBbar()\fR does is +call \fBmutex_exit()\fR, the compiler can arrange for \fBbar()\fR to branch to +\fBmutex_exit()\fRwith a return address of \fBfoo()\fR\fB+0x58\fR. Thus, the +\fBmutex_exit()\fR in \fBbar()\fR will appear as though it occurred at +\fBfoo()\fR\fB+0x58\fR. +.sp +.LP +The \fBPC\fR in the stack frame in which an interrupt occurs can be bogus +because, between function calls, the compiler is free to use the return address +register for local storage. +.sp +.LP +When using the \fB-I\fR and \fB-s\fR options together, the interrupted PC will +usually not appear anywhere in the stack since the interrupt handler is entered +asynchronously, not by a function call from that \fBPC\fR. +.sp +.LP +The \fBlockstat\fR technology is provided on an as-is basis. The format and +content of \fBlockstat\fR output reflect the current Solaris kernel +implementation and are therefore subject to change in future releases. diff --git a/man/man1m/plockstat.1m b/man/man1m/plockstat.1m new file mode 100644 index 0000000..18a9d2e --- /dev/null +++ b/man/man1m/plockstat.1m @@ -0,0 +1,273 @@ +'\" te +.\" Copyright (c) 2009, Sun Microsystems, Inc. All Rights Reserved. +.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. +.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] +.TH plockstat 1M "26 Jan 2009" "SunOS 5.11" "System Administration Commands" +.SH NAME +plockstat \- report user-level lock statistics +.SH SYNOPSIS +.LP +.nf +\fBplockstat\fR [\fB-vACHV\fR] [\fB-n\fR \fIcount\fR] [\fB-s\fR \fIdepth\fR] [\fB-e\fR \fIsecs\fR] + [\fB-x\fR \fIarg\fR [=val]] \fIcommand\fR [\fIarg\fR]... +.fi + +.LP +.nf +\fBplockstat\fR [\fB-vACHV\fR] [\fB-n\fR \fIcount\fR] [\fB-s\fR \fIdepth\fR] [\fB-e\fR \fIsecs\fR] + [\fB-x\fR \fIarg\fR [=val]] \fB-p\fR \fIpid\fR +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBplockstat\fR utility gathers and displays user-level locking statistics. +By default, \fBplockstat\fR monitors all lock contention events, gathers +frequency and timing data about those events, and displays the data in +decreasing frequency order, so that the most common events appear first. +.sp +.LP +\fBplockstat\fR gathers data until the specified command completes or the +process specified with the \fB-p\fR option completes. +.sp +.LP +\fBplockstat\fR relies on DTrace to instrument a running process or a command +it invokes to trace events of interest. This imposes a small but measurable +performance overhead on the processes being observed. Users must have the +\fBdtrace_proc\fR privilege and have permission to observe a particular process +with \fBplockstat\fR. Refer to the \fI\fR for more information about DTrace +security features. +.SH OPTIONS +.sp +.LP +The following options are supported: +.sp +.ne 2 +.mk +.na +\fB\fB-A\fR\fR +.ad +.RS 16n +.rt +Watch all lock events. This option is equivalent to \fB-CH\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-C\fR\fR +.ad +.RS 16n +.rt +Watch contention events. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-H\fR\fR +.ad +.RS 16n +.rt +Watch hold events. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-e\fR \fIsecs\fR\fR +.ad +.RS 16n +.rt +Exit after the number of seconds specified have elapsed. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-n\fR \fIcount\fR\fR +.ad +.RS 16n +.rt +Display only the specified number of entries for each output category. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-s\fR \fIdepth\fR\fR +.ad +.RS 16n +.rt +Record a stack trace rather than just the calling function. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR \fIpid\fR\fR +.ad +.RS 16n +.rt +Specify a process ID from which \fBplockstat\fR is to gather data. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-v\fR\fR +.ad +.RS 16n +.rt +Print out a message to indicate that tracing has started. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-x\fR \fIarg\fR[=\fIval\fR]\fR +.ad +.RS 16n +.rt +Enable or modify a DTrace runtime option or D compiler option. The list of +options is found in the \fISolaris Dynamic Tracing Guide\fR. Boolean options +are enabled by specifying their name. Options with values are set by separating +the option name and value with an equals sign (\fB=\fR). +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-V\fR\fR +.ad +.RS 16n +.rt +Print the Dtrace commands used to gather the data. The output can then be used +directly with the \fBdtrace\fR(1M) command. +.RE + +.SH OPERANDS +.sp +.LP +The following operands are supported: +.sp +.ne 2 +.mk +.na +\fB\fIarg\fR\fR +.ad +.RS 11n +.rt +A string to be passed as an argument to \fIcommand\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIcommand\fR\fR +.ad +.RS 11n +.rt +The name of a utility to be invoked. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIcount\fR\fR +.ad +.RS 11n +.rt +A positive integer value. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIpid\fR\fR +.ad +.RS 11n +.rt +A process identifier for a process to be monitored. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIsecs\fR\fR +.ad +.RS 11n +.rt +Duration specified as a positive integer number of seconds. +.RE + +.SH EXIT STATUS +.sp +.LP +The following exit values are returned: +.sp +.ne 2 +.mk +.na +\fB\fB0\fR\fR +.ad +.RS 6n +.rt +Successful completion. +.RE + +.sp +.ne 2 +.mk +.na +\fB>\fB0\fR\fR +.ad +.RS 6n +.rt +An error occurred. +.RE + +.SH ATTRIBUTES +.sp +.LP +See \fBattributes\fR(5) for descriptions of the following attributes: +.sp + +.sp +.TS +tab() box; +cw(2.75i) |cw(2.75i) +lw(2.75i) |lw(2.75i) +. +ATTRIBUTE TYPEATTRIBUTE VALUE +_ +Interface StabilitySee below. +.TE + +.sp +.LP +The command-line syntax is Evolving. The human-readable output is Unstable. +.SH SEE ALSO +.sp +.LP +\fBdtrace\fR(1M), \fBlockstat\fR(1M), \fBmutex_init\fR(3C), +\fBpthread_mutex_lock\fR(3C), \fBpthread_rwlock_rdlock\fR(3C), +\fBpthread_rwlock_wrlock\fR(3C), \fBpthread_rwlock_unlock\fR(3C), +\fBrwlock\fR(3C), \fBattributes\fR(5), \fBfasttrap\fR(7D) +.sp +.LP +\fI\fR diff --git a/man/man1m/zdb.1m b/man/man1m/zdb.1m new file mode 100644 index 0000000..661165d --- /dev/null +++ b/man/man1m/zdb.1m @@ -0,0 +1,87 @@ +'\" te +.\" Copyright (c) 2004, Sun Microsystems, Inc. All Rights Reserved. +.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. +.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] +.TH zdb 1M "31 Oct 2005" "SunOS 5.11" "System Administration Commands" +.SH NAME +zdb \- ZFS debugger +.SH SYNOPSIS +.LP +.nf +\fBzdb\fR \fIpool\fR +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBzdb\fR command is used by support engineers to diagnose failures and +gather statistics. Since the \fBZFS\fR file system is always consistent on disk +and is self-repairing, \fBzdb\fR should only be run under the direction by a +support engineer. +.sp +.LP +If no arguments are specified, \fBzdb\fR, performs basic consistency checks on +the pool and associated datasets, and report any problems detected. +.sp +.LP +Any options supported by this command are internal to Sun and subject to change +at any time. +.SH EXIT STATUS +.sp +.LP +The following exit values are returned: +.sp +.ne 2 +.mk +.na +\fB\fB0\fR\fR +.ad +.RS 5n +.rt +The pool is consistent. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB1\fR\fR +.ad +.RS 5n +.rt +An error was detected. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB2\fR\fR +.ad +.RS 5n +.rt +Invalid command line options were specified. +.RE + +.SH ATTRIBUTES +.sp +.LP +See \fBattributes\fR(5) for descriptions of the following attributes: +.sp + +.sp +.TS +tab() box; +cw(2.75i) |cw(2.75i) +lw(2.75i) |lw(2.75i) +. +ATTRIBUTE TYPEATTRIBUTE VALUE +_ +Interface StabilityUnstable +.TE + +.SH SEE ALSO +.sp +.LP +\fBzfs\fR(1M), \fBzpool\fR(1M), \fBattributes\fR(5) diff --git a/man/man1m/zfs.1m b/man/man1m/zfs.1m new file mode 100644 index 0000000..68244c7 --- /dev/null +++ b/man/man1m/zfs.1m @@ -0,0 +1,3781 @@ +'\" te +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with +.\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] +.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with +.\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] +.TH zfs 1M "24 Sep 2009" "SunOS 5.11" "System Administration Commands" +.SH NAME +zfs \- configures ZFS file systems +.SH SYNOPSIS +.LP +.nf +\fBzfs\fR [\fB-?\fR] +.fi + +.LP +.nf +\fBzfs\fR \fBcreate\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem\fR +.fi + +.LP +.nf +\fBzfs\fR \fBcreate\fR [\fB-ps\fR] [\fB-b\fR \fIblocksize\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fB-V\fR \fIsize\fR \fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBdestroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBdestroy\fR [\fB-rRd\fR] \fIsnapshot\fR +.fi + +.LP +.nf +\fBzfs\fR \fBsnapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... + \fIfilesystem@snapname\fR|\fIvolume@snapname\fR +.fi + +.LP +.nf +\fBzfs\fR \fBrollback\fR [\fB-rRf\fR] \fIsnapshot\fR +.fi + +.LP +.nf +\fBzfs\fR \fBclone\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBpromote\fR \fIclone-filesystem\fR +.fi + +.LP +.nf +\fBzfs\fR \fBrename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR + \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR +.fi + +.LP +.nf +\fBzfs\fR \fBrename\fR [\fB-p\fR] \fIfilesystem\fR|\fIvolume\fR \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBrename\fR \fB-r\fR \fIsnapshot\fR \fIsnapshot\fR +.fi + +.LP +.nf +\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-H\fR][\fB-o\fR \fIproperty\fR[,...]] [\fB-t\fR \fItype\fR[,...]] + [\fB-s\fR \fIproperty\fR] ... [\fB-S\fR \fIproperty\fR] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR] ... +.fi + +.LP +.nf +\fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ... +.fi + +.LP +.nf +\fBzfs\fR \fBget\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIfield\fR[,...]] [\fB-s\fR \fIsource\fR[,...]] + "\fIall\fR" | \fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ... +.fi + +.LP +.nf +\fBzfs\fR \fBinherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume|snapshot\fR ... +.fi + +.LP +.nf +\fBzfs\fR \fBupgrade\fR [\fB-v\fR] +.fi + +.LP +.nf +\fBzfs\fR \fBupgrade\fR [\fB-r\fR] [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIfilesystem\fR +.fi + +.LP +.nf +\fBzfs\fR \fBuserspace\fR [\fB-niHp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-sS\fR \fIfield\fR] ... + [\fB-t\fR \fItype\fR [,...]] \fIfilesystem\fR|\fIsnapshot\fR +.fi + +.LP +.nf +\fBzfs\fR \fBgroupspace\fR [\fB-niHp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-sS\fR \fIfield\fR] ... + [\fB-t\fR \fItype\fR [,...]] \fIfilesystem\fR|\fIsnapshot\fR +.fi + +.LP +.nf +\fBzfs\fR \fBmount\fR +.fi + +.LP +.nf +\fBzfs\fR \fBmount\fR [\fB-vO\fR] [\fB-o \fIoptions\fR\fR] \fB-a\fR | \fIfilesystem\fR +.fi + +.LP +.nf +\fBzfs\fR \fBunmount\fR [\fB-f\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR +.fi + +.LP +.nf +\fBzfs\fR \fBshare\fR \fB-a\fR | \fIfilesystem\fR +.fi + +.LP +.nf +\fBzfs\fR \fBunshare\fR \fB-a\fR \fIfilesystem\fR|\fImountpoint\fR +.fi + +.LP +.nf +\fBzfs\fR \fBsend\fR [\fB-vR\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR +.fi + +.LP +.nf +\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR +.fi + +.LP +.nf +\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] \fB-d\fR \fIfilesystem\fR +.fi + +.LP +.nf +\fBzfs\fR \fBallow\fR \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBallow\fR [\fB-ldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] \fIperm\fR|\fI@setname\fR[,...] + \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBallow\fR [\fB-ld\fR] \fB-e\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBallow\fR \fB-c\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBallow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBunallow\fR [\fB-rldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] [\fIperm\fR|@\fIsetname\fR[,... ]] + \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBunallow\fR [\fB-rld\fR] \fB-e\fR [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-c\fR [\fIperm\fR|@\fIsetname\fR[ ... ]] \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR +.fi + +.LP +.nf +\fBzfs\fR \fBhold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR... +.fi + +.LP +.nf +\fBzfs\fR \fBholds\fR [\fB-r\fR] \fIsnapshot\fR... +.fi + +.LP +.nf +\fBzfs\fR \fBrelease\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR... +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBzfs\fR command configures \fBZFS\fR datasets within a \fBZFS\fR storage +pool, as described in \fBzpool\fR(1M). A dataset is identified by a unique path +within the \fBZFS\fR namespace. For example: +.sp +.in +2 +.nf +pool/{filesystem,volume,snapshot} +.fi +.in -2 +.sp + +.sp +.LP +where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes). +.sp +.LP +A dataset can be one of the following: +.sp +.ne 2 +.mk +.na +\fB\fIfile system\fR\fR +.ad +.sp .6 +.RS 4n +A \fBZFS\fR dataset of type \fBfilesystem\fR can be mounted within the standard +system namespace and behaves like other file systems. While \fBZFS\fR file +systems are designed to be \fBPOSIX\fR compliant, known issues exist that +prevent compliance in some cases. Applications that depend on standards +conformance might fail due to nonstandard behavior when checking file system +free space. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +A logical volume exported as a raw or block device. This type of dataset should +only be used under special circumstances. File systems are typically used in +most environments. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +A read-only version of a file system or volume at a given point in time. It is +specified as \fIfilesystem@name\fR or \fIvolume@name\fR. +.RE + +.SS "ZFS File System Hierarchy" +.sp +.LP +A \fBZFS\fR storage pool is a logical collection of devices that provide space +for datasets. A storage pool is also the root of the \fBZFS\fR file system +hierarchy. +.sp +.LP +The root of the pool can be accessed as a file system, such as mounting and +unmounting, taking snapshots, and setting properties. The physical storage +characteristics, however, are managed by the \fBzpool\fR(1M) command. +.sp +.LP +See \fBzpool\fR(1M) for more information on creating and administering pools. +.SS "Snapshots" +.sp +.LP +A snapshot is a read-only copy of a file system or volume. Snapshots can be +created extremely quickly, and initially consume no additional space within the +pool. As data within the active dataset changes, the snapshot consumes more +data than would otherwise be shared with the active dataset. +.sp +.LP +Snapshots can have arbitrary names. Snapshots of volumes can be cloned or +rolled back, but cannot be accessed independently. +.sp +.LP +File system snapshots can be accessed under the \fB\&.zfs/snapshot\fR directory +in the root of the file system. Snapshots are automatically mounted on demand +and may be unmounted at regular intervals. The visibility of the \fB\&.zfs\fR +directory can be controlled by the \fBsnapdir\fR property. +.SS "Clones" +.sp +.LP +A clone is a writable volume or file system whose initial contents are the same +as another dataset. As with snapshots, creating a clone is nearly +instantaneous, and initially consumes no additional space. +.sp +.LP +Clones can only be created from a snapshot. When a snapshot is cloned, it +creates an implicit dependency between the parent and child. Even though the +clone is created somewhere else in the dataset hierarchy, the original snapshot +cannot be destroyed as long as a clone exists. The \fBorigin\fR property +exposes this dependency, and the \fBdestroy\fR command lists any such +dependencies, if they exist. +.sp +.LP +The clone parent-child dependency relationship can be reversed by using the +\fBpromote\fR subcommand. This causes the "origin" file system to become a +clone of the specified file system, which makes it possible to destroy the file +system that the clone was created from. +.SS "Mount Points" +.sp +.LP +Creating a \fBZFS\fR file system is a simple operation, so the number of file +systems per system is likely to be numerous. To cope with this, \fBZFS\fR +automatically manages mounting and unmounting file systems without the need to +edit the \fB/etc/vfstab\fR file. All automatically managed file systems are +mounted by \fBZFS\fR at boot time. +.sp +.LP +By default, file systems are mounted under \fB/\fIpath\fR\fR, where \fIpath\fR +is the name of the file system in the \fBZFS\fR namespace. Directories are +created and destroyed as needed. +.sp +.LP +A file system can also have a mount point set in the \fBmountpoint\fR property. +This directory is created as needed, and \fBZFS\fR automatically mounts the +file system when the \fBzfs mount -a\fR command is invoked (without editing +\fB/etc/vfstab\fR). The \fBmountpoint\fR property can be inherited, so if +\fBpool/home\fR has a mount point of \fB/export/stuff\fR, then +\fBpool/home/user\fR automatically inherits a mount point of +\fB/export/stuff/user\fR. +.sp +.LP +A file system \fBmountpoint\fR property of \fBnone\fR prevents the file system +from being mounted. +.sp +.LP +If needed, \fBZFS\fR file systems can also be managed with traditional tools +(\fBmount\fR, \fBumount\fR, \fB/etc/vfstab\fR). If a file system's mount point +is set to \fBlegacy\fR, \fBZFS\fR makes no attempt to manage the file system, +and the administrator is responsible for mounting and unmounting the file +system. +.SS "Zones" +.sp +.LP +A \fBZFS\fR file system can be added to a non-global zone by using the +\fBzonecfg\fR \fBadd fs\fR subcommand. A \fBZFS\fR file system that is added to +a non-global zone must have its \fBmountpoint\fR property set to \fBlegacy\fR. +.sp +.LP +The physical properties of an added file system are controlled by the global +administrator. However, the zone administrator can create, modify, or destroy +files within the added file system, depending on how the file system is +mounted. +.sp +.LP +A dataset can also be delegated to a non-global zone by using the \fBzonecfg\fR +\fBadd dataset\fR subcommand. You cannot delegate a dataset to one zone and the +children of the same dataset to another zone. The zone administrator can change +properties of the dataset or any of its children. However, the \fBquota\fR +property is controlled by the global administrator. +.sp +.LP +A \fBZFS\fR volume can be added as a device to a non-global zone by using the +\fBzonecfg\fR \fBadd device\fR subcommand. However, its physical properties can +be modified only by the global administrator. +.sp +.LP +For more information about \fBzonecfg\fR syntax, see \fBzonecfg\fR(1M). +.sp +.LP +After a dataset is delegated to a non-global zone, the \fBzoned\fR property is +automatically set. A zoned file system cannot be mounted in the global zone, +since the zone administrator might have to set the mount point to an +unacceptable value. +.sp +.LP +The global administrator can forcibly clear the \fBzoned\fR property, though +this should be done with extreme care. The global administrator should verify +that all the mount points are acceptable before clearing the property. +.SS "Native Properties" +.sp +.LP +Properties are divided into two types, native properties and user-defined (or +"user") properties. Native properties either export internal statistics or +control \fBZFS\fR behavior. In addition, native properties are either editable +or read-only. User properties have no effect on \fBZFS\fR behavior, but you can +use them to annotate datasets in a way that is meaningful in your environment. +For more information about user properties, see the "User Properties" section, +below. +.sp +.LP +Every dataset has a set of properties that export statistics about the dataset +as well as control various behaviors. Properties are inherited from the parent +unless overridden by the child. Some properties apply only to certain types of +datasets (file systems, volumes, or snapshots). +.sp +.LP +The values of numeric properties can be specified using human-readable suffixes +(for example, \fBk\fR, \fBKB\fR, \fBM\fR, \fBGb\fR, and so forth, up to \fBZ\fR +for zettabyte). The following are all valid (and equal) specifications: +.sp +.in +2 +.nf +1536M, 1.5g, 1.50GB +.fi +.in -2 +.sp + +.sp +.LP +The values of non-numeric properties are case sensitive and must be lowercase, +except for \fBmountpoint\fR, \fBsharenfs\fR, and \fBsharesmb\fR. +.sp +.LP +The following native properties consist of read-only statistics about the +dataset. These properties can be neither set, nor inherited. Native properties +apply to all dataset types unless otherwise noted. +.sp +.ne 2 +.mk +.na +\fB\fBavailable\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space available to the dataset and all its children, assuming +that there is no other activity in the pool. Because space is shared within a +pool, availability can be limited by any number of factors, including physical +pool size, quotas, reservations, or other datasets within the pool. +.sp +This property can also be referred to by its shortened column name, +\fBavail\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcompressratio\fR\fR +.ad +.sp .6 +.RS 4n +The compression ratio achieved for this dataset, expressed as a multiplier. +Compression can be turned on by running: \fBzfs set compression=on +\fIdataset\fR\fR. The default value is \fBoff\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcreation\fR\fR +.ad +.sp .6 +.RS 4n +The time this dataset was created. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBdefer_destroy\fR\fR +.ad +.sp .6 +.RS 4n +This property is \fBon\fR if the snapshot has been marked for deferred destroy +by using the \fBzfs destroy\fR \fB-d\fR command. Otherwise, the property is +\fBoff\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBmounted\fR\fR +.ad +.sp .6 +.RS 4n +For file systems, indicates whether the file system is currently mounted. This +property can be either \fByes\fR or \fBno\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBorigin\fR\fR +.ad +.sp .6 +.RS 4n +For cloned file systems or volumes, the snapshot from which the clone was +created. The origin cannot be destroyed (even with the \fB-r\fR or \fB-f\fR +options) so long as a clone exists. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBreferenced\fR\fR +.ad +.sp .6 +.RS 4n +The amount of data that is accessible by this dataset, which may or may not be +shared with other datasets in the pool. When a snapshot or clone is created, it +initially references the same amount of space as the file system or snapshot it +was created from, since its contents are identical. +.sp +This property can also be referred to by its shortened column name, +\fBrefer\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBtype\fR\fR +.ad +.sp .6 +.RS 4n +The type of dataset: \fBfilesystem\fR, \fBvolume\fR, or \fBsnapshot\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBused\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space consumed by this dataset and all its descendents. This is +the value that is checked against this dataset's quota and reservation. The +space used does not include this dataset's reservation, but does take into +account the reservations of any descendent datasets. The amount of space that a +dataset consumes from its parent, as well as the amount of space that are freed +if this dataset is recursively destroyed, is the greater of its space used and +its reservation. +.sp +When snapshots (see the "Snapshots" section) are created, their space is +initially shared between the snapshot and the file system, and possibly with +previous snapshots. As the file system changes, space that was previously +shared becomes unique to the snapshot, and counted in the snapshot's space +used. Additionally, deleting snapshots can increase the amount of space unique +to (and used by) other snapshots. +.sp +The amount of space used, available, or referenced does not take into account +pending changes. Pending changes are generally accounted for within a few +seconds. Committing a change to a disk using \fBfsync\fR(3c) or \fBO_SYNC\fR +does not necessarily guarantee that the space usage information is updated +immediately. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBusedby*\fR\fR +.ad +.sp .6 +.RS 4n +The \fBusedby*\fR properties decompose the \fBused\fR properties into the +various reasons that space is used. Specifically, \fBused\fR = +\fBusedbychildren\fR + \fBusedbydataset\fR + \fBusedbyrefreservation\fR +, +\fBusedbysnapshots\fR. These properties are only available for datasets created +on \fBzpool\fR "version 13" pools. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBusedbychildren\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space used by children of this dataset, which would be freed if +all the dataset's children were destroyed. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBusedbydataset\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space used by this dataset itself, which would be freed if the +dataset were destroyed (after first removing any \fBrefreservation\fR and +destroying any necessary snapshots or descendents). +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBusedbyrefreservation\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space used by a \fBrefreservation\fR set on this dataset, which +would be freed if the \fBrefreservation\fR was removed. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBusedbysnapshots\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space consumed by snapshots of this dataset. In particular, it is +the amount of space that would be freed if all of this dataset's snapshots were +destroyed. Note that this is not simply the sum of the snapshots' \fBused\fR +properties because space can be shared by multiple snapshots. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBuserused@\fR\fIuser\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space consumed by the specified user in this dataset. Space is +charged to the owner of each file, as displayed by \fBls\fR \fB-l\fR. The +amount of space charged is displayed by \fBdu\fR and \fBls\fR \fB-s\fR. See the +\fBzfs userspace\fR subcommand for more information. +.sp +Unprivileged users can access only their own space usage. The root user, or a +user who has been granted the \fBuserused\fR privilege with \fBzfs allow\fR, +can access everyone's usage. +.sp +The \fBuserused@\fR... properties are not displayed by \fBzfs get all\fR. The +user's name must be appended after the \fB@\fR symbol, using one of the +following forms: +.RS +4 +.TP +.ie t \(bu +.el o +\fIPOSIX name\fR (for example, \fBjoe\fR) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fIPOSIX numeric ID\fR (for example, \fB789\fR) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fISID name\fR (for example, \fBjoe.smith@mydomain\fR) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR) +.RE +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBuserrefs\fR\fR +.ad +.sp .6 +.RS 4n +This property is set to the number of user holds on this snapshot. User holds +are set by using the \fBzfs hold\fR command. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBgroupused@\fR\fIgroup\fR\fR +.ad +.sp .6 +.RS 4n +The amount of space consumed by the specified group in this dataset. Space is +charged to the group of each file, as displayed by \fBls\fR \fB-l\fR. See the +\fBuserused@\fR\fIuser\fR property for more information. +.sp +Unprivileged users can only access their own groups' space usage. The root +user, or a user who has been granted the \fBgroupused\fR privilege with \fBzfs +allow\fR, can access all groups' usage. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBvolblocksize\fR=\fIblocksize\fR\fR +.ad +.sp .6 +.RS 4n +For volumes, specifies the block size of the volume. The \fBblocksize\fR cannot +be changed once the volume has been written, so it should be set at volume +creation time. The default \fBblocksize\fR for volumes is 8 Kbytes. Any power +of 2 from 512 bytes to 128 Kbytes is valid. +.sp +This property can also be referred to by its shortened column name, +\fBvolblock\fR. +.RE + +.sp +.LP +The following native properties can be used to change the behavior of a +\fBZFS\fR dataset. +.sp +.ne 2 +.mk +.na +\fB\fBaclinherit\fR=\fBdiscard\fR | \fBnoallow\fR | \fBrestricted\fR | +\fBpassthrough\fR | \fBpassthrough-x\fR\fR +.ad +.sp .6 +.RS 4n +Controls how \fBACL\fR entries are inherited when files and directories are +created. A file system with an \fBaclinherit\fR property of \fBdiscard\fR does +not inherit any \fBACL\fR entries. A file system with an \fBaclinherit\fR +property value of \fBnoallow\fR only inherits inheritable \fBACL\fR entries +that specify "deny" permissions. The property value \fBrestricted\fR (the +default) removes the \fBwrite_acl\fR and \fBwrite_owner\fR permissions when the +\fBACL\fR entry is inherited. A file system with an \fBaclinherit\fR property +value of \fBpassthrough\fR inherits all inheritable \fBACL\fR entries without +any modifications made to the \fBACL\fR entries when they are inherited. A file +system with an \fBaclinherit\fR property value of \fBpassthrough-x\fR has the +same meaning as \fBpassthrough\fR, except that the \fBowner@\fR, \fBgroup@\fR, +and \fBeveryone@\fR \fBACE\fRs inherit the execute permission only if the file +creation mode also requests the execute bit. +.sp +When the property value is set to \fBpassthrough\fR, files are created with a +mode determined by the inheritable \fBACE\fRs. If no inheritable \fBACE\fRs +exist that affect the mode, then the mode is set in accordance to the requested +mode from the application. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBaclmode\fR=\fBdiscard\fR | \fBgroupmask\fR | \fBpassthrough\fR\fR +.ad +.sp .6 +.RS 4n +Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with +an \fBaclmode\fR property of \fBdiscard\fR deletes all \fBACL\fR entries that +do not represent the mode of the file. An \fBaclmode\fR property of +\fBgroupmask\fR (the default) reduces user or group permissions. The +permissions are reduced, such that they are no greater than the group +permission bits, unless it is a user entry that has the same \fBUID\fR as the +owner of the file or directory. In this case, the \fBACL\fR permissions are +reduced so that they are no greater than owner permission bits. A file system +with an \fBaclmode\fR property of \fBpassthrough\fR indicates that no changes +are made to the \fBACL\fR other than generating the necessary \fBACL\fR entries +to represent the new mode of the file or directory. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBatime\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether the access time for files is updated when they are read. +Turning this property off avoids producing write traffic when reading files and +can result in significant performance gains, though it might confuse mailers +and other similar utilities. The default value is \fBon\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcanmount\fR=\fBon\fR | \fBoff\fR | \fBnoauto\fR\fR +.ad +.sp .6 +.RS 4n +If this property is set to \fBoff\fR, the file system cannot be mounted, and is +ignored by \fBzfs mount -a\fR. Setting this property to \fBoff\fR is similar to +setting the \fBmountpoint\fR property to \fBnone\fR, except that the dataset +still has a normal \fBmountpoint\fR property, which can be inherited. Setting +this property to \fBoff\fR allows datasets to be used solely as a mechanism to +inherit properties. One example of setting \fBcanmount=\fR\fBoff\fR is to have +two datasets with the same \fBmountpoint\fR, so that the children of both +datasets appear in the same directory, but might have different inherited +characteristics. +.sp +When the \fBnoauto\fR option is set, a dataset can only be mounted and +unmounted explicitly. The dataset is not mounted automatically when the dataset +is created or imported, nor is it mounted by the \fBzfs mount -a\fR command or +unmounted by the \fBzfs unmount -a\fR command. +.sp +This property is not inherited. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBchecksum\fR=\fBon\fR | \fBoff\fR | \fBfletcher2,\fR| \fBfletcher4\fR | +\fBsha256\fR\fR +.ad +.sp .6 +.RS 4n +Controls the checksum used to verify data integrity. The default value is +\fBon\fR, which automatically selects an appropriate algorithm (currently, +\fBfletcher2\fR, but this may change in future releases). The value \fBoff\fR +disables integrity checking on user data. Disabling checksums is \fBNOT\fR a +recommended practice. +.sp +Changing this property affects only newly-written data. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcompression\fR=\fBon\fR | \fBoff\fR | \fBlzjb\fR | \fBgzip\fR | +\fBgzip-\fR\fIN\fR\fR +.ad +.sp .6 +.RS 4n +Controls the compression algorithm used for this dataset. The \fBlzjb\fR +compression algorithm is optimized for performance while providing decent data +compression. Setting compression to \fBon\fR uses the \fBlzjb\fR compression +algorithm. The \fBgzip\fR compression algorithm uses the same compression as +the \fBgzip\fR(1) command. You can specify the \fBgzip\fR level by using the +value \fBgzip-\fR\fIN\fR where \fIN\fR is an integer from 1 (fastest) to 9 +(best compression ratio). Currently, \fBgzip\fR is equivalent to \fBgzip-6\fR +(which is also the default for \fBgzip\fR(1)). +.sp +This property can also be referred to by its shortened column name +\fBcompress\fR. Changing this property affects only newly-written data. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcopies\fR=\fB1\fR | \fB2\fR | \fB3\fR\fR +.ad +.sp .6 +.RS 4n +Controls the number of copies of data stored for this dataset. These copies are +in addition to any redundancy provided by the pool, for example, mirroring or +RAID-Z. The copies are stored on different disks, if possible. The space used +by multiple copies is charged to the associated file and dataset, changing the +\fBused\fR property and counting against quotas and reservations. +.sp +Changing this property only affects newly-written data. Therefore, set this +property at file system creation time by using the \fB-o\fR +\fBcopies=\fR\fIN\fR option. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBdevices\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether device nodes can be opened on this file system. The default +value is \fBon\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBexec\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether processes can be executed from within this file system. The +default value is \fBon\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBmountpoint\fR=\fIpath\fR | \fBnone\fR | \fBlegacy\fR\fR +.ad +.sp .6 +.RS 4n +Controls the mount point used for this file system. See the "Mount Points" +section for more information on how this property is used. +.sp +When the \fBmountpoint\fR property is changed for a file system, the file +system and any children that inherit the mount point are unmounted. If the new +value is \fBlegacy\fR, then they remain unmounted. Otherwise, they are +automatically remounted in the new location if the property was previously +\fBlegacy\fR or \fBnone\fR, or if they were mounted before the property was +changed. In addition, any shared file systems are unshared and shared in the +new location. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBnbmand\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether the file system should be mounted with \fBnbmand\fR (Non +Blocking mandatory locks). This is used for \fBCIFS\fR clients. Changes to this +property only take effect when the file system is umounted and remounted. See +\fBmount\fR(1M) for more information on \fBnbmand\fR mounts. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBprimarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR +.ad +.sp .6 +.RS 4n +Controls what is cached in the primary cache (ARC). If this property is set to +\fBall\fR, then both user data and metadata is cached. If this property is set +to \fBnone\fR, then neither user data nor metadata is cached. If this property +is set to \fBmetadata\fR, then only metadata is cached. The default value is +\fBall\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBquota\fR=\fIsize\fR | \fBnone\fR\fR +.ad +.sp .6 +.RS 4n +Limits the amount of space a dataset and its descendents can consume. This +property enforces a hard limit on the amount of space used. This includes all +space consumed by descendents, including file systems and snapshots. Setting a +quota on a descendent of a dataset that already has a quota does not override +the ancestor's quota, but rather imposes an additional limit. +.sp +Quotas cannot be set on volumes, as the \fBvolsize\fR property acts as an +implicit quota. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBuserquota@\fR\fIuser\fR=\fIsize\fR | \fBnone\fR\fR +.ad +.sp .6 +.RS 4n +Limits the amount of space consumed by the specified user. User space +consumption is identified by the \fBuserspace@\fR\fIuser\fR property. +.sp +Enforcement of user quotas may be delayed by several seconds. This delay means +that a user might exceed their quota before the system notices that they are +over quota and begins to refuse additional writes with the \fBEDQUOT\fR error +message . See the \fBzfs userspace\fR subcommand for more information. +.sp +Unprivileged users can only access their own groups' space usage. The root +user, or a user who has been granted the \fBuserquota\fR privilege with \fBzfs +allow\fR, can get and set everyone's quota. +.sp +This property is not available on volumes, on file systems before version 4, or +on pools before version 15. The \fBuserquota@\fR... properties are not +displayed by \fBzfs get all\fR. The user's name must be appended after the +\fB@\fR symbol, using one of the following forms: +.RS +4 +.TP +.ie t \(bu +.el o +\fIPOSIX name\fR (for example, \fBjoe\fR) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fIPOSIX numeric ID\fR (for example, \fB789\fR) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fISID name\fR (for example, \fBjoe.smith@mydomain\fR) +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR) +.RE +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBgroupquota@\fR\fIgroup\fR=\fIsize\fR | \fBnone\fR\fR +.ad +.sp .6 +.RS 4n +Limits the amount of space consumed by the specified group. Group space +consumption is identified by the \fBuserquota@\fR\fIuser\fR property. +.sp +Unprivileged users can access only their own groups' space usage. The root +user, or a user who has been granted the \fBgroupquota\fR privilege with \fBzfs +allow\fR, can get and set all groups' quotas. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBreadonly\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether this dataset can be modified. The default value is \fBoff\fR. +.sp +This property can also be referred to by its shortened column name, +\fBrdonly\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBrecordsize\fR=\fIsize\fR\fR +.ad +.sp .6 +.RS 4n +Specifies a suggested block size for files in the file system. This property is +designed solely for use with database workloads that access files in fixed-size +records. \fBZFS\fR automatically tunes block sizes according to internal +algorithms optimized for typical access patterns. +.sp +For databases that create very large files but access them in small random +chunks, these algorithms may be suboptimal. Specifying a \fBrecordsize\fR +greater than or equal to the record size of the database can result in +significant performance gains. Use of this property for general purpose file +systems is strongly discouraged, and may adversely affect performance. +.sp +The size specified must be a power of two greater than or equal to 512 and less +than or equal to 128 Kbytes. +.sp +Changing the file system's \fBrecordsize\fR affects only files created +afterward; existing files are unaffected. +.sp +This property can also be referred to by its shortened column name, +\fBrecsize\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBrefquota\fR=\fIsize\fR | \fBnone\fR\fR +.ad +.sp .6 +.RS 4n +Limits the amount of space a dataset can consume. This property enforces a hard +limit on the amount of space used. This hard limit does not include space used +by descendents, including file systems and snapshots. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBrefreservation\fR=\fIsize\fR | \fBnone\fR\fR +.ad +.sp .6 +.RS 4n +The minimum amount of space guaranteed to a dataset, not including its +descendents. When the amount of space used is below this value, the dataset is +treated as if it were taking up the amount of space specified by +\fBrefreservation\fR. The \fBrefreservation\fR reservation is accounted for in +the parent datasets' space used, and counts against the parent datasets' quotas +and reservations. +.sp +If \fBrefreservation\fR is set, a snapshot is only allowed if there is enough +free pool space outside of this reservation to accommodate the current number +of "referenced" bytes in the dataset. +.sp +This property can also be referred to by its shortened column name, +\fBrefreserv\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBreservation\fR=\fIsize\fR | \fBnone\fR\fR +.ad +.sp .6 +.RS 4n +The minimum amount of space guaranteed to a dataset and its descendents. When +the amount of space used is below this value, the dataset is treated as if it +were taking up the amount of space specified by its reservation. Reservations +are accounted for in the parent datasets' space used, and count against the +parent datasets' quotas and reservations. +.sp +This property can also be referred to by its shortened column name, +\fBreserv\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBsecondarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR +.ad +.sp .6 +.RS 4n +Controls what is cached in the secondary cache (L2ARC). If this property is set +to \fBall\fR, then both user data and metadata is cached. If this property is +set to \fBnone\fR, then neither user data nor metadata is cached. If this +property is set to \fBmetadata\fR, then only metadata is cached. The default +value is \fBall\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBsetuid\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether the set-\fBUID\fR bit is respected for the file system. The +default value is \fBon\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBshareiscsi\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Like the \fBsharenfs\fR property, \fBshareiscsi\fR indicates whether a +\fBZFS\fR volume is exported as an \fBiSCSI\fR target. The acceptable values +for this property are \fBon\fR, \fBoff\fR, and \fBtype=disk\fR. The default +value is \fBoff\fR. In the future, other target types might be supported. For +example, \fBtape\fR. +.sp +You might want to set \fBshareiscsi=on\fR for a file system so that all +\fBZFS\fR volumes within the file system are shared by default. However, +setting this property on a file system has no direct effect. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBsharesmb\fR=\fBon\fR | \fBoff\fR | \fIopts\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether the file system is shared by using the Solaris \fBCIFS\fR +service, and what options are to be used. A file system with the \fBsharesmb\fR +property set to \fBoff\fR is managed through traditional tools such as +\fBsharemgr\fR(1M). Otherwise, the file system is automatically shared and +unshared with the \fBzfs share\fR and \fBzfs unshare\fR commands. If the +property is set to \fBon\fR, the \fBsharemgr\fR(1M) command is invoked with no +options. Otherwise, the \fBsharemgr\fR(1M) command is invoked with options +equivalent to the contents of this property. +.sp +Because \fBSMB\fR shares requires a resource name, a unique resource name is +constructed from the dataset name. The constructed name is a copy of the +dataset name except that the characters in the dataset name, which would be +illegal in the resource name, are replaced with underscore (\fB_\fR) +characters. A pseudo property "name" is also supported that allows you to +replace the data set name with a specified name. The specified name is then +used to replace the prefix dataset in the case of inheritance. For example, if +the dataset \fBdata/home/john\fR is set to \fBname=john\fR, then +\fBdata/home/john\fR has a resource name of \fBjohn\fR. If a child dataset of +\fBdata/home/john/backups\fR, it has a resource name of \fBjohn_backups\fR. +.sp +When SMB shares are created, the SMB share name appears as an entry in the +\fB\&.zfs/shares\fR directory. You can use the \fBls\fR or \fBchmod\fR command +to display the share-level ACLs on the entries in this directory. +.sp +When the \fBsharesmb\fR property is changed for a dataset, the dataset and any +children inheriting the property are re-shared with the new options, only if +the property was previously set to \fBoff\fR, or if they were shared before the +property was changed. If the new property is set to \fBoff\fR, the file systems +are unshared. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBsharenfs\fR=\fBon\fR | \fBoff\fR | \fIopts\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether the file system is shared via \fBNFS\fR, and what options are +used. A file system with a \fBsharenfs\fR property of \fBoff\fR is managed +through traditional tools such as \fBshare\fR(1M), \fBunshare\fR(1M), and +\fBdfstab\fR(4). Otherwise, the file system is automatically shared and +unshared with the \fBzfs share\fR and \fBzfs unshare\fR commands. If the +property is set to \fBon\fR, the \fBshare\fR(1M) command is invoked with no +options. Otherwise, the \fBshare\fR(1M) command is invoked with options +equivalent to the contents of this property. +.sp +When the \fBsharenfs\fR property is changed for a dataset, the dataset and any +children inheriting the property are re-shared with the new options, only if +the property was previously \fBoff\fR, or if they were shared before the +property was changed. If the new property is \fBoff\fR, the file systems are +unshared. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBlogbias\fR = \fBlatency\fR | \fBthroughput\fR\fR +.ad +.sp .6 +.RS 4n +Provide a hint to ZFS about handling of synchronous requests in this dataset. +If \fBlogbias\fR is set to \fBlatency\fR (the default), ZFS will use pool log +devices (if configured) to handle the requests at low latency. If \fBlogbias\fR +is set to \fBthroughput\fR, ZFS will not use configured pool log devices. ZFS +will instead optimize synchronous operations for global pool throughput and +efficient use of resources. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBsnapdir\fR=\fBhidden\fR | \fBvisible\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether the \fB\&.zfs\fR directory is hidden or visible in the root of +the file system as discussed in the "Snapshots" section. The default value is +\fBhidden\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBversion\fR=\fB1\fR | \fB2\fR | \fBcurrent\fR\fR +.ad +.sp .6 +.RS 4n +The on-disk version of this file system, which is independent of the pool +version. This property can only be set to later supported versions. See the +\fBzfs upgrade\fR command. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBvolsize\fR=\fIsize\fR\fR +.ad +.sp .6 +.RS 4n +For volumes, specifies the logical size of the volume. By default, creating a +volume establishes a reservation of equal size. For storage pools with a +version number of 9 or higher, a \fBrefreservation\fR is set instead. Any +changes to \fBvolsize\fR are reflected in an equivalent change to the +reservation (or \fBrefreservation\fR). The \fBvolsize\fR can only be set to a +multiple of \fBvolblocksize\fR, and cannot be zero. +.sp +The reservation is kept equal to the volume's logical size to prevent +unexpected behavior for consumers. Without the reservation, the volume could +run out of space, resulting in undefined behavior or data corruption, depending +on how the volume is used. These effects can also occur when the volume size is +changed while it is in use (particularly when shrinking the size). Extreme care +should be used when adjusting the volume size. +.sp +Though not recommended, a "sparse volume" (also known as "thin provisioning") +can be created by specifying the \fB-s\fR option to the \fBzfs create -V\fR +command, or by changing the reservation after the volume has been created. A +"sparse volume" is a volume where the reservation is less then the volume size. +Consequently, writes to a sparse volume can fail with \fBENOSPC\fR when the +pool is low on space. For a sparse volume, changes to \fBvolsize\fR are not +reflected in the reservation. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBvscan\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether regular files should be scanned for viruses when a file is +opened and closed. In addition to enabling this property, the virus scan +service must also be enabled for virus scanning to occur. The default value is +\fBoff\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBxattr\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether extended attributes are enabled for this file system. The +default value is \fBon\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzoned\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether the dataset is managed from a non-global zone. See the "Zones" +section for more information. The default value is \fBoff\fR. +.RE + +.sp +.LP +The following three properties cannot be changed after the file system is +created, and therefore, should be set when the file system is created. If the +properties are not set with the \fBzfs create\fR or \fBzpool create\fR +commands, these properties are inherited from the parent dataset. If the parent +dataset lacks these properties due to having been created prior to these +features being supported, the new file system will have the default values for +these properties. +.sp +.ne 2 +.mk +.na +\fB\fBcasesensitivity\fR=\fBsensitive\fR | \fBinsensitive\fR | \fBmixed\fR\fR +.ad +.sp .6 +.RS 4n +Indicates whether the file name matching algorithm used by the file system +should be case-sensitive, case-insensitive, or allow a combination of both +styles of matching. The default value for the \fBcasesensitivity\fR property is +\fBsensitive\fR. Traditionally, UNIX and POSIX file systems have case-sensitive +file names. +.sp +The \fBmixed\fR value for the \fBcasesensitivity\fR property indicates that the +file system can support requests for both case-sensitive and case-insensitive +matching behavior. Currently, case-insensitive matching behavior on a file +system that supports mixed behavior is limited to the Solaris CIFS server +product. For more information about the \fBmixed\fR value behavior, see the +\fISolaris ZFS Administration Guide\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBnormalization\fR = \fBnone\fR | \fBformC\fR | \fBformD\fR | \fBformKC\fR +| \fBformKD\fR\fR +.ad +.sp .6 +.RS 4n +Indicates whether the file system should perform a \fBunicode\fR normalization +of file names whenever two file names are compared, and which normalization +algorithm should be used. File names are always stored unmodified, names are +normalized as part of any comparison process. If this property is set to a +legal value other than \fBnone\fR, and the \fButf8only\fR property was left +unspecified, the \fButf8only\fR property is automatically set to \fBon\fR. The +default value of the \fBnormalization\fR property is \fBnone\fR. This property +cannot be changed after the file system is created. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fButf8only\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Indicates whether the file system should reject file names that include +characters that are not present in the \fBUTF-8\fR character code set. If this +property is explicitly set to \fBoff\fR, the normalization property must either +not be explicitly set or be set to \fBnone\fR. The default value for the +\fButf8only\fR property is \fBoff\fR. This property cannot be changed after the +file system is created. +.RE + +.sp +.LP +The \fBcasesensitivity\fR, \fBnormalization\fR, and \fButf8only\fR properties +are also new permissions that can be assigned to non-privileged users by using +the \fBZFS\fR delegated administration feature. +.SS "Temporary Mount Point Properties" +.sp +.LP +When a file system is mounted, either through \fBmount\fR(1M) for legacy mounts +or the \fBzfs mount\fR command for normal file systems, its mount options are +set according to its properties. The correlation between properties and mount +options is as follows: +.sp +.in +2 +.nf + PROPERTY MOUNT OPTION + devices devices/nodevices + exec exec/noexec + readonly ro/rw + setuid setuid/nosetuid + xattr xattr/noxattr +.fi +.in -2 +.sp + +.sp +.LP +In addition, these options can be set on a per-mount basis using the \fB-o\fR +option, without affecting the property that is stored on disk. The values +specified on the command line override the values stored in the dataset. The +\fB-nosuid\fR option is an alias for \fBnodevices,nosetuid\fR. These properties +are reported as "temporary" by the \fBzfs get\fR command. If the properties are +changed while the dataset is mounted, the new setting overrides any temporary +settings. +.SS "User Properties" +.sp +.LP +In addition to the standard native properties, \fBZFS\fR supports arbitrary +user properties. User properties have no effect on \fBZFS\fR behavior, but +applications or administrators can use them to annotate datasets (file systems, +volumes, and snapshots). +.sp +.LP +User property names must contain a colon (\fB:\fR) character to distinguish +them from native properties. They may contain lowercase letters, numbers, and +the following punctuation characters: colon (\fB:\fR), dash (\fB-\fR), period +(\fB\&.\fR), and underscore (\fB_\fR). The expected convention is that the +property name is divided into two portions such as +\fImodule\fR\fB:\fR\fIproperty\fR, but this namespace is not enforced by +\fBZFS\fR. User property names can be at most 256 characters, and cannot begin +with a dash (\fB-\fR). +.sp +.LP +When making programmatic use of user properties, it is strongly suggested to +use a reversed \fBDNS\fR domain name for the \fImodule\fR component of property +names to reduce the chance that two independently-developed packages use the +same property name for different purposes. Property names beginning with +\fBcom.sun\fR. are reserved for use by Sun Microsystems. +.sp +.LP +The values of user properties are arbitrary strings, are always inherited, and +are never validated. All of the commands that operate on properties (\fBzfs +list\fR, \fBzfs get\fR, \fBzfs set\fR, and so forth) can be used to manipulate +both native properties and user properties. Use the \fBzfs inherit\fR command +to clear a user property . If the property is not defined in any parent +dataset, it is removed entirely. Property values are limited to 1024 +characters. +.SS "ZFS Volumes as Swap or Dump Devices" +.sp +.LP +During an initial installation a swap device and dump device are created on +\fBZFS\fR volumes in the \fBZFS\fR root pool. By default, the swap area size is +based on 1/2 the size of physical memory up to 2 Gbytes. The size of the dump +device depends on the kernel's requirements at installation time. Separate +\fBZFS\fR volumes must be used for the swap area and dump devices. Do not swap +to a file on a \fBZFS\fR file system. A \fBZFS\fR swap file configuration is +not supported. +.sp +.LP +If you need to change your swap area or dump device after the system is +installed or upgraded, use the \fBswap\fR(1M) and \fBdumpadm\fR(1M) commands. +If you need to change the size of your swap area or dump device, see the +\fISolaris ZFS Administration Guide\fR. +.SH SUBCOMMANDS +.sp +.LP +All subcommands that modify state are logged persistently to the pool in their +original form. +.sp +.ne 2 +.mk +.na +\fB\fBzfs ?\fR\fR +.ad +.sp .6 +.RS 4n +Displays a help message. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs create\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... +\fIfilesystem\fR\fR +.ad +.sp .6 +.RS 4n +Creates a new \fBZFS\fR file system. The file system is automatically mounted +according to the \fBmountpoint\fR property inherited from the parent. +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Creates all the non-existing parent datasets. Datasets created in this manner +are automatically mounted according to the \fBmountpoint\fR property inherited +from their parent. Any property specified on the command line using the +\fB-o\fR option is ignored. If the target filesystem already exists, the +operation completes successfully. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR +.ad +.sp .6 +.RS 4n +Sets the specified property as if the command \fBzfs set\fR +\fIproperty\fR=\fIvalue\fR was invoked at the same time the dataset was +created. Any editable \fBZFS\fR property can also be set at creation time. +Multiple \fB-o\fR options can be specified. An error results if the same +property is specified in multiple \fB-o\fR options. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs create\fR [\fB-ps\fR] [\fB-b\fR \fIblocksize\fR] [\fB-o\fR +\fIproperty\fR=\fIvalue\fR] ... \fB-V\fR \fIsize\fR \fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Creates a volume of the given size. The volume is exported as a block device in +\fB/dev/zvol/{dsk,rdsk}/\fR\fIpath\fR, where \fIpath\fR is the name of the +volume in the \fBZFS\fR namespace. The size represents the logical size as +exported by the device. By default, a reservation of equal size is created. +.sp +\fIsize\fR is automatically rounded up to the nearest 128 Kbytes to ensure that +the volume has an integral number of blocks regardless of \fIblocksize\fR. +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Creates all the non-existing parent datasets. Datasets created in this manner +are automatically mounted according to the \fBmountpoint\fR property inherited +from their parent. Any property specified on the command line using the +\fB-o\fR option is ignored. If the target filesystem already exists, the +operation completes successfully. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-s\fR\fR +.ad +.sp .6 +.RS 4n +Creates a sparse volume with no reservation. See \fBvolsize\fR in the Native +Properties section for more information about sparse volumes. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR +.ad +.sp .6 +.RS 4n +Sets the specified property as if the \fBzfs set\fR \fIproperty\fR=\fIvalue\fR +command was invoked at the same time the dataset was created. Any editable +\fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options +can be specified. An error results if the same property is specified in +multiple \fB-o\fR options. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-b\fR \fIblocksize\fR\fR +.ad +.sp .6 +.RS 4n +Equivalent to \fB-o\fR \fBvolblocksize\fR=\fIblocksize\fR. If this option is +specified in conjunction with \fB-o\fR \fBvolblocksize\fR, the resulting +behavior is undefined. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs destroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Destroys the given dataset. By default, the command unshares any file systems +that are currently shared, unmounts any file systems that are currently +mounted, and refuses to destroy a dataset that has active dependents (children +or clones). +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Recursively destroy all children. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-R\fR\fR +.ad +.sp .6 +.RS 4n +Recursively destroy all dependents, including cloned file systems outside the +target hierarchy. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.sp .6 +.RS 4n +Force an unmount of any file systems using the \fBunmount -f\fR command. This +option has no effect on non-file systems or unmounted file systems. +.RE + +Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR +options, as they can destroy large portions of a pool and cause unexpected +behavior for mounted file systems in use. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs destroy\fR [\fB-rRd\fR] \fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +The given snapshot is destroyed immediately if and only if the \fBzfs +destroy\fR command without the \fB-d\fR option would have destroyed it. Such +immediate destruction would occur, for example, if the snapshot had no clones +and the user-initiated reference count were zero. +.sp +If the snapshot does not qualify for immediate destruction, it is marked for +deferred deletion. In this state, it exists as a usable, visible snapshot until +both of the preconditions listed above are met, at which point it is destroyed. +.sp +.ne 2 +.mk +.na +\fB\fB-d\fR\fR +.ad +.sp .6 +.RS 4n +Defer snapshot deletion. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Destroy (or mark for deferred deletion) all snapshots with this name in +descendent file systems. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-R\fR\fR +.ad +.sp .6 +.RS 4n +Recursively destroy all dependents. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... +\fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR +.ad +.sp .6 +.RS 4n +Creates a snapshot with the given name. All previous modifications by +successful system calls to the file system are part of the snapshot. See the +"Snapshots" section for details. +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Recursively create snapshots of all descendent datasets. Snapshots are taken +atomically, so that all recursive snapshots correspond to the same moment in +time. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR +.ad +.sp .6 +.RS 4n +Sets the specified property; see \fBzfs create\fR for details. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs rollback\fR [\fB-rRf\fR] \fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +Roll back the given dataset to a previous snapshot. When a dataset is rolled +back, all data that has changed since the snapshot is discarded, and the +dataset reverts to the state at the time of the snapshot. By default, the +command refuses to roll back to a snapshot other than the most recent one. In +order to do so, all intermediate snapshots must be destroyed by specifying the +\fB-r\fR option. +.sp +The \fB-rR\fR options do not recursively destroy the child snapshots of a +recursive snapshot. Only the top-level recursive snapshot is destroyed by +either of these options. To completely roll back a recursive snapshot, you must +rollback the individual child snapshots. +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Recursively destroy any snapshots more recent than the one specified. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-R\fR\fR +.ad +.sp .6 +.RS 4n +Recursively destroy any more recent snapshots, as well as any clones of those +snapshots. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.sp .6 +.RS 4n +Used with the \fB-R\fR option to force an unmount of any clone file systems +that are to be destroyed. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs clone\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... +\fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Creates a clone of the given snapshot. See the "Clones" section for details. +The target dataset can be located anywhere in the \fBZFS\fR hierarchy, and is +created as the same type as the original. +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Creates all the non-existing parent datasets. Datasets created in this manner +are automatically mounted according to the \fBmountpoint\fR property inherited +from their parent. If the target filesystem or volume already exists, the +operation completes successfully. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR +.ad +.sp .6 +.RS 4n +Sets the specified property; see \fBzfs create\fR for details. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs promote\fR \fIclone-filesystem\fR\fR +.ad +.sp .6 +.RS 4n +Promotes a clone file system to no longer be dependent on its "origin" +snapshot. This makes it possible to destroy the file system that the clone was +created from. The clone parent-child dependency relationship is reversed, so +that the origin file system becomes a clone of the specified file system. +.sp +The snapshot that was cloned, and any snapshots previous to this snapshot, are +now owned by the promoted clone. The space they use moves from the origin file +system to the promoted clone, so enough space must be available to accommodate +these snapshots. No new space is consumed by this operation, but the space +accounting is adjusted. The promoted clone must not have any conflicting +snapshot names of its own. The \fBrename\fR subcommand can be used to rename +any conflicting snapshots. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs rename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR +.ad +.br +.na +\fB\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR +.ad +.br +.na +\fB\fBzfs rename\fR [\fB-p\fR] \fIfilesystem\fR|\fIvolume\fR +\fIfilesystem\fR|\fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Renames the given dataset. The new target can be located anywhere in the +\fBZFS\fR hierarchy, with the exception of snapshots. Snapshots can only be +renamed within the parent file system or volume. When renaming a snapshot, the +parent file system of the snapshot does not need to be specified as part of the +second argument. Renamed file systems can inherit new mount points, in which +case they are unmounted and remounted at the new mount point. +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Creates all the nonexistent parent datasets. Datasets created in this manner +are automatically mounted according to the \fBmountpoint\fR property inherited +from their parent. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs rename\fR \fB-r\fR \fIsnapshot\fR \fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +Recursively rename the snapshots of all descendent datasets. Snapshots are the +only dataset that can be renamed recursively. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-H\fR] [\fB-o\fR +\fIproperty\fR[,\fI\&...\fR]] [ \fB-t\fR \fItype\fR[,\fI\&...\fR]] [ \fB-s\fR +\fIproperty\fR ] ... [ \fB-S\fR \fIproperty\fR ] ... +[\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR] ...\fR +.ad +.sp .6 +.RS 4n +Lists the property information for the given datasets in tabular form. If +specified, you can list property information by the absolute pathname or the +relative pathname. By default, all file systems and volumes are displayed. +Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the +default is \fBoff\fR) . The following fields are displayed, +\fBname,used,available,referenced,mountpoint\fR. +.sp +.ne 2 +.mk +.na +\fB\fB-H\fR\fR +.ad +.sp .6 +.RS 4n +Used for scripting mode. Do not print headers and separate fields by a single +tab instead of arbitrary white space. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Recursively display any children of the dataset on the command line. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-d\fR \fIdepth\fR\fR +.ad +.sp .6 +.RS 4n +Recursively display any children of the dataset, limiting the recursion to +\fIdepth\fR. A depth of \fB1\fR will display only the dataset and its direct +children. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty\fR\fR +.ad +.sp .6 +.RS 4n +A comma-separated list of properties to display. The property must be: +.RS +4 +.TP +.ie t \(bu +.el o +One of the properties described in the "Native Properties" section +.RE +.RS +4 +.TP +.ie t \(bu +.el o +A user property +.RE +.RS +4 +.TP +.ie t \(bu +.el o +The value \fBname\fR to display the dataset name +.RE +.RS +4 +.TP +.ie t \(bu +.el o +The value \fBspace\fR to display space usage properties on file systems and +volumes. This is a shortcut for specifying \fB-o +name,avail,used,usedsnap,usedds,usedrefreserv,usedchild\fR \fB-t +filesystem,volume\fR syntax. +.RE +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-s\fR \fIproperty\fR\fR +.ad +.sp .6 +.RS 4n +A property for sorting the output by column in ascending order based on the +value of the property. The property must be one of the properties described in +the "Properties" section, or the special value \fBname\fR to sort by the +dataset name. Multiple properties can be specified at one time using multiple +\fB-s\fR property options. Multiple \fB-s\fR options are evaluated from left to +right in decreasing order of importance. +.sp +The following is a list of sorting criteria: +.RS +4 +.TP +.ie t \(bu +.el o +Numeric types sort in numeric order. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +String types sort in alphabetical order. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +Types inappropriate for a row sort that row to the literal bottom, regardless +of the specified ordering. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +If no sorting options are specified the existing behavior of \fBzfs list\fR is +preserved. +.RE +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-S\fR \fIproperty\fR\fR +.ad +.sp .6 +.RS 4n +Same as the \fB-s\fR option, but sorts by property in descending order. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-t\fR \fItype\fR\fR +.ad +.sp .6 +.RS 4n +A comma-separated list of types to display, where \fItype\fR is one of +\fBfilesystem\fR, \fBsnapshot\fR , \fBvolume\fR, or \fBall\fR. For example, +specifying \fB-t snapshot\fR displays only snapshots. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs set\fR \fIproperty\fR=\fIvalue\fR +\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR +.ad +.sp .6 +.RS 4n +Sets the property to the given value for each dataset. Only some properties can +be edited. See the "Properties" section for more information on what properties +can be set and acceptable values. Numeric values can be specified as exact +values, or in a human-readable form with a suffix of \fBB\fR, \fBK\fR, \fBM\fR, +\fBG\fR, \fBT\fR, \fBP\fR, \fBE\fR, \fBZ\fR (for bytes, kilobytes, megabytes, +gigabytes, terabytes, petabytes, exabytes, or zettabytes, respectively). User +properties can be set on snapshots. For more information, see the "User +Properties" section. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs get\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-Hp\fR] [\fB-o\fR +\fIfield\fR[,...] [\fB-s\fR \fIsource\fR[,...] "\fIall\fR" | +\fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR +.ad +.sp .6 +.RS 4n +Displays properties for the given datasets. If no datasets are specified, then +the command displays properties for all datasets on the system. For each +property, the following columns are displayed: +.sp +.in +2 +.nf + name Dataset name + property Property name + value Property value + source Property source. Can either be local, default, + temporary, inherited, or none (-). +.fi +.in -2 +.sp + +All columns are displayed by default, though this can be controlled by using +the \fB-o\fR option. This command takes a comma-separated list of properties as +described in the "Native Properties" and "User Properties" sections. +.sp +The special value \fBall\fR can be used to display all properties that apply to +the given dataset's type (filesystem, volume, or snapshot). +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Recursively display properties for any children. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-d\fR \fIdepth\fR\fR +.ad +.sp .6 +.RS 4n +Recursively display any children of the dataset, limiting the recursion to +\fIdepth\fR. A depth of \fB1\fR will display only the dataset and its direct +children. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-H\fR\fR +.ad +.sp .6 +.RS 4n +Display output in a form more easily parsed by scripts. Any headers are +omitted, and fields are explicitly separated by a single tab instead of an +arbitrary amount of space. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIfield\fR\fR +.ad +.sp .6 +.RS 4n +A comma-separated list of columns to display. \fBname,property,value,source\fR +is the default value. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-s\fR \fIsource\fR\fR +.ad +.sp .6 +.RS 4n +A comma-separated list of sources to display. Those properties coming from a +source other than those in this list are ignored. Each source must be one of +the following: \fBlocal,default,inherited,temporary,none\fR. The default value +is all sources. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Display numbers in parseable (exact) values. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs inherit\fR [\fB-r\fR] \fIproperty\fR +\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR +.ad +.sp .6 +.RS 4n +Clears the specified property, causing it to be inherited from an ancestor. If +no ancestor has the property set, then the default value is used. See the +"Properties" section for a listing of default values, and details on which +properties can be inherited. +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Recursively inherit the given property for all children. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs upgrade\fR [\fB-v\fR]\fR +.ad +.sp .6 +.RS 4n +Displays a list of file systems that are not the most recent version. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs upgrade\fR [\fB-r\fR] [\fB-V\fR \fIversion\fR] [\fB-a\fR | +\fIfilesystem\fR]\fR +.ad +.sp .6 +.RS 4n +Upgrades file systems to a new on-disk version. Once this is done, the file +systems will no longer be accessible on systems running older versions of the +software. \fBzfs send\fR streams generated from new snapshots of these file +systems cannot be accessed on systems running older versions of the software. +.sp +In general, the file system version is independent of the pool version. See +\fBzpool\fR(1M) for information on the \fBzpool upgrade\fR command. +.sp +In some cases, the file system version and the pool version are interrelated +and the pool version must be upgraded before the file system version can be +upgraded. +.sp +.ne 2 +.mk +.na +\fB\fB-a\fR\fR +.ad +.sp .6 +.RS 4n +Upgrade all file systems on all imported pools. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIfilesystem\fR\fR +.ad +.sp .6 +.RS 4n +Upgrade the specified file system. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Upgrade the specified file system and all descendent file systems +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-V\fR \fIversion\fR\fR +.ad +.sp .6 +.RS 4n +Upgrade to the specified \fIversion\fR. If the \fB-V\fR flag is not specified, +this command upgrades to the most recent version. This option can only be used +to increase the version number, and only up to the most recent version +supported by this software. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs userspace\fR [\fB-niHp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-sS\fR +\fIfield\fR]... [\fB-t\fR \fItype\fR [,...]] \fIfilesystem\fR | +\fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +Displays space consumed by, and quotas on, each user in the specified +filesystem or snapshot. This corresponds to the \fBuserused@\fR\fIuser\fR and +\fBuserquota@\fR\fIuser\fR properties. +.sp +.ne 2 +.mk +.na +\fB\fB-n\fR\fR +.ad +.sp .6 +.RS 4n +Print numeric ID instead of user/group name. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-H\fR\fR +.ad +.sp .6 +.RS 4n +Do not print headers, use tab-delimited output. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Use exact (parseable) numeric output. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIfield\fR[,...]\fR +.ad +.sp .6 +.RS 4n +Display only the specified fields from the following set, +\fBtype,name,used,quota\fR.The default is to display all fields. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-s\fR \fIfield\fR\fR +.ad +.sp .6 +.RS 4n +Sort output by this field. The \fIs\fR and \fIS\fR flags may be specified +multiple times to sort first by one field, then by another. The default is +\fB-s type\fR \fB-s name\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-S\fR \fIfield\fR\fR +.ad +.sp .6 +.RS 4n +Sort by this field in reverse order. See \fB-s\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-t\fR \fItype\fR[,...]\fR +.ad +.sp .6 +.RS 4n +Print only the specified types from the following set, +\fBall,posixuser,smbuser,posixgroup,smbgroup\fR. +.sp +The default is \fB-t posixuser,smbuser\fR +.sp +The default can be changed to include group types. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-i\fR\fR +.ad +.sp .6 +.RS 4n +Translate SID to POSIX ID. The POSIX ID may be ephemeral if no mapping exists. +Normal POSIX interfaces (for example, \fBstat\fR(2), \fBls\fR \fB-l\fR) perform +this translation, so the \fB-i\fR option allows the output from \fBzfs +userspace\fR to be compared directly with those utilities. However, \fB-i\fR +may lead to confusion if some files were created by an SMB user before a +SMB-to-POSIX name mapping was established. In such a case, some files are owned +by the SMB entity and some by the POSIX entity. However, the \fB-i\fR option +will report that the POSIX entity has the total usage and quota for both. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs groupspace\fR [\fB-niHp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-sS\fR +\fIfield\fR]... [\fB-t\fR \fItype\fR [,...]] \fIfilesystem\fR | +\fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +Displays space consumed by, and quotas on, each group in the specified +filesystem or snapshot. This subcommand is identical to \fBzfs userspace\fR, +except that the default types to display are \fB-t posixgroup,smbgroup\fR. +.sp +.in +2 +.nf +- +.fi +.in -2 +.sp + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs mount\fR\fR +.ad +.sp .6 +.RS 4n +Displays all \fBZFS\fR file systems currently mounted. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs mount\fR [\fB-vO\fR] [\fB-o\fR \fIoptions\fR] \fB-a\fR | +\fIfilesystem\fR\fR +.ad +.sp .6 +.RS 4n +Mounts \fBZFS\fR file systems. Invoked automatically as part of the boot +process. +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIoptions\fR\fR +.ad +.sp .6 +.RS 4n +An optional, comma-separated list of mount options to use temporarily for the +duration of the mount. See the "Temporary Mount Point Properties" section for +details. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-O\fR\fR +.ad +.sp .6 +.RS 4n +Perform an overlay mount. See \fBmount\fR(1M) for more information. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-v\fR\fR +.ad +.sp .6 +.RS 4n +Report mount progress. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-a\fR\fR +.ad +.sp .6 +.RS 4n +Mount all available \fBZFS\fR file systems. Invoked automatically as part of +the boot process. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIfilesystem\fR\fR +.ad +.sp .6 +.RS 4n +Mount the specified filesystem. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs unmount\fR [\fB-f\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR +.ad +.sp .6 +.RS 4n +Unmounts currently mounted \fBZFS\fR file systems. Invoked automatically as +part of the shutdown process. +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.sp .6 +.RS 4n +Forcefully unmount the file system, even if it is currently in use. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-a\fR\fR +.ad +.sp .6 +.RS 4n +Unmount all available \fBZFS\fR file systems. Invoked automatically as part of +the boot process. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIfilesystem\fR|\fImountpoint\fR\fR +.ad +.sp .6 +.RS 4n +Unmount the specified filesystem. The command can also be given a path to a +\fBZFS\fR file system mount point on the system. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs share\fR \fB-a\fR | \fIfilesystem\fR\fR +.ad +.sp .6 +.RS 4n +Shares available \fBZFS\fR file systems. +.sp +.ne 2 +.mk +.na +\fB\fB-a\fR\fR +.ad +.sp .6 +.RS 4n +Share all available \fBZFS\fR file systems. Invoked automatically as part of +the boot process. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIfilesystem\fR\fR +.ad +.sp .6 +.RS 4n +Share the specified filesystem according to the \fBsharenfs\fR and +\fBsharesmb\fR properties. File systems are shared when the \fBsharenfs\fR or +\fBsharesmb\fR property is set. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs unshare\fR \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR +.ad +.sp .6 +.RS 4n +Unshares currently shared \fBZFS\fR file systems. This is invoked automatically +as part of the shutdown process. +.sp +.ne 2 +.mk +.na +\fB\fB-a\fR\fR +.ad +.sp .6 +.RS 4n +Unshare all available \fBZFS\fR file systems. Invoked automatically as part of +the boot process. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fIfilesystem\fR|\fImountpoint\fR\fR +.ad +.sp .6 +.RS 4n +Unshare the specified filesystem. The command can also be given a path to a +\fBZFS\fR file system shared on the system. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs send\fR [\fB-vR\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] +\fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +Creates a stream representation of the second \fIsnapshot\fR, which is written +to standard output. The output can be redirected to a file or to a different +system (for example, using \fBssh\fR(1). By default, a full stream is +generated. +.sp +.ne 2 +.mk +.na +\fB\fB-i\fR \fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +Generate an incremental stream from the first \fIsnapshot\fR to the second +\fIsnapshot\fR. The incremental source (the first \fIsnapshot\fR) can be +specified as the last component of the snapshot name (for example, the part +after the \fB@\fR), and it is assumed to be from the same file system as the +second \fIsnapshot\fR. +.sp +If the destination is a clone, the source may be the origin snapshot, which +must be fully specified (for example, \fBpool/fs@origin\fR, not just +\fB@origin\fR). +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-I\fR \fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +Generate a stream package that sends all intermediary snapshots from the first +snapshot to the second snapshot. For example, \fB-I @a fs@d\fR is similar to +\fB-i @a fs@b; -i @b fs@c; -i @c fs@d\fR. The incremental source snapshot may +be specified as with the \fB-i\fR option. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-R\fR\fR +.ad +.sp .6 +.RS 4n +Generate a replication stream package, which will replicate the specified +filesystem, and all descendent file systems, up to the named snapshot. When +received, all properties, snapshots, descendent file systems, and clones are +preserved. +.sp +If the \fB-i\fR or \fB-I\fR flags are used in conjunction with the \fB-R\fR +flag, an incremental replication stream is generated. The current values of +properties, and current snapshot and file system names are set when the stream +is received. If the \fB-F\fR flag is specified when this stream is received, +snapshots and file systems that do not exist on the sending side are destroyed. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-v\fR\fR +.ad +.sp .6 +.RS 4n +Print verbose information about the stream package generated. +.RE + +The format of the stream is committed. You will be able to receive your streams +on future versions of \fBZFS\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs receive\fR [\fB-vnFu\fR] +\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR +.ad +.br +.na +\fB\fBzfs receive\fR [\fB-vnFu\fR] \fB-d\fR \fIfilesystem\fR\fR +.ad +.sp .6 +.RS 4n +Creates a snapshot whose contents are as specified in the stream provided on +standard input. If a full stream is received, then a new file system is created +as well. Streams are created using the \fBzfs send\fR subcommand, which by +default creates a full stream. \fBzfs recv\fR can be used as an alias for +\fBzfs receive\fR. +.sp +If an incremental stream is received, then the destination file system must +already exist, and its most recent snapshot must match the incremental stream's +source. For \fBzvols\fR, the destination device link is destroyed and +recreated, which means the \fBzvol\fR cannot be accessed during the +\fBreceive\fR operation. +.sp +When a snapshot replication package stream that is generated by using the +\fBzfs send\fR \fB-R\fR command is received, any snapshots that do not exist +on the sending location are destroyed by using the \fBzfs destroy\fR \fB-d\fR +command. +.sp +The name of the snapshot (and file system, if a full stream is received) that +this subcommand creates depends on the argument type and the \fB-d\fR option. +.sp +If the argument is a snapshot name, the specified \fIsnapshot\fR is created. If +the argument is a file system or volume name, a snapshot with the same name as +the sent snapshot is created within the specified \fIfilesystem\fR or +\fIvolume\fR. If the \fB-d\fR option is specified, the snapshot name is +determined by appending the sent snapshot's name to the specified +\fIfilesystem\fR. If the \fB-d\fR option is specified, any required file +systems within the specified one are created. +.sp +.ne 2 +.mk +.na +\fB\fB-d\fR\fR +.ad +.sp .6 +.RS 4n +Use the name of the sent snapshot to determine the name of the new snapshot as +described in the paragraph above. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-u\fR\fR +.ad +.sp .6 +.RS 4n +File system that is associated with the received stream is not mounted. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-v\fR\fR +.ad +.sp .6 +.RS 4n +Print verbose information about the stream and the time required to perform the +receive operation. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-n\fR\fR +.ad +.sp .6 +.RS 4n +Do not actually receive the stream. This can be useful in conjunction with the +\fB-v\fR option to verify the name the receive operation would use. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-F\fR\fR +.ad +.sp .6 +.RS 4n +Force a rollback of the file system to the most recent snapshot before +performing the receive operation. If receiving an incremental replication +stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy +snapshots and file systems that do not exist on the sending side. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs allow\fR \fIfilesystem\fR | \fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Displays permissions that have been delegated on the specified filesystem or +volume. See the other forms of \fBzfs allow\fR for more information. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs allow\fR [\fB-ldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] +\fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR| \fIvolume\fR\fR +.ad +.br +.na +\fB\fBzfs allow\fR [\fB-ld\fR] \fB-e\fR \fIperm\fR|@\fIsetname\fR[,...] +\fIfilesystem\fR | \fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Delegates \fBZFS\fR administration permission for the file systems to +non-privileged users. +.sp +.ne 2 +.mk +.na +\fB[\fB-ug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...]\fR +.ad +.sp .6 +.RS 4n +Specifies to whom the permissions are delegated. Multiple entities can be +specified as a comma-separated list. If neither of the \fB-ug\fR options are +specified, then the argument is interpreted preferentially as the keyword +"everyone", then as a user name, and lastly as a group name. To specify a user +or group named "everyone", use the \fB-u\fR or \fB-g\fR options. To specify a +group with the same name as a user, use the \fB-g\fR options. +.RE + +.sp +.ne 2 +.mk +.na +\fB[\fB-e\fR] \fIperm\fR|@\fIsetname\fR[,...]\fR +.ad +.sp .6 +.RS 4n +Specifies that the permissions be delegated to "everyone." Multiple permissions +may be specified as a comma-separated list. Permission names are the same as +\fBZFS\fR subcommand and property names. See the property list below. Property +set names, which begin with an at sign (\fB@\fR) , may be specified. See the +\fB-s\fR form below for details. +.RE + +.sp +.ne 2 +.mk +.na +\fB[\fB-ld\fR] \fIfilesystem\fR|\fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Specifies where the permissions are delegated. If neither of the \fB-ld\fR +options are specified, or both are, then the permissions are allowed for the +file system or volume, and all of its descendents. If only the \fB-l\fR option +is used, then is allowed "locally" only for the specified file system. If only +the \fB-d\fR option is used, then is allowed only for the descendent file +systems. +.RE + +.RE + +.sp +.LP +Permissions are generally the ability to use a \fBZFS\fR subcommand or change a +\fBZFS\fR property. The following permissions are available: +.sp +.in +2 +.nf +NAME TYPE NOTES +allow subcommand Must also have the permission that is being + allowed +clone subcommand Must also have the 'create' ability and 'mount' + ability in the origin file system +create subcommand Must also have the 'mount' ability +destroy subcommand Must also have the 'mount' ability +mount subcommand Allows mount/umount of ZFS datasets +promote subcommand Must also have the 'mount' + and 'promote' ability in the origin file system +receive subcommand Must also have the 'mount' and 'create' ability +rename subcommand Must also have the 'mount' and 'create' + ability in the new parent +rollback subcommand Must also have the 'mount' ability +send subcommand +share subcommand Allows sharing file systems over NFS or SMB + protocols +snapshot subcommand Must also have the 'mount' ability +groupquota other Allows accessing any groupquota@... property +groupused other Allows reading any groupused@... property +userprop other Allows changing any user property +userquota other Allows accessing any userquota@... property +userused other Allows reading any userused@... property + +aclinherit property +aclmode property +atime property +canmount property +casesensitivity property +checksum property +compression property +copies property +devices property +exec property +mountpoint property +nbmand property +normalization property +primarycache property +quota property +readonly property +recordsize property +refquota property +refreservation property +reservation property +secondarycache property +setuid property +shareiscsi property +sharenfs property +sharesmb property +snapdir property +utf8only property +version property +volblocksize property +volsize property +vscan property +xattr property +zoned property +.fi +.in -2 +.sp + +.sp +.ne 2 +.mk +.na +\fB\fBzfs allow\fR \fB-c\fR \fIperm\fR|@\fIsetname\fR[,...] +\fIfilesystem\fR|\fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Sets "create time" permissions. These permissions are granted (locally) to the +creator of any newly-created descendent file system. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs allow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,...] +\fIfilesystem\fR|\fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Defines or adds permissions to a permission set. The set can be used by other +\fBzfs allow\fR commands for the specified file system and its descendents. +Sets are evaluated dynamically, so changes to a set are immediately reflected. +Permission sets follow the same naming restrictions as ZFS file systems, but +the name must begin with an "at sign" (\fB@\fR), and can be no more than 64 +characters long. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs unallow\fR [\fB-rldug\fR] +"\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] +[\fIperm\fR|@\fIsetname\fR[, ...]] \fIfilesystem\fR|\fIvolume\fR\fR +.ad +.br +.na +\fB\fBzfs unallow\fR [\fB-rld\fR] \fB-e\fR [\fIperm\fR|@\fIsetname\fR [,...]] +\fIfilesystem\fR|\fIvolume\fR\fR +.ad +.br +.na +\fB\fBzfs unallow\fR [\fB-r\fR] \fB-c\fR [\fIperm\fR|@\fIsetname\fR[,...]]\fR +.ad +.br +.na +\fB\fIfilesystem\fR|\fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Removes permissions that were granted with the \fBzfs allow\fR command. No +permissions are explicitly denied, so other permissions granted are still in +effect. For example, if the permission is granted by an ancestor. If no +permissions are specified, then all permissions for the specified \fIuser\fR, +\fIgroup\fR, or \fIeveryone\fR are removed. Specifying "everyone" (or using the +\fB-e\fR option) only removes the permissions that were granted to "everyone", +not all permissions for every user and group. See the \fBzfs allow\fR command +for a description of the \fB-ldugec\fR options. +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Recursively remove the permissions from this file system and all descendents. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs unallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR +[\fIperm\fR|@\fIsetname\fR[,...]]\fR +.ad +.br +.na +\fB\fIfilesystem\fR|\fIvolume\fR\fR +.ad +.sp .6 +.RS 4n +Removes permissions from a permission set. If no permissions are specified, +then all permissions are removed, thus removing the set entirely. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs hold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR +.ad +.sp .6 +.RS 4n +Adds a single reference, named with the \fItag\fR argument, to the specified +snapshot or snapshots. Each snapshot has its own tag namespace, and tags must +be unique within that space. +.sp +If a hold exists on a snapshot, attempts to destroy that snapshot by using the +\fBzfs destroy\fR command return \fBEBUSY\fR. +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Specifies that a hold with the given tag is applied recursively to the +snapshots of all descendent file systems. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs holds\fR [\fB-r\fR] \fIsnapshot\fR...\fR +.ad +.sp .6 +.RS 4n +Lists all existing user references for the given snapshot or snapshots. +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Lists the holds that are set on the named descendent snapshots, in addition to +listing the holds on the named snapshot. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzfs release\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR +.ad +.sp .6 +.RS 4n +Removes a single reference, named with the \fItag\fR argument, from the +specified snapshot or snapshots. The tag must already exist for each snapshot. +.sp +If a hold exists on a snapshot, attempts to destroy that snapshot by using the +\fBzfs destroy\fR command return \fBEBUSY\fR. +.sp +.ne 2 +.mk +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Recursively releases a hold with the given tag on the snapshots of all +descendent file systems. +.RE + +.RE + +.SH EXAMPLES +.LP +\fBExample 1 \fRCreating a ZFS File System Hierarchy +.sp +.LP +The following commands create a file system named \fBpool/home\fR and a file +system named \fBpool/home/bob\fR. The mount point \fB/export/home\fR is set for +the parent file system, and is automatically inherited by the child file +system. + +.sp +.in +2 +.nf +# \fBzfs create pool/home\fR +# \fBzfs set mountpoint=/export/home pool/home\fR +# \fBzfs create pool/home/bob\fR +.fi +.in -2 +.sp + +.LP +\fBExample 2 \fRCreating a ZFS Snapshot +.sp +.LP +The following command creates a snapshot named \fByesterday\fR. This snapshot +is mounted on demand in the \fB\&.zfs/snapshot\fR directory at the root of the +\fBpool/home/bob\fR file system. + +.sp +.in +2 +.nf +# \fBzfs snapshot pool/home/bob@yesterday\fR +.fi +.in -2 +.sp + +.LP +\fBExample 3 \fRCreating and Destroying Multiple Snapshots +.sp +.LP +The following command creates snapshots named \fByesterday\fR of +\fBpool/home\fR and all of its descendent file systems. Each snapshot is +mounted on demand in the \fB\&.zfs/snapshot\fR directory at the root of its +file system. The second command destroys the newly created snapshots. + +.sp +.in +2 +.nf +# \fBzfs snapshot -r pool/home@yesterday\fR +# \fBzfs destroy -r pool/home@yesterday\fR +.fi +.in -2 +.sp + +.LP +\fBExample 4 \fRDisabling and Enabling File System Compression +.sp +.LP +The following command disables the \fBcompression\fR property for all file +systems under \fBpool/home\fR. The next command explicitly enables +\fBcompression\fR for \fBpool/home/anne\fR. + +.sp +.in +2 +.nf +# \fBzfs set compression=off pool/home\fR +# \fBzfs set compression=on pool/home/anne\fR +.fi +.in -2 +.sp + +.LP +\fBExample 5 \fRListing ZFS Datasets +.sp +.LP +The following command lists all active file systems and volumes in the system. +Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR. The +default is \fBoff\fR. See \fBzpool\fR(1M) for more information on pool +properties. + +.sp +.in +2 +.nf +# \fBzfs list\fR + NAME USED AVAIL REFER MOUNTPOINT + pool 450K 457G 18K /pool + pool/home 315K 457G 21K /export/home + pool/home/anne 18K 457G 18K /export/home/anne + pool/home/bob 276K 457G 276K /export/home/bob +.fi +.in -2 +.sp + +.LP +\fBExample 6 \fRSetting a Quota on a ZFS File System +.sp +.LP +The following command sets a quota of 50 Gbytes for \fBpool/home/bob\fR. + +.sp +.in +2 +.nf +# \fBzfs set quota=50G pool/home/bob\fR +.fi +.in -2 +.sp + +.LP +\fBExample 7 \fRListing ZFS Properties +.sp +.LP +The following command lists all properties for \fBpool/home/bob\fR. + +.sp +.in +2 +.nf +# \fBzfs get all pool/home/bob\fR +NAME PROPERTY VALUE SOURCE +pool/home/bob type filesystem - +pool/home/bob creation Tue Jul 21 15:53 2009 - +pool/home/bob used 21K - +pool/home/bob available 20.0G - +pool/home/bob referenced 21K - +pool/home/bob compressratio 1.00x - +pool/home/bob mounted yes - +pool/home/bob quota 20G local +pool/home/bob reservation none default +pool/home/bob recordsize 128K default +pool/home/bob mountpoint /pool/home/bob default +pool/home/bob sharenfs off default +pool/home/bob checksum on default +pool/home/bob compression on local +pool/home/bob atime on default +pool/home/bob devices on default +pool/home/bob exec on default +pool/home/bob setuid on default +pool/home/bob readonly off default +pool/home/bob zoned off default +pool/home/bob snapdir hidden default +pool/home/bob aclmode groupmask default +pool/home/bob aclinherit restricted default +pool/home/bob canmount on default +pool/home/bob shareiscsi off default +pool/home/bob xattr on default +pool/home/bob copies 1 default +pool/home/bob version 4 - +pool/home/bob utf8only off - +pool/home/bob normalization none - +pool/home/bob casesensitivity sensitive - +pool/home/bob vscan off default +pool/home/bob nbmand off default +pool/home/bob sharesmb off default +pool/home/bob refquota none default +pool/home/bob refreservation none default +pool/home/bob primarycache all default +pool/home/bob secondarycache all default +pool/home/bob usedbysnapshots 0 - +pool/home/bob usedbydataset 21K - +pool/home/bob usedbychildren 0 - +pool/home/bob usedbyrefreservation 0 - +.fi +.in -2 +.sp + +.sp +.LP +The following command gets a single property value. + +.sp +.in +2 +.nf +# \fBzfs get -H -o value compression pool/home/bob\fR +on +.fi +.in -2 +.sp + +.sp +.LP +The following command lists all properties with local settings for +\fBpool/home/bob\fR. + +.sp +.in +2 +.nf +# \fBzfs get -r -s local -o name,property,value all pool/home/bob\fR +NAME PROPERTY VALUE +pool/home/bob quota 20G +pool/home/bob compression on +.fi +.in -2 +.sp + +.LP +\fBExample 8 \fRRolling Back a ZFS File System +.sp +.LP +The following command reverts the contents of \fBpool/home/anne\fR to the +snapshot named \fByesterday\fR, deleting all intermediate snapshots. + +.sp +.in +2 +.nf +# \fBzfs rollback -r pool/home/anne@yesterday\fR +.fi +.in -2 +.sp + +.LP +\fBExample 9 \fRCreating a ZFS Clone +.sp +.LP +The following command creates a writable file system whose initial contents are +the same as \fBpool/home/bob@yesterday\fR. + +.sp +.in +2 +.nf +# \fBzfs clone pool/home/bob@yesterday pool/clone\fR +.fi +.in -2 +.sp + +.LP +\fBExample 10 \fRPromoting a ZFS Clone +.sp +.LP +The following commands illustrate how to test out changes to a file system, and +then replace the original file system with the changed one, using clones, clone +promotion, and renaming: + +.sp +.in +2 +.nf +# \fBzfs create pool/project/production\fR + populate /pool/project/production with data +# \fBzfs snapshot pool/project/production@today\fR +# \fBzfs clone pool/project/production@today pool/project/beta\fR +make changes to /pool/project/beta and test them +# \fBzfs promote pool/project/beta\fR +# \fBzfs rename pool/project/production pool/project/legacy\fR +# \fBzfs rename pool/project/beta pool/project/production\fR +once the legacy version is no longer needed, it can be destroyed +# \fBzfs destroy pool/project/legacy\fR +.fi +.in -2 +.sp + +.LP +\fBExample 11 \fRInheriting ZFS Properties +.sp +.LP +The following command causes \fBpool/home/bob\fR and \fBpool/home/anne\fR to +inherit the \fBchecksum\fR property from their parent. + +.sp +.in +2 +.nf +# \fBzfs inherit checksum pool/home/bob pool/home/anne\fR +.fi +.in -2 +.sp + +.LP +\fBExample 12 \fRRemotely Replicating ZFS Data +.sp +.LP +The following commands send a full stream and then an incremental stream to a +remote machine, restoring them into \fBpoolB/received/fs@a\fRand +\fBpoolB/received/fs@b\fR, respectively. \fBpoolB\fR must contain the file +system \fBpoolB/received\fR, and must not initially contain +\fBpoolB/received/fs\fR. + +.sp +.in +2 +.nf +# \fBzfs send pool/fs@a | \e\fR + \fBssh host zfs receive poolB/received/fs@a\fR +# \fBzfs send -i a pool/fs@b | ssh host \e\fR + \fBzfs receive poolB/received/fs\fR +.fi +.in -2 +.sp + +.LP +\fBExample 13 \fRUsing the \fBzfs receive\fR \fB-d\fR Option +.sp +.LP +The following command sends a full stream of \fBpoolA/fsA/fsB@snap\fR to a +remote machine, receiving it into \fBpoolB/received/fsA/fsB@snap\fR. The +\fBfsA/fsB@snap\fR portion of the received snapshot's name is determined from +the name of the sent snapshot. \fBpoolB\fR must contain the file system +\fBpoolB/received\fR. If \fBpoolB/received/fsA\fR does not exist, it is created +as an empty file system. + +.sp +.in +2 +.nf +# \fBzfs send poolA/fsA/fsB@snap | \e + ssh host zfs receive -d poolB/received\fR +.fi +.in -2 +.sp + +.LP +\fBExample 14 \fRSetting User Properties +.sp +.LP +The following example sets the user-defined \fBcom.example:department\fR +property for a dataset. + +.sp +.in +2 +.nf +# \fBzfs set com.example:department=12345 tank/accounting\fR +.fi +.in -2 +.sp + +.LP +\fBExample 15 \fRCreating a ZFS Volume as an iSCSI Target Device +.sp +.LP +The following example shows how to create a \fBZFS\fR volume as an \fBiSCSI\fR +target. + +.sp +.in +2 +.nf +# \fBzfs create -V 2g pool/volumes/vol1\fR +# \fBzfs set shareiscsi=on pool/volumes/vol1\fR +# \fBiscsitadm list target\fR +Target: pool/volumes/vol1 + iSCSI Name: + iqn.1986-03.com.sun:02:7b4b02a6-3277-eb1b-e686-a24762c52a8c + Connections: 0 +.fi +.in -2 +.sp + +.sp +.LP +After the \fBiSCSI\fR target is created, set up the \fBiSCSI\fR initiator. For +more information about the Solaris \fBiSCSI\fR initiator, see +\fBiscsitadm\fR(1M). +.LP +\fBExample 16 \fRPerforming a Rolling Snapshot +.sp +.LP +The following example shows how to maintain a history of snapshots with a +consistent naming scheme. To keep a week's worth of snapshots, the user +destroys the oldest snapshot, renames the remaining snapshots, and then creates +a new snapshot, as follows: + +.sp +.in +2 +.nf +# \fBzfs destroy -r pool/users@7daysago\fR +# \fBzfs rename -r pool/users@6daysago @7daysago\fR +# \fBzfs rename -r pool/users@5daysago @6daysago\fR +# \fBzfs rename -r pool/users@yesterday @5daysago\fR +# \fBzfs rename -r pool/users@yesterday @4daysago\fR +# \fBzfs rename -r pool/users@yesterday @3daysago\fR +# \fBzfs rename -r pool/users@yesterday @2daysago\fR +# \fBzfs rename -r pool/users@today @yesterday\fR +# \fBzfs snapshot -r pool/users@today\fR +.fi +.in -2 +.sp + +.LP +\fBExample 17 \fRSetting \fBsharenfs\fR Property Options on a ZFS File System +.sp +.LP +The following commands show how to set \fBsharenfs\fR property options to +enable \fBrw\fR access for a set of \fBIP\fR addresses and to enable root +access for system \fBneo\fR on the \fBtank/home\fR file system. + +.sp +.in +2 +.nf +# \fB# zfs set sharenfs='rw=@123.123.0.0/16,root=neo' tank/home\fR +.fi +.in -2 +.sp + +.sp +.LP +If you are using \fBDNS\fR for host name resolution, specify the fully +qualified hostname. + +.LP +\fBExample 18 \fRDelegating ZFS Administration Permissions on a ZFS Dataset +.sp +.LP +The following example shows how to set permissions so that user \fBcindys\fR +can create, destroy, mount, and take snapshots on \fBtank/cindys\fR. The +permissions on \fBtank/cindys\fR are also displayed. + +.sp +.in +2 +.nf +# \fBzfs allow cindys create,destroy,mount,snapshot tank/cindys\fR +# \fBzfs allow tank/cindys\fR +------------------------------------------------------------- +Local+Descendent permissions on (tank/cindys) + user cindys create,destroy,mount,snapshot +------------------------------------------------------------- +.fi +.in -2 +.sp + +.sp +.LP +Because the \fBtank/cindys\fR mount point permission is set to 755 by default, +user \fBcindys\fR will be unable to mount file systems under \fBtank/cindys\fR. +Set an \fBACL\fR similar to the following syntax to provide mount point access: +.sp +.in +2 +.nf +# \fBchmod A+user:cindys:add_subdirectory:allow /tank/cindys\fR +.fi +.in -2 +.sp + +.LP +\fBExample 19 \fRDelegating Create Time Permissions on a ZFS Dataset +.sp +.LP +The following example shows how to grant anyone in the group \fBstaff\fR to +create file systems in \fBtank/users\fR. This syntax also allows staff members +to destroy their own file systems, but not destroy anyone else's file system. +The permissions on \fBtank/users\fR are also displayed. + +.sp +.in +2 +.nf +# \fB# zfs allow staff create,mount tank/users\fR +# \fBzfs allow -c destroy tank/users\fR +# \fBzfs allow tank/users\fR +------------------------------------------------------------- +Create time permissions on (tank/users) + create,destroy +Local+Descendent permissions on (tank/users) + group staff create,mount +------------------------------------------------------------- +.fi +.in -2 +.sp + +.LP +\fBExample 20 \fRDefining and Granting a Permission Set on a ZFS Dataset +.sp +.LP +The following example shows how to define and grant a permission set on the +\fBtank/users\fR file system. The permissions on \fBtank/users\fR are also +displayed. + +.sp +.in +2 +.nf +# \fBzfs allow -s @pset create,destroy,snapshot,mount tank/users\fR +# \fBzfs allow staff @pset tank/users\fR +# \fBzfs allow tank/users\fR +------------------------------------------------------------- +Permission sets on (tank/users) + @pset create,destroy,mount,snapshot +Create time permissions on (tank/users) + create,destroy +Local+Descendent permissions on (tank/users) + group staff @pset,create,mount +------------------------------------------------------------- +.fi +.in -2 +.sp + +.LP +\fBExample 21 \fRDelegating Property Permissions on a ZFS Dataset +.sp +.LP +The following example shows to grant the ability to set quotas and reservations +on the \fBusers/home\fR file system. The permissions on \fBusers/home\fR are +also displayed. + +.sp +.in +2 +.nf +# \fBzfs allow cindys quota,reservation users/home\fR +# \fBzfs allow users/home\fR +------------------------------------------------------------- +Local+Descendent permissions on (users/home) + user cindys quota,reservation +------------------------------------------------------------- +cindys% \fBzfs set quota=10G users/home/marks\fR +cindys% \fBzfs get quota users/home/marks\fR +NAME PROPERTY VALUE SOURCE +users/home/marks quota 10G local +.fi +.in -2 +.sp + +.LP +\fBExample 22 \fRRemoving ZFS Delegated Permissions on a ZFS Dataset +.sp +.LP +The following example shows how to remove the snapshot permission from the +\fBstaff\fR group on the \fBtank/users\fR file system. The permissions on +\fBtank/users\fR are also displayed. + +.sp +.in +2 +.nf +# \fBzfs unallow staff snapshot tank/users\fR +# \fBzfs allow tank/users\fR +------------------------------------------------------------- +Permission sets on (tank/users) + @pset create,destroy,mount,snapshot +Create time permissions on (tank/users) + create,destroy +Local+Descendent permissions on (tank/users) + group staff @pset,create,mount +------------------------------------------------------------- +.fi +.in -2 +.sp + +.SH EXIT STATUS +.sp +.LP +The following exit values are returned: +.sp +.ne 2 +.mk +.na +\fB\fB0\fR\fR +.ad +.sp .6 +.RS 4n +Successful completion. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB1\fR\fR +.ad +.sp .6 +.RS 4n +An error occurred. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB2\fR\fR +.ad +.sp .6 +.RS 4n +Invalid command line options were specified. +.RE + +.SH ATTRIBUTES +.sp +.LP +See \fBattributes\fR(5) for descriptions of the following attributes: +.sp + +.sp +.TS +tab() box; +cw(2.75i) |cw(2.75i) +lw(2.75i) |lw(2.75i) +. +ATTRIBUTE TYPEATTRIBUTE VALUE +_ +Interface StabilityCommitted +.TE + +.SH SEE ALSO +.sp +.LP +\fBssh\fR(1), \fBiscsitadm\fR(1M), \fBmount\fR(1M), \fBshare\fR(1M), +\fBsharemgr\fR(1M), \fBunshare\fR(1M), \fBzonecfg\fR(1M), \fBzpool\fR(1M), +\fBchmod\fR(2), \fBstat\fR(2), \fBwrite\fR(2), \fBfsync\fR(3C), +\fBdfstab\fR(4), \fBattributes\fR(5) +.sp +.LP +See the \fBgzip\fR(1) man page, which is not part of the SunOS man page +collection. +.sp +.LP +For information about using the \fBZFS\fR web-based management tool and other +\fBZFS\fR features, see the \fISolaris ZFS Administration Guide\fR. diff --git a/man/man1m/zpool.1m b/man/man1m/zpool.1m new file mode 100644 index 0000000..7a67781 --- /dev/null +++ b/man/man1m/zpool.1m @@ -0,0 +1,2146 @@ +'\" te +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] +.TH zpool 1M "21 Sep 2009" "SunOS 5.11" "System Administration Commands" +.SH NAME +zpool \- configures ZFS storage pools +.SH SYNOPSIS +.LP +.nf +\fBzpool\fR [\fB-?\fR] +.fi + +.LP +.nf +\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ... +.fi + +.LP +.nf +\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR +.fi + +.LP +.nf +\fBzpool clear\fR \fIpool\fR [\fIdevice\fR] +.fi + +.LP +.nf +\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] + ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ... +.fi + +.LP +.nf +\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR +.fi + +.LP +.nf +\fBzpool detach\fR \fIpool\fR \fIdevice\fR +.fi + +.LP +.nf +\fBzpool export\fR [\fB-f\fR] \fIpool\fR ... +.fi + +.LP +.nf +\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ... +.fi + +.LP +.nf +\fBzpool history\fR [\fB-il\fR] [\fIpool\fR] ... +.fi + +.LP +.nf +\fBzpool import\fR [\fB-d\fR \fIdir\fR] [\fB-D\fR] +.fi + +.LP +.nf +\fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] + [\fB-D\fR] [\fB-f\fR] [\fB-R\fR \fIroot\fR] \fB-a\fR +.fi + +.LP +.nf +\fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] + [\fB-D\fR] [\fB-f\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR |\fIid\fR [\fInewpool\fR] +.fi + +.LP +.nf +\fBzpool iostat\fR [\fB-T\fR u | d ] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]] +.fi + +.LP +.nf +\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIproperty\fR[,...]] [\fIpool\fR] ... +.fi + +.LP +.nf +\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ... +.fi + +.LP +.nf +\fBzpool online\fR \fIpool\fR \fIdevice\fR ... +.fi + +.LP +.nf +\fBzpool remove\fR \fIpool\fR \fIdevice\fR ... +.fi + +.LP +.nf +\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR [\fInew_device\fR] +.fi + +.LP +.nf +\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ... +.fi + +.LP +.nf +\fBzpool set\fR \fIproperty\fR=\fIvalue\fR \fIpool\fR +.fi + +.LP +.nf +\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ... +.fi + +.LP +.nf +\fBzpool upgrade\fR +.fi + +.LP +.nf +\fBzpool upgrade\fR \fB-v\fR +.fi + +.LP +.nf +\fBzpool upgrade\fR [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIpool\fR ... +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBzpool\fR command configures \fBZFS\fR storage pools. A storage pool is a +collection of devices that provides physical storage and data replication for +\fBZFS\fR datasets. +.sp +.LP +All datasets within a storage pool share the same space. See \fBzfs\fR(1M) for +information on managing datasets. +.SS "Virtual Devices (\fBvdev\fRs)" +.sp +.LP +A "virtual device" describes a single device or a collection of devices +organized according to certain performance and fault characteristics. The +following virtual devices are supported: +.sp +.ne 2 +.mk +.na +\fB\fBdisk\fR\fR +.ad +.RS 10n +.rt +A block device, typically located under \fB/dev/dsk\fR. \fBZFS\fR can use +individual slices or partitions, though the recommended mode of operation is to +use whole disks. A disk can be specified by a full path, or it can be a +shorthand name (the relative portion of the path under "/dev/dsk"). A whole +disk can be specified by omitting the slice or partition designation. For +example, "c0t0d0" is equivalent to "/dev/dsk/c0t0d0s2". When given a whole +disk, \fBZFS\fR automatically labels the disk, if necessary. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBfile\fR\fR +.ad +.RS 10n +.rt +A regular file. The use of files as a backing store is strongly discouraged. It +is designed primarily for experimental purposes, as the fault tolerance of a +file is only as good as the file system of which it is a part. A file must be +specified by a full path. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBmirror\fR\fR +.ad +.RS 10n +.rt +A mirror of two or more devices. Data is replicated in an identical fashion +across all components of a mirror. A mirror with \fIN\fR disks of size \fIX\fR +can hold \fIX\fR bytes and can withstand (\fIN-1\fR) devices failing before +data integrity is compromised. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBraidz\fR\fR +.ad +.br +.na +\fB\fBraidz1\fR\fR +.ad +.br +.na +\fB\fBraidz2\fR\fR +.ad +.br +.na +\fB\fBraidz3\fR\fR +.ad +.RS 10n +.rt +A variation on \fBRAID-5\fR that allows for better distribution of parity and +eliminates the "\fBRAID-5\fR write hole" (in which data and parity become +inconsistent after a power loss). Data and parity is striped across all disks +within a \fBraidz\fR group. +.sp +A \fBraidz\fR group can have single-, double- , or triple parity, meaning that +the \fBraidz\fR group can sustain one, two, or three failures, respectively, +without losing any data. The \fBraidz1\fR \fBvdev\fR type specifies a +single-parity \fBraidz\fR group; the \fBraidz2\fR \fBvdev\fR type specifies a +double-parity \fBraidz\fR group; and the \fBraidz3\fR \fBvdev\fR type specifies +a triple-parity \fBraidz\fR group. The \fBraidz\fR \fBvdev\fR type is an alias +for \fBraidz1\fR. +.sp +A \fBraidz\fR group with \fIN\fR disks of size \fIX\fR with \fIP\fR parity +disks can hold approximately (\fIN-P\fR)*\fIX\fR bytes and can withstand +\fIP\fR device(s) failing before data integrity is compromised. The minimum +number of devices in a \fBraidz\fR group is one more than the number of parity +disks. The recommended number is between 3 and 9 to help increase performance. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBspare\fR\fR +.ad +.RS 10n +.rt +A special pseudo-\fBvdev\fR which keeps track of available hot spares for a +pool. For more information, see the "Hot Spares" section. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBlog\fR\fR +.ad +.RS 10n +.rt +A separate-intent log device. If more than one log device is specified, then +writes are load-balanced between devices. Log devices can be mirrored. However, +\fBraidz\fR \fBvdev\fR types are not supported for the intent log. For more +information, see the "Intent Log" section. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcache\fR\fR +.ad +.RS 10n +.rt +A device used to cache storage pool data. A cache device cannot be cannot be +configured as a mirror or \fBraidz\fR group. For more information, see the +"Cache Devices" section. +.RE + +.sp +.LP +Virtual devices cannot be nested, so a mirror or \fBraidz\fR virtual device can +only contain files or disks. Mirrors of mirrors (or other combinations) are not +allowed. +.sp +.LP +A pool can have any number of virtual devices at the top of the configuration +(known as "root vdevs"). Data is dynamically distributed across all top-level +devices to balance data among devices. As new virtual devices are added, +\fBZFS\fR automatically places data on the newly available devices. +.sp +.LP +Virtual devices are specified one at a time on the command line, separated by +whitespace. The keywords "mirror" and "raidz" are used to distinguish where a +group ends and another begins. For example, the following creates two root +vdevs, each a mirror of two disks: +.sp +.in +2 +.nf +# \fBzpool create mypool mirror c0t0d0 c0t1d0 mirror c1t0d0 c1t1d0\fR +.fi +.in -2 +.sp + +.SS "Device Failure and Recovery" +.sp +.LP +\fBZFS\fR supports a rich set of mechanisms for handling device failure and +data corruption. All metadata and data is checksummed, and \fBZFS\fR +automatically repairs bad data from a good copy when corruption is detected. +.sp +.LP +In order to take advantage of these features, a pool must make use of some form +of redundancy, using either mirrored or \fBraidz\fR groups. While \fBZFS\fR +supports running in a non-redundant configuration, where each root vdev is +simply a disk or file, this is strongly discouraged. A single case of bit +corruption can render some or all of your data unavailable. +.sp +.LP +A pool's health status is described by one of three states: online, degraded, +or faulted. An online pool has all devices operating normally. A degraded pool +is one in which one or more devices have failed, but the data is still +available due to a redundant configuration. A faulted pool has corrupted +metadata, or one or more faulted devices, and insufficient replicas to continue +functioning. +.sp +.LP +The health of the top-level vdev, such as mirror or \fBraidz\fR device, is +potentially impacted by the state of its associated vdevs, or component +devices. A top-level vdev or component device is in one of the following +states: +.sp +.ne 2 +.mk +.na +\fB\fBDEGRADED\fR\fR +.ad +.RS 12n +.rt +One or more top-level vdevs is in the degraded state because one or more +component devices are offline. Sufficient replicas exist to continue +functioning. +.sp +One or more component devices is in the degraded or faulted state, but +sufficient replicas exist to continue functioning. The underlying conditions +are as follows: +.RS +4 +.TP +.ie t \(bu +.el o +The number of checksum errors exceeds acceptable levels and the device is +degraded as an indication that something may be wrong. \fBZFS\fR continues to +use the device as necessary. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +The number of I/O errors exceeds acceptable levels. The device could not be +marked as faulted because there are insufficient replicas to continue +functioning. +.RE +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBFAULTED\fR\fR +.ad +.RS 12n +.rt +One or more top-level vdevs is in the faulted state because one or more +component devices are offline. Insufficient replicas exist to continue +functioning. +.sp +One or more component devices is in the faulted state, and insufficient +replicas exist to continue functioning. The underlying conditions are as +follows: +.RS +4 +.TP +.ie t \(bu +.el o +The device could be opened, but the contents did not match expected values. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +The number of I/O errors exceeds acceptable levels and the device is faulted to +prevent further use of the device. +.RE +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBOFFLINE\fR\fR +.ad +.RS 12n +.rt +The device was explicitly taken offline by the "\fBzpool offline\fR" command. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBONLINE\fR\fR +.ad +.RS 12n +.rt +The device is online and functioning. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBREMOVED\fR\fR +.ad +.RS 12n +.rt +The device was physically removed while the system was running. Device removal +detection is hardware-dependent and may not be supported on all platforms. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBUNAVAIL\fR\fR +.ad +.RS 12n +.rt +The device could not be opened. If a pool is imported when a device was +unavailable, then the device will be identified by a unique identifier instead +of its path since the path was never correct in the first place. +.RE + +.sp +.LP +If a device is removed and later re-attached to the system, \fBZFS\fR attempts +to put the device online automatically. Device attach detection is +hardware-dependent and might not be supported on all platforms. +.SS "Hot Spares" +.sp +.LP +\fBZFS\fR allows devices to be associated with pools as "hot spares". These +devices are not actively used in the pool, but when an active device fails, it +is automatically replaced by a hot spare. To create a pool with hot spares, +specify a "spare" \fBvdev\fR with any number of devices. For example, +.sp +.in +2 +.nf +# zpool create pool mirror c0d0 c1d0 spare c2d0 c3d0 +.fi +.in -2 +.sp + +.sp +.LP +Spares can be shared across multiple pools, and can be added with the "\fBzpool +add\fR" command and removed with the "\fBzpool remove\fR" command. Once a spare +replacement is initiated, a new "spare" \fBvdev\fR is created within the +configuration that will remain there until the original device is replaced. At +this point, the hot spare becomes available again if another device fails. +.sp +.LP +If a pool has a shared spare that is currently being used, the pool can not be +exported since other pools may use this shared spare, which may lead to +potential data corruption. +.sp +.LP +An in-progress spare replacement can be cancelled by detaching the hot spare. +If the original faulted device is detached, then the hot spare assumes its +place in the configuration, and is removed from the spare list of all active +pools. +.sp +.LP +Spares cannot replace log devices. +.SS "Intent Log" +.sp +.LP +The \fBZFS\fR Intent Log (\fBZIL\fR) satisfies \fBPOSIX\fR requirements for +synchronous transactions. For instance, databases often require their +transactions to be on stable storage devices when returning from a system call. +\fBNFS\fR and other applications can also use \fBfsync\fR() to ensure data +stability. By default, the intent log is allocated from blocks within the main +pool. However, it might be possible to get better performance using separate +intent log devices such as \fBNVRAM\fR or a dedicated disk. For example: +.sp +.in +2 +.nf +\fB# zpool create pool c0d0 c1d0 log c2d0\fR +.fi +.in -2 +.sp + +.sp +.LP +Multiple log devices can also be specified, and they can be mirrored. See the +EXAMPLES section for an example of mirroring multiple log devices. +.sp +.LP +Log devices can be added, replaced, attached, detached, and imported and +exported as part of the larger pool. Mirrored log devices can be removed by +specifying the top-level mirror for the log. +.SS "Cache Devices" +.sp +.LP +Devices can be added to a storage pool as "cache devices." These devices +provide an additional layer of caching between main memory and disk. For +read-heavy workloads, where the working set size is much larger than what can +be cached in main memory, using cache devices allow much more of this working +set to be served from low latency media. Using cache devices provides the +greatest performance improvement for random read-workloads of mostly static +content. +.sp +.LP +To create a pool with cache devices, specify a "cache" \fBvdev\fR with any +number of devices. For example: +.sp +.in +2 +.nf +\fB# zpool create pool c0d0 c1d0 cache c2d0 c3d0\fR +.fi +.in -2 +.sp + +.sp +.LP +Cache devices cannot be mirrored or part of a \fBraidz\fR configuration. If a +read error is encountered on a cache device, that read \fBI/O\fR is reissued to +the original storage pool device, which might be part of a mirrored or +\fBraidz\fR configuration. +.sp +.LP +The content of the cache devices is considered volatile, as is the case with +other system caches. +.SS "Properties" +.sp +.LP +Each pool has several properties associated with it. Some properties are +read-only statistics while others are configurable and change the behavior of +the pool. The following are read-only properties: +.sp +.ne 2 +.mk +.na +\fB\fBavailable\fR\fR +.ad +.RS 20n +.rt +Amount of storage available within the pool. This property can also be referred +to by its shortened column name, "avail". +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcapacity\fR\fR +.ad +.RS 20n +.rt +Percentage of pool space used. This property can also be referred to by its +shortened column name, "cap". +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBhealth\fR\fR +.ad +.RS 20n +.rt +The current health of the pool. Health can be "\fBONLINE\fR", "\fBDEGRADED\fR", +"\fBFAULTED\fR", " \fBOFFLINE\fR", "\fBREMOVED\fR", or "\fBUNAVAIL\fR". +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBguid\fR\fR +.ad +.RS 20n +.rt +A unique identifier for the pool. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBsize\fR\fR +.ad +.RS 20n +.rt +Total size of the storage pool. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBused\fR\fR +.ad +.RS 20n +.rt +Amount of storage space used within the pool. +.RE + +.sp +.LP +These space usage properties report actual physical space available to the +storage pool. The physical space can be different from the total amount of +space that any contained datasets can actually use. The amount of space used in +a \fBraidz\fR configuration depends on the characteristics of the data being +written. In addition, \fBZFS\fR reserves some space for internal accounting +that the \fBzfs\fR(1M) command takes into account, but the \fBzpool\fR command +does not. For non-full pools of a reasonable size, these effects should be +invisible. For small pools, or pools that are close to being completely full, +these discrepancies may become more noticeable. +.sp +.LP +The following property can be set at creation time and import time: +.sp +.ne 2 +.mk +.na +\fB\fBaltroot\fR\fR +.ad +.sp .6 +.RS 4n +Alternate root directory. If set, this directory is prepended to any mount +points within the pool. This can be used when examining an unknown pool where +the mount points cannot be trusted, or in an alternate boot environment, where +the typical paths are not valid. \fBaltroot\fR is not a persistent property. It +is valid only while the system is up. Setting \fBaltroot\fR defaults to using +\fBcachefile\fR=none, though this may be overridden using an explicit setting. +.RE + +.sp +.LP +The following properties can be set at creation time and import time, and later +changed with the \fBzpool set\fR command: +.sp +.ne 2 +.mk +.na +\fB\fBautoexpand\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls automatic pool expansion when the underlying LUN is grown. If set to +\fBon\fR, the pool will be resized according to the size of the expanded +device. If the device is part of a mirror or \fBraidz\fR then all devices +within that mirror/\fBraidz\fR group must be expanded before the new space is +made available to the pool. The default behavior is \fBoff\fR. This property +can also be referred to by its shortened column name, \fBexpand\fR. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBautoreplace\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls automatic device replacement. If set to "\fBoff\fR", device +replacement must be initiated by the administrator by using the "\fBzpool +replace\fR" command. If set to "\fBon\fR", any new device, found in the same +physical location as a device that previously belonged to the pool, is +automatically formatted and replaced. The default behavior is "\fBoff\fR". This +property can also be referred to by its shortened column name, "replace". +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBbootfs\fR=\fIpool\fR/\fIdataset\fR\fR +.ad +.sp .6 +.RS 4n +Identifies the default bootable dataset for the root pool. This property is +expected to be set mainly by the installation and upgrade programs. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcachefile\fR=\fIpath\fR | \fBnone\fR\fR +.ad +.sp .6 +.RS 4n +Controls the location of where the pool configuration is cached. Discovering +all pools on system startup requires a cached copy of the configuration data +that is stored on the root file system. All pools in this cache are +automatically imported when the system boots. Some environments, such as +install and clustering, need to cache this information in a different location +so that pools are not automatically imported. Setting this property caches the +pool configuration in a different location that can later be imported with +"\fBzpool import -c\fR". Setting it to the special value "\fBnone\fR" creates a +temporary pool that is never cached, and the special value \fB\&''\fR (empty +string) uses the default location. +.sp +Multiple pools can share the same cache file. Because the kernel destroys and +recreates this file when pools are added and removed, care should be taken when +attempting to access this file. When the last pool using a \fBcachefile\fR is +exported or destroyed, the file is removed. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBdelegation\fR=\fBon\fR | \fBoff\fR\fR +.ad +.sp .6 +.RS 4n +Controls whether a non-privileged user is granted access based on the dataset +permissions defined on the dataset. See \fBzfs\fR(1M) for more information on +\fBZFS\fR delegated administration. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBfailmode\fR=\fBwait\fR | \fBcontinue\fR | \fBpanic\fR\fR +.ad +.sp .6 +.RS 4n +Controls the system behavior in the event of catastrophic pool failure. This +condition is typically a result of a loss of connectivity to the underlying +storage device(s) or a failure of all devices within the pool. The behavior of +such an event is determined as follows: +.sp +.ne 2 +.mk +.na +\fB\fBwait\fR\fR +.ad +.RS 12n +.rt +Blocks all \fBI/O\fR access until the device connectivity is recovered and the +errors are cleared. This is the default behavior. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBcontinue\fR\fR +.ad +.RS 12n +.rt +Returns \fBEIO\fR to any new write \fBI/O\fR requests but allows reads to any +of the remaining healthy devices. Any write requests that have yet to be +committed to disk would be blocked. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBpanic\fR\fR +.ad +.RS 12n +.rt +Prints out a message to the console and generates a system crash dump. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBlistsnaps\fR=on | off\fR +.ad +.sp .6 +.RS 4n +Controls whether information about snapshots associated with this pool is +output when "\fBzfs list\fR" is run without the \fB-t\fR option. The default +value is "off". +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBversion\fR=\fIversion\fR\fR +.ad +.sp .6 +.RS 4n +The current on-disk version of the pool. This can be increased, but never +decreased. The preferred method of updating pools is with the "\fBzpool +upgrade\fR" command, though this property can be used when a specific version +is needed for backwards compatibility. This property can be any number between +1 and the current version reported by "\fBzpool upgrade -v\fR". +.RE + +.SS "Subcommands" +.sp +.LP +All subcommands that modify state are logged persistently to the pool in their +original form. +.sp +.LP +The \fBzpool\fR command provides subcommands to create and destroy storage +pools, add capacity to storage pools, and provide information about the storage +pools. The following subcommands are supported: +.sp +.ne 2 +.mk +.na +\fB\fBzpool\fR \fB-?\fR\fR +.ad +.sp .6 +.RS 4n +Displays a help message. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...\fR +.ad +.sp .6 +.RS 4n +Adds the specified virtual devices to the given pool. The \fIvdev\fR +specification is described in the "Virtual Devices" section. The behavior of +the \fB-f\fR option, and the device checks performed are described in the +"zpool create" subcommand. +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.RS 6n +.rt +Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting +replication level. Not all devices can be overridden in this manner. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-n\fR\fR +.ad +.RS 6n +.rt +Displays the configuration that would be used without actually adding the +\fBvdev\fRs. The actual pool creation can still fail due to insufficient +privileges or device sharing. +.RE + +Do not add a disk that is currently configured as a quorum device to a zpool. +After a disk is in the pool, that disk can then be configured as a quorum +device. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR +.ad +.sp .6 +.RS 4n +Attaches \fInew_device\fR to an existing \fBzpool\fR device. The existing +device cannot be part of a \fBraidz\fR configuration. If \fIdevice\fR is not +currently part of a mirrored configuration, \fIdevice\fR automatically +transforms into a two-way mirror of \fIdevice\fR and \fInew_device\fR. If +\fIdevice\fR is part of a two-way mirror, attaching \fInew_device\fR creates a +three-way mirror, and so on. In either case, \fInew_device\fR begins to +resilver immediately. +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.RS 6n +.rt +Forces use of \fInew_device\fR, even if its appears to be in use. Not all +devices can be overridden in this manner. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...\fR +.ad +.sp .6 +.RS 4n +Clears device errors in a pool. If no arguments are specified, all device +errors within the pool are cleared. If one or more devices is specified, only +those errors associated with the specified device or devices are cleared. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR +\fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR +\fIroot\fR] \fIpool\fR \fIvdev\fR ...\fR +.ad +.sp .6 +.RS 4n +Creates a new storage pool containing the virtual devices specified on the +command line. The pool name must begin with a letter, and can only contain +alphanumeric characters as well as underscore ("_"), dash ("-"), and period +("."). The pool names "mirror", "raidz", "spare" and "log" are reserved, as are +names beginning with the pattern "c[0-9]". The \fBvdev\fR specification is +described in the "Virtual Devices" section. +.sp +The command verifies that each device specified is accessible and not currently +in use by another subsystem. There are some uses, such as being currently +mounted, or specified as the dedicated dump device, that prevents a device from +ever being used by \fBZFS\fR. Other uses, such as having a preexisting +\fBUFS\fR file system, can be overridden with the \fB-f\fR option. +.sp +The command also checks that the replication strategy for the pool is +consistent. An attempt to combine redundant and non-redundant storage in a +single pool, or to mix disks and files, results in an error unless \fB-f\fR is +specified. The use of differently sized devices within a single \fBraidz\fR or +mirror group is also flagged as an error unless \fB-f\fR is specified. +.sp +Unless the \fB-R\fR option is specified, the default mount point is +"/\fIpool\fR". The mount point must not exist or must be empty, or else the +root dataset cannot be mounted. This can be overridden with the \fB-m\fR +option. +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.sp .6 +.RS 4n +Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting +replication level. Not all devices can be overridden in this manner. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-n\fR\fR +.ad +.sp .6 +.RS 4n +Displays the configuration that would be used without actually creating the +pool. The actual pool creation can still fail due to insufficient privileges or +device sharing. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty=value\fR [\fB-o\fR \fIproperty=value\fR] ...\fR +.ad +.sp .6 +.RS 4n +Sets the given pool properties. See the "Properties" section for a list of +valid properties that can be set. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-O\fR \fIfile-system-property=value\fR\fR +.ad +.br +.na +\fB[\fB-O\fR \fIfile-system-property=value\fR] ...\fR +.ad +.sp .6 +.RS 4n +Sets the given file system properties in the root file system of the pool. See +the "Properties" section of \fBzfs\fR(1M) for a list of valid properties that +can be set. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-R\fR \fIroot\fR\fR +.ad +.sp .6 +.RS 4n +Equivalent to "-o cachefile=none,altroot=\fIroot\fR" +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-m\fR \fImountpoint\fR\fR +.ad +.sp .6 +.RS 4n +Sets the mount point for the root dataset. The default mount point is +"/\fIpool\fR" or "\fBaltroot\fR/\fIpool\fR" if \fBaltroot\fR is specified. The +mount point must be an absolute path, "\fBlegacy\fR", or "\fBnone\fR". For more +information on dataset mount points, see \fBzfs\fR(1M). +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR\fR +.ad +.sp .6 +.RS 4n +Destroys the given pool, freeing up any devices for other use. This command +tries to unmount any active datasets before destroying the pool. +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.RS 6n +.rt +Forces any active datasets contained within the pool to be unmounted. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool detach\fR \fIpool\fR \fIdevice\fR\fR +.ad +.sp .6 +.RS 4n +Detaches \fIdevice\fR from a mirror. The operation is refused if there are no +other valid replicas of the data. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool export\fR [\fB-f\fR] \fIpool\fR ...\fR +.ad +.sp .6 +.RS 4n +Exports the given pools from the system. All devices are marked as exported, +but are still considered in use by other subsystems. The devices can be moved +between systems (even those of different endianness) and imported as long as a +sufficient number of devices are present. +.sp +Before exporting the pool, all datasets within the pool are unmounted. A pool +can not be exported if it has a shared spare that is currently being used. +.sp +For pools to be portable, you must give the \fBzpool\fR command whole disks, +not just slices, so that \fBZFS\fR can label the disks with portable \fBEFI\fR +labels. Otherwise, disk drivers on platforms of different endianness will not +recognize the disks. +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.RS 6n +.rt +Forcefully unmount all datasets, using the "\fBunmount -f\fR" command. +.sp +This command will forcefully export the pool even if it has a shared spare that +is currently being used. This may lead to potential data corruption. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...\fR +.ad +.sp .6 +.RS 4n +Retrieves the given list of properties (or all properties if "\fBall\fR" is +used) for the specified storage pool(s). These properties are displayed with +the following fields: +.sp +.in +2 +.nf + name Name of storage pool + property Property name + value Property value + source Property source, either 'default' or 'local'. +.fi +.in -2 +.sp + +See the "Properties" section for more information on the available pool +properties. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool history\fR [\fB-il\fR] [\fIpool\fR] ...\fR +.ad +.sp .6 +.RS 4n +Displays the command history of the specified pools or all pools if no pool is +specified. +.sp +.ne 2 +.mk +.na +\fB\fB-i\fR\fR +.ad +.RS 6n +.rt +Displays internally logged \fBZFS\fR events in addition to user initiated +events. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-l\fR\fR +.ad +.RS 6n +.rt +Displays log records in long format, which in addition to standard format +includes, the user name, the hostname, and the zone in which the operation was +performed. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool import\fR [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] +[\fB-D\fR]\fR +.ad +.sp .6 +.RS 4n +Lists pools available to import. If the \fB-d\fR option is not specified, this +command searches for devices in "/dev/dsk". The \fB-d\fR option can be +specified multiple times, and all directories are searched. If the device +appears to be part of an exported pool, this command displays a summary of the +pool with the name of the pool, a numeric identifier, as well as the \fIvdev\fR +layout and current health of the device for each device or file. Destroyed +pools, pools that were previously destroyed with the "\fBzpool destroy\fR" +command, are not listed unless the \fB-D\fR option is specified. +.sp +The numeric identifier is unique, and can be used instead of the pool name when +multiple exported pools of the same name are available. +.sp +.ne 2 +.mk +.na +\fB\fB-c\fR \fIcachefile\fR\fR +.ad +.RS 16n +.rt +Reads configuration from the given \fBcachefile\fR that was created with the +"\fBcachefile\fR" pool property. This \fBcachefile\fR is used instead of +searching for devices. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-d\fR \fIdir\fR\fR +.ad +.RS 16n +.rt +Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be +specified multiple times. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-D\fR\fR +.ad +.RS 16n +.rt +Lists destroyed pools only. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool import\fR [\fB-o\fR \fImntopts\fR] [ \fB-o\fR +\fIproperty\fR=\fIvalue\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] +[\fB-D\fR] [\fB-f\fR] [\fB-R\fR \fIroot\fR] \fB-a\fR\fR +.ad +.sp .6 +.RS 4n +Imports all pools found in the search directories. Identical to the previous +command, except that all pools with a sufficient number of devices available +are imported. Destroyed pools, pools that were previously destroyed with the +"\fBzpool destroy\fR" command, will not be imported unless the \fB-D\fR option +is specified. +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fImntopts\fR\fR +.ad +.RS 21n +.rt +Comma-separated list of mount options to use when mounting datasets within the +pool. See \fBzfs\fR(1M) for a description of dataset properties and mount +options. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty=value\fR\fR +.ad +.RS 21n +.rt +Sets the specified property on the imported pool. See the "Properties" section +for more information on the available pool properties. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-c\fR \fIcachefile\fR\fR +.ad +.RS 21n +.rt +Reads configuration from the given \fBcachefile\fR that was created with the +"\fBcachefile\fR" pool property. This \fBcachefile\fR is used instead of +searching for devices. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-d\fR \fIdir\fR\fR +.ad +.RS 21n +.rt +Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be +specified multiple times. This option is incompatible with the \fB-c\fR option. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-D\fR\fR +.ad +.RS 21n +.rt +Imports destroyed pools only. The \fB-f\fR option is also required. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.RS 21n +.rt +Forces import, even if the pool appears to be potentially active. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-a\fR\fR +.ad +.RS 21n +.rt +Searches for and imports all pools found. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-R\fR \fIroot\fR\fR +.ad +.RS 21n +.rt +Sets the "\fBcachefile\fR" property to "\fBnone\fR" and the "\fIaltroot\fR" +property to "\fIroot\fR". +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool import\fR [\fB-o\fR \fImntopts\fR] [ \fB-o\fR +\fIproperty\fR=\fIvalue\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] +[\fB-D\fR] [\fB-f\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR | \fIid\fR +[\fInewpool\fR]\fR +.ad +.sp .6 +.RS 4n +Imports a specific pool. A pool can be identified by its name or the numeric +identifier. If \fInewpool\fR is specified, the pool is imported using the name +\fInewpool\fR. Otherwise, it is imported with the same name as its exported +name. +.sp +If a device is removed from a system without running "\fBzpool export\fR" +first, the device appears as potentially active. It cannot be determined if +this was a failed export, or whether the device is really in use from another +host. To import a pool in this state, the \fB-f\fR option is required. +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fImntopts\fR\fR +.ad +.sp .6 +.RS 4n +Comma-separated list of mount options to use when mounting datasets within the +pool. See \fBzfs\fR(1M) for a description of dataset properties and mount +options. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIproperty=value\fR\fR +.ad +.sp .6 +.RS 4n +Sets the specified property on the imported pool. See the "Properties" section +for more information on the available pool properties. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-c\fR \fIcachefile\fR\fR +.ad +.sp .6 +.RS 4n +Reads configuration from the given \fBcachefile\fR that was created with the +"\fBcachefile\fR" pool property. This \fBcachefile\fR is used instead of +searching for devices. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-d\fR \fIdir\fR\fR +.ad +.sp .6 +.RS 4n +Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be +specified multiple times. This option is incompatible with the \fB-c\fR option. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-D\fR\fR +.ad +.sp .6 +.RS 4n +Imports destroyed pool. The \fB-f\fR option is also required. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.sp .6 +.RS 4n +Forces import, even if the pool appears to be potentially active. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-R\fR \fIroot\fR\fR +.ad +.sp .6 +.RS 4n +Sets the "\fBcachefile\fR" property to "\fBnone\fR" and the "\fIaltroot\fR" +property to "\fIroot\fR". +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool iostat\fR [\fB-T\fR \fBu\fR | \fBd\fR] [\fB-v\fR] [\fIpool\fR] ... +[\fIinterval\fR[\fIcount\fR]]\fR +.ad +.sp .6 +.RS 4n +Displays \fBI/O\fR statistics for the given pools. When given an interval, the +statistics are printed every \fIinterval\fR seconds until \fBCtrl-C\fR is +pressed. If no \fIpools\fR are specified, statistics for every pool in the +system is shown. If \fIcount\fR is specified, the command exits after +\fIcount\fR reports are printed. +.sp +.ne 2 +.mk +.na +\fB\fB-T\fR \fBu\fR | \fBd\fR\fR +.ad +.RS 12n +.rt +Display a time stamp. +.sp +Specify \fBu\fR for a printed representation of the internal representation of +time. See \fBtime\fR(2). Specify \fBd\fR for standard date format. See +\fBdate\fR(1). +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-v\fR\fR +.ad +.RS 12n +.rt +Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within +the pool, in addition to the pool-wide statistics. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIprops\fR[,...]] [\fIpool\fR] ...\fR +.ad +.sp .6 +.RS 4n +Lists the given pools along with a health status and space usage. When given no +arguments, all pools in the system are listed. +.sp +.ne 2 +.mk +.na +\fB\fB-H\fR\fR +.ad +.RS 12n +.rt +Scripted mode. Do not display headers, and separate fields by a single tab +instead of arbitrary space. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-o\fR \fIprops\fR\fR +.ad +.RS 12n +.rt +Comma-separated list of properties to display. See the "Properties" section for +a list of valid properties. The default list is "name, size, used, available, +capacity, health, altroot" +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...\fR +.ad +.sp .6 +.RS 4n +Takes the specified physical device offline. While the \fIdevice\fR is offline, +no attempt is made to read or write to the device. +.sp +This command is not applicable to spares or cache devices. +.sp +.ne 2 +.mk +.na +\fB\fB-t\fR\fR +.ad +.RS 6n +.rt +Temporary. Upon reboot, the specified physical device reverts to its previous +state. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool online\fR [\fB-e\fR] \fIpool\fR \fIdevice\fR...\fR +.ad +.sp .6 +.RS 4n +Brings the specified physical device online. +.sp +This command is not applicable to spares or cache devices. +.sp +.ne 2 +.mk +.na +\fB\fB-e\fR\fR +.ad +.RS 6n +.rt +Expand the device to use all available space. If the device is part of a mirror +or \fBraidz\fR then all devices must be expanded before the new space will +become available to the pool. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool remove\fR \fIpool\fR \fIdevice\fR ...\fR +.ad +.sp .6 +.RS 4n +Removes the specified device from the pool. This command currently only +supports removing hot spares, cache, and log devices. A mirrored log device can +be removed by specifying the top-level mirror for the log. Non-log devices that +are part of a mirrored configuration can be removed using the \fBzpool +detach\fR command. Non-redundant and \fBraidz\fR devices cannot be removed from +a pool. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIold_device\fR +[\fInew_device\fR]\fR +.ad +.sp .6 +.RS 4n +Replaces \fIold_device\fR with \fInew_device\fR. This is equivalent to +attaching \fInew_device\fR, waiting for it to resilver, and then detaching +\fIold_device\fR. +.sp +The size of \fInew_device\fR must be greater than or equal to the minimum size +of all the devices in a mirror or \fBraidz\fR configuration. +.sp +\fInew_device\fR is required if the pool is not redundant. If \fInew_device\fR +is not specified, it defaults to \fIold_device\fR. This form of replacement is +useful after an existing disk has failed and has been physically replaced. In +this case, the new disk may have the same \fB/dev/dsk\fR path as the old +device, even though it is actually a different disk. \fBZFS\fR recognizes this. +.sp +.ne 2 +.mk +.na +\fB\fB-f\fR\fR +.ad +.RS 6n +.rt +Forces use of \fInew_device\fR, even if its appears to be in use. Not all +devices can be overridden in this manner. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...\fR +.ad +.sp .6 +.RS 4n +Begins a scrub. The scrub examines all data in the specified pools to verify +that it checksums correctly. For replicated (mirror or \fBraidz\fR) devices, +\fBZFS\fR automatically repairs any damage discovered during the scrub. The +"\fBzpool status\fR" command reports the progress of the scrub and summarizes +the results of the scrub upon completion. +.sp +Scrubbing and resilvering are very similar operations. The difference is that +resilvering only examines data that \fBZFS\fR knows to be out of date (for +example, when attaching a new device to a mirror or replacing an existing +device), whereas scrubbing examines all data to discover silent errors due to +hardware faults or disk failure. +.sp +Because scrubbing and resilvering are \fBI/O\fR-intensive operations, \fBZFS\fR +only allows one at a time. If a scrub is already in progress, the "\fBzpool +scrub\fR" command terminates it and starts a new scrub. If a resilver is in +progress, \fBZFS\fR does not allow a scrub to be started until the resilver +completes. +.sp +.ne 2 +.mk +.na +\fB\fB-s\fR\fR +.ad +.RS 6n +.rt +Stop scrubbing. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool set\fR \fIproperty\fR=\fIvalue\fR \fIpool\fR\fR +.ad +.sp .6 +.RS 4n +Sets the given property on the specified pool. See the "Properties" section for +more information on what properties can be set and acceptable values. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...\fR +.ad +.sp .6 +.RS 4n +Displays the detailed health status for the given pools. If no \fIpool\fR is +specified, then the status of each pool in the system is displayed. For more +information on pool and device health, see the "Device Failure and Recovery" +section. +.sp +If a scrub or resilver is in progress, this command reports the percentage done +and the estimated time to completion. Both of these are only approximate, +because the amount of data in the pool and the other workloads on the system +can change. +.sp +.ne 2 +.mk +.na +\fB\fB-x\fR\fR +.ad +.RS 6n +.rt +Only display status for pools that are exhibiting errors or are otherwise +unavailable. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-v\fR\fR +.ad +.RS 6n +.rt +Displays verbose data error information, printing out a complete list of all +data errors since the last complete pool scrub. +.RE + +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool upgrade\fR\fR +.ad +.sp .6 +.RS 4n +Displays all pools formatted using a different \fBZFS\fR on-disk version. Older +versions can continue to be used, but some features may not be available. These +pools can be upgraded using "\fBzpool upgrade -a\fR". Pools that are formatted +with a more recent version are also displayed, although these pools will be +inaccessible on the system. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool upgrade\fR \fB-v\fR\fR +.ad +.sp .6 +.RS 4n +Displays \fBZFS\fR versions supported by the current software. The current +\fBZFS\fR versions and all previous supported versions are displayed, along +with an explanation of the features provided with each version. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fBzpool upgrade\fR [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIpool\fR ...\fR +.ad +.sp .6 +.RS 4n +Upgrades the given pool to the latest on-disk version. Once this is done, the +pool will no longer be accessible on systems running older versions of the +software. +.sp +.ne 2 +.mk +.na +\fB\fB-a\fR\fR +.ad +.RS 14n +.rt +Upgrades all pools. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-V\fR \fIversion\fR\fR +.ad +.RS 14n +.rt +Upgrade to the specified version. If the \fB-V\fR flag is not specified, the +pool is upgraded to the most recent version. This option can only be used to +increase the version number, and only up to the most recent version supported +by this software. +.RE + +.RE + +.SH EXAMPLES +.LP +\fBExample 1 \fRCreating a RAID-Z Storage Pool +.sp +.LP +The following command creates a pool with a single \fBraidz\fR root \fIvdev\fR +that consists of six disks. + +.sp +.in +2 +.nf +# \fBzpool create tank raidz c0t0d0 c0t1d0 c0t2d0 c0t3d0 c0t4d0 c0t5d0\fR +.fi +.in -2 +.sp + +.LP +\fBExample 2 \fRCreating a Mirrored Storage Pool +.sp +.LP +The following command creates a pool with two mirrors, where each mirror +contains two disks. + +.sp +.in +2 +.nf +# \fBzpool create tank mirror c0t0d0 c0t1d0 mirror c0t2d0 c0t3d0\fR +.fi +.in -2 +.sp + +.LP +\fBExample 3 \fRCreating a ZFS Storage Pool by Using Slices +.sp +.LP +The following command creates an unmirrored pool using two disk slices. + +.sp +.in +2 +.nf +# \fBzpool create tank /dev/dsk/c0t0d0s1 c0t1d0s4\fR +.fi +.in -2 +.sp + +.LP +\fBExample 4 \fRCreating a ZFS Storage Pool by Using Files +.sp +.LP +The following command creates an unmirrored pool using files. While not +recommended, a pool based on files can be useful for experimental purposes. + +.sp +.in +2 +.nf +# \fBzpool create tank /path/to/file/a /path/to/file/b\fR +.fi +.in -2 +.sp + +.LP +\fBExample 5 \fRAdding a Mirror to a ZFS Storage Pool +.sp +.LP +The following command adds two mirrored disks to the pool "\fItank\fR", +assuming the pool is already made up of two-way mirrors. The additional space +is immediately available to any datasets within the pool. + +.sp +.in +2 +.nf +# \fBzpool add tank mirror c1t0d0 c1t1d0\fR +.fi +.in -2 +.sp + +.LP +\fBExample 6 \fRListing Available ZFS Storage Pools +.sp +.LP +The following command lists all available pools on the system. In this case, +the pool \fIzion\fR is faulted due to a missing device. + +.sp +.LP +The results from this command are similar to the following: + +.sp +.in +2 +.nf +# \fBzpool list\fR + NAME SIZE USED AVAIL CAP HEALTH ALTROOT + pool 67.5G 2.92M 67.5G 0% ONLINE - + tank 67.5G 2.92M 67.5G 0% ONLINE - + zion - - - 0% FAULTED - +.fi +.in -2 +.sp + +.LP +\fBExample 7 \fRDestroying a ZFS Storage Pool +.sp +.LP +The following command destroys the pool "\fItank\fR" and any datasets contained +within. + +.sp +.in +2 +.nf +# \fBzpool destroy -f tank\fR +.fi +.in -2 +.sp + +.LP +\fBExample 8 \fRExporting a ZFS Storage Pool +.sp +.LP +The following command exports the devices in pool \fItank\fR so that they can +be relocated or later imported. + +.sp +.in +2 +.nf +# \fBzpool export tank\fR +.fi +.in -2 +.sp + +.LP +\fBExample 9 \fRImporting a ZFS Storage Pool +.sp +.LP +The following command displays available pools, and then imports the pool +"tank" for use on the system. + +.sp +.LP +The results from this command are similar to the following: + +.sp +.in +2 +.nf +# \fBzpool import\fR + pool: tank + id: 15451357997522795478 + state: ONLINE +action: The pool can be imported using its name or numeric identifier. +config: + + tank ONLINE + mirror ONLINE + c1t2d0 ONLINE + c1t3d0 ONLINE + +# \fBzpool import tank\fR +.fi +.in -2 +.sp + +.LP +\fBExample 10 \fRUpgrading All ZFS Storage Pools to the Current Version +.sp +.LP +The following command upgrades all ZFS Storage pools to the current version of +the software. + +.sp +.in +2 +.nf +# \fBzpool upgrade -a\fR +This system is currently running ZFS version 2. +.fi +.in -2 +.sp + +.LP +\fBExample 11 \fRManaging Hot Spares +.sp +.LP +The following command creates a new pool with an available hot spare: + +.sp +.in +2 +.nf +# \fBzpool create tank mirror c0t0d0 c0t1d0 spare c0t2d0\fR +.fi +.in -2 +.sp + +.sp +.LP +If one of the disks were to fail, the pool would be reduced to the degraded +state. The failed device can be replaced using the following command: + +.sp +.in +2 +.nf +# \fBzpool replace tank c0t0d0 c0t3d0\fR +.fi +.in -2 +.sp + +.sp +.LP +Once the data has been resilvered, the spare is automatically removed and is +made available should another device fails. The hot spare can be permanently +removed from the pool using the following command: + +.sp +.in +2 +.nf +# \fBzpool remove tank c0t2d0\fR +.fi +.in -2 +.sp + +.LP +\fBExample 12 \fRCreating a ZFS Pool with Mirrored Separate Intent Logs +.sp +.LP +The following command creates a ZFS storage pool consisting of two, two-way +mirrors and mirrored log devices: + +.sp +.in +2 +.nf +# \fBzpool create pool mirror c0d0 c1d0 mirror c2d0 c3d0 log mirror \e + c4d0 c5d0\fR +.fi +.in -2 +.sp + +.LP +\fBExample 13 \fRAdding Cache Devices to a ZFS Pool +.sp +.LP +The following command adds two disks for use as cache devices to a ZFS storage +pool: + +.sp +.in +2 +.nf +# \fBzpool add pool cache c2d0 c3d0\fR +.fi +.in -2 +.sp + +.sp +.LP +Once added, the cache devices gradually fill with content from main memory. +Depending on the size of your cache devices, it could take over an hour for +them to fill. Capacity and reads can be monitored using the \fBiostat\fR option +as follows: + +.sp +.in +2 +.nf +# \fBzpool iostat -v pool 5\fR +.fi +.in -2 +.sp + +.LP +\fBExample 14 \fRRemoving a Mirrored Log Device +.sp +.LP +The following command removes the mirrored log device \fBmirror-2\fR. + +.sp +.LP +Given this configuration: + +.sp +.in +2 +.nf + pool: tank + state: ONLINE + scrub: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + c6t0d0 ONLINE 0 0 0 + c6t1d0 ONLINE 0 0 0 + mirror-1 ONLINE 0 0 0 + c6t2d0 ONLINE 0 0 0 + c6t3d0 ONLINE 0 0 0 + logs + mirror-2 ONLINE 0 0 0 + c4t0d0 ONLINE 0 0 0 + c4t1d0 ONLINE 0 0 0 +.fi +.in -2 +.sp + +.sp +.LP +The command to remove the mirrored log \fBmirror-2\fR is: + +.sp +.in +2 +.nf +# \fBzpool remove tank mirror-2\fR +.fi +.in -2 +.sp + +.SH EXIT STATUS +.sp +.LP +The following exit values are returned: +.sp +.ne 2 +.mk +.na +\fB\fB0\fR\fR +.ad +.RS 5n +.rt +Successful completion. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB1\fR\fR +.ad +.RS 5n +.rt +An error occurred. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB2\fR\fR +.ad +.RS 5n +.rt +Invalid command line options were specified. +.RE + +.SH ATTRIBUTES +.sp +.LP +See \fBattributes\fR(5) for descriptions of the following attributes: +.sp + +.sp +.TS +tab() box; +cw(2.75i) |cw(2.75i) +lw(2.75i) |lw(2.75i) +. +ATTRIBUTE TYPEATTRIBUTE VALUE +_ +Interface StabilityEvolving +.TE + +.SH SEE ALSO +.sp +.LP +\fBzfs\fR(1M), \fBattributes\fR(5) diff --git a/man/man1m/zstreamdump.1m b/man/man1m/zstreamdump.1m new file mode 100644 index 0000000..d8b5e94 --- /dev/null +++ b/man/man1m/zstreamdump.1m @@ -0,0 +1,67 @@ +'\" te +.\" Copyright (c) 2009, Sun Microsystems, Inc. All Rights Reserved +.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with +.\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] +.TH zstreamdump 1M "21 Sep 2009" "SunOS 5.11" "System Administration Commands" +.SH NAME +zstreamdump \- filter data in zfs send stream +.SH SYNOPSIS +.LP +.nf +\fBzstreamdump\fR [\fB-C\fR] [\fB-v\fR] +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBzstreamdump\fR utility reads from the output of the \fBzfs send\fR +command, then displays headers and some statistics from that output. See +\fBzfs\fR(1M). +.SH OPTIONS +.sp +.LP +The following options are supported: +.sp +.ne 2 +.mk +.na +\fB\fB-C\fR\fR +.ad +.sp .6 +.RS 4n +Suppress the validation of checksums. +.RE + +.sp +.ne 2 +.mk +.na +\fB\fB-v\fR\fR +.ad +.sp .6 +.RS 4n +Verbose. Dump all headers, not only begin and end headers. +.RE + +.SH ATTRIBUTES +.sp +.LP +See \fBattributes\fR(5) for descriptions of the following attributes: +.sp + +.sp +.TS +tab() box; +cw(2.75i) |cw(2.75i) +lw(2.75i) |lw(2.75i) +. +ATTRIBUTE TYPEATTRIBUTE VALUE +_ +Interface StabilityUncommitted +.TE + +.SH SEE ALSO +.sp +.LP +\fBzfs\fR(1M), \fBattributes\fR(5) -- cgit v1.1 From b1ed36a61856de570e63fd32baf8a4d8354ee686 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 18 Jul 2012 10:19:06 +0000 Subject: Update vendor/illumos/dist to pre libzfs_core state (dtrace and ctf part) illumos-gate revision 13742:b6bbdd77139c Obtained from: ssh://anonhg@hg.illumos.org/illumos-gate --- cmd/dtrace/test/cmd/jdtrace/JDTrace.java | 11 +- cmd/dtrace/test/cmd/jdtrace/exception.lst | 6 +- cmd/dtrace/test/cmd/jdtrace/jdtrace.c | 18 +- cmd/dtrace/test/cmd/scripts/dtest.pl | 45 +- cmd/dtrace/test/cmd/scripts/dtfailures.ksh | 42 + cmd/dtrace/test/tst/common/aggs/tst.subr.d | 4 +- cmd/dtrace/test/tst/common/buffering/tst.resize1.d | 8 - cmd/dtrace/test/tst/common/buffering/tst.resize2.d | 8 - .../tst/common/funcs/err.D_PROTO_ARG.tolower.d | 30 + .../tst/common/funcs/err.D_PROTO_ARG.toupper.d | 30 + .../tst/common/funcs/err.D_PROTO_LEN.tolower.d | 30 + .../common/funcs/err.D_PROTO_LEN.tolowertoomany.d | 30 + .../tst/common/funcs/err.D_PROTO_LEN.toupper.d | 30 + .../common/funcs/err.D_PROTO_LEN.touppertoomany.d | 30 + cmd/dtrace/test/tst/common/funcs/tst.lltostrbase.d | 80 + .../test/tst/common/funcs/tst.lltostrbase.d.out | 302 +++ cmd/dtrace/test/tst/common/funcs/tst.tolower.d | 66 + cmd/dtrace/test/tst/common/funcs/tst.toupper.d | 66 + .../test/tst/common/include/tst.includefirst.ksh | 76 + cmd/dtrace/test/tst/common/ip/get.ipv4remote.pl | 3 +- cmd/dtrace/test/tst/common/ip/get.ipv6remote.pl | 3 +- cmd/dtrace/test/tst/common/ip/tst.ipv4localtcp.ksh | 2 +- .../test/tst/common/ip/tst.ipv4remotetcp.ksh | 2 +- .../test/tst/common/ip/tst.localtcpstate.ksh | 2 +- .../test/tst/common/ip/tst.remotetcpstate.ksh | 2 +- .../test/tst/common/java_api/src/TestBean.java | 47 +- .../test/tst/common/java_api/tst.Bean.ksh.out | 6 + .../llquantize/err.D_LLQUANT_FACTOREVEN.nodivide.d | 29 + .../err.D_LLQUANT_FACTOREVEN.notfactor.d | 29 + .../common/llquantize/err.D_LLQUANT_FACTORMATCH.d | 30 + .../common/llquantize/err.D_LLQUANT_FACTORNSTEPS.d | 29 + .../common/llquantize/err.D_LLQUANT_FACTORSMALL.d | 29 + .../common/llquantize/err.D_LLQUANT_FACTORTYPE.d | 30 + .../common/llquantize/err.D_LLQUANT_FACTORVAL.d | 29 + .../common/llquantize/err.D_LLQUANT_HIGHMATCH.d | 30 + .../tst/common/llquantize/err.D_LLQUANT_HIGHTYPE.d | 30 + .../tst/common/llquantize/err.D_LLQUANT_HIGHVAL.d | 29 + .../tst/common/llquantize/err.D_LLQUANT_LOWMATCH.d | 30 + .../tst/common/llquantize/err.D_LLQUANT_LOWTYPE.d | 30 + .../tst/common/llquantize/err.D_LLQUANT_LOWVAL.d | 29 + .../tst/common/llquantize/err.D_LLQUANT_MAGRANGE.d | 29 + .../common/llquantize/err.D_LLQUANT_MAGTOOBIG.d | 29 + .../common/llquantize/err.D_LLQUANT_NSTEPMATCH.d | 30 + .../common/llquantize/err.D_LLQUANT_NSTEPTYPE.d | 30 + .../tst/common/llquantize/err.D_LLQUANT_NSTEPVAL.d | 29 + cmd/dtrace/test/tst/common/llquantize/tst.bases.d | 46 + .../test/tst/common/llquantize/tst.bases.d.out | 177 ++ cmd/dtrace/test/tst/common/llquantize/tst.basic.d | 38 + .../test/tst/common/llquantize/tst.basic.d.out | 25 + .../test/tst/common/llquantize/tst.negorder.d | 62 + .../test/tst/common/llquantize/tst.negorder.d.out | 148 ++ .../test/tst/common/llquantize/tst.negvalue.d | 38 + .../test/tst/common/llquantize/tst.negvalue.d.out | 25 + cmd/dtrace/test/tst/common/llquantize/tst.normal.d | 40 + .../test/tst/common/llquantize/tst.normal.d.out | 26 + cmd/dtrace/test/tst/common/llquantize/tst.range.d | 38 + .../test/tst/common/llquantize/tst.range.d.out | 29 + cmd/dtrace/test/tst/common/llquantize/tst.steps.d | 52 + .../test/tst/common/llquantize/tst.steps.d.out | 2033 ++++++++++++++++++++ cmd/dtrace/test/tst/common/llquantize/tst.trunc.d | 49 + .../test/tst/common/llquantize/tst.trunc.d.out | 34 + cmd/dtrace/test/tst/common/mdb/tst.dtracedcmd.ksh | 4 +- cmd/dtrace/test/tst/common/misc/tst.include.ksh | 35 +- .../test/tst/common/misc/tst.macroglob.ksh.out | 3 + cmd/dtrace/test/tst/common/misc/tst.schrock.ksh | 12 +- cmd/dtrace/test/tst/common/pid/tst.provregex1.ksh | 5 +- cmd/dtrace/test/tst/common/pid/tst.provregex2.ksh | 9 +- cmd/dtrace/test/tst/common/pid/tst.provregex3.ksh | 7 +- cmd/dtrace/test/tst/common/pid/tst.provregex4.ksh | 9 +- .../test/tst/common/pragma/tst.libdepsepdir.ksh | 76 + .../test/tst/common/print/err.D_PRINT_DYN.bad.d | 29 + .../test/tst/common/print/err.D_PRINT_VOID.bad.d | 29 + .../test/tst/common/print/err.D_PROTO_LEN.bad.d | 29 + cmd/dtrace/test/tst/common/print/tst.array.d | 62 + cmd/dtrace/test/tst/common/print/tst.array.d.out | 23 + cmd/dtrace/test/tst/common/print/tst.bitfield.d | 49 + .../test/tst/common/print/tst.bitfield.d.out | 6 + cmd/dtrace/test/tst/common/print/tst.primitive.d | 45 + .../test/tst/common/print/tst.primitive.d.out | 11 + cmd/dtrace/test/tst/common/print/tst.struct.d | 59 + cmd/dtrace/test/tst/common/print/tst.struct.d.out | 12 + .../test/tst/common/printa/tst.largeusersym.ksh | 2 +- .../test/tst/common/privs/tst.noprivdrop.ksh | 72 + .../test/tst/common/privs/tst.noprivrestrict.ksh | 61 + cmd/dtrace/test/tst/common/privs/tst.tick.ksh | 55 + cmd/dtrace/test/tst/common/profile-n/tst.ufunc.ksh | 6 +- cmd/dtrace/test/tst/common/profile-n/tst.umod.ksh | 3 +- cmd/dtrace/test/tst/common/profile-n/tst.usym.ksh | 3 +- .../test/tst/common/safety/tst.violentdeath.ksh | 3 +- .../common/sizeof/err.D_SIZEOF_TYPE.badstruct.d | 30 + .../test/tst/common/trace/err.D_TRACE_DYN.bad.d | 29 + .../test/tst/common/tracemem/err.D_TRACEMEM_ARGS.d | 29 + .../tst/common/tracemem/err.D_TRACEMEM_DYNSIZE.d | 30 + cmd/dtrace/test/tst/common/tracemem/tst.dynsize.d | 45 + .../test/tst/common/tracemem/tst.dynsize.d.out | 1313 +++++++++++++ .../test/tst/common/tracemem/tst.smallsize.d | 32 + .../test/tst/common/tracemem/tst.smallsize.d.out | 4 + cmd/dtrace/test/tst/common/usdt/tst.badguess.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.corruptenv.ksh | 4 +- cmd/dtrace/test/tst/common/usdt/tst.dlclose1.ksh | 15 +- cmd/dtrace/test/tst/common/usdt/tst.dlclose2.ksh | 15 +- cmd/dtrace/test/tst/common/usdt/tst.dlclose3.ksh | 15 +- cmd/dtrace/test/tst/common/usdt/tst.eliminate.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.enabled.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.enabled2.ksh | 5 +- .../test/tst/common/usdt/tst.entryreturn.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.fork.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.guess32.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.guess64.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.header.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.include.ksh | 3 +- cmd/dtrace/test/tst/common/usdt/tst.linkpriv.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.linkunpriv.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.multiple.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.nodtrace.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.noreap.ksh | 128 ++ cmd/dtrace/test/tst/common/usdt/tst.noreapring.ksh | 124 ++ .../test/tst/common/usdt/tst.onlyenabled.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.reap.ksh | 115 ++ cmd/dtrace/test/tst/common/usdt/tst.reeval.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.static.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.static2.ksh | 5 +- cmd/dtrace/test/tst/common/usdt/tst.user.ksh | 5 +- cmd/dtrace/test/tst/common/ustack/tst.spin.ksh | 3 +- cmd/dtrace/test/tst/sparc/usdt/tst.tailcall.ksh | 3 +- lib/libctf/common/ctf_lib.c | 6 +- lib/libdtrace/common/dt_aggregate.c | 89 +- lib/libdtrace/common/dt_cc.c | 302 ++- lib/libdtrace/common/dt_consume.c | 186 +- lib/libdtrace/common/dt_dof.c | 14 +- lib/libdtrace/common/dt_errtags.h | 30 +- lib/libdtrace/common/dt_impl.h | 13 + lib/libdtrace/common/dt_map.c | 172 +- lib/libdtrace/common/dt_open.c | 32 +- lib/libdtrace/common/dt_options.c | 27 - lib/libdtrace/common/dt_parser.c | 8 + lib/libdtrace/common/dt_pragma.c | 41 +- lib/libdtrace/common/dt_print.c | 648 +++++++ lib/libdtrace/common/dt_printf.c | 12 + lib/libdtrace/common/dt_program.c | 2 + lib/libdtrace/common/dtrace.h | 19 +- lib/libdtrace/i386/regs.d.in | 151 +- man/man1m/dtrace.1m | 75 +- man/man1m/lockstat.1m | 50 +- man/man1m/plockstat.1m | 51 +- tools/ctf/cvt/dwarf.c | 9 +- 146 files changed, 8583 insertions(+), 469 deletions(-) create mode 100644 cmd/dtrace/test/cmd/scripts/dtfailures.ksh create mode 100644 cmd/dtrace/test/tst/common/funcs/err.D_PROTO_ARG.tolower.d create mode 100644 cmd/dtrace/test/tst/common/funcs/err.D_PROTO_ARG.toupper.d create mode 100644 cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.tolower.d create mode 100644 cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.tolowertoomany.d create mode 100644 cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.toupper.d create mode 100644 cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.touppertoomany.d create mode 100644 cmd/dtrace/test/tst/common/funcs/tst.lltostrbase.d create mode 100644 cmd/dtrace/test/tst/common/funcs/tst.lltostrbase.d.out create mode 100644 cmd/dtrace/test/tst/common/funcs/tst.tolower.d create mode 100644 cmd/dtrace/test/tst/common/funcs/tst.toupper.d create mode 100644 cmd/dtrace/test/tst/common/include/tst.includefirst.ksh create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTOREVEN.nodivide.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTOREVEN.notfactor.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORMATCH.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORNSTEPS.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORSMALL.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORTYPE.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORVAL.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHMATCH.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHTYPE.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHVAL.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWMATCH.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWTYPE.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWVAL.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_MAGRANGE.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_MAGTOOBIG.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPMATCH.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPTYPE.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPVAL.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.bases.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.bases.d.out create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.basic.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.basic.d.out create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.negorder.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.negorder.d.out create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.negvalue.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.negvalue.d.out create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.normal.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.normal.d.out create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.range.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.range.d.out create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.steps.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.steps.d.out create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.trunc.d create mode 100644 cmd/dtrace/test/tst/common/llquantize/tst.trunc.d.out create mode 100644 cmd/dtrace/test/tst/common/pragma/tst.libdepsepdir.ksh create mode 100644 cmd/dtrace/test/tst/common/print/err.D_PRINT_DYN.bad.d create mode 100644 cmd/dtrace/test/tst/common/print/err.D_PRINT_VOID.bad.d create mode 100644 cmd/dtrace/test/tst/common/print/err.D_PROTO_LEN.bad.d create mode 100644 cmd/dtrace/test/tst/common/print/tst.array.d create mode 100644 cmd/dtrace/test/tst/common/print/tst.array.d.out create mode 100644 cmd/dtrace/test/tst/common/print/tst.bitfield.d create mode 100644 cmd/dtrace/test/tst/common/print/tst.bitfield.d.out create mode 100644 cmd/dtrace/test/tst/common/print/tst.primitive.d create mode 100644 cmd/dtrace/test/tst/common/print/tst.primitive.d.out create mode 100644 cmd/dtrace/test/tst/common/print/tst.struct.d create mode 100644 cmd/dtrace/test/tst/common/print/tst.struct.d.out create mode 100644 cmd/dtrace/test/tst/common/privs/tst.noprivdrop.ksh create mode 100644 cmd/dtrace/test/tst/common/privs/tst.noprivrestrict.ksh create mode 100644 cmd/dtrace/test/tst/common/privs/tst.tick.ksh create mode 100644 cmd/dtrace/test/tst/common/sizeof/err.D_SIZEOF_TYPE.badstruct.d create mode 100644 cmd/dtrace/test/tst/common/trace/err.D_TRACE_DYN.bad.d create mode 100644 cmd/dtrace/test/tst/common/tracemem/err.D_TRACEMEM_ARGS.d create mode 100644 cmd/dtrace/test/tst/common/tracemem/err.D_TRACEMEM_DYNSIZE.d create mode 100644 cmd/dtrace/test/tst/common/tracemem/tst.dynsize.d create mode 100644 cmd/dtrace/test/tst/common/tracemem/tst.dynsize.d.out create mode 100644 cmd/dtrace/test/tst/common/tracemem/tst.smallsize.d create mode 100644 cmd/dtrace/test/tst/common/tracemem/tst.smallsize.d.out create mode 100644 cmd/dtrace/test/tst/common/usdt/tst.noreap.ksh create mode 100644 cmd/dtrace/test/tst/common/usdt/tst.noreapring.ksh create mode 100644 cmd/dtrace/test/tst/common/usdt/tst.reap.ksh create mode 100644 lib/libdtrace/common/dt_print.c diff --git a/cmd/dtrace/test/cmd/jdtrace/JDTrace.java b/cmd/dtrace/test/cmd/jdtrace/JDTrace.java index 3c5654d..f8c9ab7 100644 --- a/cmd/dtrace/test/cmd/jdtrace/JDTrace.java +++ b/cmd/dtrace/test/cmd/jdtrace/JDTrace.java @@ -23,7 +23,6 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * ident "%Z%%M% %I% %E% SMI" */ import org.opensolaris.os.dtrace.*; import java.io.*; @@ -303,6 +302,7 @@ public class JDTrace { Distribution.Bucket bucket; int b1 = 0; // first displayed bucket int b2 = d.size() - 1; // last displayed bucket + for (; (b1 <= b2) && (d.get(b1).getFrequency() == 0); ++b1); // If possible, get one bucket before the first non-zero // bucket and one bucket after the last. @@ -337,9 +337,14 @@ public class JDTrace { v = bucket.getFrequency(); b = bucket.getMin(); - if (d instanceof LinearDistribution) { + if ((d instanceof LinearDistribution) || + (d instanceof LogLinearDistribution)) { if (b == Long.MIN_VALUE) { - String lt = "< " + ((LinearDistribution)d).getBase(); + String lt; + if (d instanceof LinearDistribution) + lt = "< " + ((LinearDistribution)d).getBase(); + else + lt = "< " + ((LogLinearDistribution)d).getBase(); out.printf("%16s ", lt); } else if (bucket.getMax() == Long.MAX_VALUE) { String ge = ">= " + b; diff --git a/cmd/dtrace/test/cmd/jdtrace/exception.lst b/cmd/dtrace/test/cmd/jdtrace/exception.lst index 261f870..19fc3ac 100644 --- a/cmd/dtrace/test/cmd/jdtrace/exception.lst +++ b/cmd/dtrace/test/cmd/jdtrace/exception.lst @@ -23,7 +23,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # Exception list: names tests that are bypassed when running in Java # mode (relative to /opt/SUNWdtrt/tst) @@ -52,14 +51,17 @@ common/usdt/tst.enabled.ksh common/usdt/tst.enabled2.ksh common/usdt/tst.entryreturn.ksh common/usdt/tst.fork.ksh -common/usdt/tst.header.ksh common/usdt/tst.guess32.ksh common/usdt/tst.guess64.ksh +common/usdt/tst.header.ksh common/usdt/tst.linkpriv.ksh common/usdt/tst.linkunpriv.ksh common/usdt/tst.multiple.ksh common/usdt/tst.nodtrace.ksh +common/usdt/tst.noreap.ksh +common/usdt/tst.noreapring.ksh common/usdt/tst.onlyenabled.ksh +common/usdt/tst.reap.ksh common/usdt/tst.reeval.ksh common/usdt/tst.static.ksh common/usdt/tst.static2.ksh diff --git a/cmd/dtrace/test/cmd/jdtrace/jdtrace.c b/cmd/dtrace/test/cmd/jdtrace/jdtrace.c index 0951265..81a2d9b 100644 --- a/cmd/dtrace/test/cmd/jdtrace/jdtrace.c +++ b/cmd/dtrace/test/cmd/jdtrace/jdtrace.c @@ -22,25 +22,27 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2011, Richard Lowe */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include +#include #include int main(int argc, char **argv) { - int i, ac, has64; + int i, ac; char **av, **p; + char isaname[16]; ac = argc + 3; av = p = alloca(sizeof (char *) * ac); - *p++ = "java"; + *p++ = "/usr/java/bin/java"; *p++ = "-jar"; *p++ = "/opt/SUNWdtrt/lib/java/jdtrace.jar"; @@ -52,9 +54,9 @@ main(int argc, char **argv) } p[i] = NULL; - (void) execvp(av[0], av); - - perror("exec failed"); + if (sysinfo(SI_ARCHITECTURE_64, isaname, sizeof (isaname)) != -1) + asprintf(av, "/usr/java/bin/%s/java", isaname); - return (0); + (void) execv(av[0], av); + err(1, "exec failed"); } diff --git a/cmd/dtrace/test/cmd/scripts/dtest.pl b/cmd/dtrace/test/cmd/scripts/dtest.pl index 3f24429..f11cf69 100644 --- a/cmd/dtrace/test/cmd/scripts/dtest.pl +++ b/cmd/dtrace/test/cmd/scripts/dtest.pl @@ -25,6 +25,9 @@ # Use is subject to license terms. # +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# require 5.8.4; use File::Find; @@ -35,8 +38,8 @@ use Cwd 'abs_path'; $PNAME = $0; $PNAME =~ s:.*/::; -$OPTSTR = 'abd:fghi:jlnqsx:'; -$USAGE = "Usage: $PNAME [-abfghjlnqs] [-d dir] [-i isa] " +$OPTSTR = 'abd:fFghi:jlnqsx:'; +$USAGE = "Usage: $PNAME [-abfFghjlnqs] [-d dir] [-i isa] " . "[-x opt[=arg]] [file | dir ...]\n"; ($MACH = `uname -p`) =~ s/\W*\n//; ($PLATFORM = `uname -i`) =~ s/\W*\n//; @@ -69,6 +72,20 @@ sub dirname { return $i == -1 ? '.' : $i == 0 ? '/' : $s; } +sub inpath +{ + my ($exec) = (@_); + my @path = File::Spec->path(); + + for my $dir (@path) { + if (-x $dir . "/" . $exec) { + return 1; + } + } + + return 0; +} + sub usage { print $USAGE; @@ -77,6 +94,7 @@ sub usage print "\t -d specify directory for test results files and cores\n"; print "\t -g enable libumem debugging when running tests\n"; print "\t -f force bypassed tests to run\n"; + print "\t -F force tests to be run, even if missing dependencies\n"; print "\t -h display verbose usage message\n"; print "\t -i specify ISA to test instead of isaexec(3C) default\n"; print "\t -j execute test suite using jdtrace (Java API) only\n"; @@ -240,8 +258,8 @@ sub run_tests { my($failed) = $errs; my($total) = 0; - die "$PNAME: $dtrace not found\n" unless (-x "$dtrace"); - logmsg($dtrace . "\n"); + die "$PNAME: $dtrace not found; aborting\n" unless (-x "$dtrace"); + logmsg("executing tests using $dtrace ...\n"); load_exceptions($exceptions_path); @@ -546,9 +564,20 @@ $dt_bin = '/opt/SUNWdtrt/bin'; $defdir = -d $dt_tst ? $dt_tst : '.'; $bindir = -d $dt_bin ? $dt_bin : '.'; +if (!$opt_F) { + my @dependencies = ("gcc", "make", "java", "perl"); + + for my $dep (@dependencies) { + if (!inpath($dep)) { + die "$PNAME: '$dep' not found (use -F to force run)\n"; + } + } +} + find(\&wanted, "$defdir/common") if (scalar(@ARGV) == 0); find(\&wanted, "$defdir/$MACH") if (scalar(@ARGV) == 0); find(\&wanted, "$defdir/$PLATFORM") if (scalar(@ARGV) == 0); + die $USAGE if (scalar(@files) == 0); $dtrace_path = '/usr/sbin/dtrace'; @@ -562,7 +591,7 @@ if ($opt_j || $opt_n || $opt_i) { push(@dtrace_cmds, $jdtrace_path) if ($opt_j); push(@dtrace_cmds, "/usr/sbin/$opt_i/dtrace") if ($opt_i); } else { - @dtrace_cmds = ($dtrace_path, $jdtrace_path); + @dtrace_cmds = ($dtrace_path); } if ($opt_d) { @@ -589,12 +618,6 @@ if ($opt_g) { $ENV{'LD_PRELOAD'} = 'libumem.so'; } -# -# Ensure that $PATH contains a cc(1) so that we can execute the -# test programs that require compilation of C code. -# -$ENV{'PATH'} = $ENV{'PATH'} . ':/ws/onnv-tools/SUNWspro/SS11/bin'; - if ($opt_b) { logmsg("badioctl'ing ... "); diff --git a/cmd/dtrace/test/cmd/scripts/dtfailures.ksh b/cmd/dtrace/test/cmd/scripts/dtfailures.ksh new file mode 100644 index 0000000..89ba955 --- /dev/null +++ b/cmd/dtrace/test/cmd/scripts/dtfailures.ksh @@ -0,0 +1,42 @@ +#!/usr/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +let failure=0 + +printf "%-3s %-10s %-31s %s\n" "#" "KIND" "TEST" "DETAILS" + +while [[ -d failure.$failure ]]; do + dir=failure.$failure + tst=`cat $dir/README | head -1 | nawk '{ print $2 }'` + kind=`basename $(dirname $tst)` + name=`basename $tst` + cols=$(expr `tput cols` - 47) + details=`tail -1 $dir/*.err | cut -c1-$cols` + printf "%-3d %-10s %-31s " $failure $kind $name + echo $details + let failure=failure+1 +done + diff --git a/cmd/dtrace/test/tst/common/aggs/tst.subr.d b/cmd/dtrace/test/tst/common/aggs/tst.subr.d index 5ca1f1e..bb0739f 100644 --- a/cmd/dtrace/test/tst/common/aggs/tst.subr.d +++ b/cmd/dtrace/test/tst/common/aggs/tst.subr.d @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #define INTFUNC(x) \ @@ -98,6 +96,8 @@ INTFUNC(ntohll(0x1234567890abcdefL)) STRFUNC(inet_ntoa((ipaddr_t *)alloca(sizeof (ipaddr_t)))) STRFUNC(inet_ntoa6((in6_addr_t *)alloca(sizeof (in6_addr_t)))) STRFUNC(inet_ntop(AF_INET, (void *)alloca(sizeof (ipaddr_t)))) +STRFUNC(toupper("foo")) +STRFUNC(tolower("BAR")) BEGIN /subr == DIF_SUBR_MAX + 1/ diff --git a/cmd/dtrace/test/tst/common/buffering/tst.resize1.d b/cmd/dtrace/test/tst/common/buffering/tst.resize1.d index 396a808..ca8ad44 100644 --- a/cmd/dtrace/test/tst/common/buffering/tst.resize1.d +++ b/cmd/dtrace/test/tst/common/buffering/tst.resize1.d @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ASSERTION: * Checks that setting "bufresize" to "auto" will cause buffer @@ -34,14 +32,8 @@ * SECTION: Buffers and Buffering/Buffer Resizing Policy; * Options and Tunables/bufsize; * Options and Tunables/bufresize - * - * NOTES: - * We use the undocumented "preallocate" option to make sure dtrace(1M) - * has enough space in its heap to allocate a buffer as large as the - * kernel's trace buffer. */ -#pragma D option preallocate=100t #pragma D option bufresize=auto #pragma D option bufsize=100t diff --git a/cmd/dtrace/test/tst/common/buffering/tst.resize2.d b/cmd/dtrace/test/tst/common/buffering/tst.resize2.d index 50b814b..ddb97c8 100644 --- a/cmd/dtrace/test/tst/common/buffering/tst.resize2.d +++ b/cmd/dtrace/test/tst/common/buffering/tst.resize2.d @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ASSERTION: * Checks that setting "bufresize" to "auto" will cause buffer @@ -34,14 +32,8 @@ * SECTION: Buffers and Buffering/Buffer Resizing Policy; * Options and Tunables/aggsize; * Options and Tunables/bufresize - * - * NOTES: - * We use the undocumented "preallocate" option to make sure dtrace(1M) - * has enough space in its heap to allocate a buffer as large as the - * kernel's trace buffer. */ -#pragma D option preallocate=100t #pragma D option bufresize=auto #pragma D option aggsize=100t diff --git a/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_ARG.tolower.d b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_ARG.tolower.d new file mode 100644 index 0000000..9d4e40b --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_ARG.tolower.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + trace(tolower(2152006)); + exit(1); +} diff --git a/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_ARG.toupper.d b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_ARG.toupper.d new file mode 100644 index 0000000..2c1389b --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_ARG.toupper.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + trace(toupper(timestamp)); + exit(1); +} diff --git a/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.tolower.d b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.tolower.d new file mode 100644 index 0000000..7d9c27f --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.tolower.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + trace(tolower()); + exit(1); +} diff --git a/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.tolowertoomany.d b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.tolowertoomany.d new file mode 100644 index 0000000..afaa7f9 --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.tolowertoomany.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + trace(tolower("dory", "eel", "roughy")); + exit(1); +} diff --git a/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.toupper.d b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.toupper.d new file mode 100644 index 0000000..9658f6a --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.toupper.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + trace(toupper()); + exit(1); +} diff --git a/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.touppertoomany.d b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.touppertoomany.d new file mode 100644 index 0000000..bee8697 --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/err.D_PROTO_LEN.touppertoomany.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + trace(tolower("haino", "tylo")); + exit(1); +} diff --git a/cmd/dtrace/test/tst/common/funcs/tst.lltostrbase.d b/cmd/dtrace/test/tst/common/funcs/tst.lltostrbase.d new file mode 100644 index 0000000..1afe37d --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/tst.lltostrbase.d @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +int64_t val[int]; + +BEGIN +{ + base = -2; + i = 0; + val[i++] = -10; + val[i++] = -1; + val[i++] = 0; + val[i++] = 10; + val[i++] = 100; + val[i++] = 1000; + val[i++] = (1LL << 62); + maxval = i; + i = 0; +} + +tick-1ms +/i < maxval/ +{ + printf("base %2d of %20d: ", base, val[i]); +} + +tick-1ms +/i < maxval/ +{ + printf(" %s\n", lltostr(val[i], base)); +} + +ERROR +{ + printf(" \n"); +} + +tick-1ms +/i < maxval/ +{ + i++; +} + +tick-1ms +/i == maxval/ +{ + i = 0; + base++; +} + +tick-1ms +/base > 40/ +{ + exit(0); +} + diff --git a/cmd/dtrace/test/tst/common/funcs/tst.lltostrbase.d.out b/cmd/dtrace/test/tst/common/funcs/tst.lltostrbase.d.out new file mode 100644 index 0000000..94e2257 --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/tst.lltostrbase.d.out @@ -0,0 +1,302 @@ +base -2 of -10: +base -2 of -1: +base -2 of 0: +base -2 of 10: +base -2 of 100: +base -2 of 1000: +base -2 of 4611686018427387904: +base -1 of -10: +base -1 of -1: +base -1 of 0: +base -1 of 10: +base -1 of 100: +base -1 of 1000: +base -1 of 4611686018427387904: +base 0 of -10: +base 0 of -1: +base 0 of 0: +base 0 of 10: +base 0 of 100: +base 0 of 1000: +base 0 of 4611686018427387904: +base 1 of -10: +base 1 of -1: +base 1 of 0: +base 1 of 10: +base 1 of 100: +base 1 of 1000: +base 1 of 4611686018427387904: +base 2 of -10: 1111111111111111111111111111111111111111111111111111111111110110 +base 2 of -1: 1111111111111111111111111111111111111111111111111111111111111111 +base 2 of 0: 0 +base 2 of 10: 1010 +base 2 of 100: 1100100 +base 2 of 1000: 1111101000 +base 2 of 4611686018427387904: 100000000000000000000000000000000000000000000000000000000000000 +base 3 of -10: 11112220022122120101211020120210210211120 +base 3 of -1: 11112220022122120101211020120210210211220 +base 3 of 0: 0 +base 3 of 10: 101 +base 3 of 100: 10201 +base 3 of 1000: 1101001 +base 3 of 4611686018427387904: 1010201120122220002201001122110012110111 +base 4 of -10: 33333333333333333333333333333312 +base 4 of -1: 33333333333333333333333333333333 +base 4 of 0: 0 +base 4 of 10: 22 +base 4 of 100: 1210 +base 4 of 1000: 33220 +base 4 of 4611686018427387904: 10000000000000000000000000000000 +base 5 of -10: 2214220303114400424121122411 +base 5 of -1: 2214220303114400424121122430 +base 5 of 0: 0 +base 5 of 10: 20 +base 5 of 100: 400 +base 5 of 1000: 13000 +base 5 of 4611686018427387904: 302141200402211214402403104 +base 6 of -10: 3520522010102100444244410 +base 6 of -1: 3520522010102100444244423 +base 6 of 0: 0 +base 6 of 10: 14 +base 6 of 100: 244 +base 6 of 1000: 4344 +base 6 of 4611686018427387904: 550120301313313111041104 +base 7 of -10: 45012021522523134134556 +base 7 of -1: 45012021522523134134601 +base 7 of 0: 0 +base 7 of 10: 13 +base 7 of 100: 202 +base 7 of 1000: 2626 +base 7 of 4611686018427387904: 11154003640456024361134 +base 8 of -10: 01777777777777777777766 +base 8 of -1: 01777777777777777777777 +base 8 of 0: 0 +base 8 of 10: 012 +base 8 of 100: 0144 +base 8 of 1000: 01750 +base 8 of 4611686018427387904: 0400000000000000000000 +base 9 of -10: 145808576354216723746 +base 9 of -1: 145808576354216723756 +base 9 of 0: 0 +base 9 of 10: 11 +base 9 of 100: 121 +base 9 of 1000: 1331 +base 9 of 4611686018427387904: 33646586081048405414 +base 10 of -10: -10 +base 10 of -1: -1 +base 10 of 0: 0 +base 10 of 10: 10 +base 10 of 100: 100 +base 10 of 1000: 1000 +base 10 of 4611686018427387904: 4611686018427387904 +base 11 of -10: 335500516a429071276 +base 11 of -1: 335500516a429071284 +base 11 of 0: 0 +base 11 of 10: a +base 11 of 100: 91 +base 11 of 1000: 82a +base 11 of 4611686018427387904: 9140013181078458a4 +base 12 of -10: 839365134a2a240706 +base 12 of -1: 839365134a2a240713 +base 12 of 0: 0 +base 12 of 10: a +base 12 of 100: 84 +base 12 of 1000: 6b4 +base 12 of 4611686018427387904: 20b3a733a268670194 +base 13 of -10: 219505a9511a867b66 +base 13 of -1: 219505a9511a867b72 +base 13 of 0: 0 +base 13 of 10: a +base 13 of 100: 79 +base 13 of 1000: 5bc +base 13 of 4611686018427387904: 6c1349246a2881c84 +base 14 of -10: 8681049adb03db166 +base 14 of -1: 8681049adb03db171 +base 14 of 0: 0 +base 14 of 10: a +base 14 of 100: 72 +base 14 of 1000: 516 +base 14 of 4611686018427387904: 219038263637dd3c4 +base 15 of -10: 2c1d56b648c6cd106 +base 15 of -1: 2c1d56b648c6cd110 +base 15 of 0: 0 +base 15 of 10: a +base 15 of 100: 6a +base 15 of 1000: 46a +base 15 of 4611686018427387904: a7e8ce189a933404 +base 16 of -10: 0xfffffffffffffff6 +base 16 of -1: 0xffffffffffffffff +base 16 of 0: 0x0 +base 16 of 10: 0xa +base 16 of 100: 0x64 +base 16 of 1000: 0x3e8 +base 16 of 4611686018427387904: 0x4000000000000000 +base 17 of -10: 67979g60f5428008 +base 17 of -1: 67979g60f5428010 +base 17 of 0: 0 +base 17 of 10: a +base 17 of 100: 5f +base 17 of 1000: 37e +base 17 of 4611686018427387904: 1a6a6ca03e10a88d +base 18 of -10: 2d3fgb0b9cg4bd26 +base 18 of -1: 2d3fgb0b9cg4bd2f +base 18 of 0: 0 +base 18 of 10: a +base 18 of 100: 5a +base 18 of 1000: 31a +base 18 of 4611686018427387904: c588bdbfgd12ge4 +base 19 of -10: 141c8786h1ccaag7 +base 19 of -1: 141c8786h1ccaagg +base 19 of 0: 0 +base 19 of 10: a +base 19 of 100: 55 +base 19 of 1000: 2ec +base 19 of 4611686018427387904: 5ecbb6fi9h7ggi9 +base 20 of -10: b53bjh07be4dj06 +base 20 of -1: b53bjh07be4dj0f +base 20 of 0: 0 +base 20 of 10: a +base 20 of 100: 50 +base 20 of 1000: 2a0 +base 20 of 4611686018427387904: 2g5hjj51hib39f4 +base 21 of -10: 5e8g4ggg7g56di6 +base 21 of -1: 5e8g4ggg7g56dif +base 21 of 0: 0 +base 21 of 10: a +base 21 of 100: 4g +base 21 of 1000: 25d +base 21 of 4611686018427387904: 18hjgjjjhebh8f4 +base 22 of -10: 2l4lf104353j8k6 +base 22 of -1: 2l4lf104353j8kf +base 22 of 0: 0 +base 22 of 10: a +base 22 of 100: 4c +base 22 of 1000: 21a +base 22 of 4611686018427387904: g6g95gc0hha7g4 +base 23 of -10: 1ddh88h2782i50j +base 23 of -1: 1ddh88h2782i515 +base 23 of 0: 0 +base 23 of 10: a +base 23 of 100: 48 +base 23 of 1000: 1kb +base 23 of 4611686018427387904: 93a22467dc4chd +base 24 of -10: l12ee5fn0ji1i6 +base 24 of -1: l12ee5fn0ji1if +base 24 of 0: 0 +base 24 of 10: a +base 24 of 100: 44 +base 24 of 1000: 1hg +base 24 of 4611686018427387904: 566ffd9ni4mcag +base 25 of -10: c9c336o0mlb7e6 +base 25 of -1: c9c336o0mlb7ef +base 25 of 0: 0 +base 25 of 10: a +base 25 of 100: 40 +base 25 of 1000: 1f0 +base 25 of 4611686018427387904: 32970kc6bo2kg4 +base 26 of -10: 7b7n2pcniokcg6 +base 26 of -1: 7b7n2pcniokcgf +base 26 of 0: 0 +base 26 of 10: a +base 26 of 100: 3m +base 26 of 1000: 1cc +base 26 of 4611686018427387904: 1m8c769io65344 +base 27 of -10: 4eo8hfam6fllmf +base 27 of -1: 4eo8hfam6fllmo +base 27 of 0: 0 +base 27 of 10: a +base 27 of 100: 3j +base 27 of 1000: 1a1 +base 27 of 4611686018427387904: 13jfho2j1hc5cd +base 28 of -10: 2nc6j26l66rho6 +base 28 of -1: 2nc6j26l66rhof +base 28 of 0: 0 +base 28 of 10: a +base 28 of 100: 3g +base 28 of 1000: 17k +base 28 of 4611686018427387904: jo1ilfj8fkpd4 +base 29 of -10: 1n3rsh11f098re +base 29 of -1: 1n3rsh11f098rn +base 29 of 0: 0 +base 29 of 10: a +base 29 of 100: 3d +base 29 of 1000: 15e +base 29 of 4611686018427387904: d0slim0b029e6 +base 30 of -10: 14l9lkmo30o406 +base 30 of -1: 14l9lkmo30o40f +base 30 of 0: 0 +base 30 of 10: a +base 30 of 100: 3a +base 30 of 1000: 13a +base 30 of 4611686018427387904: 8k9rrkl0ml104 +base 31 of -10: nd075ib45k866 +base 31 of -1: nd075ib45k86f +base 31 of 0: 0 +base 31 of 10: a +base 31 of 100: 37 +base 31 of 1000: 118 +base 31 of 4611686018427387904: 5qfh94i8okhh4 +base 32 of -10: fvvvvvvvvvvvm +base 32 of -1: fvvvvvvvvvvvv +base 32 of 0: 0 +base 32 of 10: a +base 32 of 100: 34 +base 32 of 1000: v8 +base 32 of 4611686018427387904: 4000000000000 +base 33 of -10: b1w8p7j5q9r66 +base 33 of -1: b1w8p7j5q9r6f +base 33 of 0: 0 +base 33 of 10: a +base 33 of 100: 31 +base 33 of 1000: ua +base 33 of 4611686018427387904: 2p826a4q6ivi4 +base 34 of -10: 7orp63sh4dph8 +base 34 of -1: 7orp63sh4dphh +base 34 of 0: 0 +base 34 of 10: a +base 34 of 100: 2w +base 34 of 1000: te +base 34 of 4611686018427387904: 1vnvr0wl9ketu +base 35 of -10: 5g24a25twkwf6 +base 35 of -1: 5g24a25twkwff +base 35 of 0: 0 +base 35 of 10: a +base 35 of 100: 2u +base 35 of 1000: sk +base 35 of 4611686018427387904: 1cqrb9a7gvgu4 +base 36 of -10: 3w5e11264sgs6 +base 36 of -1: 3w5e11264sgsf +base 36 of 0: 0 +base 36 of 10: a +base 36 of 100: 2s +base 36 of 1000: rs +base 36 of 4611686018427387904: z1ci99jj7474 +base 37 of -10: +base 37 of -1: +base 37 of 0: +base 37 of 10: +base 37 of 100: +base 37 of 1000: +base 37 of 4611686018427387904: +base 38 of -10: +base 38 of -1: +base 38 of 0: +base 38 of 10: +base 38 of 100: +base 38 of 1000: +base 38 of 4611686018427387904: +base 39 of -10: +base 39 of -1: +base 39 of 0: +base 39 of 10: +base 39 of 100: +base 39 of 1000: +base 39 of 4611686018427387904: +base 40 of -10: +base 40 of -1: +base 40 of 0: +base 40 of 10: +base 40 of 100: +base 40 of 1000: +base 40 of 4611686018427387904: + diff --git a/cmd/dtrace/test/tst/common/funcs/tst.tolower.d b/cmd/dtrace/test/tst/common/funcs/tst.tolower.d new file mode 100644 index 0000000..2539630 --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/tst.tolower.d @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +BEGIN +{ + i = 0; + + input[i] = "ahi"; + expected[i++] = "ahi"; + + input[i] = "MaHi!"; + expected[i++] = "mahi!"; + + input[i] = " Nase-5"; + expected[i++] = " nase-5"; + + input[i] = "!@#$%"; + expected[i++] = "!@#$%"; + + i = 0; +} + +tick-1ms +/input[i] != NULL && (this->out = tolower(input[i])) != expected[i]/ +{ + printf("expected tolower(\"%s\") to be \"%s\"; found \"%s\"\n", + input[i], expected[i], this->out); + exit(1); +} + +tick-1ms +/input[i] != NULL/ +{ + printf("tolower(\"%s\") is \"%s\", as expected\n", + input[i], expected[i]); +} + +tick-1ms +/input[i++] == NULL/ +{ + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/funcs/tst.toupper.d b/cmd/dtrace/test/tst/common/funcs/tst.toupper.d new file mode 100644 index 0000000..fd803f2 --- /dev/null +++ b/cmd/dtrace/test/tst/common/funcs/tst.toupper.d @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +BEGIN +{ + i = 0; + + input[i] = "ahi"; + expected[i++] = "AHI"; + + input[i] = "MaHi!"; + expected[i++] = "MAHI!"; + + input[i] = " dace-9"; + expected[i++] = " DACE-9"; + + input[i] = "!@#$%"; + expected[i++] = "!@#$%"; + + i = 0; +} + +tick-1ms +/input[i] != NULL && (this->out = toupper(input[i])) != expected[i]/ +{ + printf("expected toupper(\"%s\") to be \"%s\"; found \"%s\"\n", + input[i], expected[i], this->out); + exit(1); +} + +tick-1ms +/input[i] != NULL/ +{ + printf("toupper(\"%s\") is \"%s\", as expected\n", + input[i], expected[i]); +} + +tick-1ms +/input[i++] == NULL/ +{ + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/include/tst.includefirst.ksh b/cmd/dtrace/test/tst/common/include/tst.includefirst.ksh new file mode 100644 index 0000000..b8240d6 --- /dev/null +++ b/cmd/dtrace/test/tst/common/include/tst.includefirst.ksh @@ -0,0 +1,76 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2011, Joyent Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# This test verifies that we only use the first entry of a file with a given +# name in the library path +# + +if [ $# != 1 ]; then + echo expected one argument: '<'dtrace-path'>' + exit 2 +fi + +firstinc=${TMPDIR:-/tmp}/firstinc.$$ +secondinc=${TMPDIR:-/tmp}/secondinc.$$ +expexit=23 + +setup_include() +{ + mkdir $firstinc + mkdir $secondinc + cat > $firstinc/lib.d < $secondinc/lib.d < test.pl <<-EOPERL close \$s; EOPERL -$dtrace -c '/usr/bin/perl test.pl' -qs /dev/stdin < test.pl <<-EOPERL close \$s; EOPERL -$dtrace -c '/usr/bin/perl test.pl' -qs /dev/stdin < test.pl <<-EOPERL close \$s; EOPERL -$dtrace -c '/usr/bin/perl test.pl' -qs /dev/stdin < test.pl <<-EOPERL close \$s; EOPERL -$dtrace -c '/usr/bin/perl test.pl' -qs /dev/stdin < buckets = + new ArrayList < Distribution.Bucket > (); + + for (order = 0; order < low; order++) + value *= factor; + + bucket = new Distribution.Bucket(Long.MIN_VALUE, (value - 1), 0); + buckets.add(bucket); + + next = value * factor; + step = (next > nsteps) ? (next / nsteps) : 1; + + while (order <= high) { + bucket = new Distribution.Bucket(value, value + step - 1, 5); + buckets.add(bucket); + + if ((value += step) != next) + continue; + + next = value * factor; + step = (next > nsteps) ? (next / nsteps) : 1; + order++; + } + + bucket = new Distribution.Bucket(value, Long.MAX_VALUE, 0); + buckets.add(bucket); + + LogLinearDistribution d = new LogLinearDistribution(factor, low, high, + nsteps, 0, buckets); + return d; + } + public static Option getOption() { diff --git a/cmd/dtrace/test/tst/common/java_api/tst.Bean.ksh.out b/cmd/dtrace/test/tst/common/java_api/tst.Bean.ksh.out index cf24c67..141dccc 100644 --- a/cmd/dtrace/test/tst/common/java_api/tst.Bean.ksh.out +++ b/cmd/dtrace/test/tst/common/java_api/tst.Bean.ksh.out @@ -158,6 +158,12 @@ LinearDistribution: LinearDistribution: encoded: class org.opensolaris.os.dtrace.LinearDistribution[base = 1, step = 10, buckets = [org.opensolaris.os.dtrace.Distribution$Bucket[min = 1, max = 10, frequency = 0], org.opensolaris.os.dtrace.Distribution$Bucket[min = 11, max = 20, frequency = 1], org.opensolaris.os.dtrace.Distribution$Bucket[min = 21, max = 30, frequency = 2], org.opensolaris.os.dtrace.Distribution$Bucket[min = 31, max = 40, frequency = 3], org.opensolaris.os.dtrace.Distribution$Bucket[min = 41, max = 50, frequency = 4], org.opensolaris.os.dtrace.Distribution$Bucket[min = 51, max = 60, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 61, max = 70, frequency = 6], org.opensolaris.os.dtrace.Distribution$Bucket[min = 71, max = 80, frequency = 7], org.opensolaris.os.dtrace.Distribution$Bucket[min = 81, max = 90, frequency = 8], org.opensolaris.os.dtrace.Distribution$Bucket[min = 91, max = 100, frequency = 9], org.opensolaris.os.dtrace.Distribution$Bucket[min = 101, max = 9223372036854775807, frequency = 0]], total = 45.0] decoded: class org.opensolaris.os.dtrace.LinearDistribution[base = 1, step = 10, buckets = [org.opensolaris.os.dtrace.Distribution$Bucket[min = 1, max = 10, frequency = 0], org.opensolaris.os.dtrace.Distribution$Bucket[min = 11, max = 20, frequency = 1], org.opensolaris.os.dtrace.Distribution$Bucket[min = 21, max = 30, frequency = 2], org.opensolaris.os.dtrace.Distribution$Bucket[min = 31, max = 40, frequency = 3], org.opensolaris.os.dtrace.Distribution$Bucket[min = 41, max = 50, frequency = 4], org.opensolaris.os.dtrace.Distribution$Bucket[min = 51, max = 60, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 61, max = 70, frequency = 6], org.opensolaris.os.dtrace.Distribution$Bucket[min = 71, max = 80, frequency = 7], org.opensolaris.os.dtrace.Distribution$Bucket[min = 81, max = 90, frequency = 8], org.opensolaris.os.dtrace.Distribution$Bucket[min = 91, max = 100, frequency = 9], org.opensolaris.os.dtrace.Distribution$Bucket[min = 101, max = 9223372036854775807, frequency = 0]], total = 45.0] +LogLinearDistribution: + serialized: org.opensolaris.os.dtrace.Distribution[buckets = [org.opensolaris.os.dtrace.Distribution$Bucket[min = -9223372036854775808, max = 0, frequency = 0], org.opensolaris.os.dtrace.Distribution$Bucket[min = 1, max = 1, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 2, max = 3, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 4, max = 7, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 8, max = 15, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 16, max = 31, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 32, max = 63, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 64, max = 127, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 128, max = 9223372036854775807, frequency = 0]], total = 35.0] + deserialized: org.opensolaris.os.dtrace.Distribution[buckets = [org.opensolaris.os.dtrace.Distribution$Bucket[min = -9223372036854775808, max = 0, frequency = 0], org.opensolaris.os.dtrace.Distribution$Bucket[min = 1, max = 1, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 2, max = 3, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 4, max = 7, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 8, max = 15, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 16, max = 31, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 32, max = 63, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 64, max = 127, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 128, max = 9223372036854775807, frequency = 0]], total = 35.0] +LogLinearDistribution: + encoded: org.opensolaris.os.dtrace.Distribution[buckets = [org.opensolaris.os.dtrace.Distribution$Bucket[min = -9223372036854775808, max = 0, frequency = 0], org.opensolaris.os.dtrace.Distribution$Bucket[min = 1, max = 1, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 2, max = 3, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 4, max = 7, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 8, max = 15, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 16, max = 31, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 32, max = 63, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 64, max = 127, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 128, max = 9223372036854775807, frequency = 0]], total = 35.0] + decoded: org.opensolaris.os.dtrace.Distribution[buckets = [org.opensolaris.os.dtrace.Distribution$Bucket[min = -9223372036854775808, max = 0, frequency = 0], org.opensolaris.os.dtrace.Distribution$Bucket[min = 1, max = 1, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 2, max = 3, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 4, max = 7, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 8, max = 15, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 16, max = 31, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 32, max = 63, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 64, max = 127, frequency = 5], org.opensolaris.os.dtrace.Distribution$Bucket[min = 128, max = 9223372036854775807, frequency = 0]], total = 35.0] Option: serialized: org.opensolaris.os.dtrace.Option[name = aggrate, value = 1s] deserialized: org.opensolaris.os.dtrace.Option[name = aggrate, value = 1s] diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTOREVEN.nodivide.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTOREVEN.nodivide.d new file mode 100644 index 0000000..b11d282 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTOREVEN.nodivide.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 0, 10, 25); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTOREVEN.notfactor.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTOREVEN.notfactor.d new file mode 100644 index 0000000..c8af7d9 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTOREVEN.notfactor.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 0, 10, 30); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORMATCH.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORMATCH.d new file mode 100644 index 0000000..0404b4ff --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORMATCH.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 0, 10, 10); + @ = llquantize(0, 3, 0, 10, 81); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORNSTEPS.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORNSTEPS.d new file mode 100644 index 0000000..fd6b0e6 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORNSTEPS.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 0, 10, 7); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORSMALL.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORSMALL.d new file mode 100644 index 0000000..7074f5f --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORSMALL.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 1, 0, 10, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORTYPE.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORTYPE.d new file mode 100644 index 0000000..ea39c7e --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORTYPE.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + this->doogle = 10; + @ = llquantize(0, this->doogle, 0, 10, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORVAL.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORVAL.d new file mode 100644 index 0000000..a1ad20f --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_FACTORVAL.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 65537, 0, 10, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHMATCH.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHMATCH.d new file mode 100644 index 0000000..46bf0e6 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHMATCH.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 0, 10, 10); + @ = llquantize(0, 10, 0, 11, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHTYPE.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHTYPE.d new file mode 100644 index 0000000..fee786d --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHTYPE.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + this->doogle = 10; + @ = llquantize(0, 10, 0, this->doogle, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHVAL.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHVAL.d new file mode 100644 index 0000000..531ab0b --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_HIGHVAL.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 0, -1, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWMATCH.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWMATCH.d new file mode 100644 index 0000000..126429a --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWMATCH.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 0, 10, 10); + @ = llquantize(0, 10, 1, 10, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWTYPE.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWTYPE.d new file mode 100644 index 0000000..2a9b2ef --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWTYPE.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + this->doogle = 0; + @ = llquantize(0, 10, this->doogle, 10, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWVAL.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWVAL.d new file mode 100644 index 0000000..e1045d8 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_LOWVAL.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, -1, 10, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_MAGRANGE.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_MAGRANGE.d new file mode 100644 index 0000000..9852c1a --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_MAGRANGE.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 10, 0, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_MAGTOOBIG.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_MAGTOOBIG.d new file mode 100644 index 0000000..c707630 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_MAGTOOBIG.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 0, 100, 10); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPMATCH.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPMATCH.d new file mode 100644 index 0000000..77b4d8a --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPMATCH.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(0, 10, 0, 10, 10); + @ = llquantize(0, 10, 0, 10, 100); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPTYPE.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPTYPE.d new file mode 100644 index 0000000..4eb9b2f --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPTYPE.d @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + this->doogle = 10; + @ = llquantize(0, 10, 0, 10, this->doogle); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPVAL.d b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPVAL.d new file mode 100644 index 0000000..3855beb --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/err.D_LLQUANT_NSTEPVAL.d @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +BEGIN +{ + @ = llquantize(123, 10, 0, 10, 123456); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.bases.d b/cmd/dtrace/test/tst/common/llquantize/tst.bases.d new file mode 100644 index 0000000..e3a6ff1 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.bases.d @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +tick-1ms +/i++ <= 100/ +{ + @two = llquantize(i, 2, 0, 6, 2); + @three = llquantize(i, 3, 0, 1, 9); + @four = llquantize(i, 4, 0, 1, 4); + @five = llquantize(i, 5, 0, 1, 25); + @six = llquantize(i, 6, 0, 3, 12); + @seven = llquantize(i, 7, 0, 1, 7); + @eight = llquantize(i, 8, 0, 1, 16); + @nine = llquantize(i, 9, 0, 1, 9); + @ten = llquantize(i, 10, 0, 1, 10); +} + +tick-1ms +/i > 100/ +{ + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.bases.d.out b/cmd/dtrace/test/tst/common/llquantize/tst.bases.d.out new file mode 100644 index 0000000..1b207bf --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.bases.d.out @@ -0,0 +1,177 @@ + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 |@ 2 + 4 |@@ 4 + 8 |@@@ 8 + 16 |@@@@@@ 16 + 32 |@@@@@@@@@@@@@ 32 + 64 |@@@@@@@@@@@@@@@ 38 + >= 128 | 0 + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + >= 9 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 93 + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 |@@ 4 + 8 |@@ 4 + 12 |@@ 4 + >= 16 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 86 + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + 11 | 1 + 12 | 1 + 13 | 1 + 14 | 1 + 15 | 1 + 16 | 1 + 17 | 1 + 18 | 1 + 19 | 1 + 20 | 1 + 21 | 1 + 22 | 1 + 23 | 1 + 24 | 1 + >= 25 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 77 + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 6 |@ 3 + 9 |@ 3 + 12 |@ 3 + 15 |@ 3 + 18 |@ 3 + 21 |@ 3 + 24 |@ 3 + 27 |@ 3 + 30 |@ 3 + 33 |@ 3 + 36 |@@@@@@@ 18 + 54 |@@@@@@@ 18 + 72 |@@@@@@@ 18 + 90 |@@@@@ 12 + 108 | 0 + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 |@@@ 7 + 14 |@@@ 7 + 21 |@@@ 7 + 28 |@@@ 7 + 35 |@@@ 7 + 42 |@@@ 7 + >= 49 |@@@@@@@@@@@@@@@@@@@@@ 53 + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 |@@ 4 + 12 |@@ 4 + 16 |@@ 4 + 20 |@@ 4 + 24 |@@ 4 + 28 |@@ 4 + 32 |@@ 4 + 36 |@@ 4 + 40 |@@ 4 + 44 |@@ 4 + 48 |@@ 4 + 52 |@@ 4 + 56 |@@ 4 + 60 |@@ 4 + >= 64 |@@@@@@@@@@@@@@@ 38 + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 |@@@@ 9 + 18 |@@@@ 9 + 27 |@@@@ 9 + 36 |@@@@ 9 + 45 |@@@@ 9 + 54 |@@@@ 9 + 63 |@@@@ 9 + 72 |@@@@ 9 + >= 81 |@@@@@@@@ 21 + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 |@@@@ 10 + 20 |@@@@ 10 + 30 |@@@@ 10 + 40 |@@@@ 10 + 50 |@@@@ 10 + 60 |@@@@ 10 + 70 |@@@@ 10 + 80 |@@@@ 10 + 90 |@@@@ 10 + >= 100 |@ 2 + diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.basic.d b/cmd/dtrace/test/tst/common/llquantize/tst.basic.d new file mode 100644 index 0000000..57b6ed8 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.basic.d @@ -0,0 +1,38 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +tick-1ms +/i++ <= 100/ +{ + @ = llquantize(i, 10, 0, 10, 10); +} + +tick-1ms +/i > 100/ +{ + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.basic.d.out b/cmd/dtrace/test/tst/common/llquantize/tst.basic.d.out new file mode 100644 index 0000000..9a7b288 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.basic.d.out @@ -0,0 +1,25 @@ + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 |@@@@ 10 + 20 |@@@@ 10 + 30 |@@@@ 10 + 40 |@@@@ 10 + 50 |@@@@ 10 + 60 |@@@@ 10 + 70 |@@@@ 10 + 80 |@@@@ 10 + 90 |@@@@ 10 + 100 |@ 2 + 200 | 0 + diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.negorder.d b/cmd/dtrace/test/tst/common/llquantize/tst.negorder.d new file mode 100644 index 0000000..b18c688 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.negorder.d @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +BEGIN +{ + a = 7; + b = 13; + val = (-a * b) + a; +} + +tick-1ms +{ + incr = val % b; + val += a; +} + +tick-1ms +/val == 0/ +{ + val += a; +} + +tick-1ms +/incr != 0/ +{ + i++; + @llquanty[i] = llquantize(1, 10, 0, 10, 10, incr); +} + +tick-1ms +/incr == 0/ +{ + printf("Ordering of llquantize() with some negative weights:\n"); + printa(@llquanty); + printf("\n"); + + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.negorder.d.out b/cmd/dtrace/test/tst/common/llquantize/tst.negorder.d.out new file mode 100644 index 0000000..ac0f3cb --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.negorder.d.out @@ -0,0 +1,148 @@ +Ordering of llquantize() with some negative weights: + + 2 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -12 + 2 | 0 + + 4 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -11 + 2 | 0 + + 6 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -10 + 2 | 0 + + 8 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -9 + 2 | 0 + + 10 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -8 + 2 | 0 + + 12 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -7 + 2 | 0 + + 1 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -6 + 2 | 0 + + 3 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -5 + 2 | 0 + + 5 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -4 + 2 | 0 + + 7 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -3 + 2 | 0 + + 9 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -2 + 2 | 0 + + 11 + value ------------- Distribution ------------- count + < 1 | 0 + 1 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -1 + 2 | 0 + + 14 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1 + 2 | 0 + + 16 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 2 + 2 | 0 + + 18 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 3 + 2 | 0 + + 20 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 4 + 2 | 0 + + 22 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 5 + 2 | 0 + + 24 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 6 + 2 | 0 + + 13 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 7 + 2 | 0 + + 15 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 8 + 2 | 0 + + 17 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 9 + 2 | 0 + + 19 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 10 + 2 | 0 + + 21 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 11 + 2 | 0 + + 23 + value ------------- Distribution ------------- count + < 1 | 0 + 1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 12 + 2 | 0 + + + diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.negvalue.d b/cmd/dtrace/test/tst/common/llquantize/tst.negvalue.d new file mode 100644 index 0000000..c74d019 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.negvalue.d @@ -0,0 +1,38 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +tick-1ms +/i++ <= 100/ +{ + @ = llquantize(i, 10, 0, 10, 10, 50 - i); +} + +tick-1ms +/i > 100/ +{ + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.negvalue.d.out b/cmd/dtrace/test/tst/common/llquantize/tst.negvalue.d.out new file mode 100644 index 0000000..04b0d5e --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.negvalue.d.out @@ -0,0 +1,25 @@ + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 49 + 2 | 48 + 3 | 47 + 4 | 46 + 5 | 45 + 6 | 44 + 7 | 43 + 8 | 42 + 9 | 41 + 10 |@@@ 355 + 20 |@@ 255 + 30 |@ 155 + 40 | 55 + 50 | -45 + 60 @| -145 + 70 @@| -245 + 80 @@@| -345 + 90 @@@| -445 + 100 @| -101 + 200 | 0 + diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.normal.d b/cmd/dtrace/test/tst/common/llquantize/tst.normal.d new file mode 100644 index 0000000..7097ba7 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.normal.d @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +tick-1ms +/i++ <= 100/ +{ + @ = llquantize(i, 10, 0, 10, 10); +} + +tick-1ms +/i > 100/ +{ + normalize(@, 10); + printa(@); + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.normal.d.out b/cmd/dtrace/test/tst/common/llquantize/tst.normal.d.out new file mode 100644 index 0000000..3b1f41b --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.normal.d.out @@ -0,0 +1,26 @@ + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 0 + 2 | 0 + 3 | 0 + 4 | 0 + 5 | 0 + 6 | 0 + 7 | 0 + 8 | 0 + 9 | 0 + 10 |@@@@ 1 + 20 |@@@@ 1 + 30 |@@@@ 1 + 40 |@@@@ 1 + 50 |@@@@ 1 + 60 |@@@@ 1 + 70 |@@@@ 1 + 80 |@@@@ 1 + 90 |@@@@ 1 + 100 |@ 0 + 200 | 0 + + diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.range.d b/cmd/dtrace/test/tst/common/llquantize/tst.range.d new file mode 100644 index 0000000..e2882b3 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.range.d @@ -0,0 +1,38 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +BEGIN +{ + @["Screven"] = llquantize(0, 10, 1, 2, 20, 25); + @["Katz"] = llquantize(1, 10, 1, 2, 20, -100); + @["Kurian"] = llquantize(7, 10, 1, 2, 20, 15); + @["Rozwat"] = llquantize(49, 10, 1, 2, 20, 15); + @["Fowler"] = llquantize(343, 10, 1, 2, 20, 150); + + printa(@); + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.range.d.out b/cmd/dtrace/test/tst/common/llquantize/tst.range.d.out new file mode 100644 index 0000000..c6736c6 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.range.d.out @@ -0,0 +1,29 @@ + + Katz + value ------------- Distribution ------------- count + < 10 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -100 + 10 | 0 + + Kurian + value ------------- Distribution ------------- count + < 10 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 15 + 10 | 0 + + Screven + value ------------- Distribution ------------- count + < 10 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 25 + 10 | 0 + + Rozwat + value ------------- Distribution ------------- count + 40 | 0 + 45 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 15 + 50 | 0 + + Fowler + value ------------- Distribution ------------- count + 250 | 0 + 300 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 150 + 350 | 0 + + diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.steps.d b/cmd/dtrace/test/tst/common/llquantize/tst.steps.d new file mode 100644 index 0000000..f00659e --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.steps.d @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +tick-1ms +/i++ <= 100/ +{ + @ = llquantize(i, 10, 0, 10, 20); + @hunid = llquantize(i * 10, 10, 0, 10, 100); + @large = llquantize(i * 100, 10, 0, 10, 1000); +} + +tick-1ms +/i > 100/ +{ + exit(0); +} + +END +{ + printf("20 steps:\n"); + printa(@); + + printf("100 steps:\n"); + printa(@hunid); + + printf("1000 steps:\n"); + printa(@large); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.steps.d.out b/cmd/dtrace/test/tst/common/llquantize/tst.steps.d.out new file mode 100644 index 0000000..0888551 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.steps.d.out @@ -0,0 +1,2033 @@ +20 steps: + + + value ------------- Distribution ------------- count + < 1 | 0 + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 |@@ 5 + 15 |@@ 5 + 20 |@@ 5 + 25 |@@ 5 + 30 |@@ 5 + 35 |@@ 5 + 40 |@@ 5 + 45 |@@ 5 + 50 |@@ 5 + 55 |@@ 5 + 60 |@@ 5 + 65 |@@ 5 + 70 |@@ 5 + 75 |@@ 5 + 80 |@@ 5 + 85 |@@ 5 + 90 |@@ 5 + 95 |@@ 5 + 100 |@ 2 + 150 | 0 + +100 steps: + + + value ------------- Distribution ------------- count + 9 | 0 + 10 | 1 + 11 | 0 + 12 | 0 + 13 | 0 + 14 | 0 + 15 | 0 + 16 | 0 + 17 | 0 + 18 | 0 + 19 | 0 + 20 | 1 + 21 | 0 + 22 | 0 + 23 | 0 + 24 | 0 + 25 | 0 + 26 | 0 + 27 | 0 + 28 | 0 + 29 | 0 + 30 | 1 + 31 | 0 + 32 | 0 + 33 | 0 + 34 | 0 + 35 | 0 + 36 | 0 + 37 | 0 + 38 | 0 + 39 | 0 + 40 | 1 + 41 | 0 + 42 | 0 + 43 | 0 + 44 | 0 + 45 | 0 + 46 | 0 + 47 | 0 + 48 | 0 + 49 | 0 + 50 | 1 + 51 | 0 + 52 | 0 + 53 | 0 + 54 | 0 + 55 | 0 + 56 | 0 + 57 | 0 + 58 | 0 + 59 | 0 + 60 | 1 + 61 | 0 + 62 | 0 + 63 | 0 + 64 | 0 + 65 | 0 + 66 | 0 + 67 | 0 + 68 | 0 + 69 | 0 + 70 | 1 + 71 | 0 + 72 | 0 + 73 | 0 + 74 | 0 + 75 | 0 + 76 | 0 + 77 | 0 + 78 | 0 + 79 | 0 + 80 | 1 + 81 | 0 + 82 | 0 + 83 | 0 + 84 | 0 + 85 | 0 + 86 | 0 + 87 | 0 + 88 | 0 + 89 | 0 + 90 | 1 + 91 | 0 + 92 | 0 + 93 | 0 + 94 | 0 + 95 | 0 + 96 | 0 + 97 | 0 + 98 | 0 + 99 | 0 + 100 | 1 + 110 | 1 + 120 | 1 + 130 | 1 + 140 | 1 + 150 | 1 + 160 | 1 + 170 | 1 + 180 | 1 + 190 | 1 + 200 | 1 + 210 | 1 + 220 | 1 + 230 | 1 + 240 | 1 + 250 | 1 + 260 | 1 + 270 | 1 + 280 | 1 + 290 | 1 + 300 | 1 + 310 | 1 + 320 | 1 + 330 | 1 + 340 | 1 + 350 | 1 + 360 | 1 + 370 | 1 + 380 | 1 + 390 | 1 + 400 | 1 + 410 | 1 + 420 | 1 + 430 | 1 + 440 | 1 + 450 | 1 + 460 | 1 + 470 | 1 + 480 | 1 + 490 | 1 + 500 | 1 + 510 | 1 + 520 | 1 + 530 | 1 + 540 | 1 + 550 | 1 + 560 | 1 + 570 | 1 + 580 | 1 + 590 | 1 + 600 | 1 + 610 | 1 + 620 | 1 + 630 | 1 + 640 | 1 + 650 | 1 + 660 | 1 + 670 | 1 + 680 | 1 + 690 | 1 + 700 | 1 + 710 | 1 + 720 | 1 + 730 | 1 + 740 | 1 + 750 | 1 + 760 | 1 + 770 | 1 + 780 | 1 + 790 | 1 + 800 | 1 + 810 | 1 + 820 | 1 + 830 | 1 + 840 | 1 + 850 | 1 + 860 | 1 + 870 | 1 + 880 | 1 + 890 | 1 + 900 | 1 + 910 | 1 + 920 | 1 + 930 | 1 + 940 | 1 + 950 | 1 + 960 | 1 + 970 | 1 + 980 | 1 + 990 | 1 + 1000 |@ 2 + 1100 | 0 + +1000 steps: + + + value ------------- Distribution ------------- count + 99 | 0 + 100 | 1 + 101 | 0 + 102 | 0 + 103 | 0 + 104 | 0 + 105 | 0 + 106 | 0 + 107 | 0 + 108 | 0 + 109 | 0 + 110 | 0 + 111 | 0 + 112 | 0 + 113 | 0 + 114 | 0 + 115 | 0 + 116 | 0 + 117 | 0 + 118 | 0 + 119 | 0 + 120 | 0 + 121 | 0 + 122 | 0 + 123 | 0 + 124 | 0 + 125 | 0 + 126 | 0 + 127 | 0 + 128 | 0 + 129 | 0 + 130 | 0 + 131 | 0 + 132 | 0 + 133 | 0 + 134 | 0 + 135 | 0 + 136 | 0 + 137 | 0 + 138 | 0 + 139 | 0 + 140 | 0 + 141 | 0 + 142 | 0 + 143 | 0 + 144 | 0 + 145 | 0 + 146 | 0 + 147 | 0 + 148 | 0 + 149 | 0 + 150 | 0 + 151 | 0 + 152 | 0 + 153 | 0 + 154 | 0 + 155 | 0 + 156 | 0 + 157 | 0 + 158 | 0 + 159 | 0 + 160 | 0 + 161 | 0 + 162 | 0 + 163 | 0 + 164 | 0 + 165 | 0 + 166 | 0 + 167 | 0 + 168 | 0 + 169 | 0 + 170 | 0 + 171 | 0 + 172 | 0 + 173 | 0 + 174 | 0 + 175 | 0 + 176 | 0 + 177 | 0 + 178 | 0 + 179 | 0 + 180 | 0 + 181 | 0 + 182 | 0 + 183 | 0 + 184 | 0 + 185 | 0 + 186 | 0 + 187 | 0 + 188 | 0 + 189 | 0 + 190 | 0 + 191 | 0 + 192 | 0 + 193 | 0 + 194 | 0 + 195 | 0 + 196 | 0 + 197 | 0 + 198 | 0 + 199 | 0 + 200 | 1 + 201 | 0 + 202 | 0 + 203 | 0 + 204 | 0 + 205 | 0 + 206 | 0 + 207 | 0 + 208 | 0 + 209 | 0 + 210 | 0 + 211 | 0 + 212 | 0 + 213 | 0 + 214 | 0 + 215 | 0 + 216 | 0 + 217 | 0 + 218 | 0 + 219 | 0 + 220 | 0 + 221 | 0 + 222 | 0 + 223 | 0 + 224 | 0 + 225 | 0 + 226 | 0 + 227 | 0 + 228 | 0 + 229 | 0 + 230 | 0 + 231 | 0 + 232 | 0 + 233 | 0 + 234 | 0 + 235 | 0 + 236 | 0 + 237 | 0 + 238 | 0 + 239 | 0 + 240 | 0 + 241 | 0 + 242 | 0 + 243 | 0 + 244 | 0 + 245 | 0 + 246 | 0 + 247 | 0 + 248 | 0 + 249 | 0 + 250 | 0 + 251 | 0 + 252 | 0 + 253 | 0 + 254 | 0 + 255 | 0 + 256 | 0 + 257 | 0 + 258 | 0 + 259 | 0 + 260 | 0 + 261 | 0 + 262 | 0 + 263 | 0 + 264 | 0 + 265 | 0 + 266 | 0 + 267 | 0 + 268 | 0 + 269 | 0 + 270 | 0 + 271 | 0 + 272 | 0 + 273 | 0 + 274 | 0 + 275 | 0 + 276 | 0 + 277 | 0 + 278 | 0 + 279 | 0 + 280 | 0 + 281 | 0 + 282 | 0 + 283 | 0 + 284 | 0 + 285 | 0 + 286 | 0 + 287 | 0 + 288 | 0 + 289 | 0 + 290 | 0 + 291 | 0 + 292 | 0 + 293 | 0 + 294 | 0 + 295 | 0 + 296 | 0 + 297 | 0 + 298 | 0 + 299 | 0 + 300 | 1 + 301 | 0 + 302 | 0 + 303 | 0 + 304 | 0 + 305 | 0 + 306 | 0 + 307 | 0 + 308 | 0 + 309 | 0 + 310 | 0 + 311 | 0 + 312 | 0 + 313 | 0 + 314 | 0 + 315 | 0 + 316 | 0 + 317 | 0 + 318 | 0 + 319 | 0 + 320 | 0 + 321 | 0 + 322 | 0 + 323 | 0 + 324 | 0 + 325 | 0 + 326 | 0 + 327 | 0 + 328 | 0 + 329 | 0 + 330 | 0 + 331 | 0 + 332 | 0 + 333 | 0 + 334 | 0 + 335 | 0 + 336 | 0 + 337 | 0 + 338 | 0 + 339 | 0 + 340 | 0 + 341 | 0 + 342 | 0 + 343 | 0 + 344 | 0 + 345 | 0 + 346 | 0 + 347 | 0 + 348 | 0 + 349 | 0 + 350 | 0 + 351 | 0 + 352 | 0 + 353 | 0 + 354 | 0 + 355 | 0 + 356 | 0 + 357 | 0 + 358 | 0 + 359 | 0 + 360 | 0 + 361 | 0 + 362 | 0 + 363 | 0 + 364 | 0 + 365 | 0 + 366 | 0 + 367 | 0 + 368 | 0 + 369 | 0 + 370 | 0 + 371 | 0 + 372 | 0 + 373 | 0 + 374 | 0 + 375 | 0 + 376 | 0 + 377 | 0 + 378 | 0 + 379 | 0 + 380 | 0 + 381 | 0 + 382 | 0 + 383 | 0 + 384 | 0 + 385 | 0 + 386 | 0 + 387 | 0 + 388 | 0 + 389 | 0 + 390 | 0 + 391 | 0 + 392 | 0 + 393 | 0 + 394 | 0 + 395 | 0 + 396 | 0 + 397 | 0 + 398 | 0 + 399 | 0 + 400 | 1 + 401 | 0 + 402 | 0 + 403 | 0 + 404 | 0 + 405 | 0 + 406 | 0 + 407 | 0 + 408 | 0 + 409 | 0 + 410 | 0 + 411 | 0 + 412 | 0 + 413 | 0 + 414 | 0 + 415 | 0 + 416 | 0 + 417 | 0 + 418 | 0 + 419 | 0 + 420 | 0 + 421 | 0 + 422 | 0 + 423 | 0 + 424 | 0 + 425 | 0 + 426 | 0 + 427 | 0 + 428 | 0 + 429 | 0 + 430 | 0 + 431 | 0 + 432 | 0 + 433 | 0 + 434 | 0 + 435 | 0 + 436 | 0 + 437 | 0 + 438 | 0 + 439 | 0 + 440 | 0 + 441 | 0 + 442 | 0 + 443 | 0 + 444 | 0 + 445 | 0 + 446 | 0 + 447 | 0 + 448 | 0 + 449 | 0 + 450 | 0 + 451 | 0 + 452 | 0 + 453 | 0 + 454 | 0 + 455 | 0 + 456 | 0 + 457 | 0 + 458 | 0 + 459 | 0 + 460 | 0 + 461 | 0 + 462 | 0 + 463 | 0 + 464 | 0 + 465 | 0 + 466 | 0 + 467 | 0 + 468 | 0 + 469 | 0 + 470 | 0 + 471 | 0 + 472 | 0 + 473 | 0 + 474 | 0 + 475 | 0 + 476 | 0 + 477 | 0 + 478 | 0 + 479 | 0 + 480 | 0 + 481 | 0 + 482 | 0 + 483 | 0 + 484 | 0 + 485 | 0 + 486 | 0 + 487 | 0 + 488 | 0 + 489 | 0 + 490 | 0 + 491 | 0 + 492 | 0 + 493 | 0 + 494 | 0 + 495 | 0 + 496 | 0 + 497 | 0 + 498 | 0 + 499 | 0 + 500 | 1 + 501 | 0 + 502 | 0 + 503 | 0 + 504 | 0 + 505 | 0 + 506 | 0 + 507 | 0 + 508 | 0 + 509 | 0 + 510 | 0 + 511 | 0 + 512 | 0 + 513 | 0 + 514 | 0 + 515 | 0 + 516 | 0 + 517 | 0 + 518 | 0 + 519 | 0 + 520 | 0 + 521 | 0 + 522 | 0 + 523 | 0 + 524 | 0 + 525 | 0 + 526 | 0 + 527 | 0 + 528 | 0 + 529 | 0 + 530 | 0 + 531 | 0 + 532 | 0 + 533 | 0 + 534 | 0 + 535 | 0 + 536 | 0 + 537 | 0 + 538 | 0 + 539 | 0 + 540 | 0 + 541 | 0 + 542 | 0 + 543 | 0 + 544 | 0 + 545 | 0 + 546 | 0 + 547 | 0 + 548 | 0 + 549 | 0 + 550 | 0 + 551 | 0 + 552 | 0 + 553 | 0 + 554 | 0 + 555 | 0 + 556 | 0 + 557 | 0 + 558 | 0 + 559 | 0 + 560 | 0 + 561 | 0 + 562 | 0 + 563 | 0 + 564 | 0 + 565 | 0 + 566 | 0 + 567 | 0 + 568 | 0 + 569 | 0 + 570 | 0 + 571 | 0 + 572 | 0 + 573 | 0 + 574 | 0 + 575 | 0 + 576 | 0 + 577 | 0 + 578 | 0 + 579 | 0 + 580 | 0 + 581 | 0 + 582 | 0 + 583 | 0 + 584 | 0 + 585 | 0 + 586 | 0 + 587 | 0 + 588 | 0 + 589 | 0 + 590 | 0 + 591 | 0 + 592 | 0 + 593 | 0 + 594 | 0 + 595 | 0 + 596 | 0 + 597 | 0 + 598 | 0 + 599 | 0 + 600 | 1 + 601 | 0 + 602 | 0 + 603 | 0 + 604 | 0 + 605 | 0 + 606 | 0 + 607 | 0 + 608 | 0 + 609 | 0 + 610 | 0 + 611 | 0 + 612 | 0 + 613 | 0 + 614 | 0 + 615 | 0 + 616 | 0 + 617 | 0 + 618 | 0 + 619 | 0 + 620 | 0 + 621 | 0 + 622 | 0 + 623 | 0 + 624 | 0 + 625 | 0 + 626 | 0 + 627 | 0 + 628 | 0 + 629 | 0 + 630 | 0 + 631 | 0 + 632 | 0 + 633 | 0 + 634 | 0 + 635 | 0 + 636 | 0 + 637 | 0 + 638 | 0 + 639 | 0 + 640 | 0 + 641 | 0 + 642 | 0 + 643 | 0 + 644 | 0 + 645 | 0 + 646 | 0 + 647 | 0 + 648 | 0 + 649 | 0 + 650 | 0 + 651 | 0 + 652 | 0 + 653 | 0 + 654 | 0 + 655 | 0 + 656 | 0 + 657 | 0 + 658 | 0 + 659 | 0 + 660 | 0 + 661 | 0 + 662 | 0 + 663 | 0 + 664 | 0 + 665 | 0 + 666 | 0 + 667 | 0 + 668 | 0 + 669 | 0 + 670 | 0 + 671 | 0 + 672 | 0 + 673 | 0 + 674 | 0 + 675 | 0 + 676 | 0 + 677 | 0 + 678 | 0 + 679 | 0 + 680 | 0 + 681 | 0 + 682 | 0 + 683 | 0 + 684 | 0 + 685 | 0 + 686 | 0 + 687 | 0 + 688 | 0 + 689 | 0 + 690 | 0 + 691 | 0 + 692 | 0 + 693 | 0 + 694 | 0 + 695 | 0 + 696 | 0 + 697 | 0 + 698 | 0 + 699 | 0 + 700 | 1 + 701 | 0 + 702 | 0 + 703 | 0 + 704 | 0 + 705 | 0 + 706 | 0 + 707 | 0 + 708 | 0 + 709 | 0 + 710 | 0 + 711 | 0 + 712 | 0 + 713 | 0 + 714 | 0 + 715 | 0 + 716 | 0 + 717 | 0 + 718 | 0 + 719 | 0 + 720 | 0 + 721 | 0 + 722 | 0 + 723 | 0 + 724 | 0 + 725 | 0 + 726 | 0 + 727 | 0 + 728 | 0 + 729 | 0 + 730 | 0 + 731 | 0 + 732 | 0 + 733 | 0 + 734 | 0 + 735 | 0 + 736 | 0 + 737 | 0 + 738 | 0 + 739 | 0 + 740 | 0 + 741 | 0 + 742 | 0 + 743 | 0 + 744 | 0 + 745 | 0 + 746 | 0 + 747 | 0 + 748 | 0 + 749 | 0 + 750 | 0 + 751 | 0 + 752 | 0 + 753 | 0 + 754 | 0 + 755 | 0 + 756 | 0 + 757 | 0 + 758 | 0 + 759 | 0 + 760 | 0 + 761 | 0 + 762 | 0 + 763 | 0 + 764 | 0 + 765 | 0 + 766 | 0 + 767 | 0 + 768 | 0 + 769 | 0 + 770 | 0 + 771 | 0 + 772 | 0 + 773 | 0 + 774 | 0 + 775 | 0 + 776 | 0 + 777 | 0 + 778 | 0 + 779 | 0 + 780 | 0 + 781 | 0 + 782 | 0 + 783 | 0 + 784 | 0 + 785 | 0 + 786 | 0 + 787 | 0 + 788 | 0 + 789 | 0 + 790 | 0 + 791 | 0 + 792 | 0 + 793 | 0 + 794 | 0 + 795 | 0 + 796 | 0 + 797 | 0 + 798 | 0 + 799 | 0 + 800 | 1 + 801 | 0 + 802 | 0 + 803 | 0 + 804 | 0 + 805 | 0 + 806 | 0 + 807 | 0 + 808 | 0 + 809 | 0 + 810 | 0 + 811 | 0 + 812 | 0 + 813 | 0 + 814 | 0 + 815 | 0 + 816 | 0 + 817 | 0 + 818 | 0 + 819 | 0 + 820 | 0 + 821 | 0 + 822 | 0 + 823 | 0 + 824 | 0 + 825 | 0 + 826 | 0 + 827 | 0 + 828 | 0 + 829 | 0 + 830 | 0 + 831 | 0 + 832 | 0 + 833 | 0 + 834 | 0 + 835 | 0 + 836 | 0 + 837 | 0 + 838 | 0 + 839 | 0 + 840 | 0 + 841 | 0 + 842 | 0 + 843 | 0 + 844 | 0 + 845 | 0 + 846 | 0 + 847 | 0 + 848 | 0 + 849 | 0 + 850 | 0 + 851 | 0 + 852 | 0 + 853 | 0 + 854 | 0 + 855 | 0 + 856 | 0 + 857 | 0 + 858 | 0 + 859 | 0 + 860 | 0 + 861 | 0 + 862 | 0 + 863 | 0 + 864 | 0 + 865 | 0 + 866 | 0 + 867 | 0 + 868 | 0 + 869 | 0 + 870 | 0 + 871 | 0 + 872 | 0 + 873 | 0 + 874 | 0 + 875 | 0 + 876 | 0 + 877 | 0 + 878 | 0 + 879 | 0 + 880 | 0 + 881 | 0 + 882 | 0 + 883 | 0 + 884 | 0 + 885 | 0 + 886 | 0 + 887 | 0 + 888 | 0 + 889 | 0 + 890 | 0 + 891 | 0 + 892 | 0 + 893 | 0 + 894 | 0 + 895 | 0 + 896 | 0 + 897 | 0 + 898 | 0 + 899 | 0 + 900 | 1 + 901 | 0 + 902 | 0 + 903 | 0 + 904 | 0 + 905 | 0 + 906 | 0 + 907 | 0 + 908 | 0 + 909 | 0 + 910 | 0 + 911 | 0 + 912 | 0 + 913 | 0 + 914 | 0 + 915 | 0 + 916 | 0 + 917 | 0 + 918 | 0 + 919 | 0 + 920 | 0 + 921 | 0 + 922 | 0 + 923 | 0 + 924 | 0 + 925 | 0 + 926 | 0 + 927 | 0 + 928 | 0 + 929 | 0 + 930 | 0 + 931 | 0 + 932 | 0 + 933 | 0 + 934 | 0 + 935 | 0 + 936 | 0 + 937 | 0 + 938 | 0 + 939 | 0 + 940 | 0 + 941 | 0 + 942 | 0 + 943 | 0 + 944 | 0 + 945 | 0 + 946 | 0 + 947 | 0 + 948 | 0 + 949 | 0 + 950 | 0 + 951 | 0 + 952 | 0 + 953 | 0 + 954 | 0 + 955 | 0 + 956 | 0 + 957 | 0 + 958 | 0 + 959 | 0 + 960 | 0 + 961 | 0 + 962 | 0 + 963 | 0 + 964 | 0 + 965 | 0 + 966 | 0 + 967 | 0 + 968 | 0 + 969 | 0 + 970 | 0 + 971 | 0 + 972 | 0 + 973 | 0 + 974 | 0 + 975 | 0 + 976 | 0 + 977 | 0 + 978 | 0 + 979 | 0 + 980 | 0 + 981 | 0 + 982 | 0 + 983 | 0 + 984 | 0 + 985 | 0 + 986 | 0 + 987 | 0 + 988 | 0 + 989 | 0 + 990 | 0 + 991 | 0 + 992 | 0 + 993 | 0 + 994 | 0 + 995 | 0 + 996 | 0 + 997 | 0 + 998 | 0 + 999 | 0 + 1000 | 1 + 1010 | 0 + 1020 | 0 + 1030 | 0 + 1040 | 0 + 1050 | 0 + 1060 | 0 + 1070 | 0 + 1080 | 0 + 1090 | 0 + 1100 | 1 + 1110 | 0 + 1120 | 0 + 1130 | 0 + 1140 | 0 + 1150 | 0 + 1160 | 0 + 1170 | 0 + 1180 | 0 + 1190 | 0 + 1200 | 1 + 1210 | 0 + 1220 | 0 + 1230 | 0 + 1240 | 0 + 1250 | 0 + 1260 | 0 + 1270 | 0 + 1280 | 0 + 1290 | 0 + 1300 | 1 + 1310 | 0 + 1320 | 0 + 1330 | 0 + 1340 | 0 + 1350 | 0 + 1360 | 0 + 1370 | 0 + 1380 | 0 + 1390 | 0 + 1400 | 1 + 1410 | 0 + 1420 | 0 + 1430 | 0 + 1440 | 0 + 1450 | 0 + 1460 | 0 + 1470 | 0 + 1480 | 0 + 1490 | 0 + 1500 | 1 + 1510 | 0 + 1520 | 0 + 1530 | 0 + 1540 | 0 + 1550 | 0 + 1560 | 0 + 1570 | 0 + 1580 | 0 + 1590 | 0 + 1600 | 1 + 1610 | 0 + 1620 | 0 + 1630 | 0 + 1640 | 0 + 1650 | 0 + 1660 | 0 + 1670 | 0 + 1680 | 0 + 1690 | 0 + 1700 | 1 + 1710 | 0 + 1720 | 0 + 1730 | 0 + 1740 | 0 + 1750 | 0 + 1760 | 0 + 1770 | 0 + 1780 | 0 + 1790 | 0 + 1800 | 1 + 1810 | 0 + 1820 | 0 + 1830 | 0 + 1840 | 0 + 1850 | 0 + 1860 | 0 + 1870 | 0 + 1880 | 0 + 1890 | 0 + 1900 | 1 + 1910 | 0 + 1920 | 0 + 1930 | 0 + 1940 | 0 + 1950 | 0 + 1960 | 0 + 1970 | 0 + 1980 | 0 + 1990 | 0 + 2000 | 1 + 2010 | 0 + 2020 | 0 + 2030 | 0 + 2040 | 0 + 2050 | 0 + 2060 | 0 + 2070 | 0 + 2080 | 0 + 2090 | 0 + 2100 | 1 + 2110 | 0 + 2120 | 0 + 2130 | 0 + 2140 | 0 + 2150 | 0 + 2160 | 0 + 2170 | 0 + 2180 | 0 + 2190 | 0 + 2200 | 1 + 2210 | 0 + 2220 | 0 + 2230 | 0 + 2240 | 0 + 2250 | 0 + 2260 | 0 + 2270 | 0 + 2280 | 0 + 2290 | 0 + 2300 | 1 + 2310 | 0 + 2320 | 0 + 2330 | 0 + 2340 | 0 + 2350 | 0 + 2360 | 0 + 2370 | 0 + 2380 | 0 + 2390 | 0 + 2400 | 1 + 2410 | 0 + 2420 | 0 + 2430 | 0 + 2440 | 0 + 2450 | 0 + 2460 | 0 + 2470 | 0 + 2480 | 0 + 2490 | 0 + 2500 | 1 + 2510 | 0 + 2520 | 0 + 2530 | 0 + 2540 | 0 + 2550 | 0 + 2560 | 0 + 2570 | 0 + 2580 | 0 + 2590 | 0 + 2600 | 1 + 2610 | 0 + 2620 | 0 + 2630 | 0 + 2640 | 0 + 2650 | 0 + 2660 | 0 + 2670 | 0 + 2680 | 0 + 2690 | 0 + 2700 | 1 + 2710 | 0 + 2720 | 0 + 2730 | 0 + 2740 | 0 + 2750 | 0 + 2760 | 0 + 2770 | 0 + 2780 | 0 + 2790 | 0 + 2800 | 1 + 2810 | 0 + 2820 | 0 + 2830 | 0 + 2840 | 0 + 2850 | 0 + 2860 | 0 + 2870 | 0 + 2880 | 0 + 2890 | 0 + 2900 | 1 + 2910 | 0 + 2920 | 0 + 2930 | 0 + 2940 | 0 + 2950 | 0 + 2960 | 0 + 2970 | 0 + 2980 | 0 + 2990 | 0 + 3000 | 1 + 3010 | 0 + 3020 | 0 + 3030 | 0 + 3040 | 0 + 3050 | 0 + 3060 | 0 + 3070 | 0 + 3080 | 0 + 3090 | 0 + 3100 | 1 + 3110 | 0 + 3120 | 0 + 3130 | 0 + 3140 | 0 + 3150 | 0 + 3160 | 0 + 3170 | 0 + 3180 | 0 + 3190 | 0 + 3200 | 1 + 3210 | 0 + 3220 | 0 + 3230 | 0 + 3240 | 0 + 3250 | 0 + 3260 | 0 + 3270 | 0 + 3280 | 0 + 3290 | 0 + 3300 | 1 + 3310 | 0 + 3320 | 0 + 3330 | 0 + 3340 | 0 + 3350 | 0 + 3360 | 0 + 3370 | 0 + 3380 | 0 + 3390 | 0 + 3400 | 1 + 3410 | 0 + 3420 | 0 + 3430 | 0 + 3440 | 0 + 3450 | 0 + 3460 | 0 + 3470 | 0 + 3480 | 0 + 3490 | 0 + 3500 | 1 + 3510 | 0 + 3520 | 0 + 3530 | 0 + 3540 | 0 + 3550 | 0 + 3560 | 0 + 3570 | 0 + 3580 | 0 + 3590 | 0 + 3600 | 1 + 3610 | 0 + 3620 | 0 + 3630 | 0 + 3640 | 0 + 3650 | 0 + 3660 | 0 + 3670 | 0 + 3680 | 0 + 3690 | 0 + 3700 | 1 + 3710 | 0 + 3720 | 0 + 3730 | 0 + 3740 | 0 + 3750 | 0 + 3760 | 0 + 3770 | 0 + 3780 | 0 + 3790 | 0 + 3800 | 1 + 3810 | 0 + 3820 | 0 + 3830 | 0 + 3840 | 0 + 3850 | 0 + 3860 | 0 + 3870 | 0 + 3880 | 0 + 3890 | 0 + 3900 | 1 + 3910 | 0 + 3920 | 0 + 3930 | 0 + 3940 | 0 + 3950 | 0 + 3960 | 0 + 3970 | 0 + 3980 | 0 + 3990 | 0 + 4000 | 1 + 4010 | 0 + 4020 | 0 + 4030 | 0 + 4040 | 0 + 4050 | 0 + 4060 | 0 + 4070 | 0 + 4080 | 0 + 4090 | 0 + 4100 | 1 + 4110 | 0 + 4120 | 0 + 4130 | 0 + 4140 | 0 + 4150 | 0 + 4160 | 0 + 4170 | 0 + 4180 | 0 + 4190 | 0 + 4200 | 1 + 4210 | 0 + 4220 | 0 + 4230 | 0 + 4240 | 0 + 4250 | 0 + 4260 | 0 + 4270 | 0 + 4280 | 0 + 4290 | 0 + 4300 | 1 + 4310 | 0 + 4320 | 0 + 4330 | 0 + 4340 | 0 + 4350 | 0 + 4360 | 0 + 4370 | 0 + 4380 | 0 + 4390 | 0 + 4400 | 1 + 4410 | 0 + 4420 | 0 + 4430 | 0 + 4440 | 0 + 4450 | 0 + 4460 | 0 + 4470 | 0 + 4480 | 0 + 4490 | 0 + 4500 | 1 + 4510 | 0 + 4520 | 0 + 4530 | 0 + 4540 | 0 + 4550 | 0 + 4560 | 0 + 4570 | 0 + 4580 | 0 + 4590 | 0 + 4600 | 1 + 4610 | 0 + 4620 | 0 + 4630 | 0 + 4640 | 0 + 4650 | 0 + 4660 | 0 + 4670 | 0 + 4680 | 0 + 4690 | 0 + 4700 | 1 + 4710 | 0 + 4720 | 0 + 4730 | 0 + 4740 | 0 + 4750 | 0 + 4760 | 0 + 4770 | 0 + 4780 | 0 + 4790 | 0 + 4800 | 1 + 4810 | 0 + 4820 | 0 + 4830 | 0 + 4840 | 0 + 4850 | 0 + 4860 | 0 + 4870 | 0 + 4880 | 0 + 4890 | 0 + 4900 | 1 + 4910 | 0 + 4920 | 0 + 4930 | 0 + 4940 | 0 + 4950 | 0 + 4960 | 0 + 4970 | 0 + 4980 | 0 + 4990 | 0 + 5000 | 1 + 5010 | 0 + 5020 | 0 + 5030 | 0 + 5040 | 0 + 5050 | 0 + 5060 | 0 + 5070 | 0 + 5080 | 0 + 5090 | 0 + 5100 | 1 + 5110 | 0 + 5120 | 0 + 5130 | 0 + 5140 | 0 + 5150 | 0 + 5160 | 0 + 5170 | 0 + 5180 | 0 + 5190 | 0 + 5200 | 1 + 5210 | 0 + 5220 | 0 + 5230 | 0 + 5240 | 0 + 5250 | 0 + 5260 | 0 + 5270 | 0 + 5280 | 0 + 5290 | 0 + 5300 | 1 + 5310 | 0 + 5320 | 0 + 5330 | 0 + 5340 | 0 + 5350 | 0 + 5360 | 0 + 5370 | 0 + 5380 | 0 + 5390 | 0 + 5400 | 1 + 5410 | 0 + 5420 | 0 + 5430 | 0 + 5440 | 0 + 5450 | 0 + 5460 | 0 + 5470 | 0 + 5480 | 0 + 5490 | 0 + 5500 | 1 + 5510 | 0 + 5520 | 0 + 5530 | 0 + 5540 | 0 + 5550 | 0 + 5560 | 0 + 5570 | 0 + 5580 | 0 + 5590 | 0 + 5600 | 1 + 5610 | 0 + 5620 | 0 + 5630 | 0 + 5640 | 0 + 5650 | 0 + 5660 | 0 + 5670 | 0 + 5680 | 0 + 5690 | 0 + 5700 | 1 + 5710 | 0 + 5720 | 0 + 5730 | 0 + 5740 | 0 + 5750 | 0 + 5760 | 0 + 5770 | 0 + 5780 | 0 + 5790 | 0 + 5800 | 1 + 5810 | 0 + 5820 | 0 + 5830 | 0 + 5840 | 0 + 5850 | 0 + 5860 | 0 + 5870 | 0 + 5880 | 0 + 5890 | 0 + 5900 | 1 + 5910 | 0 + 5920 | 0 + 5930 | 0 + 5940 | 0 + 5950 | 0 + 5960 | 0 + 5970 | 0 + 5980 | 0 + 5990 | 0 + 6000 | 1 + 6010 | 0 + 6020 | 0 + 6030 | 0 + 6040 | 0 + 6050 | 0 + 6060 | 0 + 6070 | 0 + 6080 | 0 + 6090 | 0 + 6100 | 1 + 6110 | 0 + 6120 | 0 + 6130 | 0 + 6140 | 0 + 6150 | 0 + 6160 | 0 + 6170 | 0 + 6180 | 0 + 6190 | 0 + 6200 | 1 + 6210 | 0 + 6220 | 0 + 6230 | 0 + 6240 | 0 + 6250 | 0 + 6260 | 0 + 6270 | 0 + 6280 | 0 + 6290 | 0 + 6300 | 1 + 6310 | 0 + 6320 | 0 + 6330 | 0 + 6340 | 0 + 6350 | 0 + 6360 | 0 + 6370 | 0 + 6380 | 0 + 6390 | 0 + 6400 | 1 + 6410 | 0 + 6420 | 0 + 6430 | 0 + 6440 | 0 + 6450 | 0 + 6460 | 0 + 6470 | 0 + 6480 | 0 + 6490 | 0 + 6500 | 1 + 6510 | 0 + 6520 | 0 + 6530 | 0 + 6540 | 0 + 6550 | 0 + 6560 | 0 + 6570 | 0 + 6580 | 0 + 6590 | 0 + 6600 | 1 + 6610 | 0 + 6620 | 0 + 6630 | 0 + 6640 | 0 + 6650 | 0 + 6660 | 0 + 6670 | 0 + 6680 | 0 + 6690 | 0 + 6700 | 1 + 6710 | 0 + 6720 | 0 + 6730 | 0 + 6740 | 0 + 6750 | 0 + 6760 | 0 + 6770 | 0 + 6780 | 0 + 6790 | 0 + 6800 | 1 + 6810 | 0 + 6820 | 0 + 6830 | 0 + 6840 | 0 + 6850 | 0 + 6860 | 0 + 6870 | 0 + 6880 | 0 + 6890 | 0 + 6900 | 1 + 6910 | 0 + 6920 | 0 + 6930 | 0 + 6940 | 0 + 6950 | 0 + 6960 | 0 + 6970 | 0 + 6980 | 0 + 6990 | 0 + 7000 | 1 + 7010 | 0 + 7020 | 0 + 7030 | 0 + 7040 | 0 + 7050 | 0 + 7060 | 0 + 7070 | 0 + 7080 | 0 + 7090 | 0 + 7100 | 1 + 7110 | 0 + 7120 | 0 + 7130 | 0 + 7140 | 0 + 7150 | 0 + 7160 | 0 + 7170 | 0 + 7180 | 0 + 7190 | 0 + 7200 | 1 + 7210 | 0 + 7220 | 0 + 7230 | 0 + 7240 | 0 + 7250 | 0 + 7260 | 0 + 7270 | 0 + 7280 | 0 + 7290 | 0 + 7300 | 1 + 7310 | 0 + 7320 | 0 + 7330 | 0 + 7340 | 0 + 7350 | 0 + 7360 | 0 + 7370 | 0 + 7380 | 0 + 7390 | 0 + 7400 | 1 + 7410 | 0 + 7420 | 0 + 7430 | 0 + 7440 | 0 + 7450 | 0 + 7460 | 0 + 7470 | 0 + 7480 | 0 + 7490 | 0 + 7500 | 1 + 7510 | 0 + 7520 | 0 + 7530 | 0 + 7540 | 0 + 7550 | 0 + 7560 | 0 + 7570 | 0 + 7580 | 0 + 7590 | 0 + 7600 | 1 + 7610 | 0 + 7620 | 0 + 7630 | 0 + 7640 | 0 + 7650 | 0 + 7660 | 0 + 7670 | 0 + 7680 | 0 + 7690 | 0 + 7700 | 1 + 7710 | 0 + 7720 | 0 + 7730 | 0 + 7740 | 0 + 7750 | 0 + 7760 | 0 + 7770 | 0 + 7780 | 0 + 7790 | 0 + 7800 | 1 + 7810 | 0 + 7820 | 0 + 7830 | 0 + 7840 | 0 + 7850 | 0 + 7860 | 0 + 7870 | 0 + 7880 | 0 + 7890 | 0 + 7900 | 1 + 7910 | 0 + 7920 | 0 + 7930 | 0 + 7940 | 0 + 7950 | 0 + 7960 | 0 + 7970 | 0 + 7980 | 0 + 7990 | 0 + 8000 | 1 + 8010 | 0 + 8020 | 0 + 8030 | 0 + 8040 | 0 + 8050 | 0 + 8060 | 0 + 8070 | 0 + 8080 | 0 + 8090 | 0 + 8100 | 1 + 8110 | 0 + 8120 | 0 + 8130 | 0 + 8140 | 0 + 8150 | 0 + 8160 | 0 + 8170 | 0 + 8180 | 0 + 8190 | 0 + 8200 | 1 + 8210 | 0 + 8220 | 0 + 8230 | 0 + 8240 | 0 + 8250 | 0 + 8260 | 0 + 8270 | 0 + 8280 | 0 + 8290 | 0 + 8300 | 1 + 8310 | 0 + 8320 | 0 + 8330 | 0 + 8340 | 0 + 8350 | 0 + 8360 | 0 + 8370 | 0 + 8380 | 0 + 8390 | 0 + 8400 | 1 + 8410 | 0 + 8420 | 0 + 8430 | 0 + 8440 | 0 + 8450 | 0 + 8460 | 0 + 8470 | 0 + 8480 | 0 + 8490 | 0 + 8500 | 1 + 8510 | 0 + 8520 | 0 + 8530 | 0 + 8540 | 0 + 8550 | 0 + 8560 | 0 + 8570 | 0 + 8580 | 0 + 8590 | 0 + 8600 | 1 + 8610 | 0 + 8620 | 0 + 8630 | 0 + 8640 | 0 + 8650 | 0 + 8660 | 0 + 8670 | 0 + 8680 | 0 + 8690 | 0 + 8700 | 1 + 8710 | 0 + 8720 | 0 + 8730 | 0 + 8740 | 0 + 8750 | 0 + 8760 | 0 + 8770 | 0 + 8780 | 0 + 8790 | 0 + 8800 | 1 + 8810 | 0 + 8820 | 0 + 8830 | 0 + 8840 | 0 + 8850 | 0 + 8860 | 0 + 8870 | 0 + 8880 | 0 + 8890 | 0 + 8900 | 1 + 8910 | 0 + 8920 | 0 + 8930 | 0 + 8940 | 0 + 8950 | 0 + 8960 | 0 + 8970 | 0 + 8980 | 0 + 8990 | 0 + 9000 | 1 + 9010 | 0 + 9020 | 0 + 9030 | 0 + 9040 | 0 + 9050 | 0 + 9060 | 0 + 9070 | 0 + 9080 | 0 + 9090 | 0 + 9100 | 1 + 9110 | 0 + 9120 | 0 + 9130 | 0 + 9140 | 0 + 9150 | 0 + 9160 | 0 + 9170 | 0 + 9180 | 0 + 9190 | 0 + 9200 | 1 + 9210 | 0 + 9220 | 0 + 9230 | 0 + 9240 | 0 + 9250 | 0 + 9260 | 0 + 9270 | 0 + 9280 | 0 + 9290 | 0 + 9300 | 1 + 9310 | 0 + 9320 | 0 + 9330 | 0 + 9340 | 0 + 9350 | 0 + 9360 | 0 + 9370 | 0 + 9380 | 0 + 9390 | 0 + 9400 | 1 + 9410 | 0 + 9420 | 0 + 9430 | 0 + 9440 | 0 + 9450 | 0 + 9460 | 0 + 9470 | 0 + 9480 | 0 + 9490 | 0 + 9500 | 1 + 9510 | 0 + 9520 | 0 + 9530 | 0 + 9540 | 0 + 9550 | 0 + 9560 | 0 + 9570 | 0 + 9580 | 0 + 9590 | 0 + 9600 | 1 + 9610 | 0 + 9620 | 0 + 9630 | 0 + 9640 | 0 + 9650 | 0 + 9660 | 0 + 9670 | 0 + 9680 | 0 + 9690 | 0 + 9700 | 1 + 9710 | 0 + 9720 | 0 + 9730 | 0 + 9740 | 0 + 9750 | 0 + 9760 | 0 + 9770 | 0 + 9780 | 0 + 9790 | 0 + 9800 | 1 + 9810 | 0 + 9820 | 0 + 9830 | 0 + 9840 | 0 + 9850 | 0 + 9860 | 0 + 9870 | 0 + 9880 | 0 + 9890 | 0 + 9900 | 1 + 9910 | 0 + 9920 | 0 + 9930 | 0 + 9940 | 0 + 9950 | 0 + 9960 | 0 + 9970 | 0 + 9980 | 0 + 9990 | 0 + 10000 | 1 + 10100 | 1 + 10200 | 0 + + diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.trunc.d b/cmd/dtrace/test/tst/common/llquantize/tst.trunc.d new file mode 100644 index 0000000..e3db030 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.trunc.d @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +int i; + +tick-10ms +/i < 100/ +{ + @[i] = llquantize(i, 10, 1, 2, 10, 150); + @[i] = llquantize(i + 1, 10, 1, 2, 10, 150); + @[i] = llquantize(i + 2, 10, 1, 2, 10, 150); + @[i] = llquantize(i + 3, 10, 1, 2, 10, 150); + i++; +} + +tick-10ms +/i == 100/ +{ + exit(0); +} + +END +{ + trunc(@, 5); +} diff --git a/cmd/dtrace/test/tst/common/llquantize/tst.trunc.d.out b/cmd/dtrace/test/tst/common/llquantize/tst.trunc.d.out new file mode 100644 index 0000000..941c626 --- /dev/null +++ b/cmd/dtrace/test/tst/common/llquantize/tst.trunc.d.out @@ -0,0 +1,34 @@ + + 95 + value ------------- Distribution ------------- count + 80 | 0 + 90 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 600 + 100 | 0 + + 96 + value ------------- Distribution ------------- count + 80 | 0 + 90 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 600 + 100 | 0 + + 97 + value ------------- Distribution ------------- count + 80 | 0 + 90 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 450 + 100 |@@@@@@@@@@ 150 + 200 | 0 + + 98 + value ------------- Distribution ------------- count + 80 | 0 + 90 |@@@@@@@@@@@@@@@@@@@@ 300 + 100 |@@@@@@@@@@@@@@@@@@@@ 300 + 200 | 0 + + 99 + value ------------- Distribution ------------- count + 80 | 0 + 90 |@@@@@@@@@@ 150 + 100 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 450 + 200 | 0 + diff --git a/cmd/dtrace/test/tst/common/mdb/tst.dtracedcmd.ksh b/cmd/dtrace/test/tst/common/mdb/tst.dtracedcmd.ksh index 561f854..6ca9712 100644 --- a/cmd/dtrace/test/tst/common/mdb/tst.dtracedcmd.ksh +++ b/cmd/dtrace/test/tst/common/mdb/tst.dtracedcmd.ksh @@ -24,8 +24,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # # This script primarily tests that the ::dtrace dcmd is not dumping @@ -36,7 +34,7 @@ script() { - $dtrace -o $dtraceout -s /dev/stdin <' @@ -31,7 +30,7 @@ if [ $# != 1 ]; then fi dtrace=$1 -CC=/usr/sfw/bin/gcc +CC=`which gcc` CFLAGS= doit() @@ -83,13 +82,37 @@ files=/usr/include/sys/*.h # because they include static globals (!) or function bodies (!!) in the header # file. Hopefully these remain sufficiently few that the O(#files * #badfiles) # algorithm, below, doesn't become a problem. (And yes, writing scripts in -# something other than ksh1888 would probably be a good idea.) If this script +# something other than ksh would probably be a good idea.) If this script # becomes a problem, kindly fix it by reducing the number of bad files! (That # is, fix it by fixing the broken file, not the broken script.) # -badfiles="ctype.h eri_msg.h ser_sync.h sbpro.h neti.h hook_event.h \ - bootconf.h bootstat.h dtrace.h dumphdr.h exacct_impl.h fasttrap.h \ - kobj.h kobj_impl.h ksyms.h lockstat.h smedia.h stat.h utsname.h" +badfiles="\ + bootconf.h \ + bootstat.h \ + ctype.h \ + dtrace.h \ + dumphdr.h \ + exacct_impl.h \ + fasttrap.h \ + hook_event.h \ + iscsi_authclient.h \ + kiconv_ja.h \ + kiconv_ja_jis_to_unicode.h \ + kiconv_ja_unicode_to_jis.h \ + kobj.h \ + kobj_impl.h \ + ksyms.h \ + lockstat.h \ + neti.h \ + rds.h \ + ser_sync.h \ + smbios_impl.h \ + smedia.h \ + sockfilter.h \ + stat.h \ + u8_textprep_data.h \ + utsname.h \ + vnic.h" for inc in $files; do file=`basename $inc` diff --git a/cmd/dtrace/test/tst/common/misc/tst.macroglob.ksh.out b/cmd/dtrace/test/tst/common/misc/tst.macroglob.ksh.out index 8a9ac6d..09d984d 100644 --- a/cmd/dtrace/test/tst/common/misc/tst.macroglob.ksh.out +++ b/cmd/dtrace/test/tst/common/misc/tst.macroglob.ksh.out @@ -4,12 +4,15 @@ read entry FUNCTION NAME read entry readlink entry +readlinkat entry readv entry FUNCTION NAME read entry readlink entry +readlinkat entry readv entry FUNCTION NAME readlink entry +readlinkat entry FUNCTION NAME pread64 entry diff --git a/cmd/dtrace/test/tst/common/misc/tst.schrock.ksh b/cmd/dtrace/test/tst/common/misc/tst.schrock.ksh index 494f6f8..bc29f71 100644 --- a/cmd/dtrace/test/tst/common/misc/tst.schrock.ksh +++ b/cmd/dtrace/test/tst/common/misc/tst.schrock.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -33,13 +32,12 @@ fi dtrace=$1 # -# /usr/ccs/bin/nm execs a 64-bit version of itself. DTrace uses libproc -# (which uses /proc) to find out when the traced process exits, but a -# 32-bit process can't examine a 64-bit one with libproc. The -# LD_NOEXEC_64 variable prevents nm from re-execing itself. +# Some variants of nm may exec a 64-bit version of themselves. DTrace uses +# libproc (which uses /proc) to find out when the traced process exits, but a +# 32-bit process can't examine a 64-bit one with libproc. The LD_NOEXEC_64 +# variable prevents nm from re-execing itself. # -LD_NOEXEC_64=tomeeisrad $dtrace -F -s /dev/stdin -c \ - '/usr/ccs/bin/nm /bin/ls' stat < Makefile < main.c < Makefile < altlib.c < Makefile < Makefile <' + exit 2 +fi + +libdira=${TMPDIR:-/tmp}/libdepa.$$ +libdirb=${TMPDIR:-/tmp}/libdepb.$$ +libdirc=${TMPDIR:-/tmp}/libdepc.$$ +dtrace=$1 + +setup_libs() +{ + mkdir $libdira + mkdir $libdirb + mkdir $libdirc + cat > $libdira/liba.$$.d < $libdirb/libb.$$.d < $libdirb/libc.$$.d < $libdirb/libd.$$.d < $libdirc/libe.$$.d < $libdirc/libf.$$.d <f = (foo_t *)alloca(sizeof (foo_t)); + + this->f->a[0] = 1; + this->f->a[1] = 2; + this->f->a[2] = 3; + this->f->b[0] = 'a'; + this->f->b[1] = 'b'; + this->f->b[2] = 0; + this->f->c[0].alpha = 5; + this->f->c[1].alpha = 6; + this->f->c[2].alpha = 7; + this->f->d[0] = 4; + this->f->d[1] = 0; + this->f->d[2] = 0; + + print(this->f->a); + print(this->f->b); + print(this->f->c); + print(*this->f); + + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/print/tst.array.d.out b/cmd/dtrace/test/tst/common/print/tst.array.d.out new file mode 100644 index 0000000..0702d4b --- /dev/null +++ b/cmd/dtrace/test/tst/common/print/tst.array.d.out @@ -0,0 +1,23 @@ +int [3] [ 0x1, 0x2, 0x3 ] +char [30] "ab" +bar_t [2] [ + bar_t { + int alpha = 0x5 + }, + bar_t { + int alpha = 0x6 + } +] +foo_t { + int [3] a = [ 0x1, 0x2, 0x3 ] + char [30] b = [ "ab" ] + bar_t [2] c = [ + bar_t { + int alpha = 0x5 + }, + bar_t { + int alpha = 0x6 + } + ] + char [3] d = [ '\004', '\0', '\0' ] +} diff --git a/cmd/dtrace/test/tst/common/print/tst.bitfield.d b/cmd/dtrace/test/tst/common/print/tst.bitfield.d new file mode 100644 index 0000000..ff77bb0 --- /dev/null +++ b/cmd/dtrace/test/tst/common/print/tst.bitfield.d @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011 by Delphix. All rights reserved. + */ + +#pragma D option quiet + +typedef struct forward forward_t; + +typedef struct foo { + int a:4; + int b:7; + int c:1; + int d:2; +} foo_t; + +BEGIN +{ + this->s = (foo_t *)alloca(sizeof (foo_t)); + + this->s->a = 1; + this->s->b = 5; + this->s->c = 0; + this->s->d = 2; + + print(*this->s); + + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/print/tst.bitfield.d.out b/cmd/dtrace/test/tst/common/print/tst.bitfield.d.out new file mode 100644 index 0000000..309444d --- /dev/null +++ b/cmd/dtrace/test/tst/common/print/tst.bitfield.d.out @@ -0,0 +1,6 @@ +foo_t { + int a :4 = 0x1 + int b :7 = 0x5 + int c :1 = 0 + int d :2 = 0x2 +} diff --git a/cmd/dtrace/test/tst/common/print/tst.primitive.d b/cmd/dtrace/test/tst/common/print/tst.primitive.d new file mode 100644 index 0000000..559dab1 --- /dev/null +++ b/cmd/dtrace/test/tst/common/print/tst.primitive.d @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011 by Delphix. All rights reserved. + */ + +#pragma D option quiet + +BEGIN +{ + i = (int)'a'; + + printf("\n"); + + print((char)'a'); + print((int)-1); + print((unsigned int)23); + print((short)456); + print((unsigned short)789); + print((long)1234); + print((ulong_t)56789); + print((void *)0x1234); + print("hello"); + + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/print/tst.primitive.d.out b/cmd/dtrace/test/tst/common/print/tst.primitive.d.out new file mode 100644 index 0000000..f7e4076 --- /dev/null +++ b/cmd/dtrace/test/tst/common/print/tst.primitive.d.out @@ -0,0 +1,11 @@ + +char 'a' +int 0xffffffff +unsigned int 0x17 +short 0x1c8 +unsigned short 0x315 +long 0x4d2 +ulong_t 0xddd5 +void * 0x1234 +string "hello" + diff --git a/cmd/dtrace/test/tst/common/print/tst.struct.d b/cmd/dtrace/test/tst/common/print/tst.struct.d new file mode 100644 index 0000000..2fb1c41 --- /dev/null +++ b/cmd/dtrace/test/tst/common/print/tst.struct.d @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011 by Delphix. All rights reserved. + */ + +#pragma D option quiet + +typedef struct forward forward_t; + +typedef struct foo { + int a; + void *b; + struct { + uint64_t alpha; + uint64_t beta; + } c; + ushort_t d; + int e; + forward_t *f; + void (*g)(); +} foo_t; + +BEGIN +{ + this->s = (foo_t *)alloca(sizeof (foo_t)); + + this->s->a = 1; + this->s->b = (void *)2; + this->s->c.alpha = 3; + this->s->c.beta = 4; + this->s->d = 5; + this->s->e = 6; + this->s->f = (void *)7; + this->s->g = (void *)8; + + print(*this->s); + + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/print/tst.struct.d.out b/cmd/dtrace/test/tst/common/print/tst.struct.d.out new file mode 100644 index 0000000..b7b2108 --- /dev/null +++ b/cmd/dtrace/test/tst/common/print/tst.struct.d.out @@ -0,0 +1,12 @@ +foo_t { + int a = 0x1 + void *b = 0x2 + struct c = { + uint64_t alpha = 0x3 + uint64_t beta = 0x4 + } + ushort_t d = 0x5 + int e = 0x6 + forward_t *f = 0x7 + int (*)() g = 0x8 +} diff --git a/cmd/dtrace/test/tst/common/printa/tst.largeusersym.ksh b/cmd/dtrace/test/tst/common/printa/tst.largeusersym.ksh index ed375fd..4c5df0a 100644 --- a/cmd/dtrace/test/tst/common/printa/tst.largeusersym.ksh +++ b/cmd/dtrace/test/tst/common/printa/tst.largeusersym.ksh @@ -50,7 +50,7 @@ main(int argc, char *argv[]) } EOF -cc -o test test.c +gcc -o test test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 diff --git a/cmd/dtrace/test/tst/common/privs/tst.noprivdrop.ksh b/cmd/dtrace/test/tst/common/privs/tst.noprivdrop.ksh new file mode 100644 index 0000000..a5cd183 --- /dev/null +++ b/cmd/dtrace/test/tst/common/privs/tst.noprivdrop.ksh @@ -0,0 +1,72 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +ppriv -s A=basic,dtrace_user $$ + +# +# We expect some number of these profile probes to be silently dropped. +# Note that this test will fail if something is stuck on all CPUs that +# whomever is running the test happens to own. +# +count=$(/usr/sbin/dtrace -q -s /dev/stdin < 100/ +{ + printa("%@d", @); + exit(0); +} +EOF) + +cpus=`psrinfo | grep -- on-line | wc -l` +max=`expr $cpus \* 500` + +if [[ $count -gt $max ]]; then + echo "count ($count) is greater than allowed max ($max)" + exit 1 +fi + +echo "count ($count) is within allowed max ($max)" +exit 0 diff --git a/cmd/dtrace/test/tst/common/privs/tst.noprivrestrict.ksh b/cmd/dtrace/test/tst/common/privs/tst.noprivrestrict.ksh new file mode 100644 index 0000000..358ed92 --- /dev/null +++ b/cmd/dtrace/test/tst/common/privs/tst.noprivrestrict.ksh @@ -0,0 +1,61 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +ppriv -s A=basic,dtrace_user $$ + +# +# We expect at least one of these tick probes to error out because only +# dtrace_user is set, and we are attempting to access arguments. Note that +# this test will fail if something is stuck on CPU that whomever is running +# the test happens to own. +# +/usr/sbin/dtrace -q -s /dev/stdin < 100/ +{ + printf("error count is %d\n", errcnt); + exit(errcnt != 0 ? 0 : 1); +} +EOF diff --git a/cmd/dtrace/test/tst/common/privs/tst.tick.ksh b/cmd/dtrace/test/tst/common/privs/tst.tick.ksh new file mode 100644 index 0000000..eaff59f --- /dev/null +++ b/cmd/dtrace/test/tst/common/privs/tst.tick.ksh @@ -0,0 +1,55 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +ppriv -s A=basic,dtrace_user $$ + +# +# We expect tick probes to fire if dtrace_user is set +# +/usr/sbin/dtrace -q -s /dev/stdin < 10 && (this->ms = (timestamp - start) / 1000000) > 2000/ +{ + printf("expected completion in 100 ms, found %d!\n", this->ms); + exit(1); +} + +tick-10ms +/ticks > 10/ +{ + printf("completed in %d ms\n", this->ms); + exit(0); +} +EOF diff --git a/cmd/dtrace/test/tst/common/profile-n/tst.ufunc.ksh b/cmd/dtrace/test/tst/common/profile-n/tst.ufunc.ksh index 69c0f84..478307a 100644 --- a/cmd/dtrace/test/tst/common/profile-n/tst.ufunc.ksh +++ b/cmd/dtrace/test/tst/common/profile-n/tst.ufunc.ksh @@ -23,7 +23,6 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" script() { @@ -61,10 +60,9 @@ child=$! # # The only thing we can be sure of here is that we caught some function in -# ksh doing work. (This actually goes one step further and assumes that we -# catch some non-static function in ksh.) +# ksh doing work. # -script | tee /dev/fd/2 | grep 'ksh`[a-zA-Z_]' > /dev/null +script | tee /dev/fd/2 | egrep '(ksh|libshell\.so\.[0-9])`[a-zA-Z_]' > /dev/null status=$? kill $child diff --git a/cmd/dtrace/test/tst/common/profile-n/tst.umod.ksh b/cmd/dtrace/test/tst/common/profile-n/tst.umod.ksh index 6ca823f..fe5649d 100644 --- a/cmd/dtrace/test/tst/common/profile-n/tst.umod.ksh +++ b/cmd/dtrace/test/tst/common/profile-n/tst.umod.ksh @@ -23,7 +23,6 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" script() { @@ -62,7 +61,7 @@ child=$! # # The only thing we can be sure of here is that ksh is doing some work. # -script | tee /dev/fd/2 | grep -w ksh > /dev/null +script | tee /dev/fd/2 | egrep '(ksh|libshell)' > /dev/null status=$? kill $child diff --git a/cmd/dtrace/test/tst/common/profile-n/tst.usym.ksh b/cmd/dtrace/test/tst/common/profile-n/tst.usym.ksh index b1a3ab9..36edf0e 100644 --- a/cmd/dtrace/test/tst/common/profile-n/tst.usym.ksh +++ b/cmd/dtrace/test/tst/common/profile-n/tst.usym.ksh @@ -23,7 +23,6 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" script() { @@ -63,7 +62,7 @@ child=$! # This test is essentially the same as that in the ufunc test; see that # test for the rationale. # -script | tee /dev/fd/2 | grep 'ksh`[a-zA-Z_]' > /dev/null +script | tee /dev/fd/2 | egrep '(ksh|libshell\.so\.[0-9])`[a-zA-Z_]' > /dev/null status=$? kill $child diff --git a/cmd/dtrace/test/tst/common/safety/tst.violentdeath.ksh b/cmd/dtrace/test/tst/common/safety/tst.violentdeath.ksh index 879774a..d701053 100644 --- a/cmd/dtrace/test/tst/common/safety/tst.violentdeath.ksh +++ b/cmd/dtrace/test/tst/common/safety/tst.violentdeath.ksh @@ -23,11 +23,10 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" script() { - $dtrace -x bufpolicy=ring -x bufsize=1k -s /dev/stdin <= 150/ +{ + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/tracemem/tst.dynsize.d.out b/cmd/dtrace/test/tst/common/tracemem/tst.dynsize.d.out new file mode 100644 index 0000000..6415893 --- /dev/null +++ b/cmd/dtrace/test/tst/common/tracemem/tst.dynsize.d.out @@ -0,0 +1,1313 @@ +-9: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +-8: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +-7: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +-6: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +-5: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +-4: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +-3: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +-2: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +-1: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +0: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +1: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 . + +2: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 .. + +3: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 ... + +4: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 .... + +5: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 ..... + +6: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 ...... + +7: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 ....... + +8: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 ........ + +9: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 ......... + +10: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 .......... + +11: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 ........... + +12: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 ............ + +13: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 ............. + +14: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 .............. + +15: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ............... + +16: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +17: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 . + +18: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 .. + +19: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 ... + +20: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 .... + +21: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 ..... + +22: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 ...... + +23: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 ....... + +24: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 ........ + +25: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 ......... + +26: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 .......... + +27: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 ........... + +28: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 ............ + +29: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 ............. + +30: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 .............. + +31: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ............... + +32: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +33: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 . + +34: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 .. + +35: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 ... + +36: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 .... + +37: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 ..... + +38: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 ...... + +39: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 ....... + +40: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 ........ + +41: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 ......... + +42: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 .......... + +43: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 ........... + +44: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 ............ + +45: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 ............. + +46: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 .............. + +47: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ............... + +48: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +49: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 . + +50: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 .. + +51: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 ... + +52: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 .... + +53: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 ..... + +54: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 ...... + +55: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 ....... + +56: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 ........ + +57: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 ......... + +58: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 .......... + +59: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 ........... + +60: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 ............ + +61: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 ............. + +62: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 .............. + +63: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ............... + +64: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +65: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 . + +66: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 .. + +67: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 ... + +68: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 .... + +69: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 ..... + +70: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 ...... + +71: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 ....... + +72: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 ........ + +73: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 ......... + +74: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 .......... + +75: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 ........... + +76: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 ............ + +77: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 ............. + +78: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 .............. + +79: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ............... + +80: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +81: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 . + +82: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 .. + +83: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 ... + +84: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 .... + +85: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 ..... + +86: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 ...... + +87: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 ....... + +88: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 ........ + +89: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 ......... + +90: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 .......... + +91: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 ........... + +92: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 ............ + +93: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 ............. + +94: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 .............. + +95: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ............... + +96: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +97: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 . + +98: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 .. + +99: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 ... + +100: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 .... + +101: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 ..... + +102: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 ...... + +103: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 ....... + +104: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 ........ + +105: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 ......... + +106: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 .......... + +107: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 ........... + +108: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 ............ + +109: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 ............. + +110: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 .............. + +111: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ............... + +112: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +113: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 . + +114: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 .. + +115: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 ... + +116: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 .... + +117: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 ..... + +118: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 ...... + +119: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 ....... + +120: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 ........ + +121: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 ......... + +122: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 .......... + +123: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 ........... + +124: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 ............ + +125: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 ............. + +126: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 .............. + +127: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ............... + +128: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +129: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +130: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +131: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +132: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +133: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +134: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +135: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +136: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +137: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +138: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +139: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +140: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +141: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +142: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +143: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +144: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +145: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +146: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +147: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +148: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +149: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + +150: + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 40: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + + diff --git a/cmd/dtrace/test/tst/common/tracemem/tst.smallsize.d b/cmd/dtrace/test/tst/common/tracemem/tst.smallsize.d new file mode 100644 index 0000000..ae44770 --- /dev/null +++ b/cmd/dtrace/test/tst/common/tracemem/tst.smallsize.d @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#pragma D option quiet + +BEGIN +{ + tracemem(`utsname.sysname, 5); + exit(0); +} diff --git a/cmd/dtrace/test/tst/common/tracemem/tst.smallsize.d.out b/cmd/dtrace/test/tst/common/tracemem/tst.smallsize.d.out new file mode 100644 index 0000000..8cefb58 --- /dev/null +++ b/cmd/dtrace/test/tst/common/tracemem/tst.smallsize.d.out @@ -0,0 +1,4 @@ + + 0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef + 0: 53 75 6e 4f 53 SunOS + diff --git a/cmd/dtrace/test/tst/common/usdt/tst.badguess.ksh b/cmd/dtrace/test/tst/common/usdt/tst.badguess.ksh index 291fe83..3c41f66 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.badguess.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.badguess.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -61,12 +60,12 @@ main(int argc, char **argv) } EOF -cc -xarch=generic64 -c -o test64.o test.c +gcc -m64 -c -o test64.o test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c 64-bit" exit 1 fi -cc -xarch=generic -c -o test32.o test.c +gcc -m32 -c -o test32.o test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c 32-bit" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.corruptenv.ksh b/cmd/dtrace/test/tst/common/usdt/tst.corruptenv.ksh index 68dbb03..c9bcb03 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.corruptenv.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.corruptenv.ksh @@ -60,10 +60,10 @@ cat > Makefile < Makefile < /dev/null +make > /dev/null if [ $? -ne 0 ]; then print -u2 "failed to build" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.dlclose2.ksh b/cmd/dtrace/test/tst/common/usdt/tst.dlclose2.ksh index c83d8bf..692c8d9 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.dlclose2.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.dlclose2.ksh @@ -24,7 +24,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -41,17 +40,17 @@ cat > Makefile < /dev/null +make > /dev/null if [ $? -ne 0 ]; then print -u2 "failed to build" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.dlclose3.ksh b/cmd/dtrace/test/tst/common/usdt/tst.dlclose3.ksh index 72f24ce..e950eb4 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.dlclose3.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.dlclose3.ksh @@ -24,7 +24,6 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # # This test verifies that performing a dlclose(3dl) on a library doesn't @@ -46,17 +45,17 @@ cat > Makefile < /dev/null +make > /dev/null if [ $? -ne 0 ]; then print -u2 "failed to build" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.eliminate.ksh b/cmd/dtrace/test/tst/common/usdt/tst.eliminate.ksh index 687e435..3d50443 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.eliminate.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.eliminate.ksh @@ -23,7 +23,6 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # # Make sure temporary symbols generated due to DTrace probes in static @@ -72,7 +71,7 @@ main(int argc, char **argv) } EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -82,7 +81,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.enabled.ksh b/cmd/dtrace/test/tst/common/usdt/tst.enabled.ksh index ba62be7..47ea79f 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.enabled.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.enabled.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -61,7 +60,7 @@ main(int argc, char **argv) } EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -71,7 +70,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.enabled2.ksh b/cmd/dtrace/test/tst/common/usdt/tst.enabled2.ksh index 3401648..9b71ac2 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.enabled2.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.enabled2.ksh @@ -23,7 +23,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" # # This test is primarily intended to verify a fix for SPARC, but there's no @@ -77,7 +76,7 @@ main(int argc, char **argv) } EOF -cc -c -xO2 test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -87,7 +86,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.entryreturn.ksh b/cmd/dtrace/test/tst/common/usdt/tst.entryreturn.ksh index 9d2646c..79e8266 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.entryreturn.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.entryreturn.ksh @@ -23,7 +23,6 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -70,7 +69,7 @@ provider test_prov { }; EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -80,7 +79,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.fork.ksh b/cmd/dtrace/test/tst/common/usdt/tst.fork.ksh index 9c12e6d..1264e3f 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.fork.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.fork.ksh @@ -24,7 +24,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -71,7 +70,7 @@ main(int argc, char **argv) } EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -81,7 +80,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.guess32.ksh b/cmd/dtrace/test/tst/common/usdt/tst.guess32.ksh index 68a8d01..59339f7 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.guess32.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.guess32.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -61,7 +60,7 @@ main(int argc, char **argv) } EOF -cc -xarch=generic -c test.c +gcc -m32 -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -71,7 +70,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -xarch=generic -o test test.o prov.o +gcc -m32 -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.guess64.ksh b/cmd/dtrace/test/tst/common/usdt/tst.guess64.ksh index 39de8e0..e270290 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.guess64.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.guess64.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -61,7 +60,7 @@ main(int argc, char **argv) } EOF -cc -xarch=generic64 -c test.c +gcc -m64 -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -71,7 +70,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -xarch=generic64 -o test test.o prov.o +gcc -m64 -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.header.ksh b/cmd/dtrace/test/tst/common/usdt/tst.header.ksh index 08a1912..f4679bc 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.header.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.header.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -65,7 +64,7 @@ main(int argc, char **argv) } EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -75,7 +74,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.include.ksh b/cmd/dtrace/test/tst/common/usdt/tst.include.ksh index 5683f47..5576ab8 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.include.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.include.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # Make sure defines _DTRACE_VERSION @@ -46,7 +45,7 @@ main(int argc, char **argv) } EOF -cc -xarch=generic -o test test.c +gcc -m32 -o test test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.linkpriv.ksh b/cmd/dtrace/test/tst/common/usdt/tst.linkpriv.ksh index ec07e05..bbe1a4a 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.linkpriv.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.linkpriv.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -62,7 +61,7 @@ provider test_prov { }; EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -72,7 +71,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.linkunpriv.ksh b/cmd/dtrace/test/tst/common/usdt/tst.linkunpriv.ksh index 01b2126..35d97af 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.linkunpriv.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.linkunpriv.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -64,7 +63,7 @@ provider test_prov { }; EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -74,7 +73,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.multiple.ksh b/cmd/dtrace/test/tst/common/usdt/tst.multiple.ksh index 0c8b072..852f5a0 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.multiple.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.multiple.ksh @@ -24,7 +24,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -65,7 +64,7 @@ main(int argc, char **argv) } EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -75,7 +74,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.nodtrace.ksh b/cmd/dtrace/test/tst/common/usdt/tst.nodtrace.ksh index a911bcd..c0c3465 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.nodtrace.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.nodtrace.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # Fake up a scenario where _DTRACE_VERSION is not defined by having our own # . This tests that dtrace -h will produce a header file which can @@ -70,12 +69,12 @@ main(int argc, char **argv) } EOF -cc -I. -xarch=generic -c test.c +gcc -I. -m32 -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 fi -cc -xarch=generic -o test test.o +gcc -m32 -o test test.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.noreap.ksh b/cmd/dtrace/test/tst/common/usdt/tst.noreap.ksh new file mode 100644 index 0000000..338dcdf --- /dev/null +++ b/cmd/dtrace/test/tst/common/usdt/tst.noreap.ksh @@ -0,0 +1,128 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +if [ $# != 1 ]; then + echo expected one argument: '<'dtrace-path'>' + exit 2 +fi + +dtrace=$1 +DIR=/var/tmp/dtest.$$ + +mkdir $DIR +cd $DIR + +cat > test.c < +#include + +int +main(int argc, char **argv) +{ + DTRACE_PROBE(test_prov, probe1); +} +EOF + +cat > prov.d < 10/ + { + exit(0); + } +EOF +} + +script 2>&1 | tee test.out + +# +# It should be true that our probe was not reaped after the provider was made +# defunct: the speculative tracing action prevents reaping of any ECB in the +# enabling. +# +status=0 + +if grep D_PDESC_INVAL test.out 2> /dev/null 1>&2 ; then + status=1 +else + grep D_PROC_GRAB test.out 2> /dev/null 1>&2 + status=$? +fi + +cd / +/usr/bin/rm -rf $DIR + +exit $status diff --git a/cmd/dtrace/test/tst/common/usdt/tst.noreapring.ksh b/cmd/dtrace/test/tst/common/usdt/tst.noreapring.ksh new file mode 100644 index 0000000..a2e5ede --- /dev/null +++ b/cmd/dtrace/test/tst/common/usdt/tst.noreapring.ksh @@ -0,0 +1,124 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +if [ $# != 1 ]; then + echo expected one argument: '<'dtrace-path'>' + exit 2 +fi + +dtrace=$1 +DIR=/var/tmp/dtest.$$ + +mkdir $DIR +cd $DIR + +cat > test.c < +#include + +int +main(int argc, char **argv) +{ + DTRACE_PROBE(test_prov, probe1); +} +EOF + +cat > prov.d < 10/ + { + exit(0); + } +EOF +} + +$dtrace -x bufpolicy=ring -ZwqP test_prov\* > /dev/null 2>&1 & +background=$! +echo launched ring buffered enabling as pid $background +script 2>&1 | tee test.out + +# +# It should be true that our probe was not reaped after the provider was made +# defunct: the active ring buffer in the earlier enabling prevents reaping of +# any of the earlier enabling's ECBs. +# +status=0 + +if grep D_PDESC_INVAL test.out 2> /dev/null 1>&2 ; then + status=1 +else + grep D_PROC_GRAB test.out 2> /dev/null 1>&2 + status=$? +fi + +kill $background +cd / +/usr/bin/rm -rf $DIR + +exit $status diff --git a/cmd/dtrace/test/tst/common/usdt/tst.onlyenabled.ksh b/cmd/dtrace/test/tst/common/usdt/tst.onlyenabled.ksh index 989d6d3..a1e939c 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.onlyenabled.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.onlyenabled.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -62,7 +61,7 @@ main(int argc, char **argv) } EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -72,7 +71,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.reap.ksh b/cmd/dtrace/test/tst/common/usdt/tst.reap.ksh new file mode 100644 index 0000000..f18c585 --- /dev/null +++ b/cmd/dtrace/test/tst/common/usdt/tst.reap.ksh @@ -0,0 +1,115 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2011, Joyent, Inc. All rights reserved. +# + +if [ $# != 1 ]; then + echo expected one argument: '<'dtrace-path'>' + exit 2 +fi + +dtrace=$1 +DIR=/var/tmp/dtest.$$ + +mkdir $DIR +cd $DIR + +cat > test.c < +#include + +int +main(int argc, char **argv) +{ + DTRACE_PROBE(test_prov, probe1); +} +EOF + +cat > prov.d < 10/ + { + exit(0); + } +EOF +} + +script 2>&1 | tee test.out + +# +# It should be true that our probe was reaped over the course of the enabling, +# causing the embedded DTrace invocation to fail on an invalid probe (that is, +# D_PDESC_INVAL) instead of an inability to grab the underlying process +# (D_PROC_GRAB). +# +grep D_PDESC_INVAL test.out 2> /dev/null 1>&2 +status=$? + +cd / +/usr/bin/rm -rf $DIR + +exit $status diff --git a/cmd/dtrace/test/tst/common/usdt/tst.reeval.ksh b/cmd/dtrace/test/tst/common/usdt/tst.reeval.ksh index 7fad401..2f0ee33 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.reeval.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.reeval.ksh @@ -23,7 +23,6 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -52,7 +51,7 @@ provider test_prov { }; EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -62,7 +61,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.static.ksh b/cmd/dtrace/test/tst/common/usdt/tst.static.ksh index 1ebcdb9..85b0e55 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.static.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.static.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -63,7 +62,7 @@ provider test_prov { }; EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -73,7 +72,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.static2.ksh b/cmd/dtrace/test/tst/common/usdt/tst.static2.ksh index 7cf9004..07b7657 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.static2.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.static2.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" # Rebuilding an object file containing DOF changes slightly when the object # files containing the probes have already been modified. This tests that @@ -67,7 +66,7 @@ provider test_prov { }; EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -83,7 +82,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create final DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/usdt/tst.user.ksh b/cmd/dtrace/test/tst/common/usdt/tst.user.ksh index d5d9fdc..f52c1c3 100644 --- a/cmd/dtrace/test/tst/common/usdt/tst.user.ksh +++ b/cmd/dtrace/test/tst/common/usdt/tst.user.ksh @@ -24,7 +24,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -62,7 +61,7 @@ main(int argc, char **argv) } EOF -cc -c test.c +gcc -c test.c if [ $? -ne 0 ]; then print -u2 "failed to compile test.c" exit 1 @@ -72,7 +71,7 @@ if [ $? -ne 0 ]; then print -u2 "failed to create DOF" exit 1 fi -cc -o test test.o prov.o +gcc -o test test.o prov.o if [ $? -ne 0 ]; then print -u2 "failed to link final executable" exit 1 diff --git a/cmd/dtrace/test/tst/common/ustack/tst.spin.ksh b/cmd/dtrace/test/tst/common/ustack/tst.spin.ksh index 1a7e0e1..57c13d5 100644 --- a/cmd/dtrace/test/tst/common/ustack/tst.spin.ksh +++ b/cmd/dtrace/test/tst/common/ustack/tst.spin.ksh @@ -23,7 +23,6 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" if [ $# != 1 ]; then echo expected one argument: '<'dtrace-path'>' @@ -35,7 +34,7 @@ dtrace=$1 rm -f $file -dir=`dirname $tst` +dir=`/bin/dirname $tst` $dtrace -o $file -c $dir/tst.spin.exe -s /dev/stdin < #include #include @@ -37,9 +35,9 @@ #include #ifdef _LP64 -static const char *_libctf_zlib = "/usr/lib/64/libz.so"; +static const char *_libctf_zlib = "/usr/lib/64/libz.so.1"; #else -static const char *_libctf_zlib = "/usr/lib/libz.so"; +static const char *_libctf_zlib = "/usr/lib/libz.so.1"; #endif static struct { diff --git a/lib/libdtrace/common/dt_aggregate.c b/lib/libdtrace/common/dt_aggregate.c index 2e66250..bb766f7 100644 --- a/lib/libdtrace/common/dt_aggregate.c +++ b/lib/libdtrace/common/dt_aggregate.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ #include #include @@ -204,6 +206,83 @@ dt_aggregate_lquantizedcmp(int64_t *lhs, int64_t *rhs) return (0); } +static void +dt_aggregate_llquantize(int64_t *existing, int64_t *new, size_t size) +{ + int i; + + for (i = 1; i < size / sizeof (int64_t); i++) + existing[i] = existing[i] + new[i]; +} + +static long double +dt_aggregate_llquantizedsum(int64_t *llquanta) +{ + int64_t arg = *llquanta++; + uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg); + uint16_t low = DTRACE_LLQUANTIZE_LOW(arg); + uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg); + uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg); + int bin = 0, order; + int64_t value = 1, next, step; + long double total; + + assert(nsteps >= factor); + assert(nsteps % factor == 0); + + for (order = 0; order < low; order++) + value *= factor; + + total = (long double)llquanta[bin++] * (long double)(value - 1); + + next = value * factor; + step = next > nsteps ? next / nsteps : 1; + + while (order <= high) { + assert(value < next); + total += (long double)llquanta[bin++] * (long double)(value); + + if ((value += step) != next) + continue; + + next = value * factor; + step = next > nsteps ? next / nsteps : 1; + order++; + } + + return (total + (long double)llquanta[bin] * (long double)value); +} + +static int +dt_aggregate_llquantizedcmp(int64_t *lhs, int64_t *rhs) +{ + long double lsum = dt_aggregate_llquantizedsum(lhs); + long double rsum = dt_aggregate_llquantizedsum(rhs); + int64_t lzero, rzero; + + if (lsum < rsum) + return (DT_LESSTHAN); + + if (lsum > rsum) + return (DT_GREATERTHAN); + + /* + * If they're both equal, then we will compare based on the weights at + * zero. If the weights at zero are equal, then this will be judged a + * tie and will be resolved based on the key comparison. + */ + lzero = lhs[1]; + rzero = rhs[1]; + + if (lzero < rzero) + return (DT_LESSTHAN); + + if (lzero > rzero) + return (DT_GREATERTHAN); + + return (0); +} + static int dt_aggregate_quantizedcmp(int64_t *lhs, int64_t *rhs) { @@ -582,6 +661,10 @@ hashnext: h->dtahe_aggregate = dt_aggregate_lquantize; break; + case DTRACEAGG_LLQUANTIZE: + h->dtahe_aggregate = dt_aggregate_llquantize; + break; + case DTRACEAGG_COUNT: case DTRACEAGG_SUM: case DTRACEAGG_AVG: @@ -849,6 +932,10 @@ dt_aggregate_valcmp(const void *lhs, const void *rhs) rval = dt_aggregate_lquantizedcmp(laddr, raddr); break; + case DTRACEAGG_LLQUANTIZE: + rval = dt_aggregate_llquantizedcmp(laddr, raddr); + break; + case DTRACEAGG_COUNT: case DTRACEAGG_SUM: case DTRACEAGG_MIN: diff --git a/lib/libdtrace/common/dt_cc.c b/lib/libdtrace/common/dt_cc.c index 24a386b..8b8bcf4 100644 --- a/lib/libdtrace/common/dt_cc.c +++ b/lib/libdtrace/common/dt_cc.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ /* @@ -82,6 +84,7 @@ #include #include +#include #include #include @@ -676,13 +679,59 @@ dt_action_trace(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp) ap->dtad_kind = DTRACEACT_DIFEXPR; } +/* + * The print() action behaves identically to trace(), except that it stores the + * CTF type of the argument (if present) within the DOF for the DIFEXPR action. + * To do this, we set the 'dtsd_strdata' to point to the fully-qualified CTF + * type ID for the result of the DIF action. We use the ID instead of the name + * to handles complex types like arrays and function pointers that can't be + * resolved by ctf_type_lookup(). This is later processed by + * dtrace_dof_create() and turned into a reference into the string table so + * that we can get the type information when we process the data after the + * fact. + */ +static void +dt_action_print(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp) +{ + dtrace_actdesc_t *ap = dt_stmt_action(dtp, sdp); + dt_node_t *dret; + size_t len; + dt_module_t *dmp; + + if (dt_node_is_void(dnp->dn_args)) { + dnerror(dnp->dn_args, D_PRINT_VOID, + "print( ) may not be applied to a void expression\n"); + } + + if (dt_node_is_dynamic(dnp->dn_args)) { + dnerror(dnp->dn_args, D_PRINT_DYN, + "print( ) may not be applied to a dynamic expression\n"); + } + + dt_cg(yypcb, dnp->dn_args); + + dret = yypcb->pcb_dret; + dmp = dt_module_lookup_by_ctf(dtp, dret->dn_ctfp); + + len = snprintf(NULL, 0, "%s`%d", dmp->dm_name, dret->dn_type) + 1; + sdp->dtsd_strdata = dt_alloc(dtp, len); + if (sdp->dtsd_strdata == NULL) + longjmp(yypcb->pcb_jmpbuf, EDT_NOMEM); + (void) snprintf(sdp->dtsd_strdata, len, "%s`%d", dmp->dm_name, + dret->dn_type); + + ap->dtad_difo = dt_as(yypcb); + ap->dtad_kind = DTRACEACT_DIFEXPR; +} + static void dt_action_tracemem(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp) { dtrace_actdesc_t *ap = dt_stmt_action(dtp, sdp); dt_node_t *addr = dnp->dn_args; - dt_node_t *size = dnp->dn_args->dn_list; + dt_node_t *max = dnp->dn_args->dn_list; + dt_node_t *size; char n[DT_TYPE_NAMELEN]; @@ -694,17 +743,37 @@ dt_action_tracemem(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp) dt_node_type_name(addr, n, sizeof (n))); } - if (dt_node_is_posconst(size) == 0) { - dnerror(size, D_TRACEMEM_SIZE, "tracemem( ) argument #2 must " + if (dt_node_is_posconst(max) == 0) { + dnerror(max, D_TRACEMEM_SIZE, "tracemem( ) argument #2 must " "be a non-zero positive integral constant expression\n"); } + if ((size = max->dn_list) != NULL) { + if (size->dn_list != NULL) { + dnerror(size, D_TRACEMEM_ARGS, "tracemem ( ) prototype " + "mismatch: expected at most 3 args\n"); + } + + if (!dt_node_is_scalar(size)) { + dnerror(size, D_TRACEMEM_DYNSIZE, "tracemem ( ) " + "dynamic size (argument #3) must be of " + "scalar type\n"); + } + + dt_cg(yypcb, size); + ap->dtad_difo = dt_as(yypcb); + ap->dtad_difo->dtdo_rtype = dt_int_rtype; + ap->dtad_kind = DTRACEACT_TRACEMEM_DYNSIZE; + + ap = dt_stmt_action(dtp, sdp); + } + dt_cg(yypcb, addr); ap->dtad_difo = dt_as(yypcb); - ap->dtad_kind = DTRACEACT_DIFEXPR; + ap->dtad_kind = DTRACEACT_TRACEMEM; ap->dtad_difo->dtdo_rtype.dtdt_flags |= DIF_TF_BYREF; - ap->dtad_difo->dtdo_rtype.dtdt_size = size->dn_value; + ap->dtad_difo->dtdo_rtype.dtdt_size = max->dn_value; } static void @@ -1034,6 +1103,9 @@ dt_compile_fun(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp) case DT_ACT_TRACE: dt_action_trace(dtp, dnp->dn_expr, sdp); break; + case DT_ACT_PRINT: + dt_action_print(dtp, dnp->dn_expr, sdp); + break; case DT_ACT_TRACEMEM: dt_action_tracemem(dtp, dnp->dn_expr, sdp); break; @@ -1291,6 +1363,145 @@ dt_compile_agg(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp) argmax = 5; } + if (fid->di_id == DTRACEAGG_LLQUANTIZE) { + /* + * For log/linear quantizations, we have between one and five + * arguments in addition to the expression: + * + * arg1 => Factor + * arg2 => Low magnitude + * arg3 => High magnitude + * arg4 => Number of steps per magnitude + * arg5 => Quantization increment value (defaults to 1) + */ + dt_node_t *llarg = dnp->dn_aggfun->dn_args->dn_list; + uint64_t oarg, order, v; + dt_idsig_t *isp; + int i; + + struct { + char *str; /* string identifier */ + int badtype; /* error on bad type */ + int badval; /* error on bad value */ + int mismatch; /* error on bad match */ + int shift; /* shift value */ + uint16_t value; /* value itself */ + } args[] = { + { "factor", D_LLQUANT_FACTORTYPE, + D_LLQUANT_FACTORVAL, D_LLQUANT_FACTORMATCH, + DTRACE_LLQUANTIZE_FACTORSHIFT }, + { "low magnitude", D_LLQUANT_LOWTYPE, + D_LLQUANT_LOWVAL, D_LLQUANT_LOWMATCH, + DTRACE_LLQUANTIZE_LOWSHIFT }, + { "high magnitude", D_LLQUANT_HIGHTYPE, + D_LLQUANT_HIGHVAL, D_LLQUANT_HIGHMATCH, + DTRACE_LLQUANTIZE_HIGHSHIFT }, + { "linear steps per magnitude", D_LLQUANT_NSTEPTYPE, + D_LLQUANT_NSTEPVAL, D_LLQUANT_NSTEPMATCH, + DTRACE_LLQUANTIZE_NSTEPSHIFT }, + { NULL } + }; + + assert(arg == 0); + + for (i = 0; args[i].str != NULL; i++) { + if (llarg->dn_kind != DT_NODE_INT) { + dnerror(llarg, args[i].badtype, "llquantize( ) " + "argument #%d (%s) must be an " + "integer constant\n", i + 1, args[i].str); + } + + if ((uint64_t)llarg->dn_value > UINT16_MAX) { + dnerror(llarg, args[i].badval, "llquantize( ) " + "argument #%d (%s) must be an unsigned " + "16-bit quantity\n", i + 1, args[i].str); + } + + args[i].value = (uint16_t)llarg->dn_value; + + assert(!(arg & (UINT16_MAX << args[i].shift))); + arg |= ((uint64_t)args[i].value << args[i].shift); + llarg = llarg->dn_list; + } + + assert(arg != 0); + + if (args[0].value < 2) { + dnerror(dnp, D_LLQUANT_FACTORSMALL, "llquantize( ) " + "factor (argument #1) must be two or more\n"); + } + + if (args[1].value >= args[2].value) { + dnerror(dnp, D_LLQUANT_MAGRANGE, "llquantize( ) " + "high magnitude (argument #3) must be greater " + "than low magnitude (argument #2)\n"); + } + + if (args[3].value < args[0].value) { + dnerror(dnp, D_LLQUANT_FACTORNSTEPS, "llquantize( ) " + "factor (argument #1) must be less than or " + "equal to the number of linear steps per " + "magnitude (argument #4)\n"); + } + + for (v = args[0].value; v < args[3].value; v *= args[0].value) + continue; + + if ((args[3].value % args[0].value) || (v % args[3].value)) { + dnerror(dnp, D_LLQUANT_FACTOREVEN, "llquantize( ) " + "factor (argument #1) must evenly divide the " + "number of steps per magnitude (argument #4), " + "and the number of steps per magnitude must evenly " + "divide a power of the factor\n"); + } + + for (i = 0, order = 1; i < args[2].value; i++) { + if (order * args[0].value > order) { + order *= args[0].value; + continue; + } + + dnerror(dnp, D_LLQUANT_MAGTOOBIG, "llquantize( ) " + "factor (%d) raised to power of high magnitude " + "(%d) overflows 64-bits\n", args[0].value, + args[2].value); + } + + isp = (dt_idsig_t *)aid->di_data; + + if (isp->dis_auxinfo == 0) { + /* + * This is the first time we've seen an llquantize() + * for this aggregation; we'll store our argument + * as the auxiliary signature information. + */ + isp->dis_auxinfo = arg; + } else if ((oarg = isp->dis_auxinfo) != arg) { + /* + * If we have seen this llquantize() before and the + * argument doesn't match the original argument, pick + * the original argument apart to concisely report the + * mismatch. + */ + int expected = 0, found = 0; + + for (i = 0; expected == found; i++) { + assert(args[i].str != NULL); + + expected = (oarg >> args[i].shift) & UINT16_MAX; + found = (arg >> args[i].shift) & UINT16_MAX; + } + + dnerror(dnp, args[i - 1].mismatch, "llquantize( ) " + "%s (argument #%d) doesn't match previous " + "declaration: expected %d, found %d\n", + args[i - 1].str, i, expected, found); + } + + incr = llarg; + argmax = 6; + } + if (fid->di_id == DTRACEAGG_QUANTIZE) { incr = dnp->dn_aggfun->dn_args->dn_list; argmax = 2; @@ -1913,25 +2124,23 @@ dt_lib_depend_free(dtrace_hdl_t *dtp) } } - /* - * Open all of the .d library files found in the specified directory and - * compile each one in topological order to cache its inlines and translators, - * etc. We silently ignore any missing directories and other files found - * therein. We only fail (and thereby fail dt_load_libs()) if we fail to - * compile a library and the error is something other than #pragma D depends_on. - * Dependency errors are silently ignored to permit a library directory to - * contain libraries which may not be accessible depending on our privileges. + * Open all the .d library files found in the specified directory and + * compile each one of them. We silently ignore any missing directories and + * other files found therein. We only fail (and thereby fail dt_load_libs()) if + * we fail to compile a library and the error is something other than #pragma D + * depends_on. Dependency errors are silently ignored to permit a library + * directory to contain libraries which may not be accessible depending on our + * privileges. */ static int dt_load_libs_dir(dtrace_hdl_t *dtp, const char *path) { struct dirent *dp; - const char *p; + const char *p, *end; DIR *dirp; char fname[PATH_MAX]; - dtrace_prog_t *pgp; FILE *fp; void *rv; dt_lib_depend_t *dld; @@ -1955,9 +2164,28 @@ dt_load_libs_dir(dtrace_hdl_t *dtp, const char *path) continue; } + /* + * Skip files whose name match an already processed library + */ + for (dld = dt_list_next(&dtp->dt_lib_dep); dld != NULL; + dld = dt_list_next(dld)) { + end = strrchr(dld->dtld_library, '/'); + /* dt_lib_depend_add ensures this */ + assert(end != NULL); + if (strcmp(end + 1, dp->d_name) == 0) + break; + } + + if (dld != NULL) { + dt_dprintf("skipping library %s, already processed " + "library with the same name: %s", dp->d_name, + dld->dtld_library); + continue; + } + dtp->dt_filetag = fname; if (dt_lib_depend_add(dtp, &dtp->dt_lib_dep, fname) != 0) - goto err; + return (-1); /* preserve dt_errno */ rv = dt_compile(dtp, DT_CTX_DPROG, DTRACE_PROBESPEC_NAME, NULL, @@ -1966,7 +2194,7 @@ dt_load_libs_dir(dtrace_hdl_t *dtp, const char *path) if (rv != NULL && dtp->dt_errno && (dtp->dt_errno != EDT_COMPILER || dtp->dt_errtag != dt_errtag(D_PRAGMA_DEPEND))) - goto err; + return (-1); /* preserve dt_errno */ if (dtp->dt_errno) dt_dprintf("error parsing library %s: %s\n", @@ -1977,6 +2205,27 @@ dt_load_libs_dir(dtrace_hdl_t *dtp, const char *path) } (void) closedir(dirp); + + return (0); +} + +/* + * Perform a topological sorting of all the libraries found across the entire + * dt_lib_path. Once sorted, compile each one in topological order to cache its + * inlines and translators, etc. We silently ignore any missing directories and + * other files found therein. We only fail (and thereby fail dt_load_libs()) if + * we fail to compile a library and the error is something other than #pragma D + * depends_on. Dependency errors are silently ignored to permit a library + * directory to contain libraries which may not be accessible depending on our + * privileges. + */ +static int +dt_load_libs_sort(dtrace_hdl_t *dtp) +{ + dtrace_prog_t *pgp; + FILE *fp; + dt_lib_depend_t *dld; + /* * Finish building the graph containing the library dependencies * and perform a topological sort to generate an ordered list @@ -2037,7 +2286,14 @@ dt_load_libs(dtrace_hdl_t *dtp) dtp->dt_cflags |= DTRACE_C_NOLIBS; - for (dirp = dt_list_next(&dtp->dt_lib_path); + /* + * /usr/lib/dtrace is always at the head of the list. The rest of the + * list is specified in the precedence order the user requested. Process + * everything other than the head first. DTRACE_C_NOLIBS has already + * been spcified so dt_vopen will ensure that there is always one entry + * in dt_lib_path. + */ + for (dirp = dt_list_next(dt_list_next(&dtp->dt_lib_path)); dirp != NULL; dirp = dt_list_next(dirp)) { if (dt_load_libs_dir(dtp, dirp->dir_path) != 0) { dtp->dt_cflags &= ~DTRACE_C_NOLIBS; @@ -2045,6 +2301,16 @@ dt_load_libs(dtrace_hdl_t *dtp) } } + /* Handle /usr/lib/dtrace */ + dirp = dt_list_next(&dtp->dt_lib_path); + if (dt_load_libs_dir(dtp, dirp->dir_path) != 0) { + dtp->dt_cflags &= ~DTRACE_C_NOLIBS; + return (-1); /* errno is set for us */ + } + + if (dt_load_libs_sort(dtp) < 0) + return (-1); /* errno is set for us */ + return (0); } diff --git a/lib/libdtrace/common/dt_consume.c b/lib/libdtrace/common/dt_consume.c index 564189a..d3a554c 100644 --- a/lib/libdtrace/common/dt_consume.c +++ b/lib/libdtrace/common/dt_consume.c @@ -23,6 +23,11 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + */ + #include #include #include @@ -681,6 +686,121 @@ dt_print_lquantize(dtrace_hdl_t *dtp, FILE *fp, const void *addr, return (0); } +int +dt_print_llquantize(dtrace_hdl_t *dtp, FILE *fp, const void *addr, + size_t size, uint64_t normal) +{ + int i, first_bin, last_bin, bin = 1, order, levels; + uint16_t factor, low, high, nsteps; + const int64_t *data = addr; + int64_t value = 1, next, step; + char positives = 0, negatives = 0; + long double total = 0; + uint64_t arg; + char c[32]; + + if (size < sizeof (uint64_t)) + return (dt_set_errno(dtp, EDT_DMISMATCH)); + + arg = *data++; + size -= sizeof (uint64_t); + + factor = DTRACE_LLQUANTIZE_FACTOR(arg); + low = DTRACE_LLQUANTIZE_LOW(arg); + high = DTRACE_LLQUANTIZE_HIGH(arg); + nsteps = DTRACE_LLQUANTIZE_NSTEP(arg); + + /* + * We don't expect to be handed invalid llquantize() parameters here, + * but sanity check them (to a degree) nonetheless. + */ + if (size > INT32_MAX || factor < 2 || low >= high || + nsteps == 0 || factor > nsteps) + return (dt_set_errno(dtp, EDT_DMISMATCH)); + + levels = (int)size / sizeof (uint64_t); + + first_bin = 0; + last_bin = levels - 1; + + while (first_bin < levels && data[first_bin] == 0) + first_bin++; + + if (first_bin == levels) { + first_bin = 0; + last_bin = 1; + } else { + if (first_bin > 0) + first_bin--; + + while (last_bin > 0 && data[last_bin] == 0) + last_bin--; + + if (last_bin < levels - 1) + last_bin++; + } + + for (i = first_bin; i <= last_bin; i++) { + positives |= (data[i] > 0); + negatives |= (data[i] < 0); + total += dt_fabsl((long double)data[i]); + } + + if (dt_printf(dtp, fp, "\n%16s %41s %-9s\n", "value", + "------------- Distribution -------------", "count") < 0) + return (-1); + + for (order = 0; order < low; order++) + value *= factor; + + next = value * factor; + step = next > nsteps ? next / nsteps : 1; + + if (first_bin == 0) { + (void) snprintf(c, sizeof (c), "< %lld", value); + + if (dt_printf(dtp, fp, "%16s ", c) < 0) + return (-1); + + if (dt_print_quantline(dtp, fp, data[0], normal, + total, positives, negatives) < 0) + return (-1); + } + + while (order <= high) { + if (bin >= first_bin && bin <= last_bin) { + if (dt_printf(dtp, fp, "%16lld ", (long long)value) < 0) + return (-1); + + if (dt_print_quantline(dtp, fp, data[bin], + normal, total, positives, negatives) < 0) + return (-1); + } + + assert(value < next); + bin++; + + if ((value += step) != next) + continue; + + next = value * factor; + step = next > nsteps ? next / nsteps : 1; + order++; + } + + if (last_bin < bin) + return (0); + + assert(last_bin == bin); + (void) snprintf(c, sizeof (c), ">= %lld", value); + + if (dt_printf(dtp, fp, "%16s ", c) < 0) + return (-1); + + return (dt_print_quantline(dtp, fp, data[bin], normal, + total, positives, negatives)); +} + /*ARGSUSED*/ static int dt_print_average(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr, @@ -708,7 +828,7 @@ dt_print_stddev(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr, /*ARGSUSED*/ int dt_print_bytes(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr, - size_t nbytes, int width, int quiet) + size_t nbytes, int width, int quiet, int forceraw) { /* * If the byte stream is a series of printable characters, followed by @@ -721,6 +841,9 @@ dt_print_bytes(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr, if (nbytes == 0) return (0); + if (forceraw) + goto raw; + if (dtp->dt_options[DTRACEOPT_RAWBYTES] != DTRACEOPT_UNSET) goto raw; @@ -1397,6 +1520,9 @@ dt_print_datum(dtrace_hdl_t *dtp, FILE *fp, dtrace_recdesc_t *rec, case DTRACEAGG_LQUANTIZE: return (dt_print_lquantize(dtp, fp, addr, size, normal)); + case DTRACEAGG_LLQUANTIZE: + return (dt_print_llquantize(dtp, fp, addr, size, normal)); + case DTRACEAGG_AVG: return (dt_print_average(dtp, fp, addr, size, normal)); @@ -1428,7 +1554,7 @@ dt_print_datum(dtrace_hdl_t *dtp, FILE *fp, dtrace_recdesc_t *rec, (uint32_t)normal); break; default: - err = dt_print_bytes(dtp, fp, addr, size, 50, 0); + err = dt_print_bytes(dtp, fp, addr, size, 50, 0, 0); break; } @@ -1583,6 +1709,7 @@ dt_consume_cpu(dtrace_hdl_t *dtp, FILE *fp, int cpu, dtrace_bufdesc_t *buf, int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET); int rval, i, n; dtrace_epid_t last = DTRACE_EPIDNONE; + uint64_t tracememsize = 0; dtrace_probedata_t data; uint64_t drops; caddr_t addr; @@ -1751,6 +1878,13 @@ again: } } + if (act == DTRACEACT_TRACEMEM_DYNSIZE && + rec->dtrd_size == sizeof (uint64_t)) { + /* LINTED - alignment */ + tracememsize = *((unsigned long long *)addr); + continue; + } + rval = (*rfunc)(&data, rec, arg); if (rval == DTRACE_CONSUME_NEXT) @@ -1842,6 +1976,35 @@ again: goto nextrec; } + /* + * If this is a DIF expression, and the record has a + * format set, this indicates we have a CTF type name + * associated with the data and we should try to print + * it out by type. + */ + if (act == DTRACEACT_DIFEXPR) { + const char *strdata = dt_strdata_lookup(dtp, + rec->dtrd_format); + if (strdata != NULL) { + n = dtrace_print(dtp, fp, strdata, + addr, rec->dtrd_size); + + /* + * dtrace_print() will return -1 on + * error, or return the number of bytes + * consumed. It will return 0 if the + * type couldn't be determined, and we + * should fall through to the normal + * trace method. + */ + if (n < 0) + return (-1); + + if (n > 0) + goto nextrec; + } + } + nofmt: if (act == DTRACEACT_PRINTA) { dt_print_aggdata_t pd; @@ -1910,6 +2073,23 @@ nofmt: goto nextrec; } + if (act == DTRACEACT_TRACEMEM) { + if (tracememsize == 0 || + tracememsize > rec->dtrd_size) { + tracememsize = rec->dtrd_size; + } + + n = dt_print_bytes(dtp, fp, addr, + tracememsize, 33, quiet, 1); + + tracememsize = 0; + + if (n < 0) + return (-1); + + goto nextrec; + } + switch (rec->dtrd_size) { case sizeof (uint64_t): n = dt_printf(dtp, fp, @@ -1933,7 +2113,7 @@ nofmt: break; default: n = dt_print_bytes(dtp, fp, addr, - rec->dtrd_size, 33, quiet); + rec->dtrd_size, 33, quiet, 0); break; } diff --git a/lib/libdtrace/common/dt_dof.c b/lib/libdtrace/common/dt_dof.c index a7eb8e4..04c4c89 100644 --- a/lib/libdtrace/common/dt_dof.c +++ b/lib/libdtrace/common/dt_dof.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include @@ -754,16 +755,23 @@ dtrace_dof_create(dtrace_hdl_t *dtp, dtrace_prog_t *pgp, uint_t flags) dofa[i].dofa_difo = DOF_SECIDX_NONE; /* - * If the first action in a statement has format data, - * add the format string to the global string table. + * If the first action in a statement has string data, + * add the string to the global string table. This can + * be due either to a printf() format string + * (dtsd_fmtdata) or a print() type string + * (dtsd_strdata). */ if (sdp != NULL && ap == sdp->dtsd_action) { if (sdp->dtsd_fmtdata != NULL) { (void) dtrace_printf_format(dtp, sdp->dtsd_fmtdata, fmt, maxfmt + 1); strndx = dof_add_string(ddo, fmt); - } else + } else if (sdp->dtsd_strdata != NULL) { + strndx = dof_add_string(ddo, + sdp->dtsd_strdata); + } else { strndx = 0; /* use dtad_arg instead */ + } if ((next = dt_list_next(next)) != NULL) sdp = next->ds_desc; diff --git a/lib/libdtrace/common/dt_errtags.h b/lib/libdtrace/common/dt_errtags.h index 9e32dfd..473e2ad 100644 --- a/lib/libdtrace/common/dt_errtags.h +++ b/lib/libdtrace/common/dt_errtags.h @@ -24,11 +24,14 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + */ + #ifndef _DT_ERRTAGS_H #define _DT_ERRTAGS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -187,8 +190,12 @@ typedef enum { D_PRINTA_AGGPROTO, /* printa() aggregation mismatch */ D_TRACE_VOID, /* trace() argument has void type */ D_TRACE_DYN, /* trace() argument has dynamic type */ + D_PRINT_VOID, /* print() argument has void type */ + D_PRINT_DYN, /* print() argument has dynamic type */ D_TRACEMEM_ADDR, /* tracemem() address bad type */ D_TRACEMEM_SIZE, /* tracemem() size bad type */ + D_TRACEMEM_ARGS, /* tracemem() illegal number of args */ + D_TRACEMEM_DYNSIZE, /* tracemem() dynamic size bad type */ D_STACK_PROTO, /* stack() prototype mismatch */ D_STACK_SIZE, /* stack() size argument bad type */ D_USTACK_FRAMES, /* ustack() frames arg bad type */ @@ -235,7 +242,24 @@ typedef enum { D_FREOPEN_INVALID, /* frename() filename is invalid */ D_LQUANT_MATCHBASE, /* lquantize() mismatch on base */ D_LQUANT_MATCHLIM, /* lquantize() mismatch on limit */ - D_LQUANT_MATCHSTEP /* lquantize() mismatch on step */ + D_LQUANT_MATCHSTEP, /* lquantize() mismatch on step */ + D_LLQUANT_FACTORTYPE, /* llquantize() bad magnitude type */ + D_LLQUANT_FACTORVAL, /* llquantize() bad magnitude value */ + D_LLQUANT_FACTORMATCH, /* llquantize() mismatch on magnitude */ + D_LLQUANT_LOWTYPE, /* llquantize() bad low mag type */ + D_LLQUANT_LOWVAL, /* llquantize() bad low mag value */ + D_LLQUANT_LOWMATCH, /* llquantize() mismatch on low mag */ + D_LLQUANT_HIGHTYPE, /* llquantize() bad high mag type */ + D_LLQUANT_HIGHVAL, /* llquantize() bad high mag value */ + D_LLQUANT_HIGHMATCH, /* llquantize() mismatch on high mag */ + D_LLQUANT_NSTEPTYPE, /* llquantize() bad # steps type */ + D_LLQUANT_NSTEPVAL, /* llquantize() bad # steps value */ + D_LLQUANT_NSTEPMATCH, /* llquantize() mismatch on # steps */ + D_LLQUANT_MAGRANGE, /* llquantize() bad magnitude range */ + D_LLQUANT_FACTORNSTEPS, /* llquantize() # steps < factor */ + D_LLQUANT_FACTOREVEN, /* llquantize() bad # steps/factor */ + D_LLQUANT_FACTORSMALL, /* llquantize() magnitude too small */ + D_LLQUANT_MAGTOOBIG /* llquantize() high mag too large */ } dt_errtag_t; extern const char *dt_errtag(dt_errtag_t); diff --git a/lib/libdtrace/common/dt_impl.h b/lib/libdtrace/common/dt_impl.h index 1937ce0..b06fd64 100644 --- a/lib/libdtrace/common/dt_impl.h +++ b/lib/libdtrace/common/dt_impl.h @@ -24,6 +24,11 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + */ + #ifndef _DT_IMPL_H #define _DT_IMPL_H @@ -236,6 +241,8 @@ struct dtrace_hdl { dtrace_aggdesc_t **dt_aggdesc; /* aggregation descriptions */ int dt_maxformat; /* max format ID */ void **dt_formats; /* pointer to format array */ + int dt_maxstrdata; /* max strdata ID */ + char **dt_strdata; /* pointer to strdata array */ dt_aggregate_t dt_aggregate; /* aggregate */ dtrace_bufdesc_t dt_buf; /* staging buffer */ struct dt_pfdict *dt_pfdict; /* dictionary of printf conversions */ @@ -413,6 +420,7 @@ struct dtrace_hdl { #define DT_ACT_UMOD DT_ACT(26) /* umod() action */ #define DT_ACT_UADDR DT_ACT(27) /* uaddr() action */ #define DT_ACT_SETOPT DT_ACT(28) /* setopt() action */ +#define DT_ACT_PRINT DT_ACT(29) /* print() action */ /* * Sentinel to tell freopen() to restore the saved stdout. This must not @@ -596,10 +604,15 @@ extern void dt_aggid_destroy(dtrace_hdl_t *); extern void *dt_format_lookup(dtrace_hdl_t *, int); extern void dt_format_destroy(dtrace_hdl_t *); +extern const char *dt_strdata_lookup(dtrace_hdl_t *, int); +extern void dt_strdata_destroy(dtrace_hdl_t *); + extern int dt_print_quantize(dtrace_hdl_t *, FILE *, const void *, size_t, uint64_t); extern int dt_print_lquantize(dtrace_hdl_t *, FILE *, const void *, size_t, uint64_t); +extern int dt_print_llquantize(dtrace_hdl_t *, FILE *, + const void *, size_t, uint64_t); extern int dt_print_agg(const dtrace_aggdata_t *, void *); extern int dt_handle(dtrace_hdl_t *, dtrace_probedata_t *); diff --git a/lib/libdtrace/common/dt_map.c b/lib/libdtrace/common/dt_map.c index 1536186..8a3ce81 100644 --- a/lib/libdtrace/common/dt_map.c +++ b/lib/libdtrace/common/dt_map.c @@ -23,7 +23,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2011 by Delphix. All rights reserved. + */ #include #include @@ -35,10 +37,81 @@ #include static int +dt_strdata_add(dtrace_hdl_t *dtp, dtrace_recdesc_t *rec, void ***data, int *max) +{ + int maxformat; + dtrace_fmtdesc_t fmt; + void *result; + + if (rec->dtrd_format == 0) + return (0); + + if (rec->dtrd_format <= *max && + (*data)[rec->dtrd_format - 1] != NULL) { + return (0); + } + + bzero(&fmt, sizeof (fmt)); + fmt.dtfd_format = rec->dtrd_format; + fmt.dtfd_string = NULL; + fmt.dtfd_length = 0; + + if (dt_ioctl(dtp, DTRACEIOC_FORMAT, &fmt) == -1) + return (dt_set_errno(dtp, errno)); + + if ((fmt.dtfd_string = dt_alloc(dtp, fmt.dtfd_length)) == NULL) + return (dt_set_errno(dtp, EDT_NOMEM)); + + if (dt_ioctl(dtp, DTRACEIOC_FORMAT, &fmt) == -1) { + free(fmt.dtfd_string); + return (dt_set_errno(dtp, errno)); + } + + while (rec->dtrd_format > (maxformat = *max)) { + int new_max = maxformat ? (maxformat << 1) : 1; + size_t nsize = new_max * sizeof (void *); + size_t osize = maxformat * sizeof (void *); + void **new_data = dt_zalloc(dtp, nsize); + + if (new_data == NULL) { + dt_free(dtp, fmt.dtfd_string); + return (dt_set_errno(dtp, EDT_NOMEM)); + } + + bcopy(*data, new_data, osize); + free(*data); + + *data = new_data; + *max = new_max; + } + + switch (rec->dtrd_action) { + case DTRACEACT_DIFEXPR: + result = fmt.dtfd_string; + break; + case DTRACEACT_PRINTA: + result = dtrace_printa_create(dtp, fmt.dtfd_string); + dt_free(dtp, fmt.dtfd_string); + break; + default: + result = dtrace_printf_create(dtp, fmt.dtfd_string); + dt_free(dtp, fmt.dtfd_string); + break; + } + + if (result == NULL) + return (-1); + + (*data)[rec->dtrd_format - 1] = result; + + return (0); +} + +static int dt_epid_add(dtrace_hdl_t *dtp, dtrace_epid_t id) { dtrace_id_t max; - int rval, i, maxformat; + int rval, i; dtrace_eprobedesc_t *enabled, *nenabled; dtrace_probedesc_t *probe; @@ -124,71 +197,23 @@ dt_epid_add(dtrace_hdl_t *dtp, dtrace_epid_t id) } for (i = 0; i < enabled->dtepd_nrecs; i++) { - dtrace_fmtdesc_t fmt; dtrace_recdesc_t *rec = &enabled->dtepd_rec[i]; - if (!DTRACEACT_ISPRINTFLIKE(rec->dtrd_action)) - continue; - - if (rec->dtrd_format == 0) - continue; - - if (rec->dtrd_format <= dtp->dt_maxformat && - dtp->dt_formats[rec->dtrd_format - 1] != NULL) - continue; - - bzero(&fmt, sizeof (fmt)); - fmt.dtfd_format = rec->dtrd_format; - fmt.dtfd_string = NULL; - fmt.dtfd_length = 0; - - if (dt_ioctl(dtp, DTRACEIOC_FORMAT, &fmt) == -1) { - rval = dt_set_errno(dtp, errno); - goto err; - } - - if ((fmt.dtfd_string = malloc(fmt.dtfd_length)) == NULL) { - rval = dt_set_errno(dtp, EDT_NOMEM); - goto err; - } - - if (dt_ioctl(dtp, DTRACEIOC_FORMAT, &fmt) == -1) { - rval = dt_set_errno(dtp, errno); - free(fmt.dtfd_string); - goto err; - } - - while (rec->dtrd_format > (maxformat = dtp->dt_maxformat)) { - int new_max = maxformat ? (maxformat << 1) : 1; - size_t nsize = new_max * sizeof (void *); - size_t osize = maxformat * sizeof (void *); - void **new_formats = malloc(nsize); - - if (new_formats == NULL) { - rval = dt_set_errno(dtp, EDT_NOMEM); - free(fmt.dtfd_string); + if (DTRACEACT_ISPRINTFLIKE(rec->dtrd_action)) { + if (dt_strdata_add(dtp, rec, &dtp->dt_formats, + &dtp->dt_maxformat) != 0) { + rval = -1; + goto err; + } + } else if (rec->dtrd_action == DTRACEACT_DIFEXPR) { + if (dt_strdata_add(dtp, rec, + (void ***)&dtp->dt_strdata, + &dtp->dt_maxstrdata) != 0) { + rval = -1; goto err; } - - bzero(new_formats, nsize); - bcopy(dtp->dt_formats, new_formats, osize); - free(dtp->dt_formats); - - dtp->dt_formats = new_formats; - dtp->dt_maxformat = new_max; } - dtp->dt_formats[rec->dtrd_format - 1] = - rec->dtrd_action == DTRACEACT_PRINTA ? - dtrace_printa_create(dtp, fmt.dtfd_string) : - dtrace_printf_create(dtp, fmt.dtfd_string); - - free(fmt.dtfd_string); - - if (dtp->dt_formats[rec->dtrd_format - 1] == NULL) { - rval = -1; /* dt_errno is set for us */ - goto err; - } } dtp->dt_pdesc[id] = probe; @@ -424,3 +449,28 @@ dt_aggid_destroy(dtrace_hdl_t *dtp) dtp->dt_aggdesc = NULL; dtp->dt_maxagg = 0; } + +const char * +dt_strdata_lookup(dtrace_hdl_t *dtp, int idx) +{ + if (idx == 0 || idx > dtp->dt_maxstrdata) + return (NULL); + + if (dtp->dt_strdata == NULL) + return (NULL); + + return (dtp->dt_strdata[idx - 1]); +} + +void +dt_strdata_destroy(dtrace_hdl_t *dtp) +{ + int i; + + for (i = 0; i < dtp->dt_maxstrdata; i++) { + free(dtp->dt_strdata[i]); + } + + free(dtp->dt_strdata); + dtp->dt_strdata = NULL; +} diff --git a/lib/libdtrace/common/dt_open.c b/lib/libdtrace/common/dt_open.c index 2b9cd7c..502a9d4 100644 --- a/lib/libdtrace/common/dt_open.c +++ b/lib/libdtrace/common/dt_open.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include @@ -103,8 +105,13 @@ #define DT_VERS_1_6_1 DT_VERSION_NUMBER(1, 6, 1) #define DT_VERS_1_6_2 DT_VERSION_NUMBER(1, 6, 2) #define DT_VERS_1_6_3 DT_VERSION_NUMBER(1, 6, 3) -#define DT_VERS_LATEST DT_VERS_1_6_3 -#define DT_VERS_STRING "Sun D 1.6.3" +#define DT_VERS_1_7 DT_VERSION_NUMBER(1, 7, 0) +#define DT_VERS_1_7_1 DT_VERSION_NUMBER(1, 7, 1) +#define DT_VERS_1_8 DT_VERSION_NUMBER(1, 8, 0) +#define DT_VERS_1_8_1 DT_VERSION_NUMBER(1, 8, 1) +#define DT_VERS_1_9 DT_VERSION_NUMBER(1, 9, 0) +#define DT_VERS_LATEST DT_VERS_1_9 +#define DT_VERS_STRING "Sun D 1.9" const dt_version_t _dtrace_versions[] = { DT_VERS_1_0, /* D API 1.0.0 (PSARC 2001/466) Solaris 10 FCS */ @@ -120,6 +127,11 @@ const dt_version_t _dtrace_versions[] = { DT_VERS_1_6_1, /* D API 1.6.1 */ DT_VERS_1_6_2, /* D API 1.6.2 */ DT_VERS_1_6_3, /* D API 1.6.3 */ + DT_VERS_1_7, /* D API 1.7 */ + DT_VERS_1_7_1, /* D API 1.7.1 */ + DT_VERS_1_8, /* D API 1.8 */ + DT_VERS_1_8_1, /* D API 1.8.1 */ + DT_VERS_1_9, /* D API 1.9 */ 0 }; @@ -250,7 +262,10 @@ static const dt_ident_t _dtrace_globals[] = { { "jstack", DT_IDENT_ACTFUNC, 0, DT_ACT_JSTACK, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_func, "stack(...)" }, { "lltostr", DT_IDENT_FUNC, 0, DIF_SUBR_LLTOSTR, DT_ATTR_STABCMN, DT_VERS_1_0, - &dt_idops_func, "string(int64_t)" }, + &dt_idops_func, "string(int64_t, [int])" }, +{ "llquantize", DT_IDENT_AGGFUNC, 0, DTRACEAGG_LLQUANTIZE, DT_ATTR_STABCMN, + DT_VERS_1_7, &dt_idops_func, + "void(@, int32_t, int32_t, int32_t, int32_t, ...)" }, { "lquantize", DT_IDENT_AGGFUNC, 0, DTRACEAGG_LQUANTIZE, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_func, "void(@, int32_t, int32_t, ...)" }, @@ -292,6 +307,8 @@ static const dt_ident_t _dtrace_globals[] = { &dt_idops_type, "pid_t" }, { "ppid", DT_IDENT_SCALAR, 0, DIF_VAR_PPID, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_type, "pid_t" }, +{ "print", DT_IDENT_ACTFUNC, 0, DT_ACT_PRINT, DT_ATTR_STABCMN, DT_VERS_1_9, + &dt_idops_func, "void(@)" }, { "printa", DT_IDENT_ACTFUNC, 0, DT_ACT_PRINTA, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_func, "void(@, ...)" }, { "printf", DT_IDENT_ACTFUNC, 0, DT_ACT_PRINTF, DT_ATTR_STABCMN, DT_VERS_1_0, @@ -371,11 +388,15 @@ static const dt_ident_t _dtrace_globals[] = { { "timestamp", DT_IDENT_SCALAR, 0, DIF_VAR_TIMESTAMP, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_type, "uint64_t" }, +{ "tolower", DT_IDENT_FUNC, 0, DIF_SUBR_TOLOWER, DT_ATTR_STABCMN, DT_VERS_1_8, + &dt_idops_func, "string(const char *)" }, +{ "toupper", DT_IDENT_FUNC, 0, DIF_SUBR_TOUPPER, DT_ATTR_STABCMN, DT_VERS_1_8, + &dt_idops_func, "string(const char *)" }, { "trace", DT_IDENT_ACTFUNC, 0, DT_ACT_TRACE, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_func, "void(@)" }, { "tracemem", DT_IDENT_ACTFUNC, 0, DT_ACT_TRACEMEM, DT_ATTR_STABCMN, DT_VERS_1_0, - &dt_idops_func, "void(@, size_t)" }, + &dt_idops_func, "void(@, size_t, ...)" }, { "trunc", DT_IDENT_ACTFUNC, 0, DT_ACT_TRUNC, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_func, "void(...)" }, { "uaddr", DT_IDENT_ACTFUNC, 0, DT_ACT_UADDR, DT_ATTR_STABCMN, @@ -397,6 +418,8 @@ static const dt_ident_t _dtrace_globals[] = { &dt_idops_type, "uint32_t" }, { "usym", DT_IDENT_ACTFUNC, 0, DT_ACT_USYM, DT_ATTR_STABCMN, DT_VERS_1_2, &dt_idops_func, "_usymaddr(uintptr_t)" }, +{ "vmregs", DT_IDENT_ARRAY, 0, DIF_VAR_VMREGS, DT_ATTR_STABCMN, DT_VERS_1_7, + &dt_idops_regs, NULL }, { "vtimestamp", DT_IDENT_SCALAR, 0, DIF_VAR_VTIMESTAMP, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_type, "uint64_t" }, @@ -1339,6 +1362,7 @@ dtrace_close(dtrace_hdl_t *dtp) dt_epid_destroy(dtp); dt_aggid_destroy(dtp); dt_format_destroy(dtp); + dt_strdata_destroy(dtp); dt_buffered_destroy(dtp); dt_aggregate_destroy(dtp); free(dtp->dt_buf.dtbd_data); diff --git a/lib/libdtrace/common/dt_options.c b/lib/libdtrace/common/dt_options.c index 5353bfa..426f8cb 100644 --- a/lib/libdtrace/common/dt_options.c +++ b/lib/libdtrace/common/dt_options.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -837,30 +835,6 @@ dt_options_load(dtrace_hdl_t *dtp) return (0); } -/*ARGSUSED*/ -static int -dt_opt_preallocate(dtrace_hdl_t *dtp, const char *arg, uintptr_t option) -{ - dtrace_optval_t size; - void *p; - - if (arg == NULL || dt_optval_parse(arg, &size) != 0) - return (dt_set_errno(dtp, EDT_BADOPTVAL)); - - if (size > SIZE_MAX) - size = SIZE_MAX; - - if ((p = dt_zalloc(dtp, size)) == NULL) { - do { - size /= 2; - } while ((p = dt_zalloc(dtp, size)) == NULL); - } - - dt_free(dtp, p); - - return (0); -} - typedef struct dt_option { const char *o_name; int (*o_func)(dtrace_hdl_t *, const char *, uintptr_t); @@ -899,7 +873,6 @@ static const dt_option_t _dtrace_ctoptions[] = { { "linktype", dt_opt_linktype }, { "nolibs", dt_opt_cflags, DTRACE_C_NOLIBS }, { "pgmax", dt_opt_pgmax }, - { "preallocate", dt_opt_preallocate }, { "pspec", dt_opt_cflags, DTRACE_C_PSPEC }, { "stdc", dt_opt_stdc }, { "strip", dt_opt_dflags, DTRACE_D_STRIP }, diff --git a/lib/libdtrace/common/dt_parser.c b/lib/libdtrace/common/dt_parser.c index 6ad30a9..0571589 100644 --- a/lib/libdtrace/common/dt_parser.c +++ b/lib/libdtrace/common/dt_parser.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent Inc. All rights reserved. */ /* @@ -719,12 +720,19 @@ dt_node_type_name(const dt_node_t *dnp, char *buf, size_t len) size_t dt_node_type_size(const dt_node_t *dnp) { + ctf_id_t base; + if (dnp->dn_kind == DT_NODE_STRING) return (strlen(dnp->dn_string) + 1); if (dt_node_is_dynamic(dnp) && dnp->dn_ident != NULL) return (dt_ident_size(dnp->dn_ident)); + base = ctf_type_resolve(dnp->dn_ctfp, dnp->dn_type); + + if (ctf_type_kind(dnp->dn_ctfp, base) == CTF_K_FORWARD) + return (0); + return (ctf_type_size(dnp->dn_ctfp, dnp->dn_type)); } diff --git a/lib/libdtrace/common/dt_pragma.c b/lib/libdtrace/common/dt_pragma.c index 9cb3c3b..9fae5ac 100644 --- a/lib/libdtrace/common/dt_pragma.c +++ b/lib/libdtrace/common/dt_pragma.c @@ -21,14 +21,19 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent Inc. All rights reserved. */ #include #include #include +#include #include #include +#include +#include + #include #include #include @@ -196,6 +201,29 @@ dt_pragma_binding(const char *prname, dt_node_t *dnp) dtp->dt_globals->dh_defer = &dt_pragma_apply; } +static void +dt_pragma_depends_finddep(dtrace_hdl_t *dtp, const char *lname, char *lib, + size_t len) +{ + dt_dirpath_t *dirp; + struct stat sbuf; + int found = 0; + + for (dirp = dt_list_next(&dtp->dt_lib_path); dirp != NULL; + dirp = dt_list_next(dirp)) { + (void) snprintf(lib, len, "%s/%s", dirp->dir_path, lname); + + if (stat(lib, &sbuf) == 0) { + found = 1; + break; + } + } + + if (!found) + xyerror(D_PRAGMA_DEPEND, + "failed to find dependency in libpath: %s", lname); +} + /* * The #pragma depends_on directive can be used to express a dependency on a * module, provider or library which if not present will cause processing to @@ -225,16 +253,13 @@ dt_pragma_depends(const char *prname, dt_node_t *cnp) if (yypcb->pcb_cflags & DTRACE_C_CTL) { assert(dtp->dt_filetag != NULL); - /* - * We have the file we are working on in dtp->dt_filetag - * so find that node and add the dependency in. - */ + dt_pragma_depends_finddep(dtp, nnp->dn_string, lib, + sizeof (lib)); + dld = dt_lib_depend_lookup(&dtp->dt_lib_dep, dtp->dt_filetag); assert(dld != NULL); - (void) snprintf(lib, sizeof (lib), "%s%s", - dld->dtld_libpath, nnp->dn_string); if ((dt_lib_depend_add(dtp, &dld->dtld_dependencies, lib)) != 0) { xyerror(D_PRAGMA_DEPEND, @@ -256,8 +281,8 @@ dt_pragma_depends(const char *prname, dt_node_t *cnp) dtp->dt_filetag); assert(dld != NULL); - (void) snprintf(lib, sizeof (lib), "%s%s", - dld->dtld_libpath, nnp->dn_string); + dt_pragma_depends_finddep(dtp, nnp->dn_string, lib, + sizeof (lib)); dld = dt_lib_depend_lookup(&dtp->dt_lib_dep_sorted, lib); assert(dld != NULL); diff --git a/lib/libdtrace/common/dt_print.c b/lib/libdtrace/common/dt_print.c new file mode 100644 index 0000000..261fc8c --- /dev/null +++ b/lib/libdtrace/common/dt_print.c @@ -0,0 +1,648 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011 by Delphix. All rights reserved. + */ + +/* + * DTrace print() action + * + * This file contains the post-processing logic for the print() action. The + * print action behaves identically to trace() in that it generates a + * DTRACEACT_DIFEXPR action, but the action argument field refers to a CTF type + * string stored in the DOF string table (similar to printf formats). We + * take the result of the trace action and post-process it in the fashion of + * MDB's ::print dcmd. + * + * This implementation differs from MDB's in the following ways: + * + * - We do not expose any options or flags. The behavior of print() is + * equivalent to "::print -tn". + * + * - MDB will display "holes" in structures (unused padding between + * members). + * + * - When printing arrays of structures, MDB will leave a trailing ',' + * after the last element. + * + * - MDB will print time_t types as date and time. + * + * - MDB will detect when an enum is actually the OR of several flags, + * and print it out with the constituent flags separated. + * + * - For large arrays, MDB will print the first few members and then + * print a "..." continuation line. + * + * - MDB will break and wrap arrays at 80 columns. + * + * - MDB prints out floats and doubles by hand, as it must run in kmdb + * context. We're able to leverage the printf() format strings, + * but the result is a slightly different format. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* determines whether the given integer CTF encoding is a character */ +#define CTF_IS_CHAR(e) \ + (((e).cte_format & (CTF_INT_CHAR | CTF_INT_SIGNED)) == \ + (CTF_INT_CHAR | CTF_INT_SIGNED) && (e).cte_bits == NBBY) +/* determines whether the given CTF kind is a struct or union */ +#define CTF_IS_STRUCTLIKE(k) \ + ((k) == CTF_K_STRUCT || (k) == CTF_K_UNION) + +/* + * Print structure passed down recursively through printing algorithm. + */ +typedef struct dt_printarg { + caddr_t pa_addr; /* base address of trace data */ + ctf_file_t *pa_ctfp; /* CTF container */ + int pa_depth; /* member depth */ + int pa_nest; /* nested array depth */ + FILE *pa_file; /* output file */ +} dt_printarg_t; + +static int dt_print_member(const char *, ctf_id_t, ulong_t, int, void *); + +/* + * Safe version of ctf_type_name() that will fall back to just "" if it + * can't resolve the type. + */ +static void +dt_print_type_name(ctf_file_t *ctfp, ctf_id_t id, char *buf, size_t buflen) +{ + if (ctf_type_name(ctfp, id, buf, buflen) == NULL) + (void) snprintf(buf, buflen, "<%ld>", id); +} + +/* + * Print any necessary trailing braces for structures or unions. We don't get + * invoked when a struct or union ends, so we infer the need to print braces + * based on the depth the last time we printed something and the new depth. + */ +static void +dt_print_trailing_braces(dt_printarg_t *pap, int depth) +{ + int d; + + for (d = pap->pa_depth; d > depth; d--) { + (void) fprintf(pap->pa_file, "%*s}%s", + (d + pap->pa_nest - 1) * 4, "", + d == depth + 1 ? "" : "\n"); + } +} + +/* + * Print the appropriate amount of indentation given the current depth and + * array nesting. + */ +static void +dt_print_indent(dt_printarg_t *pap) +{ + (void) fprintf(pap->pa_file, "%*s", + (pap->pa_depth + pap->pa_nest) * 4, ""); +} + +/* + * Print a bitfield. It's worth noting that the D compiler support for + * bitfields is currently broken; printing "D`user_desc_t" (pulled in by the + * various D provider files) will produce incorrect results compared to + * "genunix`user_desc_t". + */ +static void +print_bitfield(dt_printarg_t *pap, ulong_t off, ctf_encoding_t *ep) +{ + FILE *fp = pap->pa_file; + caddr_t addr = pap->pa_addr + off / NBBY; + uint64_t mask = (1ULL << ep->cte_bits) - 1; + uint64_t value = 0; + size_t size = (ep->cte_bits + (NBBY - 1)) / NBBY; + uint8_t *buf = (uint8_t *)&value; + uint8_t shift; + + /* + * On big-endian machines, we need to adjust the buf pointer to refer + * to the lowest 'size' bytes in 'value', and we need to shift based on + * the offset from the end of the data, not the offset of the start. + */ +#ifdef _BIG_ENDIAN + buf += sizeof (value) - size; + off += ep->cte_bits; +#endif + bcopy(addr, buf, size); + shift = off % NBBY; + + /* + * Offsets are counted from opposite ends on little- and + * big-endian machines. + */ +#ifdef _BIG_ENDIAN + shift = NBBY - shift; +#endif + + /* + * If the bits we want do not begin on a byte boundary, shift the data + * right so that the value is in the lowest 'cte_bits' of 'value'. + */ + if (off % NBBY != 0) + value >>= shift; + value &= mask; + + (void) fprintf(fp, "%#llx", (u_longlong_t)value); +} + +/* + * Dump the contents of memory as a fixed-size integer in hex. + */ +static void +dt_print_hex(FILE *fp, caddr_t addr, size_t size) +{ + switch (size) { + case sizeof (uint8_t): + (void) fprintf(fp, "%#x", *(uint8_t *)addr); + break; + case sizeof (uint16_t): + /* LINTED - alignment */ + (void) fprintf(fp, "%#x", *(uint16_t *)addr); + break; + case sizeof (uint32_t): + /* LINTED - alignment */ + (void) fprintf(fp, "%#x", *(uint32_t *)addr); + break; + case sizeof (uint64_t): + (void) fprintf(fp, "%#llx", + /* LINTED - alignment */ + (unsigned long long)*(uint64_t *)addr); + break; + default: + (void) fprintf(fp, "", (uint_t)size); + } +} + +/* + * Print an integer type. Before dumping the contents via dt_print_hex(), we + * first check the encoding to see if it's part of a bitfield or a character. + */ +static void +dt_print_int(ctf_id_t base, ulong_t off, dt_printarg_t *pap) +{ + FILE *fp = pap->pa_file; + ctf_file_t *ctfp = pap->pa_ctfp; + ctf_encoding_t e; + size_t size; + caddr_t addr = pap->pa_addr + off / NBBY; + + if (ctf_type_encoding(ctfp, base, &e) == CTF_ERR) { + (void) fprintf(fp, ""); + return; + } + + /* + * This comes from MDB - it's not clear under what circumstances this + * would be found. + */ + if (e.cte_format & CTF_INT_VARARGS) { + (void) fprintf(fp, "..."); + return; + } + + /* + * We print this as a bitfield if the bit encoding indicates it's not + * an even power of two byte size, or is larger than 8 bytes. + */ + size = e.cte_bits / NBBY; + if (size > 8 || (e.cte_bits % NBBY) != 0 || (size & (size - 1)) != 0) { + print_bitfield(pap, off, &e); + return; + } + + /* + * If this is a character, print it out as such. + */ + if (CTF_IS_CHAR(e)) { + char c = *(char *)addr; + if (isprint(c)) + (void) fprintf(fp, "'%c'", c); + else if (c == 0) + (void) fprintf(fp, "'\\0'"); + else + (void) fprintf(fp, "'\\%03o'", c); + return; + } + + dt_print_hex(fp, addr, size); +} + +/* + * Print a floating point (float, double, long double) value. + */ +/* ARGSUSED */ +static void +dt_print_float(ctf_id_t base, ulong_t off, dt_printarg_t *pap) +{ + FILE *fp = pap->pa_file; + ctf_file_t *ctfp = pap->pa_ctfp; + ctf_encoding_t e; + caddr_t addr = pap->pa_addr + off / NBBY; + + if (ctf_type_encoding(ctfp, base, &e) == 0) { + if (e.cte_format == CTF_FP_SINGLE && + e.cte_bits == sizeof (float) * NBBY) { + /* LINTED - alignment */ + (void) fprintf(fp, "%+.7e", *((float *)addr)); + } else if (e.cte_format == CTF_FP_DOUBLE && + e.cte_bits == sizeof (double) * NBBY) { + /* LINTED - alignment */ + (void) fprintf(fp, "%+.7e", *((double *)addr)); + } else if (e.cte_format == CTF_FP_LDOUBLE && + e.cte_bits == sizeof (long double) * NBBY) { + /* LINTED - alignment */ + (void) fprintf(fp, "%+.16LE", *((long double *)addr)); + } else { + (void) fprintf(fp, ""); + } + } +} + +/* + * A pointer is printed as a fixed-size integer. This is used both for + * pointers and functions. + */ +static void +dt_print_ptr(ctf_id_t base, ulong_t off, dt_printarg_t *pap) +{ + FILE *fp = pap->pa_file; + ctf_file_t *ctfp = pap->pa_ctfp; + caddr_t addr = pap->pa_addr + off / NBBY; + size_t size = ctf_type_size(ctfp, base); + + dt_print_hex(fp, addr, size); +} + +/* + * Print out an array. This is somewhat complex, as we must manually visit + * each member, and recursively invoke ctf_type_visit() for each member. If + * the members are non-structs, then we print them out directly: + * + * [ 0x14, 0x2e, 0 ] + * + * If they are structs, then we print out the necessary leading and trailing + * braces, to end up with: + * + * [ + * type { + * ... + * }, + * type { + * ... + * } + * ] + * + * We also use a heuristic to detect whether the array looks like a character + * array. If the encoding indicates it's a character, and we have all + * printable characters followed by a null byte, then we display it as a + * string: + * + * [ "string" ] + */ +static void +dt_print_array(ctf_id_t base, ulong_t off, dt_printarg_t *pap) +{ + FILE *fp = pap->pa_file; + ctf_file_t *ctfp = pap->pa_ctfp; + caddr_t addr = pap->pa_addr + off / NBBY; + ctf_arinfo_t car; + ssize_t eltsize; + ctf_encoding_t e; + int i; + boolean_t isstring; + int kind; + ctf_id_t rtype; + + if (ctf_array_info(ctfp, base, &car) == CTF_ERR) { + (void) fprintf(fp, "0x%p", (void *)addr); + return; + } + + if ((eltsize = ctf_type_size(ctfp, car.ctr_contents)) < 0 || + (rtype = ctf_type_resolve(ctfp, car.ctr_contents)) == CTF_ERR || + (kind = ctf_type_kind(ctfp, rtype)) == CTF_ERR) { + (void) fprintf(fp, "", car.ctr_contents); + return; + } + + /* see if this looks like a string */ + isstring = B_FALSE; + if (kind == CTF_K_INTEGER && + ctf_type_encoding(ctfp, rtype, &e) != CTF_ERR && CTF_IS_CHAR(e)) { + char c; + for (i = 0; i < car.ctr_nelems; i++) { + c = *((char *)addr + eltsize * i); + if (!isprint(c) || c == '\0') + break; + } + + if (i != car.ctr_nelems && c == '\0') + isstring = B_TRUE; + } + + /* + * As a slight aesthetic optimization, if we are a top-level type, then + * don't bother printing out the brackets. This lets print("foo") look + * like: + * + * string "foo" + * + * As D will internally represent this as a char[256] array. + */ + if (!isstring || pap->pa_depth != 0) + (void) fprintf(fp, "[ "); + + if (isstring) + (void) fprintf(fp, "\""); + + for (i = 0; i < car.ctr_nelems; i++) { + if (isstring) { + char c = *((char *)addr + eltsize * i); + if (c == '\0') + break; + (void) fprintf(fp, "%c", c); + } else { + /* + * Recursively invoke ctf_type_visit() on each member. + * We setup a new printarg struct with 'pa_nest' set to + * indicate that we are within a nested array. + */ + dt_printarg_t pa = *pap; + pa.pa_nest += pap->pa_depth + 1; + pa.pa_depth = 0; + pa.pa_addr = addr + eltsize * i; + (void) ctf_type_visit(ctfp, car.ctr_contents, + dt_print_member, &pa); + + dt_print_trailing_braces(&pa, 0); + if (i != car.ctr_nelems - 1) + (void) fprintf(fp, ", "); + else if (CTF_IS_STRUCTLIKE(kind)) + (void) fprintf(fp, "\n"); + } + } + + if (isstring) + (void) fprintf(fp, "\""); + + if (!isstring || pap->pa_depth != 0) { + if (CTF_IS_STRUCTLIKE(kind)) + dt_print_indent(pap); + else + (void) fprintf(fp, " "); + (void) fprintf(fp, "]"); + } +} + +/* + * This isued by both structs and unions to print the leading brace. + */ +/* ARGSUSED */ +static void +dt_print_structlike(ctf_id_t id, ulong_t off, dt_printarg_t *pap) +{ + (void) fprintf(pap->pa_file, "{"); +} + +/* + * For enums, we try to print the enum name, and fall back to the value if it + * can't be determined. We do not do any fancy flag processing like mdb. + */ +/* ARGSUSED */ +static void +dt_print_enum(ctf_id_t base, ulong_t off, dt_printarg_t *pap) +{ + FILE *fp = pap->pa_file; + ctf_file_t *ctfp = pap->pa_ctfp; + const char *ename; + int value = 0; + + if ((ename = ctf_enum_name(ctfp, base, value)) != NULL) + (void) fprintf(fp, "%s", ename); + else + (void) fprintf(fp, "%d", value); +} + +/* + * Forward declaration. There's not much to do here without the complete + * type information, so just print out this fact and drive on. + */ +/* ARGSUSED */ +static void +dt_print_tag(ctf_id_t base, ulong_t off, dt_printarg_t *pap) +{ + (void) fprintf(pap->pa_file, ""); +} + +typedef void dt_printarg_f(ctf_id_t, ulong_t, dt_printarg_t *); + +static dt_printarg_f *const dt_printfuncs[] = { + dt_print_int, /* CTF_K_INTEGER */ + dt_print_float, /* CTF_K_FLOAT */ + dt_print_ptr, /* CTF_K_POINTER */ + dt_print_array, /* CTF_K_ARRAY */ + dt_print_ptr, /* CTF_K_FUNCTION */ + dt_print_structlike, /* CTF_K_STRUCT */ + dt_print_structlike, /* CTF_K_UNION */ + dt_print_enum, /* CTF_K_ENUM */ + dt_print_tag /* CTF_K_FORWARD */ +}; + +/* + * Print one member of a structure. This callback is invoked from + * ctf_type_visit() recursively. + */ +static int +dt_print_member(const char *name, ctf_id_t id, ulong_t off, int depth, + void *data) +{ + char type[DT_TYPE_NAMELEN]; + int kind; + dt_printarg_t *pap = data; + FILE *fp = pap->pa_file; + ctf_file_t *ctfp = pap->pa_ctfp; + boolean_t arraymember; + boolean_t brief; + ctf_encoding_t e; + ctf_id_t rtype; + + dt_print_trailing_braces(pap, depth); + /* + * dt_print_trailing_braces() doesn't include the trailing newline; add + * it here if necessary. + */ + if (depth < pap->pa_depth) + (void) fprintf(fp, "\n"); + pap->pa_depth = depth; + + if ((rtype = ctf_type_resolve(ctfp, id)) == CTF_ERR || + (kind = ctf_type_kind(ctfp, rtype)) == CTF_ERR || + kind < CTF_K_INTEGER || kind > CTF_K_FORWARD) { + dt_print_indent(pap); + (void) fprintf(fp, "%s = ", name, id); + return (0); + } + + dt_print_type_name(ctfp, id, type, sizeof (type)); + + arraymember = (pap->pa_nest != 0 && depth == 0); + brief = (arraymember && !CTF_IS_STRUCTLIKE(kind)); + + if (!brief) { + /* + * If this is a direct array member and a struct (otherwise + * brief would be true), then print a trailing newline, as the + * array printing code doesn't include it because it might be a + * simple type. + */ + if (arraymember) + (void) fprintf(fp, "\n"); + dt_print_indent(pap); + + /* always print the type */ + (void) fprintf(fp, "%s", type); + if (name[0] != '\0') { + /* + * For aesthetics, we don't include a space between the + * type name and member name if the type is a pointer. + * This will give us "void *foo =" instead of "void * + * foo =". Unions also have the odd behavior that the + * type name is returned as "union ", with a trailing + * space, so we also avoid printing a space if the type + * name already ends with a space. + */ + if (type[strlen(type) - 1] != '*' && + type[strlen(type) -1] != ' ') { + (void) fprintf(fp, " "); + } + (void) fprintf(fp, "%s", name); + + /* + * If this looks like a bitfield, or is an integer not + * aligned on a byte boundary, print the number of + * bits after the name. + */ + if (kind == CTF_K_INTEGER && + ctf_type_encoding(ctfp, id, &e) == 0) { + ulong_t bits = e.cte_bits; + ulong_t size = bits / NBBY; + + if (bits % NBBY != 0 || + off % NBBY != 0 || + size > 8 || + size != ctf_type_size(ctfp, id)) { + (void) fprintf(fp, " :%lu", bits); + } + } + + (void) fprintf(fp, " ="); + } + (void) fprintf(fp, " "); + } + + dt_printfuncs[kind - 1](rtype, off, pap); + + /* direct simple array members are not separated by newlines */ + if (!brief) + (void) fprintf(fp, "\n"); + + return (0); +} + +/* + * Main print function invoked by dt_consume_cpu(). + */ +int +dtrace_print(dtrace_hdl_t *dtp, FILE *fp, const char *typename, + caddr_t addr, size_t len) +{ + const char *s; + char *object; + dt_printarg_t pa; + ctf_id_t id; + dt_module_t *dmp; + + /* + * Split the fully-qualified type ID (module`id). This should + * always be the format, but if for some reason we don't find the + * expected value, return 0 to fall back to the generic trace() + * behavior. + */ + for (s = typename; *s != '\0' && *s != '`'; s++) + ; + + if (*s != '`') + return (0); + + object = alloca(s - typename + 1); + bcopy(typename, object, s - typename); + object[s - typename] = '\0'; + id = atoi(s + 1); + + /* + * Try to get the CTF kind for this id. If something has gone horribly + * wrong and we can't resolve the ID, bail out and let trace() do the + * work. + */ + dmp = dt_module_lookup_by_name(dtp, object); + if (dmp == NULL || ctf_type_kind(dt_module_getctf(dtp, dmp), + id) == CTF_ERR) { + return (0); + } + + /* setup the print structure and kick off the main print routine */ + pa.pa_addr = addr; + pa.pa_ctfp = dt_module_getctf(dtp, dmp); + pa.pa_nest = 0; + pa.pa_depth = 0; + pa.pa_file = fp; + (void) ctf_type_visit(pa.pa_ctfp, id, dt_print_member, &pa); + + dt_print_trailing_braces(&pa, 0); + + return (len); +} diff --git a/lib/libdtrace/common/dt_printf.c b/lib/libdtrace/common/dt_printf.c index 5290478..eabc423 100644 --- a/lib/libdtrace/common/dt_printf.c +++ b/lib/libdtrace/common/dt_printf.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include @@ -1300,6 +1301,14 @@ pfprint_lquantize(dtrace_hdl_t *dtp, FILE *fp, const char *format, return (dt_print_lquantize(dtp, fp, addr, size, normal)); } +/*ARGSUSED*/ +static int +pfprint_llquantize(dtrace_hdl_t *dtp, FILE *fp, const char *format, + const dt_pfargd_t *pfd, const void *addr, size_t size, uint64_t normal) +{ + return (dt_print_llquantize(dtp, fp, addr, size, normal)); +} + static int dt_printf_format(dtrace_hdl_t *dtp, FILE *fp, const dt_pfargv_t *pfv, const dtrace_recdesc_t *recs, uint_t nrecs, const void *buf, @@ -1485,6 +1494,9 @@ dt_printf_format(dtrace_hdl_t *dtp, FILE *fp, const dt_pfargv_t *pfv, case DTRACEAGG_LQUANTIZE: func = pfprint_lquantize; break; + case DTRACEAGG_LLQUANTIZE: + func = pfprint_llquantize; + break; case DTRACEACT_MOD: func = pfprint_mod; break; diff --git a/lib/libdtrace/common/dt_program.c b/lib/libdtrace/common/dt_program.c index 19f377d..7d725bd 100644 --- a/lib/libdtrace/common/dt_program.c +++ b/lib/libdtrace/common/dt_program.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include @@ -347,6 +348,7 @@ dtrace_stmt_destroy(dtrace_hdl_t *dtp, dtrace_stmtdesc_t *sdp) if (sdp->dtsd_fmtdata != NULL) dt_printf_destroy(sdp->dtsd_fmtdata); + dt_free(dtp, sdp->dtsd_strdata); dt_ecbdesc_release(dtp, sdp->dtsd_ecbdesc); dt_free(dtp, sdp); diff --git a/lib/libdtrace/common/dtrace.h b/lib/libdtrace/common/dtrace.h index 1c04120..87df1ca 100644 --- a/lib/libdtrace/common/dtrace.h +++ b/lib/libdtrace/common/dtrace.h @@ -24,11 +24,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + #ifndef _DTRACE_H #define _DTRACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -148,6 +150,7 @@ typedef struct dtrace_stmtdesc { dtrace_actdesc_t *dtsd_action_last; /* last action in action list */ void *dtsd_aggdata; /* aggregation data */ void *dtsd_fmtdata; /* type-specific output data */ + void *dtsd_strdata; /* type-specific string data */ void (*dtsd_callback)(); /* callback function for EPID */ void *dtsd_data; /* callback data pointer */ dtrace_attribute_t dtsd_descattr; /* probedesc attributes */ @@ -240,6 +243,18 @@ extern int dtrace_freopen(dtrace_hdl_t *, FILE *, void *, const void *, size_t); /* + * Type-specific output printing + * + * The print() action will associate a string data record that is actually the + * fully-qualified type name of the data traced by the DIFEXPR action. This is + * stored in the same 'format' record from the kernel, but we know by virtue of + * the fact that the action is still DIFEXPR that it is actually a reference to + * plain string data. + */ +extern int dtrace_print(dtrace_hdl_t *, FILE *, const char *, + caddr_t, size_t); + +/* * DTrace Work Interface */ typedef enum { diff --git a/lib/libdtrace/i386/regs.d.in b/lib/libdtrace/i386/regs.d.in index 3328f33..d18c5f7 100644 --- a/lib/libdtrace/i386/regs.d.in +++ b/lib/libdtrace/i386/regs.d.in @@ -23,8 +23,9 @@ * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2011 Joyent, Inc. All rights reserved. + */ inline int R_GS = @GS@; #pragma D binding "1.0" R_GS @@ -115,3 +116,149 @@ inline int R_R14 = @REG_R14@; inline int R_R15 = @REG_R15@; #pragma D binding "1.0" R_R15 +enum vmregs_vmx { + VMX_VIRTUAL_PROCESSOR_ID = 0x00000000, + VMX_GUEST_ES_SELECTOR = 0x00000800, + VMX_GUEST_CS_SELECTOR = 0x00000802, + VMX_GUEST_SS_SELECTOR = 0x00000804, + VMX_GUEST_DS_SELECTOR = 0x00000806, + VMX_GUEST_FS_SELECTOR = 0x00000808, + VMX_GUEST_GS_SELECTOR = 0x0000080a, + VMX_GUEST_LDTR_SELECTOR = 0x0000080c, + VMX_GUEST_TR_SELECTOR = 0x0000080e, + VMX_HOST_ES_SELECTOR = 0x00000c00, + VMX_HOST_CS_SELECTOR = 0x00000c02, + VMX_HOST_SS_SELECTOR = 0x00000c04, + VMX_HOST_DS_SELECTOR = 0x00000c06, + VMX_HOST_FS_SELECTOR = 0x00000c08, + VMX_HOST_GS_SELECTOR = 0x00000c0a, + VMX_HOST_TR_SELECTOR = 0x00000c0c, + VMX_IO_BITMAP_A = 0x00002000, + VMX_IO_BITMAP_A_HIGH = 0x00002001, + VMX_IO_BITMAP_B = 0x00002002, + VMX_IO_BITMAP_B_HIGH = 0x00002003, + VMX_MSR_BITMAP = 0x00002004, + VMX_MSR_BITMAP_HIGH = 0x00002005, + VMX_VM_EXIT_MSR_STORE_ADDR = 0x00002006, + VMX_VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, + VMX_VM_EXIT_MSR_LOAD_ADDR = 0x00002008, + VMX_VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, + VMX_VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, + VMX_VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + VMX_TSC_OFFSET = 0x00002010, + VMX_TSC_OFFSET_HIGH = 0x00002011, + VMX_VIRTUAL_APIC_PAGE_ADDR = 0x00002012, + VMX_VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, + VMX_APIC_ACCESS_ADDR = 0x00002014, + VMX_APIC_ACCESS_ADDR_HIGH = 0x00002015, + VMX_EPT_POINTER = 0x0000201a, + VMX_EPT_POINTER_HIGH = 0x0000201b, + VMX_GUEST_PHYSICAL_ADDRESS = 0x00002400, + VMX_GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, + VMX_VMCS_LINK_POINTER = 0x00002800, + VMX_VMCS_LINK_POINTER_HIGH = 0x00002801, + VMX_GUEST_IA32_DEBUGCTL = 0x00002802, + VMX_GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + VMX_GUEST_IA32_PAT = 0x00002804, + VMX_GUEST_IA32_PAT_HIGH = 0x00002805, + VMX_GUEST_PDPTR0 = 0x0000280a, + VMX_GUEST_PDPTR0_HIGH = 0x0000280b, + VMX_GUEST_PDPTR1 = 0x0000280c, + VMX_GUEST_PDPTR1_HIGH = 0x0000280d, + VMX_GUEST_PDPTR2 = 0x0000280e, + VMX_GUEST_PDPTR2_HIGH = 0x0000280f, + VMX_GUEST_PDPTR3 = 0x00002810, + VMX_GUEST_PDPTR3_HIGH = 0x00002811, + VMX_HOST_IA32_PAT = 0x00002c00, + VMX_HOST_IA32_PAT_HIGH = 0x00002c01, + VMX_PIN_BASED_VM_EXEC_CONTROL = 0x00004000, + VMX_CPU_BASED_VM_EXEC_CONTROL = 0x00004002, + VMX_EXCEPTION_BITMAP = 0x00004004, + VMX_PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, + VMX_PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, + VMX_CR3_TARGET_COUNT = 0x0000400a, + VMX_VM_EXIT_CONTROLS = 0x0000400c, + VMX_VM_EXIT_MSR_STORE_COUNT = 0x0000400e, + VMX_VM_EXIT_MSR_LOAD_COUNT = 0x00004010, + VMX_VM_ENTRY_CONTROLS = 0x00004012, + VMX_VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, + VMX_VM_ENTRY_INTR_INFO_FIELD = 0x00004016, + VMX_VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, + VMX_VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, + VMX_TPR_THRESHOLD = 0x0000401c, + VMX_SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + VMX_PLE_GAP = 0x00004020, + VMX_PLE_WINDOW = 0x00004022, + VMX_VM_INSTRUCTION_ERROR = 0x00004400, + VMX_VM_EXIT_REASON = 0x00004402, + VMX_VM_EXIT_INTR_INFO = 0x00004404, + VMX_VM_EXIT_INTR_ERROR_CODE = 0x00004406, + VMX_IDT_VECTORING_INFO_FIELD = 0x00004408, + VMX_IDT_VECTORING_ERROR_CODE = 0x0000440a, + VMX_VM_EXIT_INSTRUCTION_LEN = 0x0000440c, + VMX_VMX_INSTRUCTION_INFO = 0x0000440e, + VMX_GUEST_ES_LIMIT = 0x00004800, + VMX_GUEST_CS_LIMIT = 0x00004802, + VMX_GUEST_SS_LIMIT = 0x00004804, + VMX_GUEST_DS_LIMIT = 0x00004806, + VMX_GUEST_FS_LIMIT = 0x00004808, + VMX_GUEST_GS_LIMIT = 0x0000480a, + VMX_GUEST_LDTR_LIMIT = 0x0000480c, + VMX_GUEST_TR_LIMIT = 0x0000480e, + VMX_GUEST_GDTR_LIMIT = 0x00004810, + VMX_GUEST_IDTR_LIMIT = 0x00004812, + VMX_GUEST_ES_AR_BYTES = 0x00004814, + VMX_GUEST_CS_AR_BYTES = 0x00004816, + VMX_GUEST_SS_AR_BYTES = 0x00004818, + VMX_GUEST_DS_AR_BYTES = 0x0000481a, + VMX_GUEST_FS_AR_BYTES = 0x0000481c, + VMX_GUEST_GS_AR_BYTES = 0x0000481e, + VMX_GUEST_LDTR_AR_BYTES = 0x00004820, + VMX_GUEST_TR_AR_BYTES = 0x00004822, + VMX_GUEST_INTERRUPTIBILITY_INFO = 0x00004824, + VMX_GUEST_ACTIVITY_STATE = 0X00004826, + VMX_GUEST_SYSENTER_CS = 0x0000482A, + VMX_HOST_IA32_SYSENTER_CS = 0x00004c00, + VMX_CR0_GUEST_HOST_MASK = 0x00006000, + VMX_CR4_GUEST_HOST_MASK = 0x00006002, + VMX_CR0_READ_SHADOW = 0x00006004, + VMX_CR4_READ_SHADOW = 0x00006006, + VMX_CR3_TARGET_VALUE0 = 0x00006008, + VMX_CR3_TARGET_VALUE1 = 0x0000600a, + VMX_CR3_TARGET_VALUE2 = 0x0000600c, + VMX_CR3_TARGET_VALUE3 = 0x0000600e, + VMX_EXIT_QUALIFICATION = 0x00006400, + VMX_GUEST_LINEAR_ADDRESS = 0x0000640a, + VMX_GUEST_CR0 = 0x00006800, + VMX_GUEST_CR3 = 0x00006802, + VMX_GUEST_CR4 = 0x00006804, + VMX_GUEST_ES_BASE = 0x00006806, + VMX_GUEST_CS_BASE = 0x00006808, + VMX_GUEST_SS_BASE = 0x0000680a, + VMX_GUEST_DS_BASE = 0x0000680c, + VMX_GUEST_FS_BASE = 0x0000680e, + VMX_GUEST_GS_BASE = 0x00006810, + VMX_GUEST_LDTR_BASE = 0x00006812, + VMX_GUEST_TR_BASE = 0x00006814, + VMX_GUEST_GDTR_BASE = 0x00006816, + VMX_GUEST_IDTR_BASE = 0x00006818, + VMX_GUEST_DR7 = 0x0000681a, + VMX_GUEST_RSP = 0x0000681c, + VMX_GUEST_RIP = 0x0000681e, + VMX_GUEST_RFLAGS = 0x00006820, + VMX_GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, + VMX_GUEST_SYSENTER_ESP = 0x00006824, + VMX_GUEST_SYSENTER_EIP = 0x00006826, + VMX_HOST_CR0 = 0x00006c00, + VMX_HOST_CR3 = 0x00006c02, + VMX_HOST_CR4 = 0x00006c04, + VMX_HOST_FS_BASE = 0x00006c06, + VMX_HOST_GS_BASE = 0x00006c08, + VMX_HOST_TR_BASE = 0x00006c0a, + VMX_HOST_GDTR_BASE = 0x00006c0c, + VMX_HOST_IDTR_BASE = 0x00006c0e, + VMX_HOST_IA32_SYSENTER_ESP = 0x00006c10, + VMX_HOST_IA32_SYSENTER_EIP = 0x00006c12, + VMX_HOST_RSP = 0x00006c14, + VMX_HOST_RIP = 0x00006c16 +}; diff --git a/man/man1m/dtrace.1m b/man/man1m/dtrace.1m index 1381044..fc71612 100644 --- a/man/man1m/dtrace.1m +++ b/man/man1m/dtrace.1m @@ -3,20 +3,20 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH dtrace 1M "4 Aug 2009" "SunOS 5.11" "System Administration Commands" +.TH DTRACE 1M "Aug 4, 2009" .SH NAME dtrace \- DTrace dynamic tracing compiler and tracing utility .SH SYNOPSIS .LP .nf -\fBdtrace\fR [\fB-32\fR | \fB-64\fR] [\fB-aACeFGHhlqSvVwZ\fR] [\fB-b\fR \fIbufsz\fR] [\fB-c\fR \fIcmd\fR] - [\fB-D\fR \fIname\fR [\fI=value\fR]] [\fB-I\fR \fIpath\fR] [\fB-L\fR \fIpath\fR] [\fB-o\fR \fIoutput\fR] - [\fB-s\fR \fIscript\fR] [\fB-U\fR \fIname\fR] [\fB-x\fR \fIarg\fR [\fI=val\fR]] - [\fB-X\fR a | c | s | t] [\fB-p\fR \fIpid\fR] - [\fB-P\fR \fIprovider\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-m\fR [\fIprovider:\fR] \fImodule\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-f\fR [[\fIprovider:\fR] \fImodule:\fR] \fIfunction\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]] +\fBdtrace\fR [\fB-32\fR | \fB-64\fR] [\fB-aACeFGHhlqSvVwZ\fR] [\fB-b\fR \fIbufsz\fR] [\fB-c\fR \fIcmd\fR] + [\fB-D\fR \fIname\fR [\fI=value\fR]] [\fB-I\fR \fIpath\fR] [\fB-L\fR \fIpath\fR] [\fB-o\fR \fIoutput\fR] + [\fB-s\fR \fIscript\fR] [\fB-U\fR \fIname\fR] [\fB-x\fR \fIarg\fR [\fI=val\fR]] + [\fB-X\fR a | c | s | t] [\fB-p\fR \fIpid\fR] + [\fB-P\fR \fIprovider\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-m\fR [\fIprovider:\fR] \fImodule\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-f\fR [[\fIprovider:\fR] \fImodule:\fR] \fIfunction\fR [[\fIpredicate\fR] \fIaction\fR]] + [\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]] [\fB-i\fR \fIprobe-id\fR [[\fIpredicate\fR] \fIaction\fR]] .fi @@ -97,7 +97,6 @@ appropriately quoted to avoid interpretation of meta-characters by the shell. The following options are supported: .sp .ne 2 -.mk .na \fB\fB-32\fR | \fB-64\fR\fR .ad @@ -118,7 +117,6 @@ format (ELF32 or ELF64) produced by the \fB-G\fR option. .sp .ne 2 -.mk .na \fB\fB-a\fR\fR .ad @@ -133,7 +131,6 @@ information about anonymous tracing. .sp .ne 2 -.mk .na \fB\fB-A\fR\fR .ad @@ -149,7 +146,6 @@ the specified probes for anonymous tracing and then exits. By default, .sp .ne 2 -.mk .na \fB\fB-b\fR \fIbufsz\fR\fR .ad @@ -163,7 +159,6 @@ size or exit depending on the setting of the \fBbufresize\fR property. .sp .ne 2 -.mk .na \fB\fB-c\fR \fIcmd\fR\fR .ad @@ -180,7 +175,6 @@ for more information on macro variables. .sp .ne 2 -.mk .na \fB\fB-C\fR\fR .ad @@ -195,7 +189,6 @@ the D compiler when invoking the C preprocessor, see \fB-X\fR. .sp .ne 2 -.mk .na \fB\fB-D\fR \fIname\fR \fB[=\fR\fIvalue\fR\fB]\fR\fR .ad @@ -209,7 +202,6 @@ option to each \fBcpp\fR invocation. .sp .ne 2 -.mk .na \fB\fB-e\fR\fR .ad @@ -225,7 +217,6 @@ instrumentation. .sp .ne 2 -.mk .na \fB\fB-f\fR\fB[[\fR\fIprovider\fR\fB:]\fR\fImodule\fR\fB:]\fR\fIfunction\fR\fB[ [\fR\fIpredicate\fR\fB]\fR\fIaction\fR\fB]]\fR\fR @@ -245,7 +236,6 @@ on the command line at a time. .sp .ne 2 -.mk .na \fB\fB-F\fR\fR .ad @@ -261,7 +251,6 @@ their output is prefixed with \fB<=\fR\&. .sp .ne 2 -.mk .na \fB\fB-G\fR\fR .ad @@ -279,7 +268,6 @@ whose name is \fB\fIfilename\fR.d\fR, then the ELF file is saved using the name .sp .ne 2 -.mk .na \fB\fB-H\fR\fR .ad @@ -293,7 +281,6 @@ invocation, causing it to display the list of pathnames, one for each line, to .sp .ne 2 -.mk .na \fB\fB-h\fR\fR .ad @@ -311,7 +298,6 @@ not present and the DTrace program is contained with a file whose name is .sp .ne 2 -.mk .na \fB\fB-i\fR \fIprobe-id\fR\fB[[\fR\fIpredicate\fR] \fIaction\fR\fB]\fR\fR .ad @@ -325,7 +311,6 @@ clause. You can specify more than one \fB-i\fR option at a time. .sp .ne 2 -.mk .na \fB\fB-I\fR \fIpath\fR\fR .ad @@ -340,7 +325,6 @@ list. .sp .ne 2 -.mk .na \fB\fB-L\fR \fIpath\fR\fR .ad @@ -354,7 +338,6 @@ search path. .sp .ne 2 -.mk .na \fB\fB-l\fR\fR .ad @@ -368,7 +351,6 @@ options. If none of these options are specified, this option lists all probes. .sp .ne 2 -.mk .na \fB\fB-m\fR [[\fIprovider:\fR] \fImodule:\fR [[\fIpredicate\fR] \fIaction\fR]]\fR @@ -387,7 +369,6 @@ on the command line at a time. .sp .ne 2 -.mk .na \fB\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]\fR @@ -407,7 +388,6 @@ specified on the command line at a time. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIoutput\fR\fR .ad @@ -424,7 +404,6 @@ the default output file is \fBd.out\fR. .sp .ne 2 -.mk .na \fB\fB-p\fR \fIpid\fR\fR .ad @@ -441,7 +420,6 @@ Dynamic Tracing Guide\fR for more information on macro variables. .sp .ne 2 -.mk .na \fB\fB-P\fR \fIprovider\fR \fB[[\fR\fIpredicate\fR\fB]\fR \fIaction\fR]\fR .ad @@ -456,7 +434,6 @@ suffixed with an optional D probe clause. You can specify more than one .sp .ne 2 -.mk .na \fB\fB-q\fR\fR .ad @@ -471,7 +448,6 @@ data traced and formatted by D program statements such as \fBtrace()\fR and .sp .ne 2 -.mk .na \fB\fB-s\fR\fR .ad @@ -487,7 +463,6 @@ program is enabled and tracing begins. .sp .ne 2 -.mk .na \fB\fB-S\fR\fR .ad @@ -499,7 +474,6 @@ intermediate code generated for each D program to \fBstderr\fR. .sp .ne 2 -.mk .na \fB\fB-U\fR \fIname\fR\fR .ad @@ -512,7 +486,6 @@ invocation. .sp .ne 2 -.mk .na \fB\fB-v\fR\fR .ad @@ -526,7 +499,6 @@ further detail in the \fISolaris Dynamic Tracing Guide\fR. .sp .ne 2 -.mk .na \fB\fB-V\fR\fR .ad @@ -540,7 +512,6 @@ about DTrace versioning features. .sp .ne 2 -.mk .na \fB\fB-w\fR\fR .ad @@ -554,7 +525,6 @@ enabling of a D program that contains destructive actions. .sp .ne 2 -.mk .na \fB\fB-x\fR \fIarg\fR [\fI=val\fR]\fR .ad @@ -568,7 +538,6 @@ the option name and value with an equals sign (\fB=\fR). .sp .ne 2 -.mk .na \fB\fB-X\fR \fBa | c | s | t\fR\fR .ad @@ -582,12 +551,10 @@ depending upon the value of the argument letter. The \fB-X\fR option supports the following arguments: .sp .ne 2 -.mk .na \fB\fBa\fR\fR .ad .RS 5n -.rt Default. ISO C plus K&R compatibility extensions, with semantic changes required by ISO C. This is the default mode if \fB-X\fR is not specified. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in @@ -596,12 +563,10 @@ conjunction with the \fB-Xa\fR option. .sp .ne 2 -.mk .na \fB\fBc\fR\fR .ad .RS 5n -.rt Conformance. Strictly conformant ISO C, without K&R C compatibility extensions. The predefined macro \fB__STDC__\fR has a value of 1 when \fBcpp\fR is invoked in conjunction with the \fB-Xc\fR option. @@ -609,24 +574,20 @@ in conjunction with the \fB-Xc\fR option. .sp .ne 2 -.mk .na \fB\fBs\fR\fR .ad .RS 5n -.rt K&R C only. The macro \fB__STDC__\fR is not defined when \fBcpp\fR is invoked in conjunction with the \fB-Xs\fR option. .RE .sp .ne 2 -.mk .na \fB\fBt\fR\fR .ad .RS 5n -.rt Transition. ISO C plus K&R C compatibility extensions, without semantic changes required by ISO C. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction with the \fB-Xt\fR option. @@ -708,7 +669,6 @@ information about DTrace versioning. .sp .ne 2 -.mk .na \fB\fB-Z\fR\fR .ad @@ -735,12 +695,10 @@ in the \fISolaris Dynamic Tracing Guide\fR. The following exit values are returned: .sp .ne 2 -.mk .na \fB0\fR .ad .RS 5n -.rt Successful completion. .sp For D program requests, an exit status of \fB0\fR indicates that programs were @@ -751,12 +709,10 @@ tracing requests encountered errors or drops. .sp .ne 2 -.mk .na \fB\fB1\fR\fR .ad .RS 5n -.rt An error occurred. .sp For D program requests, an exit status of \fB1\fR indicates that program @@ -765,12 +721,10 @@ compilation failed or that the specified request could not be satisfied. .sp .ne 2 -.mk .na \fB\fB2\fR\fR .ad .RS 5n -.rt Invalid command line options or arguments were specified. .RE @@ -782,13 +736,12 @@ See \fBattributes\fR(5) for descriptions of the following attributes: .sp .TS -tab() box; -cw(2.75i) |cw(2.75i) -lw(2.75i) |lw(2.75i) -. -ATTRIBUTE TYPEATTRIBUTE VALUE +box; +c | c +l | l . +ATTRIBUTE TYPE ATTRIBUTE VALUE _ -Interface StabilitySee below. +Interface Stability See below. .TE .sp diff --git a/man/man1m/lockstat.1m b/man/man1m/lockstat.1m index 495b294..8c7eb17 100644 --- a/man/man1m/lockstat.1m +++ b/man/man1m/lockstat.1m @@ -3,16 +3,16 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH lockstat 1M "28 Feb 2008" "SunOS 5.11" "System Administration Commands" +.TH LOCKSTAT 1M "Feb 28, 2008" .SH NAME lockstat \- report kernel lock and profiling statistics .SH SYNOPSIS .LP .nf -\fBlockstat\fR [\fB-ACEHI\fR] [\fB-e\fR \fIevent_list\fR] [\fB-i\fR \fIrate\fR] - [\fB-b\fR | \fB-t\fR | \fB-h\fR | \fB-s\fR \fIdepth\fR] [\fB-n\fR \fInrecords\fR] - [\fB-l\fR \fIlock\fR [, \fIsize\fR]] [\fB-d\fR \fIduration\fR] - [\fB-f\fR \fIfunction\fR [, \fIsize\fR]] [\fB-T\fR] [\fB-ckgwWRpP\fR] [\fB-D\fR \fIcount\fR] +\fBlockstat\fR [\fB-ACEHI\fR] [\fB-e\fR \fIevent_list\fR] [\fB-i\fR \fIrate\fR] + [\fB-b\fR | \fB-t\fR | \fB-h\fR | \fB-s\fR \fIdepth\fR] [\fB-n\fR \fInrecords\fR] + [\fB-l\fR \fIlock\fR [, \fIsize\fR]] [\fB-d\fR \fIduration\fR] + [\fB-f\fR \fIfunction\fR [, \fIsize\fR]] [\fB-T\fR] [\fB-ckgwWRpP\fR] [\fB-D\fR \fIcount\fR] [\fB-o\fR \fIfilename\fR] [\fB-x\fR \fIopt\fR [=val]] \fIcommand\fR [\fIargs\fR] .fi @@ -61,7 +61,6 @@ The following options are supported: If no event selection options are specified, the default is \fB-C\fR. .sp .ne 2 -.mk .na \fB\fB-A\fR\fR .ad @@ -72,7 +71,6 @@ Watch all lock events. \fB-A\fR is equivalent to \fB-CH\fR. .sp .ne 2 -.mk .na \fB\fB-C\fR\fR .ad @@ -83,7 +81,6 @@ Watch contention events. .sp .ne 2 -.mk .na \fB\fB-E\fR\fR .ad @@ -94,7 +91,6 @@ Watch error events. .sp .ne 2 -.mk .na \fB\fB\fR\fB-e\fR \fIevent_list\fR\fR .ad @@ -107,7 +103,6 @@ arguments to get a brief description of all events. .sp .ne 2 -.mk .na \fB\fB-H\fR\fR .ad @@ -118,7 +113,6 @@ Watch hold events. .sp .ne 2 -.mk .na \fB\fB-I\fR\fR .ad @@ -129,7 +123,6 @@ Watch profiling interrupt events. .sp .ne 2 -.mk .na \fB\fB\fR\fB-i\fR \fIrate\fR\fR .ad @@ -143,7 +136,6 @@ Hz). .SS "Data Gathering" .sp .ne 2 -.mk .na \fB\fB-x\fR \fIarg\fR[=\fIval\fR]\fR .ad @@ -158,7 +150,6 @@ an equals sign (=). .SS "Data Gathering (Mutually Exclusive)" .sp .ne 2 -.mk .na \fB\fB-b\fR\fR .ad @@ -169,7 +160,6 @@ Basic statistics: lock, caller, number of events. .sp .ne 2 -.mk .na \fB\fB-h\fR\fR .ad @@ -180,7 +170,6 @@ Histogram: Timing plus time-distribution histograms. .sp .ne 2 -.mk .na \fB\fB\fR\fB-s\fR \fIdepth\fR\fR .ad @@ -191,7 +180,6 @@ Stack trace: Histogram plus stack traces up to \fIdepth\fR frames deep. .sp .ne 2 -.mk .na \fB\fB-t\fR\fR .ad @@ -203,7 +191,6 @@ Timing: Basic plus timing for all events [default]. .SS "Data Filtering" .sp .ne 2 -.mk .na \fB\fB\fR\fB-d\fR \fIduration\fR\fR .ad @@ -214,7 +201,6 @@ Only watch events longer than \fIduration\fR. .sp .ne 2 -.mk .na \fB\fB\fR\fB-f\fR \fIfunc[,size]\fR\fR .ad @@ -227,7 +213,6 @@ available, or \fB1\fR if not. .sp .ne 2 -.mk .na \fB\fB\fR\fB-l\fR \fIlock[,size]\fR\fR .ad @@ -240,7 +225,6 @@ symbol size is not available. .sp .ne 2 -.mk .na \fB\fB\fR\fB-n\fR \fInrecords\fR\fR .ad @@ -251,7 +235,6 @@ Maximum number of data records. .sp .ne 2 -.mk .na \fB\fB-T\fR\fR .ad @@ -263,7 +246,6 @@ Trace (rather than sample) events [off by default]. .SS "Data Reporting" .sp .ne 2 -.mk .na \fB\fB-c\fR\fR .ad @@ -274,7 +256,6 @@ Coalesce lock data for lock arrays (for example, \fBpse_mutex[]\fR). .sp .ne 2 -.mk .na \fB\fB\fR\fB-D\fR \fIcount\fR\fR .ad @@ -285,7 +266,6 @@ Only display the top \fIcount\fR events of each type. .sp .ne 2 -.mk .na \fB\fB-g\fR\fR .ad @@ -304,7 +284,6 @@ issue (1), the default data gathering mode when using \fB-g\fR is \fB-s\fR .sp .ne 2 -.mk .na \fB\fB-k\fR\fR .ad @@ -315,7 +294,6 @@ Coalesce PCs within functions. .sp .ne 2 -.mk .na \fB\fB\fR\fB-o\fR \fIfilename\fR\fR .ad @@ -326,7 +304,6 @@ Direct output to \fIfilename\fR. .sp .ne 2 -.mk .na \fB\fB-P\fR\fR .ad @@ -337,7 +314,6 @@ Sort data by (\fIcount * time\fR) product. .sp .ne 2 -.mk .na \fB\fB-p\fR\fR .ad @@ -348,7 +324,6 @@ Parsable output format. .sp .ne 2 -.mk .na \fB\fB-R\fR\fR .ad @@ -359,7 +334,6 @@ Display rates (events per second) rather than counts. .sp .ne 2 -.mk .na \fB\fB-W\fR\fR .ad @@ -370,7 +344,6 @@ Whichever: distinguish events only by caller, not by lock. .sp .ne 2 -.mk .na \fB\fB-w\fR\fR .ad @@ -385,7 +358,6 @@ Wherever: distinguish events only by lock, not by caller. The following headers appear over various columns of data. .sp .ne 2 -.mk .na \fB\fBCount\fR or \fBops/s\fR\fR .ad @@ -397,7 +369,6 @@ was specified. .sp .ne 2 -.mk .na \fB\fBindv\fR\fR .ad @@ -408,7 +379,6 @@ Percentage of all events represented by this individual event. .sp .ne 2 -.mk .na \fB\fBgenr\fR\fR .ad @@ -419,7 +389,6 @@ Percentage of all events generated by this function. .sp .ne 2 -.mk .na \fB\fBcuml\fR\fR .ad @@ -430,7 +399,6 @@ Cumulative percentage; a running total of the individuals. .sp .ne 2 -.mk .na \fB\fBrcnt\fR\fR .ad @@ -443,7 +411,6 @@ for shared locks (rwlocks held as reader). .sp .ne 2 -.mk .na \fB\fBnsec\fR\fR .ad @@ -455,7 +422,6 @@ For the profiling event, duration means interrupt latency. .sp .ne 2 -.mk .na \fB\fBLock\fR\fR .ad @@ -466,7 +432,6 @@ Address of the lock; displayed symbolically if possible. .sp .ne 2 -.mk .na \fB\fBCPU+PIL\fR\fR .ad @@ -478,7 +443,6 @@ Address of the lock; displayed symbolically if possible. .sp .ne 2 -.mk .na \fB\fBCaller\fR\fR .ad @@ -800,7 +764,7 @@ Count indv cuml rcnt nsec Lock Caller Profiling interrupt: 229 events in 10.042 seconds (23 events/sec) Count indv cuml rcnt nsec Hottest CPU+PIL Caller - + ------------------------------------------------------------------------- 89 39% 39% 1.00 426 cpu[0]+6 sync_stream_buf 64 28% 67% 1.00 398 cpu[0]+6 sbus_intr_wrapper @@ -855,7 +819,7 @@ Count indv cuml rcnt nsec CPU+PIL Caller 1929 40% 40% 0.00 3215 cpu[0] usec_delay+0x78 nsec ------ Time Distribution ------ count Stack 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1872 ata_wait+0x90 - 8192 | 27 acersb_get_intr_status+0x34 + 8192 | 27 acersb_get_intr_status+0x34 16384 | 29 ata_set_feature+0x124 32768 | 1 ata_disk_start+0x15c ata_hba_start+0xbc diff --git a/man/man1m/plockstat.1m b/man/man1m/plockstat.1m index 18a9d2e..b24f36d 100644 --- a/man/man1m/plockstat.1m +++ b/man/man1m/plockstat.1m @@ -3,19 +3,19 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH plockstat 1M "26 Jan 2009" "SunOS 5.11" "System Administration Commands" +.TH PLOCKSTAT 1M "Jan 26, 2009" .SH NAME plockstat \- report user-level lock statistics .SH SYNOPSIS .LP .nf -\fBplockstat\fR [\fB-vACHV\fR] [\fB-n\fR \fIcount\fR] [\fB-s\fR \fIdepth\fR] [\fB-e\fR \fIsecs\fR] +\fBplockstat\fR [\fB-vACHV\fR] [\fB-n\fR \fIcount\fR] [\fB-s\fR \fIdepth\fR] [\fB-e\fR \fIsecs\fR] [\fB-x\fR \fIarg\fR [=val]] \fIcommand\fR [\fIarg\fR]... .fi .LP .nf -\fBplockstat\fR [\fB-vACHV\fR] [\fB-n\fR \fIcount\fR] [\fB-s\fR \fIdepth\fR] [\fB-e\fR \fIsecs\fR] +\fBplockstat\fR [\fB-vACHV\fR] [\fB-n\fR \fIcount\fR] [\fB-s\fR \fIdepth\fR] [\fB-e\fR \fIsecs\fR] [\fB-x\fR \fIarg\fR [=val]] \fB-p\fR \fIpid\fR .fi @@ -44,100 +44,82 @@ security features. The following options are supported: .sp .ne 2 -.mk .na \fB\fB-A\fR\fR .ad .RS 16n -.rt Watch all lock events. This option is equivalent to \fB-CH\fR. .RE .sp .ne 2 -.mk .na \fB\fB-C\fR\fR .ad .RS 16n -.rt Watch contention events. .RE .sp .ne 2 -.mk .na \fB\fB-H\fR\fR .ad .RS 16n -.rt Watch hold events. .RE .sp .ne 2 -.mk .na \fB\fB-e\fR \fIsecs\fR\fR .ad .RS 16n -.rt Exit after the number of seconds specified have elapsed. .RE .sp .ne 2 -.mk .na \fB\fB-n\fR \fIcount\fR\fR .ad .RS 16n -.rt Display only the specified number of entries for each output category. .RE .sp .ne 2 -.mk .na \fB\fB-s\fR \fIdepth\fR\fR .ad .RS 16n -.rt Record a stack trace rather than just the calling function. .RE .sp .ne 2 -.mk .na \fB\fB-p\fR \fIpid\fR\fR .ad .RS 16n -.rt Specify a process ID from which \fBplockstat\fR is to gather data. .RE .sp .ne 2 -.mk .na \fB\fB-v\fR\fR .ad .RS 16n -.rt Print out a message to indicate that tracing has started. .RE .sp .ne 2 -.mk .na \fB\fB-x\fR \fIarg\fR[=\fIval\fR]\fR .ad .RS 16n -.rt Enable or modify a DTrace runtime option or D compiler option. The list of options is found in the \fISolaris Dynamic Tracing Guide\fR. Boolean options are enabled by specifying their name. Options with values are set by separating @@ -146,12 +128,10 @@ the option name and value with an equals sign (\fB=\fR). .sp .ne 2 -.mk .na \fB\fB-V\fR\fR .ad .RS 16n -.rt Print the Dtrace commands used to gather the data. The output can then be used directly with the \fBdtrace\fR(1M) command. .RE @@ -162,56 +142,46 @@ directly with the \fBdtrace\fR(1M) command. The following operands are supported: .sp .ne 2 -.mk .na \fB\fIarg\fR\fR .ad .RS 11n -.rt A string to be passed as an argument to \fIcommand\fR. .RE .sp .ne 2 -.mk .na \fB\fIcommand\fR\fR .ad .RS 11n -.rt The name of a utility to be invoked. .RE .sp .ne 2 -.mk .na \fB\fIcount\fR\fR .ad .RS 11n -.rt A positive integer value. .RE .sp .ne 2 -.mk .na \fB\fIpid\fR\fR .ad .RS 11n -.rt A process identifier for a process to be monitored. .RE .sp .ne 2 -.mk .na \fB\fIsecs\fR\fR .ad .RS 11n -.rt Duration specified as a positive integer number of seconds. .RE @@ -221,23 +191,19 @@ Duration specified as a positive integer number of seconds. The following exit values are returned: .sp .ne 2 -.mk .na \fB\fB0\fR\fR .ad .RS 6n -.rt Successful completion. .RE .sp .ne 2 -.mk .na \fB>\fB0\fR\fR .ad .RS 6n -.rt An error occurred. .RE @@ -249,13 +215,12 @@ See \fBattributes\fR(5) for descriptions of the following attributes: .sp .TS -tab() box; -cw(2.75i) |cw(2.75i) -lw(2.75i) |lw(2.75i) -. -ATTRIBUTE TYPEATTRIBUTE VALUE +box; +c | c +l | l . +ATTRIBUTE TYPE ATTRIBUTE VALUE _ -Interface StabilitySee below. +Interface Stability See below. .TE .sp diff --git a/tools/ctf/cvt/dwarf.c b/tools/ctf/cvt/dwarf.c index a7e97df..ce8f6f9 100644 --- a/tools/ctf/cvt/dwarf.c +++ b/tools/ctf/cvt/dwarf.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * DWARF to tdata conversion * @@ -1832,8 +1830,11 @@ dw_read(tdata_t *td, Elf *elf, const char *filename) } if ((rc = dwarf_next_cu_header(dw.dw_dw, &hdrlen, &vers, &abboff, - &addrsz, &nxthdr, &dw.dw_err)) != DW_DLV_OK || - (cu = die_sibling(&dw, NULL)) == NULL || + &addrsz, &nxthdr, &dw.dw_err)) != DW_DLV_OK) + terminate("file does not contain valid DWARF data: %s\n", + dwarf_errmsg(dw.dw_err)); + + if ((cu = die_sibling(&dw, NULL)) == NULL || (child = die_child(&dw, cu)) == NULL) terminate("file does not contain dwarf type data " "(try compiling with -g)\n"); -- cgit v1.1 From 83cf4eb03dfb4f9701ef8fe7a4c2f67086c8a588 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 18 Jul 2012 10:19:51 +0000 Subject: Update vendor/illumos/dist to pre libzfs_core state (zfs part) illumos-gate revision 13742:b6bbdd77139c Obtained from: ssh://anonhg@hg.illumos.org/illumos-gate --- cmd/zdb/zdb.c | 91 +- cmd/zfs/zfs_main.c | 2998 +++++++++++++++++++++++++++++---- cmd/zpool/zpool_main.c | 604 ++++++- cmd/ztest/ztest.c | 1154 +++++++++---- lib/libnvpair/libnvpair.c | 5 + lib/libuutil/common/uu_list.c | 7 +- lib/libzfs/common/libzfs.h | 92 +- lib/libzfs/common/libzfs_config.c | 35 + lib/libzfs/common/libzfs_dataset.c | 663 ++++++-- lib/libzfs/common/libzfs_graph.c | 653 ------- lib/libzfs/common/libzfs_impl.h | 4 +- lib/libzfs/common/libzfs_import.c | 18 +- lib/libzfs/common/libzfs_iter.c | 462 +++++ lib/libzfs/common/libzfs_pool.c | 299 +++- lib/libzfs/common/libzfs_sendrecv.c | 629 ++++--- lib/libzfs/common/libzfs_status.c | 18 +- lib/libzfs/common/libzfs_util.c | 18 +- lib/libzpool/common/kernel.c | 6 +- lib/libzpool/common/sys/zfs_context.h | 20 + lib/libzpool/common/taskq.c | 103 +- man/man1m/zdb.1m | 509 +++++- man/man1m/zfs.1m | 612 ++++--- man/man1m/zpool.1m | 349 ++-- man/man1m/zstreamdump.1m | 15 +- 24 files changed, 6937 insertions(+), 2427 deletions(-) delete mode 100644 lib/libzfs/common/libzfs_graph.c create mode 100644 lib/libzfs/common/libzfs_iter.c diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index c6e219d..ea211bf 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -54,6 +56,7 @@ #include #include #include +#include #undef ZFS_MAXNAMELEN #undef verify #include @@ -63,7 +66,8 @@ #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ zio_checksum_table[(idx)].ci_name : "UNKNOWN") #define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ - dmu_ot[(idx)].ot_name : "UNKNOWN") + dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ + dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES) #ifndef lint @@ -102,13 +106,16 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-CumdibcsDvhL] poolname [object...]\n" - " %s [-div] dataset [object...]\n" - " %s -m [-L] poolname [vdev [metaslab...]]\n" - " %s -R poolname vdev:offset:size[:flags]\n" - " %s -S poolname\n" - " %s -l [-u] device\n" - " %s -C\n\n", + "Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] " + "poolname [object...]\n" + " %s [-divPA] [-e -p path...] dataset [object...]\n" + " %s -m [-LXFPA] [-t txg] [-e [-p path...]] " + "poolname [vdev [metaslab...]]\n" + " %s -R [-A] [-e [-p path...]] poolname " + "vdev:offset:size[:flags]\n" + " %s -S [-PA] [-e [-p path...]] poolname\n" + " %s -l [-uA] device\n" + " %s -C [-A] [-U config]\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " @@ -150,7 +157,7 @@ usage(void) "has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -p -- use one or more with " "-e to specify path to vdev dir\n"); - (void) fprintf(stderr, " -P print numbers parsable\n"); + (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " @@ -1085,7 +1092,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; - zdb_nicenum(ds->ds_used_bytes, used); + zdb_nicenum(ds->ds_referenced_bytes, used); zdb_nicenum(ds->ds_compressed_bytes, compressed); zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed); zdb_nicenum(ds->ds_unique_bytes, unique); @@ -1129,6 +1136,44 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) /* ARGSUSED */ static int +dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (bp->blk_birth != 0) { + sprintf_blkptr(blkbuf, bp); + (void) printf("\t%s\n", blkbuf); + } + return (0); +} + +static void +dump_bptree(objset_t *os, uint64_t obj, char *name) +{ + char bytes[32]; + bptree_phys_t *bt; + dmu_buf_t *db; + + if (dump_opt['d'] < 3) + return; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + zdb_nicenum(bt->bt_bytes, bytes); + (void) printf("\n %s: %llu datasets, %s\n", + name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); + dmu_buf_rele(db, FTAG); + + if (dump_opt['d'] < 5) + return; + + (void) printf("\n"); + + (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); +} + +/* ARGSUSED */ +static int dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { char blkbuf[BP_SPRINTF_LEN]; @@ -1880,11 +1925,13 @@ typedef struct zdb_blkstats { */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) -#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 2) +#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) +#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static char *zdb_ot_extname[] = { "deferred free", "dedup ditto", + "other", "Total", }; @@ -1965,9 +2012,10 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, type = BP_GET_TYPE(bp); - zdb_count_block(zcb, zilog, bp, type); + zdb_count_block(zcb, zilog, bp, + (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); - is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata); + is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { int ioerr; @@ -2192,6 +2240,12 @@ dump_block_stats(spa_t *spa) count_block_cb, &zcb, NULL); (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, count_block_cb, &zcb, NULL); + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, + spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, + &zcb, NULL)); + } if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; @@ -2368,7 +2422,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || - BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) + BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); @@ -2473,7 +2527,14 @@ dump_zpool(spa_t *spa) dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees"); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj, - "Pool frees"); + "Pool snapshot frees"); + } + + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + dump_bptree(spa->spa_meta_objset, + spa->spa_dsl_pool->dp_bptree_obj, + "Pool dataset frees"); } dump_dtl(spa->spa_root_vdev, 0); } diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 9516697..b64905f 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -21,6 +21,10 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2012 Milan Jurik. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include @@ -41,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -51,7 +56,11 @@ #include #include +#include +#include #include +#include +#include #include "zfs_iter.h" #include "zfs_util.h" @@ -61,7 +70,6 @@ libzfs_handle_t *g_zfs; static FILE *mnttab_file; static char history_str[HIS_MAX_RECORD_LEN]; -const char *pypath = "/usr/lib/zfs/pyzfs.py"; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); @@ -82,8 +90,10 @@ static int zfs_do_send(int argc, char **argv); static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv); -static int zfs_do_python(int argc, char **argv); +static int zfs_do_allow(int argc, char **argv); +static int zfs_do_unallow(int argc, char **argv); static int zfs_do_hold(int argc, char **argv); +static int zfs_do_holds(int argc, char **argv); static int zfs_do_release(int argc, char **argv); static int zfs_do_diff(int argc, char **argv); @@ -131,7 +141,7 @@ typedef enum { HELP_HOLD, HELP_HOLDS, HELP_RELEASE, - HELP_DIFF + HELP_DIFF, } zfs_help_t; typedef struct zfs_command { @@ -176,12 +186,12 @@ static zfs_command_t command_table[] = { { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { NULL }, - { "allow", zfs_do_python, HELP_ALLOW }, + { "allow", zfs_do_allow, HELP_ALLOW }, { NULL }, - { "unallow", zfs_do_python, HELP_UNALLOW }, + { "unallow", zfs_do_unallow, HELP_UNALLOW }, { NULL }, { "hold", zfs_do_hold, HELP_HOLD }, - { "holds", zfs_do_python, HELP_HOLDS }, + { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, { "diff", zfs_do_diff, HELP_DIFF }, }; @@ -203,11 +213,13 @@ get_usage(zfs_help_t idx) "\tcreate [-ps] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: - return (gettext("\tdestroy [-rRf] \n" - "\tdestroy [-rRd] \n")); + return (gettext("\tdestroy [-fnpRrv] \n" + "\tdestroy [-dnpRrv] " + "@[%][,...]\n")); case HELP_GET: return (gettext("\tget [-rHp] [-d max] " - "[-o \"all\" | field[,...]] [-s source[,...]]\n" + "[-o \"all\" | field[,...]] [-t type[,...]] " + "[-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot] ...\n")); case HELP_INHERIT: @@ -231,14 +243,15 @@ get_usage(zfs_help_t idx) "snapshot>\n" "\treceive [-vnFu] [-d | -e] \n")); case HELP_RENAME: - return (gettext("\trename " + return (gettext("\trename [-f] " "\n" - "\trename -p \n" + "\trename [-f] -p \n" "\trename -r ")); case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: - return (gettext("\tsend [-RDp] [-[iI] snapshot] \n")); + return (gettext("\tsend [-DnPpRrv] [-[iI] snapshot] " + "\n")); case HELP_SET: return (gettext("\tset " " ...\n")); @@ -417,6 +430,8 @@ usage(boolean_t requested) (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "written@"); + (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); @@ -562,7 +577,7 @@ zfs_do_clone(int argc, char **argv) zfs_handle_t *zhp = NULL; boolean_t parents = B_FALSE; nvlist_t *props; - int ret; + int ret = 0; int c; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) @@ -729,7 +744,6 @@ zfs_do_create(int argc, char **argv) (void) fprintf(stderr, gettext("missing size " "argument\n")); goto badusage; - break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -862,15 +876,23 @@ badusage: */ typedef struct destroy_cbdata { boolean_t cb_first; - int cb_force; - int cb_recurse; - int cb_error; - int cb_needforce; - int cb_doclones; - boolean_t cb_closezhp; + boolean_t cb_force; + boolean_t cb_recurse; + boolean_t cb_error; + boolean_t cb_doclones; zfs_handle_t *cb_target; - char *cb_snapname; boolean_t cb_defer_destroy; + boolean_t cb_verbose; + boolean_t cb_parsable; + boolean_t cb_dryrun; + nvlist_t *cb_nvl; + + /* first snap in contiguous run */ + zfs_handle_t *cb_firstsnap; + /* previous snap in contiguous run */ + zfs_handle_t *cb_prevsnap; + int64_t cb_snapused; + char *cb_snapspec; } destroy_cbdata_t; /* @@ -900,7 +922,7 @@ destroy_check_dependent(zfs_handle_t *zhp, void *data) (void) fprintf(stderr, gettext("use '-r' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; - cbp->cb_error = 1; + cbp->cb_error = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); @@ -921,7 +943,8 @@ destroy_check_dependent(zfs_handle_t *zhp, void *data) (void) fprintf(stderr, gettext("use '-R' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; - cbp->cb_error = 1; + cbp->cb_error = B_TRUE; + cbp->cb_dryrun = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); @@ -935,7 +958,20 @@ out: static int destroy_callback(zfs_handle_t *zhp, void *data) { - destroy_cbdata_t *cbp = data; + destroy_cbdata_t *cb = data; + const char *name = zfs_get_name(zhp); + + if (cb->cb_verbose) { + if (cb->cb_parsable) { + (void) printf("destroy\t%s\n", name); + } else if (cb->cb_dryrun) { + (void) printf(gettext("would destroy %s\n"), + name); + } else { + (void) printf(gettext("will destroy %s\n"), + name); + } + } /* * Ignore pools (which we've already flagged as an error before getting @@ -947,13 +983,12 @@ destroy_callback(zfs_handle_t *zhp, void *data) return (0); } - /* - * Bail out on the first error. - */ - if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 || - zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) { - zfs_close(zhp); - return (-1); + if (!cb->cb_dryrun) { + if (zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || + zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { + zfs_close(zhp); + return (-1); + } } zfs_close(zhp); @@ -961,39 +996,142 @@ destroy_callback(zfs_handle_t *zhp, void *data) } static int -destroy_snap_clones(zfs_handle_t *zhp, void *arg) +destroy_print_cb(zfs_handle_t *zhp, void *arg) { - destroy_cbdata_t *cbp = arg; - char thissnap[MAXPATHLEN]; - zfs_handle_t *szhp; - boolean_t closezhp = cbp->cb_closezhp; - int rv; - - (void) snprintf(thissnap, sizeof (thissnap), - "%s@%s", zfs_get_name(zhp), cbp->cb_snapname); + destroy_cbdata_t *cb = arg; + const char *name = zfs_get_name(zhp); + int err = 0; + + if (nvlist_exists(cb->cb_nvl, name)) { + if (cb->cb_firstsnap == NULL) + cb->cb_firstsnap = zfs_handle_dup(zhp); + if (cb->cb_prevsnap != NULL) + zfs_close(cb->cb_prevsnap); + /* this snap continues the current range */ + cb->cb_prevsnap = zfs_handle_dup(zhp); + if (cb->cb_verbose) { + if (cb->cb_parsable) { + (void) printf("destroy\t%s\n", name); + } else if (cb->cb_dryrun) { + (void) printf(gettext("would destroy %s\n"), + name); + } else { + (void) printf(gettext("will destroy %s\n"), + name); + } + } + } else if (cb->cb_firstsnap != NULL) { + /* end of this range */ + uint64_t used = 0; + err = zfs_get_snapused_int(cb->cb_firstsnap, + cb->cb_prevsnap, &used); + cb->cb_snapused += used; + zfs_close(cb->cb_firstsnap); + cb->cb_firstsnap = NULL; + zfs_close(cb->cb_prevsnap); + cb->cb_prevsnap = NULL; + } + zfs_close(zhp); + return (err); +} - libzfs_print_on_error(g_zfs, B_FALSE); - szhp = zfs_open(g_zfs, thissnap, ZFS_TYPE_SNAPSHOT); - libzfs_print_on_error(g_zfs, B_TRUE); - if (szhp) { - /* - * Destroy any clones of this snapshot - */ - if (zfs_iter_dependents(szhp, B_FALSE, destroy_callback, - cbp) != 0) { - zfs_close(szhp); - if (closezhp) - zfs_close(zhp); - return (-1); +static int +destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) +{ + int err = 0; + assert(cb->cb_firstsnap == NULL); + assert(cb->cb_prevsnap == NULL); + err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb); + if (cb->cb_firstsnap != NULL) { + uint64_t used = 0; + if (err == 0) { + err = zfs_get_snapused_int(cb->cb_firstsnap, + cb->cb_prevsnap, &used); } - zfs_close(szhp); + cb->cb_snapused += used; + zfs_close(cb->cb_firstsnap); + cb->cb_firstsnap = NULL; + zfs_close(cb->cb_prevsnap); + cb->cb_prevsnap = NULL; } + return (err); +} - cbp->cb_closezhp = B_TRUE; - rv = zfs_iter_filesystems(zhp, destroy_snap_clones, arg); - if (closezhp) - zfs_close(zhp); - return (rv); +static int +snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + int err = 0; + + /* Check for clones. */ + if (!cb->cb_doclones) { + cb->cb_target = zhp; + cb->cb_first = B_TRUE; + err = zfs_iter_dependents(zhp, B_TRUE, + destroy_check_dependent, cb); + } + + if (err == 0) { + if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) + nomem(); + } + zfs_close(zhp); + return (err); +} + +static int +gather_snapshots(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + int err = 0; + + err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); + if (err == ENOENT) + err = 0; + if (err != 0) + goto out; + + if (cb->cb_verbose) { + err = destroy_print_snapshots(zhp, cb); + if (err != 0) + goto out; + } + + if (cb->cb_recurse) + err = zfs_iter_filesystems(zhp, gather_snapshots, cb); + +out: + zfs_close(zhp); + return (err); +} + +static int +destroy_clones(destroy_cbdata_t *cb) +{ + nvpair_t *pair; + for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); + pair != NULL; + pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { + zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), + ZFS_TYPE_SNAPSHOT); + if (zhp != NULL) { + boolean_t defer = cb->cb_defer_destroy; + int err = 0; + + /* + * We can't defer destroy non-snapshots, so set it to + * false while destroying the clones. + */ + cb->cb_defer_destroy = B_FALSE; + err = zfs_iter_dependents(zhp, B_FALSE, + destroy_callback, cb); + cb->cb_defer_destroy = defer; + zfs_close(zhp); + if (err != 0) + return (err); + } + } + return (0); } static int @@ -1002,25 +1140,35 @@ zfs_do_destroy(int argc, char **argv) destroy_cbdata_t cb = { 0 }; int c; zfs_handle_t *zhp; - char *cp; + char *at; zfs_type_t type = ZFS_TYPE_DATASET; /* check options */ - while ((c = getopt(argc, argv, "dfrR")) != -1) { + while ((c = getopt(argc, argv, "vpndfrR")) != -1) { switch (c) { + case 'v': + cb.cb_verbose = B_TRUE; + break; + case 'p': + cb.cb_verbose = B_TRUE; + cb.cb_parsable = B_TRUE; + break; + case 'n': + cb.cb_dryrun = B_TRUE; + break; case 'd': cb.cb_defer_destroy = B_TRUE; type = ZFS_TYPE_SNAPSHOT; break; case 'f': - cb.cb_force = 1; + cb.cb_force = B_TRUE; break; case 'r': - cb.cb_recurse = 1; + cb.cb_recurse = B_TRUE; break; case 'R': - cb.cb_recurse = 1; - cb.cb_doclones = 1; + cb.cb_recurse = B_TRUE; + cb.cb_doclones = B_TRUE; break; case '?': default: @@ -1035,7 +1183,7 @@ zfs_do_destroy(int argc, char **argv) /* check number of arguments */ if (argc == 0) { - (void) fprintf(stderr, gettext("missing path argument\n")); + (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { @@ -1043,91 +1191,117 @@ zfs_do_destroy(int argc, char **argv) usage(B_FALSE); } - /* - * If we are doing recursive destroy of a snapshot, then the - * named snapshot may not exist. Go straight to libzfs. - */ - if (cb.cb_recurse && (cp = strchr(argv[0], '@'))) { - int ret; + at = strchr(argv[0], '@'); + if (at != NULL) { + int err = 0; + + /* Build the list of snaps to destroy in cb_nvl. */ + if (nvlist_alloc(&cb.cb_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); - *cp = '\0'; - if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) + *at = '\0'; + zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + + cb.cb_snapspec = at + 1; + if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || + cb.cb_error) { + zfs_close(zhp); + nvlist_free(cb.cb_nvl); return (1); - *cp = '@'; - cp++; + } - if (cb.cb_doclones) { - boolean_t defer = cb.cb_defer_destroy; + if (nvlist_empty(cb.cb_nvl)) { + (void) fprintf(stderr, gettext("could not find any " + "snapshots to destroy; check snapshot names.\n")); + zfs_close(zhp); + nvlist_free(cb.cb_nvl); + return (1); + } - /* - * Temporarily ignore the defer_destroy setting since - * it's not supported for clones. - */ - cb.cb_defer_destroy = B_FALSE; - cb.cb_snapname = cp; - if (destroy_snap_clones(zhp, &cb) != 0) { - zfs_close(zhp); - return (1); + if (cb.cb_verbose) { + char buf[16]; + zfs_nicenum(cb.cb_snapused, buf, sizeof (buf)); + if (cb.cb_parsable) { + (void) printf("reclaim\t%llu\n", + cb.cb_snapused); + } else if (cb.cb_dryrun) { + (void) printf(gettext("would reclaim %s\n"), + buf); + } else { + (void) printf(gettext("will reclaim %s\n"), + buf); } - cb.cb_defer_destroy = defer; } - ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy); - zfs_close(zhp); - if (ret) { - (void) fprintf(stderr, - gettext("no snapshots destroyed\n")); + if (!cb.cb_dryrun) { + if (cb.cb_doclones) + err = destroy_clones(&cb); + if (err == 0) { + err = zfs_destroy_snaps_nvl(zhp, cb.cb_nvl, + cb.cb_defer_destroy); + } } - return (ret != 0); - } - /* Open the given dataset */ - if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) - return (1); + zfs_close(zhp); + nvlist_free(cb.cb_nvl); + if (err != 0) + return (1); + } else { + /* Open the given dataset */ + if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) + return (1); - cb.cb_target = zhp; + cb.cb_target = zhp; - /* - * Perform an explicit check for pools before going any further. - */ - if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && - zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { - (void) fprintf(stderr, gettext("cannot destroy '%s': " - "operation does not apply to pools\n"), - zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use 'zfs destroy -r " - "%s' to destroy all datasets in the pool\n"), - zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use 'zpool destroy %s' " - "to destroy the pool itself\n"), zfs_get_name(zhp)); - zfs_close(zhp); - return (1); - } + /* + * Perform an explicit check for pools before going any further. + */ + if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "operation does not apply to pools\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zfs destroy -r " + "%s' to destroy all datasets in the pool\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zpool destroy %s' " + "to destroy the pool itself\n"), zfs_get_name(zhp)); + zfs_close(zhp); + return (1); + } - /* - * Check for any dependents and/or clones. - */ - cb.cb_first = B_TRUE; - if (!cb.cb_doclones && !cb.cb_defer_destroy && - zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, - &cb) != 0) { - zfs_close(zhp); - return (1); - } + /* + * Check for any dependents and/or clones. + */ + cb.cb_first = B_TRUE; + if (!cb.cb_doclones && + zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, + &cb) != 0) { + zfs_close(zhp); + return (1); + } - if (cb.cb_error || (!cb.cb_defer_destroy && - (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) { - zfs_close(zhp); - return (1); - } + if (cb.cb_error) { + zfs_close(zhp); + return (1); + } - /* - * Do the real thing. The callback will close the handle regardless of - * whether it succeeds or not. - */ + if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, + &cb) != 0) { + zfs_close(zhp); + return (1); + } - if (destroy_callback(zhp, &cb) != 0) - return (1); + /* + * Do the real thing. The callback will close the + * handle regardless of whether it succeeds or not. + */ + if (destroy_callback(zhp, &cb) != 0) + return (1); + } return (0); } @@ -1229,6 +1403,17 @@ get_callback(zfs_handle_t *zhp, void *data) zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); + } else if (zfs_prop_written(pl->pl_user_prop)) { + sourcetype = ZPROP_SRC_LOCAL; + + if (zfs_prop_get_written(zhp, pl->pl_user_prop, + buf, sizeof (buf), cbp->cb_literal) != 0) { + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, buf, sourcetype, source, NULL); } else { if (nvlist_lookup_nvlist(user_props, pl->pl_user_prop, &propval) != 0) { @@ -1273,9 +1458,10 @@ static int zfs_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; - int i, c, flags = 0; + int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; + int types = ZFS_TYPE_DATASET; char *value, *fields; - int ret; + int ret = 0; int limit = 0; zprop_list_t fake_name = { 0 }; @@ -1290,7 +1476,7 @@ zfs_do_get(int argc, char **argv) cb.cb_type = ZFS_TYPE_DATASET; /* check options */ - while ((c = getopt(argc, argv, ":d:o:s:rHp")) != -1) { + while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; @@ -1408,6 +1594,37 @@ zfs_do_get(int argc, char **argv) } break; + case 't': + types = 0; + flags &= ~ZFS_ITER_PROP_LISTSNAPS; + while (*optarg != '\0') { + static char *type_subopts[] = { "filesystem", + "volume", "snapshot", "all", NULL }; + + switch (getsubopt(&optarg, type_subopts, + &value)) { + case 0: + types |= ZFS_TYPE_FILESYSTEM; + break; + case 1: + types |= ZFS_TYPE_VOLUME; + break; + case 2: + types |= ZFS_TYPE_SNAPSHOT; + break; + case 3: + types = ZFS_TYPE_DATASET; + break; + + default: + (void) fprintf(stderr, + gettext("invalid type '%s'\n"), + value); + usage(B_FALSE); + } + } + break; + case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -1451,7 +1668,7 @@ zfs_do_get(int argc, char **argv) cb.cb_first = B_TRUE; /* run for each object */ - ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, + ret = zfs_for_each(argc, argv, flags, types, NULL, &cb.cb_proplist, limit, get_callback, &cb); if (cb.cb_proplist == &fake_name) @@ -1512,7 +1729,7 @@ zfs_do_inherit(int argc, char **argv) zfs_prop_t prop; inherit_cbdata_t cb = { 0 }; char *propname; - int ret; + int ret = 0; int flags = 0; boolean_t received = B_FALSE; @@ -1718,7 +1935,7 @@ zfs_do_upgrade(int argc, char **argv) { boolean_t all = B_FALSE; boolean_t showversions = B_FALSE; - int ret; + int ret = 0; upgrade_cbdata_t cb = { 0 }; char c; int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; @@ -1773,8 +1990,8 @@ zfs_do_upgrade(int argc, char **argv) "---------------\n"); (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); - (void) printf(gettext(" 3 Case insensitive and File system " - "unique identifier (FUID)\n")); + (void) printf(gettext(" 3 Case insensitive and filesystem " + "user identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); (void) printf(gettext(" 5 System attributes\n")); @@ -1823,88 +2040,728 @@ zfs_do_upgrade(int argc, char **argv) return (ret); } +#define USTYPE_USR_BIT (0) +#define USTYPE_GRP_BIT (1) +#define USTYPE_PSX_BIT (2) +#define USTYPE_SMB_BIT (3) + +#define USTYPE_USR (1 << USTYPE_USR_BIT) +#define USTYPE_GRP (1 << USTYPE_GRP_BIT) + +#define USTYPE_PSX (1 << USTYPE_PSX_BIT) +#define USTYPE_SMB (1 << USTYPE_SMB_BIT) + +#define USTYPE_PSX_USR (USTYPE_PSX | USTYPE_USR) +#define USTYPE_SMB_USR (USTYPE_SMB | USTYPE_USR) +#define USTYPE_PSX_GRP (USTYPE_PSX | USTYPE_GRP) +#define USTYPE_SMB_GRP (USTYPE_SMB | USTYPE_GRP) +#define USTYPE_ALL (USTYPE_PSX_USR | USTYPE_SMB_USR \ + | USTYPE_PSX_GRP | USTYPE_SMB_GRP) + + +#define USPROP_USED_BIT (0) +#define USPROP_QUOTA_BIT (1) + +#define USPROP_USED (1 << USPROP_USED_BIT) +#define USPROP_QUOTA (1 << USPROP_QUOTA_BIT) + +typedef struct us_node { + nvlist_t *usn_nvl; + uu_avl_node_t usn_avlnode; + uu_list_node_t usn_listnode; +} us_node_t; + +typedef struct us_cbdata { + nvlist_t **cb_nvlp; + uu_avl_pool_t *cb_avl_pool; + uu_avl_t *cb_avl; + boolean_t cb_numname; + boolean_t cb_nicenum; + boolean_t cb_sid2posix; + zfs_userquota_prop_t cb_prop; + zfs_sort_column_t *cb_sortcol; + size_t cb_max_typelen; + size_t cb_max_namelen; + size_t cb_max_usedlen; + size_t cb_max_quotalen; +} us_cbdata_t; + +typedef struct { + zfs_sort_column_t *si_sortcol; + boolean_t si_num_name; + boolean_t si_parsable; +} us_sort_info_t; + +static int +us_compare(const void *larg, const void *rarg, void *unused) +{ + const us_node_t *l = larg; + const us_node_t *r = rarg; + int rc = 0; + us_sort_info_t *si = (us_sort_info_t *)unused; + zfs_sort_column_t *sortcol = si->si_sortcol; + boolean_t num_name = si->si_num_name; + nvlist_t *lnvl = l->usn_nvl; + nvlist_t *rnvl = r->usn_nvl; + + for (; sortcol != NULL; sortcol = sortcol->sc_next) { + char *lvstr = ""; + char *rvstr = ""; + uint32_t lv32 = 0; + uint32_t rv32 = 0; + uint64_t lv64 = 0; + uint64_t rv64 = 0; + zfs_prop_t prop = sortcol->sc_prop; + const char *propname = NULL; + boolean_t reverse = sortcol->sc_reverse; + + switch (prop) { + case ZFS_PROP_TYPE: + propname = "type"; + (void) nvlist_lookup_uint32(lnvl, propname, &lv32); + (void) nvlist_lookup_uint32(rnvl, propname, &rv32); + if (rv32 != lv32) + rc = (rv32 > lv32) ? 1 : -1; + break; + case ZFS_PROP_NAME: + propname = "name"; + if (num_name) { + (void) nvlist_lookup_uint32(lnvl, propname, + &lv32); + (void) nvlist_lookup_uint32(rnvl, propname, + &rv32); + if (rv32 != lv32) + rc = (rv32 > lv32) ? 1 : -1; + } else { + (void) nvlist_lookup_string(lnvl, propname, + &lvstr); + (void) nvlist_lookup_string(rnvl, propname, + &rvstr); + rc = strcmp(lvstr, rvstr); + } + break; + + case ZFS_PROP_USED: + case ZFS_PROP_QUOTA: + if (ZFS_PROP_USED == prop) + propname = "used"; + else + propname = "quota"; + (void) nvlist_lookup_uint64(lnvl, propname, &lv64); + (void) nvlist_lookup_uint64(rnvl, propname, &rv64); + if (rv64 != lv64) + rc = (rv64 > lv64) ? 1 : -1; + } + + if (rc) + if (rc < 0) + return (reverse ? 1 : -1); + else + return (reverse ? -1 : 1); + } + + return (rc); +} + +static inline const char * +us_type2str(unsigned field_type) +{ + switch (field_type) { + case USTYPE_PSX_USR: + return ("POSIX User"); + case USTYPE_PSX_GRP: + return ("POSIX Group"); + case USTYPE_SMB_USR: + return ("SMB User"); + case USTYPE_SMB_GRP: + return ("SMB Group"); + default: + return ("Undefined"); + } +} + /* * zfs userspace */ static int userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) { - zfs_userquota_prop_t *typep = arg; - zfs_userquota_prop_t p = *typep; + us_cbdata_t *cb = (us_cbdata_t *)arg; + zfs_userquota_prop_t prop = cb->cb_prop; char *name = NULL; - char *ug, *propname; + char *propname; char namebuf[32]; char sizebuf[32]; + us_node_t *node; + uu_avl_pool_t *avl_pool = cb->cb_avl_pool; + uu_avl_t *avl = cb->cb_avl; + uu_avl_index_t idx; + nvlist_t *props; + us_node_t *n; + zfs_sort_column_t *sortcol = cb->cb_sortcol; + unsigned type; + const char *typestr; + size_t namelen; + size_t typelen; + size_t sizelen; + us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; if (domain == NULL || domain[0] == '\0') { - if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) { + /* POSIX */ + if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { + type = USTYPE_PSX_GRP; struct group *g = getgrgid(rid); if (g) name = g->gr_name; } else { + type = USTYPE_PSX_USR; struct passwd *p = getpwuid(rid); if (p) name = p->pw_name; } + } else { + char sid[ZFS_MAXNAMELEN+32]; + uid_t id; + uint64_t classes; + int err = 0; + directory_error_t e; + + (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); + /* SMB */ + if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { + type = USTYPE_SMB_GRP; + err = sid_to_id(sid, B_FALSE, &id); + } else { + type = USTYPE_SMB_USR; + err = sid_to_id(sid, B_TRUE, &id); + } + + if (err == 0) { + rid = id; + + e = directory_name_from_sid(NULL, sid, &name, &classes); + if (e != NULL) { + directory_error_free(e); + return (NULL); + } + + if (name == NULL) + name = sid; + } } - if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) - ug = "group"; - else - ug = "user"; +/* + * if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) + * ug = "group"; + * else + * ug = "user"; + */ - if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED) + if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED) propname = "used"; else propname = "quota"; - if (name == NULL) { - (void) snprintf(namebuf, sizeof (namebuf), - "%llu", (longlong_t)rid); + (void) snprintf(namebuf, sizeof (namebuf), "%u", rid); + if (name == NULL) name = namebuf; - } - zfs_nicenum(space, sizebuf, sizeof (sizebuf)); - (void) printf("%s %s %s%c%s %s\n", propname, ug, domain, - domain[0] ? '-' : ' ', name, sizebuf); + if (cb->cb_nicenum) + zfs_nicenum(space, sizebuf, sizeof (sizebuf)); + else + (void) sprintf(sizebuf, "%llu", space); - return (0); -} + node = safe_malloc(sizeof (us_node_t)); + uu_avl_node_init(node, &node->usn_avlnode, avl_pool); -static int -zfs_do_userspace(int argc, char **argv) -{ - zfs_handle_t *zhp; - zfs_userquota_prop_t p; - int error; + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { + free(node); + return (-1); + } - /* - * Try the python version. If the execv fails, we'll continue - * and do a simplistic implementation. - */ - (void) execv(pypath, argv-1); + if (nvlist_add_uint32(props, "type", type) != 0) + nomem(); - (void) printf("internal error: %s not found\n" - "falling back on built-in implementation, " - "some features will not work\n", pypath); + if (cb->cb_numname) { + if (nvlist_add_uint32(props, "name", rid) != 0) + nomem(); + namelen = strlen(namebuf); + } else { + if (nvlist_add_string(props, "name", name) != 0) + nomem(); + namelen = strlen(name); + } - if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL) - return (1); + typestr = us_type2str(type); + typelen = strlen(gettext(typestr)); + if (typelen > cb->cb_max_typelen) + cb->cb_max_typelen = typelen; - (void) printf("PROP TYPE NAME VALUE\n"); + if (namelen > cb->cb_max_namelen) + cb->cb_max_namelen = namelen; - for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { - error = zfs_userspace(zhp, p, userspace_cb, &p); - if (error) - break; + sizelen = strlen(sizebuf); + if (0 == strcmp(propname, "used")) { + if (sizelen > cb->cb_max_usedlen) + cb->cb_max_usedlen = sizelen; + } else { + if (sizelen > cb->cb_max_quotalen) + cb->cb_max_quotalen = sizelen; } - return (error); -} -/* - * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...] - * [-s property [-s property]...] [-S property [-S property]...] - * ... + node->usn_nvl = props; + + n = uu_avl_find(avl, node, &sortinfo, &idx); + if (n == NULL) + uu_avl_insert(avl, node, idx); + else { + nvlist_free(props); + free(node); + node = n; + props = node->usn_nvl; + } + + if (nvlist_add_uint64(props, propname, space) != 0) + nomem(); + + return (0); +} + +static inline boolean_t +usprop_check(zfs_userquota_prop_t p, unsigned types, unsigned props) +{ + unsigned type; + unsigned prop; + + switch (p) { + case ZFS_PROP_USERUSED: + type = USTYPE_USR; + prop = USPROP_USED; + break; + case ZFS_PROP_USERQUOTA: + type = USTYPE_USR; + prop = USPROP_QUOTA; + break; + case ZFS_PROP_GROUPUSED: + type = USTYPE_GRP; + prop = USPROP_USED; + break; + case ZFS_PROP_GROUPQUOTA: + type = USTYPE_GRP; + prop = USPROP_QUOTA; + break; + default: /* ALL */ + return (B_TRUE); + }; + + return (type & types && prop & props); +} + +#define USFIELD_TYPE (1 << 0) +#define USFIELD_NAME (1 << 1) +#define USFIELD_USED (1 << 2) +#define USFIELD_QUOTA (1 << 3) +#define USFIELD_ALL (USFIELD_TYPE | USFIELD_NAME | USFIELD_USED | USFIELD_QUOTA) + +static int +parsefields(unsigned *fieldsp, char **names, unsigned *bits, size_t len) +{ + char *field = optarg; + char *delim; + + do { + int i; + boolean_t found = B_FALSE; + delim = strchr(field, ','); + if (delim != NULL) + *delim = '\0'; + + for (i = 0; i < len; i++) + if (0 == strcmp(field, names[i])) { + found = B_TRUE; + *fieldsp |= bits[i]; + break; + } + + if (!found) { + (void) fprintf(stderr, gettext("invalid type '%s'" + "for -t option\n"), field); + return (-1); + } + + field = delim + 1; + } while (delim); + + return (0); +} + + +static char *type_names[] = { "posixuser", "smbuser", "posixgroup", "smbgroup", + "all" }; +static unsigned type_bits[] = { + USTYPE_PSX_USR, + USTYPE_SMB_USR, + USTYPE_PSX_GRP, + USTYPE_SMB_GRP, + USTYPE_ALL +}; + +static char *us_field_names[] = { "type", "name", "used", "quota" }; +static unsigned us_field_bits[] = { + USFIELD_TYPE, + USFIELD_NAME, + USFIELD_USED, + USFIELD_QUOTA +}; + +static void +print_us_node(boolean_t scripted, boolean_t parseable, unsigned fields, + size_t type_width, size_t name_width, size_t used_width, + size_t quota_width, us_node_t *node) +{ + nvlist_t *nvl = node->usn_nvl; + nvpair_t *nvp = NULL; + char valstr[ZFS_MAXNAMELEN]; + boolean_t first = B_TRUE; + boolean_t quota_found = B_FALSE; + + if (fields & USFIELD_QUOTA && !nvlist_exists(nvl, "quota")) + if (nvlist_add_string(nvl, "quota", "none") != 0) + nomem(); + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + char *pname = nvpair_name(nvp); + data_type_t type = nvpair_type(nvp); + uint32_t val32 = 0; + uint64_t val64 = 0; + char *strval = NULL; + unsigned field = 0; + unsigned width = 0; + int i; + for (i = 0; i < 4; i++) { + if (0 == strcmp(pname, us_field_names[i])) { + field = us_field_bits[i]; + break; + } + } + + if (!(field & fields)) + continue; + + switch (type) { + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &val32); + break; + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &val64); + break; + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &strval); + break; + default: + (void) fprintf(stderr, "Invalid data type\n"); + } + + if (!first) + if (scripted) + (void) printf("\t"); + else + (void) printf(" "); + + switch (field) { + case USFIELD_TYPE: + strval = (char *)us_type2str(val32); + width = type_width; + break; + case USFIELD_NAME: + if (type == DATA_TYPE_UINT64) { + (void) sprintf(valstr, "%llu", val64); + strval = valstr; + } + width = name_width; + break; + case USFIELD_USED: + case USFIELD_QUOTA: + if (type == DATA_TYPE_UINT64) { + (void) nvpair_value_uint64(nvp, &val64); + if (parseable) + (void) sprintf(valstr, "%llu", val64); + else + zfs_nicenum(val64, valstr, + sizeof (valstr)); + strval = valstr; + } + + if (field == USFIELD_USED) + width = used_width; + else { + quota_found = B_FALSE; + width = quota_width; + } + + break; + } + + if (field == USFIELD_QUOTA && !quota_found) + (void) printf("%*s", width, strval); + else { + if (type == DATA_TYPE_STRING) + (void) printf("%-*s", width, strval); + else + (void) printf("%*s", width, strval); + } + + first = B_FALSE; + + } + + (void) printf("\n"); +} + +static void +print_us(boolean_t scripted, boolean_t parsable, unsigned fields, + unsigned type_width, unsigned name_width, unsigned used_width, + unsigned quota_width, boolean_t rmnode, uu_avl_t *avl) +{ + static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" }; + us_node_t *node; + const char *col; + int i; + size_t width[4] = { type_width, name_width, used_width, quota_width }; + + if (!scripted) { + boolean_t first = B_TRUE; + for (i = 0; i < 4; i++) { + unsigned field = us_field_bits[i]; + if (!(field & fields)) + continue; + + col = gettext(us_field_hdr[i]); + if (field == USFIELD_TYPE || field == USFIELD_NAME) + (void) printf(first?"%-*s":" %-*s", width[i], + col); + else + (void) printf(first?"%*s":" %*s", width[i], + col); + first = B_FALSE; + } + (void) printf("\n"); + } + + for (node = uu_avl_first(avl); node != NULL; + node = uu_avl_next(avl, node)) { + print_us_node(scripted, parsable, fields, type_width, + name_width, used_width, used_width, node); + if (rmnode) + nvlist_free(node->usn_nvl); + } +} + +static int +zfs_do_userspace(int argc, char **argv) +{ + zfs_handle_t *zhp; + zfs_userquota_prop_t p; + uu_avl_pool_t *avl_pool; + uu_avl_t *avl_tree; + uu_avl_walk_t *walk; + + char *cmd; + boolean_t scripted = B_FALSE; + boolean_t prtnum = B_FALSE; + boolean_t parseable = B_FALSE; + boolean_t sid2posix = B_FALSE; + int error = 0; + int c; + zfs_sort_column_t *default_sortcol = NULL; + zfs_sort_column_t *sortcol = NULL; + unsigned types = USTYPE_PSX_USR | USTYPE_SMB_USR; + unsigned fields = 0; + unsigned props = USPROP_USED | USPROP_QUOTA; + us_cbdata_t cb; + us_node_t *node; + boolean_t resort_avl = B_FALSE; + + if (argc < 2) + usage(B_FALSE); + + cmd = argv[0]; + if (0 == strcmp(cmd, "groupspace")) + /* toggle default group types */ + types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; + + /* check options */ + while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { + switch (c) { + case 'n': + prtnum = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'p': + parseable = B_TRUE; + break; + case 'o': + if (parsefields(&fields, us_field_names, us_field_bits, + 4) != 0) + return (1); + break; + case 's': + if (zfs_add_sort_column(&sortcol, optarg, + B_FALSE) != 0) { + (void) fprintf(stderr, + gettext("invalid property '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 'S': + if (zfs_add_sort_column(&sortcol, optarg, + B_TRUE) != 0) { + (void) fprintf(stderr, + gettext("invalid property '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 't': + if (parsefields(&types, type_names, type_bits, 5)) + return (1); + break; + case 'i': + sid2posix = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* ok, now we have sorted by default colums (type,name) avl tree */ + if (sortcol) { + zfs_sort_column_t *sc; + for (sc = sortcol; sc; sc = sc->sc_next) { + if (sc->sc_prop == ZFS_PROP_QUOTA) { + resort_avl = B_TRUE; + break; + } + } + } + + if (!fields) + fields = USFIELD_ALL; + + if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL) + return (1); + + if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), + offsetof(us_node_t, usn_avlnode), + us_compare, UU_DEFAULT)) == NULL) + nomem(); + if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) + nomem(); + + if (sortcol && !resort_avl) + cb.cb_sortcol = sortcol; + else { + (void) zfs_add_sort_column(&default_sortcol, "type", B_FALSE); + (void) zfs_add_sort_column(&default_sortcol, "name", B_FALSE); + cb.cb_sortcol = default_sortcol; + } + cb.cb_numname = prtnum; + cb.cb_nicenum = !parseable; + cb.cb_avl_pool = avl_pool; + cb.cb_avl = avl_tree; + cb.cb_sid2posix = sid2posix; + cb.cb_max_typelen = strlen(gettext("TYPE")); + cb.cb_max_namelen = strlen(gettext("NAME")); + cb.cb_max_usedlen = strlen(gettext("USED")); + cb.cb_max_quotalen = strlen(gettext("QUOTA")); + + for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { + if (!usprop_check(p, types, props)) + continue; + + cb.cb_prop = p; + error = zfs_userspace(zhp, p, userspace_cb, &cb); + if (error) + break; + } + + + if (resort_avl) { + us_node_t *node; + us_node_t *rmnode; + uu_list_pool_t *listpool; + uu_list_t *list; + uu_avl_index_t idx = 0; + uu_list_index_t idx2 = 0; + listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), + offsetof(us_node_t, usn_listnode), NULL, + UU_DEFAULT); + list = uu_list_create(listpool, NULL, UU_DEFAULT); + + node = uu_avl_first(avl_tree); + uu_list_node_init(node, &node->usn_listnode, listpool); + while (node != NULL) { + rmnode = node; + node = uu_avl_next(avl_tree, node); + uu_avl_remove(avl_tree, rmnode); + if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) { + uu_list_insert(list, rmnode, idx2); + } + } + + for (node = uu_list_first(list); node != NULL; + node = uu_list_next(list, node)) { + us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; + if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == + NULL) + uu_avl_insert(avl_tree, node, idx); + } + + uu_list_destroy(list); + } + + /* print & free node`s nvlist memory */ + print_us(scripted, parseable, fields, cb.cb_max_typelen, + cb.cb_max_namelen, cb.cb_max_usedlen, + cb.cb_max_quotalen, B_TRUE, cb.cb_avl); + + if (sortcol) + zfs_free_sort_columns(sortcol); + zfs_free_sort_columns(default_sortcol); + + /* + * Finally, clean up the AVL tree. + */ + if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(cb.cb_avl, node); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(avl_tree); + uu_avl_pool_destroy(avl_pool); + + return (error); +} + +/* + * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...] + * [-s property [-s property]...] [-S property [-S property]...] + * ... * * -r Recurse over all children * -d Limit recursion by depth. @@ -2005,6 +2862,13 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) else propstr = property; right_justify = B_TRUE; + } else if (zfs_prop_written(pl->pl_user_prop)) { + if (zfs_prop_get_written(zhp, pl->pl_user_prop, + property, sizeof (property), B_FALSE) != 0) + propstr = "-"; + else + propstr = property; + right_justify = B_TRUE; } else { if (nvlist_lookup_nvlist(userprops, pl->pl_user_prop, &propval) != 0) @@ -2065,7 +2929,7 @@ zfs_do_list(int argc, char **argv) list_cbdata_t cb = { 0 }; char *value; int limit = 0; - int ret; + int ret = 0; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; @@ -2180,8 +3044,8 @@ zfs_do_list(int argc, char **argv) } /* - * zfs rename - * zfs rename -p + * zfs rename [-f] + * zfs rename [-f] -p * zfs rename -r * * Renames the given dataset to another of the same type. @@ -2194,12 +3058,13 @@ zfs_do_rename(int argc, char **argv) { zfs_handle_t *zhp; int c; - int ret; + int ret = 0; boolean_t recurse = B_FALSE; boolean_t parents = B_FALSE; + boolean_t force_unmount = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "pr")) != -1) { + while ((c = getopt(argc, argv, "prf")) != -1) { switch (c) { case 'p': parents = B_TRUE; @@ -2207,6 +3072,9 @@ zfs_do_rename(int argc, char **argv) case 'r': recurse = B_TRUE; break; + case 'f': + force_unmount = B_TRUE; + break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), @@ -2257,7 +3125,7 @@ zfs_do_rename(int argc, char **argv) return (1); } - ret = (zfs_rename(zhp, argv[1], recurse) != 0); + ret = (zfs_rename(zhp, argv[1], recurse, force_unmount) != 0); zfs_close(zhp); return (ret); @@ -2273,7 +3141,7 @@ static int zfs_do_promote(int argc, char **argv) { zfs_handle_t *zhp; - int ret; + int ret = 0; /* check options */ if (argc > 1 && argv[1][0] == '-') { @@ -2394,7 +3262,7 @@ rollback_check(zfs_handle_t *zhp, void *data) static int zfs_do_rollback(int argc, char **argv) { - int ret; + int ret = 0; int c; boolean_t force = B_FALSE; rollback_cbdata_t cb = { 0 }; @@ -2512,7 +3380,7 @@ static int zfs_do_set(int argc, char **argv) { set_cbdata_t cb; - int ret; + int ret = 0; /* check for options */ if (argc > 1 && argv[1][0] == '-') { @@ -2566,7 +3434,7 @@ static int zfs_do_snapshot(int argc, char **argv) { boolean_t recursive = B_FALSE; - int ret; + int ret = 0; char c; nvlist_t *props; @@ -2616,9 +3484,6 @@ usage: } /* - * zfs send [-vDp] -R [-i|-I <@snap>] - * zfs send [-vDp] [-i|-I <@snap>] - * * Send a backup stream to stdout. */ static int @@ -2630,11 +3495,11 @@ zfs_do_send(int argc, char **argv) zfs_handle_t *zhp; sendflags_t flags = { 0 }; int c, err; - nvlist_t *dbgnv; + nvlist_t *dbgnv = NULL; boolean_t extraverbose = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) { + while ((c = getopt(argc, argv, ":i:I:RDpvnP")) != -1) { switch (c) { case 'i': if (fromname) @@ -2653,14 +3518,22 @@ zfs_do_send(int argc, char **argv) case 'p': flags.props = B_TRUE; break; + case 'P': + flags.parsable = B_TRUE; + flags.verbose = B_TRUE; + break; case 'v': if (flags.verbose) extraverbose = B_TRUE; flags.verbose = B_TRUE; + flags.progress = B_TRUE; break; case 'D': flags.dedup = B_TRUE; break; + case 'n': + flags.dryrun = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -2686,7 +3559,7 @@ zfs_do_send(int argc, char **argv) usage(B_FALSE); } - if (isatty(STDOUT_FILENO)) { + if (!flags.dryrun && isatty(STDOUT_FILENO)) { (void) fprintf(stderr, gettext("Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n")); @@ -2705,97 +3578,1389 @@ zfs_do_send(int argc, char **argv) if (zhp == NULL) return (1); - /* - * If they specified the full path to the snapshot, chop off - * everything except the short name of the snapshot, but special - * case if they specify the origin. - */ - if (fromname && (cp = strchr(fromname, '@')) != NULL) { - char origin[ZFS_MAXNAMELEN]; - zprop_source_t src; + /* + * If they specified the full path to the snapshot, chop off + * everything except the short name of the snapshot, but special + * case if they specify the origin. + */ + if (fromname && (cp = strchr(fromname, '@')) != NULL) { + char origin[ZFS_MAXNAMELEN]; + zprop_source_t src; + + (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, + origin, sizeof (origin), &src, NULL, 0, B_FALSE); + + if (strcmp(origin, fromname) == 0) { + fromname = NULL; + flags.fromorigin = B_TRUE; + } else { + *cp = '\0'; + if (cp != fromname && strcmp(argv[0], fromname)) { + (void) fprintf(stderr, + gettext("incremental source must be " + "in same filesystem\n")); + usage(B_FALSE); + } + fromname = cp + 1; + if (strchr(fromname, '@') || strchr(fromname, '/')) { + (void) fprintf(stderr, + gettext("invalid incremental source\n")); + usage(B_FALSE); + } + } + } + + if (flags.replicate && fromname == NULL) + flags.doall = B_TRUE; + + err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, + extraverbose ? &dbgnv : NULL); + + if (extraverbose && dbgnv != NULL) { + /* + * dump_nvlist prints to stdout, but that's been + * redirected to a file. Make it print to stderr + * instead. + */ + (void) dup2(STDERR_FILENO, STDOUT_FILENO); + dump_nvlist(dbgnv, 0); + nvlist_free(dbgnv); + } + zfs_close(zhp); + + return (err != 0); +} + +/* + * zfs receive [-vnFu] [-d | -e] + * + * Restore a backup stream from stdin. + */ +static int +zfs_do_receive(int argc, char **argv) +{ + int c, err; + recvflags_t flags = { 0 }; + + /* check options */ + while ((c = getopt(argc, argv, ":denuvF")) != -1) { + switch (c) { + case 'd': + flags.isprefix = B_TRUE; + break; + case 'e': + flags.isprefix = B_TRUE; + flags.istail = B_TRUE; + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'u': + flags.nomount = B_TRUE; + break; + case 'v': + flags.verbose = B_TRUE; + break; + case 'F': + flags.force = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (isatty(STDIN_FILENO)) { + (void) fprintf(stderr, + gettext("Error: Backup stream can not be read " + "from a terminal.\n" + "You must redirect standard input.\n")); + return (1); + } + + err = zfs_receive(g_zfs, argv[0], &flags, STDIN_FILENO, NULL); + + return (err != 0); +} + +/* + * allow/unallow stuff + */ +/* copied from zfs/sys/dsl_deleg.h */ +#define ZFS_DELEG_PERM_CREATE "create" +#define ZFS_DELEG_PERM_DESTROY "destroy" +#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" +#define ZFS_DELEG_PERM_ROLLBACK "rollback" +#define ZFS_DELEG_PERM_CLONE "clone" +#define ZFS_DELEG_PERM_PROMOTE "promote" +#define ZFS_DELEG_PERM_RENAME "rename" +#define ZFS_DELEG_PERM_MOUNT "mount" +#define ZFS_DELEG_PERM_SHARE "share" +#define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_RECEIVE "receive" +#define ZFS_DELEG_PERM_ALLOW "allow" +#define ZFS_DELEG_PERM_USERPROP "userprop" +#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ +#define ZFS_DELEG_PERM_USERQUOTA "userquota" +#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" +#define ZFS_DELEG_PERM_USERUSED "userused" +#define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" +#define ZFS_DELEG_PERM_DIFF "diff" + +#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE + +static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { + { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, + { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, + { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, + { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, + { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, + { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, + { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, + { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, + { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, + { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, + { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, + { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, + { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, + { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, + { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, + + { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, + { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, + { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, + { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, + { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, + { NULL, ZFS_DELEG_NOTE_NONE } +}; + +/* permission structure */ +typedef struct deleg_perm { + zfs_deleg_who_type_t dp_who_type; + const char *dp_name; + boolean_t dp_local; + boolean_t dp_descend; +} deleg_perm_t; + +/* */ +typedef struct deleg_perm_node { + deleg_perm_t dpn_perm; + + uu_avl_node_t dpn_avl_node; +} deleg_perm_node_t; + +typedef struct fs_perm fs_perm_t; + +/* permissions set */ +typedef struct who_perm { + zfs_deleg_who_type_t who_type; + const char *who_name; /* id */ + char who_ug_name[256]; /* user/group name */ + fs_perm_t *who_fsperm; /* uplink */ + + uu_avl_t *who_deleg_perm_avl; /* permissions */ +} who_perm_t; + +/* */ +typedef struct who_perm_node { + who_perm_t who_perm; + uu_avl_node_t who_avl_node; +} who_perm_node_t; + +typedef struct fs_perm_set fs_perm_set_t; +/* fs permissions */ +struct fs_perm { + const char *fsp_name; + + uu_avl_t *fsp_sc_avl; /* sets,create */ + uu_avl_t *fsp_uge_avl; /* user,group,everyone */ + + fs_perm_set_t *fsp_set; /* uplink */ +}; + +/* */ +typedef struct fs_perm_node { + fs_perm_t fspn_fsperm; + uu_avl_t *fspn_avl; + + uu_list_node_t fspn_list_node; +} fs_perm_node_t; + +/* top level structure */ +struct fs_perm_set { + uu_list_pool_t *fsps_list_pool; + uu_list_t *fsps_list; /* list of fs_perms */ + + uu_avl_pool_t *fsps_named_set_avl_pool; + uu_avl_pool_t *fsps_who_perm_avl_pool; + uu_avl_pool_t *fsps_deleg_perm_avl_pool; +}; + +static inline const char * +deleg_perm_type(zfs_deleg_note_t note) +{ + /* subcommands */ + switch (note) { + /* SUBCOMMANDS */ + /* OTHER */ + case ZFS_DELEG_NOTE_GROUPQUOTA: + case ZFS_DELEG_NOTE_GROUPUSED: + case ZFS_DELEG_NOTE_USERPROP: + case ZFS_DELEG_NOTE_USERQUOTA: + case ZFS_DELEG_NOTE_USERUSED: + /* other */ + return (gettext("other")); + default: + return (gettext("subcommand")); + } +} + +static int inline +who_type2weight(zfs_deleg_who_type_t who_type) +{ + int res; + switch (who_type) { + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + res = 0; + break; + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_CREATE: + res = 1; + break; + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + res = 2; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + res = 3; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + res = 4; + break; + default: + res = -1; + } + + return (res); +} + +/* ARGSUSED */ +static int +who_perm_compare(const void *larg, const void *rarg, void *unused) +{ + const who_perm_node_t *l = larg; + const who_perm_node_t *r = rarg; + zfs_deleg_who_type_t ltype = l->who_perm.who_type; + zfs_deleg_who_type_t rtype = r->who_perm.who_type; + int lweight = who_type2weight(ltype); + int rweight = who_type2weight(rtype); + int res = lweight - rweight; + if (res == 0) + res = strncmp(l->who_perm.who_name, r->who_perm.who_name, + ZFS_MAX_DELEG_NAME-1); + + if (res == 0) + return (0); + if (res > 0) + return (1); + else + return (-1); +} + +/* ARGSUSED */ +static int +deleg_perm_compare(const void *larg, const void *rarg, void *unused) +{ + const deleg_perm_node_t *l = larg; + const deleg_perm_node_t *r = rarg; + int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, + ZFS_MAX_DELEG_NAME-1); + + if (res == 0) + return (0); + + if (res > 0) + return (1); + else + return (-1); +} + +static inline void +fs_perm_set_init(fs_perm_set_t *fspset) +{ + bzero(fspset, sizeof (fs_perm_set_t)); + + if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", + sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), + NULL, UU_DEFAULT)) == NULL) + nomem(); + if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( + "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( + who_perm_node_t, who_avl_node), who_perm_compare, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( + "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( + who_perm_node_t, who_avl_node), who_perm_compare, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( + "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( + deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) + == NULL) + nomem(); +} + +static inline void fs_perm_fini(fs_perm_t *); +static inline void who_perm_fini(who_perm_t *); + +static inline void +fs_perm_set_fini(fs_perm_set_t *fspset) +{ + fs_perm_node_t *node = uu_list_first(fspset->fsps_list); + + while (node != NULL) { + fs_perm_node_t *next_node = + uu_list_next(fspset->fsps_list, node); + fs_perm_t *fsperm = &node->fspn_fsperm; + fs_perm_fini(fsperm); + uu_list_remove(fspset->fsps_list, node); + free(node); + node = next_node; + } + + uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); + uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); + uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); +} + +static inline void +deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, + const char *name) +{ + deleg_perm->dp_who_type = type; + deleg_perm->dp_name = name; +} + +static inline void +who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, + zfs_deleg_who_type_t type, const char *name) +{ + uu_avl_pool_t *pool; + pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; + + bzero(who_perm, sizeof (who_perm_t)); + + if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, + UU_DEFAULT)) == NULL) + nomem(); + + who_perm->who_type = type; + who_perm->who_name = name; + who_perm->who_fsperm = fsperm; +} + +static inline void +who_perm_fini(who_perm_t *who_perm) +{ + deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); + + while (node != NULL) { + deleg_perm_node_t *next_node = + uu_avl_next(who_perm->who_deleg_perm_avl, node); + + uu_avl_remove(who_perm->who_deleg_perm_avl, node); + free(node); + node = next_node; + } + + uu_avl_destroy(who_perm->who_deleg_perm_avl); +} + +static inline void +fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) +{ + uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; + uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; + + bzero(fsperm, sizeof (fs_perm_t)); + + if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) + == NULL) + nomem(); + + if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) + == NULL) + nomem(); + + fsperm->fsp_set = fspset; + fsperm->fsp_name = fsname; +} + +static inline void +fs_perm_fini(fs_perm_t *fsperm) +{ + who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); + while (node != NULL) { + who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, + node); + who_perm_t *who_perm = &node->who_perm; + who_perm_fini(who_perm); + uu_avl_remove(fsperm->fsp_sc_avl, node); + free(node); + node = next_node; + } + + node = uu_avl_first(fsperm->fsp_uge_avl); + while (node != NULL) { + who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, + node); + who_perm_t *who_perm = &node->who_perm; + who_perm_fini(who_perm); + uu_avl_remove(fsperm->fsp_uge_avl, node); + free(node); + node = next_node; + } + + uu_avl_destroy(fsperm->fsp_sc_avl); + uu_avl_destroy(fsperm->fsp_uge_avl); +} + +static void inline +set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, + zfs_deleg_who_type_t who_type, const char *name, char locality) +{ + uu_avl_index_t idx = 0; + + deleg_perm_node_t *found_node = NULL; + deleg_perm_t *deleg_perm = &node->dpn_perm; + + deleg_perm_init(deleg_perm, who_type, name); + + if ((found_node = uu_avl_find(avl, node, NULL, &idx)) + == NULL) + uu_avl_insert(avl, node, idx); + else { + node = found_node; + deleg_perm = &node->dpn_perm; + } + + + switch (locality) { + case ZFS_DELEG_LOCAL: + deleg_perm->dp_local = B_TRUE; + break; + case ZFS_DELEG_DESCENDENT: + deleg_perm->dp_descend = B_TRUE; + break; + case ZFS_DELEG_NA: + break; + default: + assert(B_FALSE); /* invalid locality */ + } +} + +static inline int +parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) +{ + nvpair_t *nvp = NULL; + fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; + uu_avl_t *avl = who_perm->who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_perm->who_type; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + const char *name = nvpair_name(nvp); + data_type_t type = nvpair_type(nvp); + uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; + deleg_perm_node_t *node = + safe_malloc(sizeof (deleg_perm_node_t)); + + assert(type == DATA_TYPE_BOOLEAN); + + uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); + set_deleg_perm_node(avl, node, who_type, name, locality); + } + + return (0); +} + +static inline int +parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) +{ + nvpair_t *nvp = NULL; + fs_perm_set_t *fspset = fsperm->fsp_set; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + nvlist_t *nvl2 = NULL; + const char *name = nvpair_name(nvp); + uu_avl_t *avl = NULL; + uu_avl_pool_t *avl_pool; + zfs_deleg_who_type_t perm_type = name[0]; + char perm_locality = name[1]; + const char *perm_name = name + 3; + boolean_t is_set = B_TRUE; + who_perm_t *who_perm = NULL; + + assert('$' == name[2]); + + if (nvpair_value_nvlist(nvp, &nvl2) != 0) + return (-1); + + switch (perm_type) { + case ZFS_DELEG_CREATE: + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_NAMED_SET: + case ZFS_DELEG_NAMED_SET_SETS: + avl_pool = fspset->fsps_named_set_avl_pool; + avl = fsperm->fsp_sc_avl; + break; + case ZFS_DELEG_USER: + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_GROUP: + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_EVERYONE: + case ZFS_DELEG_EVERYONE_SETS: + avl_pool = fspset->fsps_who_perm_avl_pool; + avl = fsperm->fsp_uge_avl; + break; + } + + if (is_set) { + who_perm_node_t *found_node = NULL; + who_perm_node_t *node = safe_malloc( + sizeof (who_perm_node_t)); + who_perm = &node->who_perm; + uu_avl_index_t idx = 0; + + uu_avl_node_init(node, &node->who_avl_node, avl_pool); + who_perm_init(who_perm, fsperm, perm_type, perm_name); + + if ((found_node = uu_avl_find(avl, node, NULL, &idx)) + == NULL) { + if (avl == fsperm->fsp_uge_avl) { + uid_t rid = 0; + struct passwd *p = NULL; + struct group *g = NULL; + const char *nice_name = NULL; + + switch (perm_type) { + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + rid = atoi(perm_name); + p = getpwuid(rid); + if (p) + nice_name = p->pw_name; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + rid = atoi(perm_name); + g = getgrgid(rid); + if (g) + nice_name = g->gr_name; + break; + } + + if (nice_name != NULL) + (void) strlcpy( + node->who_perm.who_ug_name, + nice_name, 256); + } + + uu_avl_insert(avl, node, idx); + } else { + node = found_node; + who_perm = &node->who_perm; + } + } + + (void) parse_who_perm(who_perm, nvl2, perm_locality); + } + + return (0); +} + +static inline int +parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) +{ + nvpair_t *nvp = NULL; + uu_avl_index_t idx = 0; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + nvlist_t *nvl2 = NULL; + const char *fsname = nvpair_name(nvp); + data_type_t type = nvpair_type(nvp); + fs_perm_t *fsperm = NULL; + fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); + if (node == NULL) + nomem(); + + fsperm = &node->fspn_fsperm; + + assert(DATA_TYPE_NVLIST == type); + + uu_list_node_init(node, &node->fspn_list_node, + fspset->fsps_list_pool); + + idx = uu_list_numnodes(fspset->fsps_list); + fs_perm_init(fsperm, fspset, fsname); + + if (nvpair_value_nvlist(nvp, &nvl2) != 0) + return (-1); + + (void) parse_fs_perm(fsperm, nvl2); + + uu_list_insert(fspset->fsps_list, node, idx); + } + + return (0); +} + +static inline const char * +deleg_perm_comment(zfs_deleg_note_t note) +{ + const char *str = ""; + + /* subcommands */ + switch (note) { + /* SUBCOMMANDS */ + case ZFS_DELEG_NOTE_ALLOW: + str = gettext("Must also have the permission that is being" + "\n\t\t\t\tallowed"); + break; + case ZFS_DELEG_NOTE_CLONE: + str = gettext("Must also have the 'create' ability and 'mount'" + "\n\t\t\t\tability in the origin file system"); + break; + case ZFS_DELEG_NOTE_CREATE: + str = gettext("Must also have the 'mount' ability"); + break; + case ZFS_DELEG_NOTE_DESTROY: + str = gettext("Must also have the 'mount' ability"); + break; + case ZFS_DELEG_NOTE_DIFF: + str = gettext("Allows lookup of paths within a dataset;" + "\n\t\t\t\tgiven an object number. Ordinary users need this" + "\n\t\t\t\tin order to use zfs diff"); + break; + case ZFS_DELEG_NOTE_HOLD: + str = gettext("Allows adding a user hold to a snapshot"); + break; + case ZFS_DELEG_NOTE_MOUNT: + str = gettext("Allows mount/umount of ZFS datasets"); + break; + case ZFS_DELEG_NOTE_PROMOTE: + str = gettext("Must also have the 'mount'\n\t\t\t\tand" + " 'promote' ability in the origin file system"); + break; + case ZFS_DELEG_NOTE_RECEIVE: + str = gettext("Must also have the 'mount' and 'create'" + " ability"); + break; + case ZFS_DELEG_NOTE_RELEASE: + str = gettext("Allows releasing a user hold which\n\t\t\t\t" + "might destroy the snapshot"); + break; + case ZFS_DELEG_NOTE_RENAME: + str = gettext("Must also have the 'mount' and 'create'" + "\n\t\t\t\tability in the new parent"); + break; + case ZFS_DELEG_NOTE_ROLLBACK: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_SEND: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_SHARE: + str = gettext("Allows sharing file systems over NFS or SMB" + "\n\t\t\t\tprotocols"); + break; + case ZFS_DELEG_NOTE_SNAPSHOT: + str = gettext(""); + break; +/* + * case ZFS_DELEG_NOTE_VSCAN: + * str = gettext(""); + * break; + */ + /* OTHER */ + case ZFS_DELEG_NOTE_GROUPQUOTA: + str = gettext("Allows accessing any groupquota@... property"); + break; + case ZFS_DELEG_NOTE_GROUPUSED: + str = gettext("Allows reading any groupused@... property"); + break; + case ZFS_DELEG_NOTE_USERPROP: + str = gettext("Allows changing any user property"); + break; + case ZFS_DELEG_NOTE_USERQUOTA: + str = gettext("Allows accessing any userquota@... property"); + break; + case ZFS_DELEG_NOTE_USERUSED: + str = gettext("Allows reading any userused@... property"); + break; + /* other */ + default: + str = ""; + } + + return (str); +} + +struct allow_opts { + boolean_t local; + boolean_t descend; + boolean_t user; + boolean_t group; + boolean_t everyone; + boolean_t create; + boolean_t set; + boolean_t recursive; /* unallow only */ + boolean_t prt_usage; + + boolean_t prt_perms; + char *who; + char *perms; + const char *dataset; +}; + +static inline int +prop_cmp(const void *a, const void *b) +{ + const char *str1 = *(const char **)a; + const char *str2 = *(const char **)b; + return (strcmp(str1, str2)); +} + +static void +allow_usage(boolean_t un, boolean_t requested, const char *msg) +{ + const char *opt_desc[] = { + "-h", gettext("show this help message and exit"), + "-l", gettext("set permission locally"), + "-d", gettext("set permission for descents"), + "-u", gettext("set permission for user"), + "-g", gettext("set permission for group"), + "-e", gettext("set permission for everyone"), + "-c", gettext("set create time permission"), + "-s", gettext("define permission set"), + /* unallow only */ + "-r", gettext("remove permissions recursively"), + }; + size_t unallow_size = sizeof (opt_desc) / sizeof (char *); + size_t allow_size = unallow_size - 2; + const char *props[ZFS_NUM_PROPS]; + int i; + size_t count = 0; + FILE *fp = requested ? stdout : stderr; + zprop_desc_t *pdtbl = zfs_prop_get_table(); + const char *fmt = gettext("%-16s %-14s\t%s\n"); + + (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : + HELP_ALLOW)); + (void) fprintf(fp, gettext("Options:\n")); + for (int i = 0; i < (un ? unallow_size : allow_size); i++) { + const char *opt = opt_desc[i++]; + const char *optdsc = opt_desc[i]; + (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); + } + + (void) fprintf(fp, gettext("\nThe following permissions are " + "supported:\n\n")); + (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), + gettext("NOTES")); + for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { + const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; + zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; + const char *perm_type = deleg_perm_type(perm_note); + const char *perm_comment = deleg_perm_comment(perm_note); + (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); + } + + for (i = 0; i < ZFS_NUM_PROPS; i++) { + zprop_desc_t *pd = &pdtbl[i]; + if (pd->pd_visible != B_TRUE) + continue; + + if (pd->pd_attr == PROP_READONLY) + continue; + + props[count++] = pd->pd_name; + } + props[count] = NULL; + + qsort(props, count, sizeof (char *), prop_cmp); + + for (i = 0; i < count; i++) + (void) fprintf(fp, fmt, props[i], gettext("property"), ""); + + if (msg != NULL) + (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); + + exit(requested ? 0 : 2); +} + +static inline const char * +munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, + char **permsp) +{ + if (un && argc == expected_argc - 1) + *permsp = NULL; + else if (argc == expected_argc) + *permsp = argv[argc - 2]; + else + allow_usage(un, B_FALSE, + gettext("wrong number of parameters\n")); + + return (argv[argc - 1]); +} + +static void +parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) +{ + int uge_sum = opts->user + opts->group + opts->everyone; + int csuge_sum = opts->create + opts->set + uge_sum; + int ldcsuge_sum = csuge_sum + opts->local + opts->descend; + int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; + + if (uge_sum > 1) + allow_usage(un, B_FALSE, + gettext("-u, -g, and -e are mutually exclusive\n")); + + if (opts->prt_usage) + if (argc == 0 && all_sum == 0) + allow_usage(un, B_TRUE, NULL); + else + usage(B_FALSE); + + if (opts->set) { + if (csuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -s\n")); + + opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); + if (argv[0][0] != '@') + allow_usage(un, B_FALSE, + gettext("invalid set name: missing '@' prefix\n")); + opts->who = argv[0]; + } else if (opts->create) { + if (ldcsuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -c\n")); + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (opts->everyone) { + if (csuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -e\n")); + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") + == 0) { + opts->everyone = B_TRUE; + argc--; + argv++; + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (argc == 1 && !un) { + opts->prt_perms = B_TRUE; + opts->dataset = argv[argc-1]; + } else { + opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); + opts->who = argv[0]; + } + + if (!opts->local && !opts->descend) { + opts->local = B_TRUE; + opts->descend = B_TRUE; + } +} + +static void +store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, + const char *who, char *perms, nvlist_t *top_nvl) +{ + int i; + char ld[2] = { '\0', '\0' }; + char who_buf[ZFS_MAXNAMELEN+32]; + char base_type; + char set_type; + nvlist_t *base_nvl = NULL; + nvlist_t *set_nvl = NULL; + nvlist_t *nvl; + + if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + switch (type) { + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + set_type = ZFS_DELEG_NAMED_SET_SETS; + base_type = ZFS_DELEG_NAMED_SET; + ld[0] = ZFS_DELEG_NA; + break; + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_CREATE: + set_type = ZFS_DELEG_CREATE_SETS; + base_type = ZFS_DELEG_CREATE; + ld[0] = ZFS_DELEG_NA; + break; + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + set_type = ZFS_DELEG_USER_SETS; + base_type = ZFS_DELEG_USER; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + set_type = ZFS_DELEG_GROUP_SETS; + base_type = ZFS_DELEG_GROUP; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + set_type = ZFS_DELEG_EVERYONE_SETS; + base_type = ZFS_DELEG_EVERYONE; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + } + + if (perms != NULL) { + char *curr = perms; + char *end = curr + strlen(perms); + + while (curr < end) { + char *delim = strchr(curr, ','); + if (delim == NULL) + delim = end; + else + *delim = '\0'; + + if (curr[0] == '@') + nvl = set_nvl; + else + nvl = base_nvl; + + (void) nvlist_add_boolean(nvl, curr); + if (delim != end) + *delim = ','; + curr = delim + 1; + } + + for (i = 0; i < 2; i++) { + char locality = ld[i]; + if (locality == 0) + continue; + + if (!nvlist_empty(base_nvl)) { + if (who != NULL) + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$%s", + base_type, locality, who); + else + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$", + base_type, locality); + + (void) nvlist_add_nvlist(top_nvl, who_buf, + base_nvl); + } + + + if (!nvlist_empty(set_nvl)) { + if (who != NULL) + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$%s", + set_type, locality, who); + else + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$", + set_type, locality); + + (void) nvlist_add_nvlist(top_nvl, who_buf, + set_nvl); + } + } + } else { + for (i = 0; i < 2; i++) { + char locality = ld[i]; + if (locality == 0) + continue; + + if (who != NULL) + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$%s", base_type, locality, who); + else + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$", base_type, locality); + (void) nvlist_add_boolean(top_nvl, who_buf); + + if (who != NULL) + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$%s", set_type, locality, who); + else + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$", set_type, locality); + (void) nvlist_add_boolean(top_nvl, who_buf); + } + } +} + +static int +construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) +{ + if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + if (opts->set) { + store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, + opts->descend, opts->who, opts->perms, *nvlp); + } else if (opts->create) { + store_allow_perm(ZFS_DELEG_CREATE, opts->local, + opts->descend, NULL, opts->perms, *nvlp); + } else if (opts->everyone) { + store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, + opts->descend, NULL, opts->perms, *nvlp); + } else { + char *curr = opts->who; + char *end = curr + strlen(curr); + + while (curr < end) { + const char *who; + zfs_deleg_who_type_t who_type; + char *endch; + char *delim = strchr(curr, ','); + char errbuf[256]; + char id[64]; + struct passwd *p = NULL; + struct group *g = NULL; + + uid_t rid; + if (delim == NULL) + delim = end; + else + *delim = '\0'; + + rid = (uid_t)strtol(curr, &endch, 0); + if (opts->user) { + who_type = ZFS_DELEG_USER; + if (*endch != '\0') + p = getpwnam(curr); + else + p = getpwuid(rid); + + if (p != NULL) + rid = p->pw_uid; + else { + (void) snprintf(errbuf, 256, gettext( + "invalid user %s"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } else if (opts->group) { + who_type = ZFS_DELEG_GROUP; + if (*endch != '\0') + g = getgrnam(curr); + else + g = getgrgid(rid); + + if (g != NULL) + rid = g->gr_gid; + else { + (void) snprintf(errbuf, 256, gettext( + "invalid group %s"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } else { + if (*endch != '\0') { + p = getpwnam(curr); + } else { + p = getpwuid(rid); + } + + if (p == NULL) + if (*endch != '\0') { + g = getgrnam(curr); + } else { + g = getgrgid(rid); + } + + if (p != NULL) { + who_type = ZFS_DELEG_USER; + rid = p->pw_uid; + } else if (g != NULL) { + who_type = ZFS_DELEG_GROUP; + rid = g->gr_gid; + } else { + (void) snprintf(errbuf, 256, gettext( + "invalid user/group %s"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } + + (void) sprintf(id, "%u", rid); + who = id; + + store_allow_perm(who_type, opts->local, + opts->descend, who, opts->perms, *nvlp); + curr = delim + 1; + } + } + + return (0); +} + +static void +print_set_creat_perms(uu_avl_t *who_avl) +{ + const char *sc_title[] = { + gettext("Permission sets:\n"), + gettext("Create time permissions:\n"), + NULL + }; + const char **title_ptr = sc_title; + who_perm_node_t *who_node = NULL; + int prev_weight = -1; + + for (who_node = uu_avl_first(who_avl); who_node != NULL; + who_node = uu_avl_next(who_avl, who_node)) { + uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; + const char *who_name = who_node->who_perm.who_name; + int weight = who_type2weight(who_type); + boolean_t first = B_TRUE; + deleg_perm_node_t *deleg_node; + + if (prev_weight != weight) { + (void) printf(*title_ptr++); + prev_weight = weight; + } + + if (who_name == NULL || strnlen(who_name, 1) == 0) + (void) printf("\t"); + else + (void) printf("\t%s ", who_name); + + for (deleg_node = uu_avl_first(avl); deleg_node != NULL; + deleg_node = uu_avl_next(avl, deleg_node)) { + if (first) { + (void) printf("%s", + deleg_node->dpn_perm.dp_name); + first = B_FALSE; + } else + (void) printf(",%s", + deleg_node->dpn_perm.dp_name); + } + + (void) printf("\n"); + } +} - (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, - origin, sizeof (origin), &src, NULL, 0, B_FALSE); +static void inline +print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, + const char *title) +{ + who_perm_node_t *who_node = NULL; + boolean_t prt_title = B_TRUE; + uu_avl_walk_t *walk; - if (strcmp(origin, fromname) == 0) { - fromname = NULL; - flags.fromorigin = B_TRUE; - } else { - *cp = '\0'; - if (cp != fromname && strcmp(argv[0], fromname)) { - (void) fprintf(stderr, - gettext("incremental source must be " - "in same filesystem\n")); - usage(B_FALSE); - } - fromname = cp + 1; - if (strchr(fromname, '@') || strchr(fromname, '/')) { - (void) fprintf(stderr, - gettext("invalid incremental source\n")); - usage(B_FALSE); + if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((who_node = uu_avl_walk_next(walk)) != NULL) { + const char *who_name = who_node->who_perm.who_name; + const char *nice_who_name = who_node->who_perm.who_ug_name; + uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; + char delim = ' '; + deleg_perm_node_t *deleg_node; + boolean_t prt_who = B_TRUE; + + for (deleg_node = uu_avl_first(avl); + deleg_node != NULL; + deleg_node = uu_avl_next(avl, deleg_node)) { + if (local != deleg_node->dpn_perm.dp_local || + descend != deleg_node->dpn_perm.dp_descend) + continue; + + if (prt_who) { + const char *who = NULL; + if (prt_title) { + prt_title = B_FALSE; + (void) printf(title); + } + + switch (who_type) { + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + who = gettext("user"); + if (nice_who_name) + who_name = nice_who_name; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + who = gettext("group"); + if (nice_who_name) + who_name = nice_who_name; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + who = gettext("everyone"); + who_name = NULL; + } + + prt_who = B_FALSE; + if (who_name == NULL) + (void) printf("\t%s", who); + else + (void) printf("\t%s %s", who, who_name); } + + (void) printf("%c%s", delim, + deleg_node->dpn_perm.dp_name); + delim = ','; } - } - if (flags.replicate && fromname == NULL) - flags.doall = B_TRUE; + if (!prt_who) + (void) printf("\n"); + } - err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0, - extraverbose ? &dbgnv : NULL); + uu_avl_walk_end(walk); +} - if (extraverbose) { - /* - * dump_nvlist prints to stdout, but that's been - * redirected to a file. Make it print to stderr - * instead. - */ - (void) dup2(STDERR_FILENO, STDOUT_FILENO); - dump_nvlist(dbgnv, 0); - nvlist_free(dbgnv); +static void +print_fs_perms(fs_perm_set_t *fspset) +{ + fs_perm_node_t *node = NULL; + char buf[ZFS_MAXNAMELEN+32]; + const char *dsname = buf; + + for (node = uu_list_first(fspset->fsps_list); node != NULL; + node = uu_list_next(fspset->fsps_list, node)) { + uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; + uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; + int left = 0; + + (void) snprintf(buf, ZFS_MAXNAMELEN+32, + gettext("---- Permissions on %s "), + node->fspn_fsperm.fsp_name); + (void) printf(dsname); + left = 70 - strlen(buf); + while (left-- > 0) + (void) printf("-"); + (void) printf("\n"); + + print_set_creat_perms(sc_avl); + print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, + gettext("Local permissions:\n")); + print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, + gettext("Descendent permissions:\n")); + print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, + gettext("Local+Descendent permissions:\n")); } - zfs_close(zhp); +} - return (err != 0); +static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; + +struct deleg_perms { + boolean_t un; + nvlist_t *nvl; +}; + +static int +set_deleg_perms(zfs_handle_t *zhp, void *data) +{ + struct deleg_perms *perms = (struct deleg_perms *)data; + zfs_type_t zfs_type = zfs_get_type(zhp); + + if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) + return (0); + + return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); } -/* - * zfs receive [-vnFu] [-d | -e] - * - * Restore a backup stream from stdin. - */ static int -zfs_do_receive(int argc, char **argv) +zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) { - int c, err; - recvflags_t flags = { 0 }; + zfs_handle_t *zhp; + nvlist_t *perm_nvl = NULL; + nvlist_t *update_perm_nvl = NULL; + int error = 1; + int c; + struct allow_opts opts = { 0 }; - /* check options */ - while ((c = getopt(argc, argv, ":denuvF")) != -1) { + const char *optstr = un ? "ldugecsrh" : "ldugecsh"; + + /* check opts */ + while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { + case 'l': + opts.local = B_TRUE; + break; case 'd': - flags.isprefix = B_TRUE; + opts.descend = B_TRUE; break; - case 'e': - flags.isprefix = B_TRUE; - flags.istail = B_TRUE; + case 'u': + opts.user = B_TRUE; break; - case 'n': - flags.dryrun = B_TRUE; + case 'g': + opts.group = B_TRUE; break; - case 'u': - flags.nomount = B_TRUE; + case 'e': + opts.everyone = B_TRUE; break; - case 'v': - flags.verbose = B_TRUE; + case 's': + opts.set = B_TRUE; break; - case 'F': - flags.force = B_TRUE; + case 'c': + opts.create = B_TRUE; + break; + case 'r': + opts.recursive = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; + case 'h': + opts.prt_usage = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -2806,27 +4971,81 @@ zfs_do_receive(int argc, char **argv) argc -= optind; argv += optind; - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing snapshot argument\n")); - usage(B_FALSE); + /* check arguments */ + parse_allow_args(argc, argv, un, &opts); + + /* try to open the dataset */ + if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) == NULL) { + (void) fprintf(stderr, "Failed to open dataset: %s\n", + opts.dataset); + return (-1); } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); + + if (zfs_get_fsacl(zhp, &perm_nvl) != 0) + goto cleanup2; + + fs_perm_set_init(&fs_perm_set); + if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { + (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); + goto cleanup1; } - if (isatty(STDIN_FILENO)) { - (void) fprintf(stderr, - gettext("Error: Backup stream can not be read " - "from a terminal.\n" - "You must redirect standard input.\n")); - return (1); + if (opts.prt_perms) + print_fs_perms(&fs_perm_set); + else { + (void) construct_fsacl_list(un, &opts, &update_perm_nvl); + if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) + goto cleanup0; + + if (un && opts.recursive) { + struct deleg_perms data = { un, update_perm_nvl }; + if (zfs_iter_filesystems(zhp, set_deleg_perms, + &data) != 0) + goto cleanup0; + } } - err = zfs_receive(g_zfs, argv[0], flags, STDIN_FILENO, NULL); + error = 0; - return (err != 0); +cleanup0: + nvlist_free(perm_nvl); + if (update_perm_nvl != NULL) + nvlist_free(update_perm_nvl); +cleanup1: + fs_perm_set_fini(&fs_perm_set); +cleanup2: + zfs_close(zhp); + + return (error); +} + +/* + * zfs allow [-r] [-t] ... + * + * -r Recursively hold + * -t Temporary hold (hidden option) + * + * Apply a user-hold with the given tag to the list of snapshots. + */ +static int +zfs_do_allow(int argc, char **argv) +{ + return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); +} + +/* + * zfs unallow [-r] [-t] ... + * + * -r Recursively hold + * -t Temporary hold (hidden option) + * + * Apply a user-hold with the given tag to the list of snapshots. + */ +static int +zfs_do_unallow(int argc, char **argv) +{ + return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); } static int @@ -2936,6 +5155,200 @@ zfs_do_release(int argc, char **argv) return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); } +typedef struct holds_cbdata { + boolean_t cb_recursive; + const char *cb_snapname; + nvlist_t **cb_nvlp; + size_t cb_max_namelen; + size_t cb_max_taglen; +} holds_cbdata_t; + +#define STRFTIME_FMT_STR "%a %b %e %k:%M %Y" +#define DATETIME_BUF_LEN (32) +/* + * + */ +static void +print_holds(boolean_t scripted, size_t nwidth, size_t tagwidth, nvlist_t *nvl) +{ + int i; + nvpair_t *nvp = NULL; + char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; + const char *col; + + if (!scripted) { + for (i = 0; i < 3; i++) { + col = gettext(hdr_cols[i]); + if (i < 2) + (void) printf("%-*s ", i ? tagwidth : nwidth, + col); + else + (void) printf("%s\n", col); + } + } + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + char *zname = nvpair_name(nvp); + nvlist_t *nvl2; + nvpair_t *nvp2 = NULL; + (void) nvpair_value_nvlist(nvp, &nvl2); + while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { + char tsbuf[DATETIME_BUF_LEN]; + char *tagname = nvpair_name(nvp2); + uint64_t val = 0; + time_t time; + struct tm t; + char sep = scripted ? '\t' : ' '; + size_t sepnum = scripted ? 1 : 2; + + (void) nvpair_value_uint64(nvp2, &val); + time = (time_t)val; + (void) localtime_r(&time, &t); + (void) strftime(tsbuf, DATETIME_BUF_LEN, + gettext(STRFTIME_FMT_STR), &t); + + (void) printf("%-*s%*c%-*s%*c%s\n", nwidth, zname, + sepnum, sep, tagwidth, tagname, sepnum, sep, tsbuf); + } + } +} + +/* + * Generic callback function to list a dataset or snapshot. + */ +static int +holds_callback(zfs_handle_t *zhp, void *data) +{ + holds_cbdata_t *cbp = data; + nvlist_t *top_nvl = *cbp->cb_nvlp; + nvlist_t *nvl = NULL; + nvpair_t *nvp = NULL; + const char *zname = zfs_get_name(zhp); + size_t znamelen = strnlen(zname, ZFS_MAXNAMELEN); + + if (cbp->cb_recursive) { + const char *snapname; + char *delim = strchr(zname, '@'); + if (delim == NULL) + return (0); + + snapname = delim + 1; + if (strcmp(cbp->cb_snapname, snapname)) + return (0); + } + + if (zfs_get_holds(zhp, &nvl) != 0) + return (-1); + + if (znamelen > cbp->cb_max_namelen) + cbp->cb_max_namelen = znamelen; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + const char *tag = nvpair_name(nvp); + size_t taglen = strnlen(tag, MAXNAMELEN); + if (taglen > cbp->cb_max_taglen) + cbp->cb_max_taglen = taglen; + } + + return (nvlist_add_nvlist(top_nvl, zname, nvl)); +} + +/* + * zfs holds [-r] ... + * + * -r Recursively hold + */ +static int +zfs_do_holds(int argc, char **argv) +{ + int errors = 0; + int c; + int i; + boolean_t scripted = B_FALSE; + boolean_t recursive = B_FALSE; + const char *opts = "rH"; + nvlist_t *nvl; + + int types = ZFS_TYPE_SNAPSHOT; + holds_cbdata_t cb = { 0 }; + + int limit = 0; + int ret = 0; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, opts)) != -1) { + switch (c) { + case 'r': + recursive = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (recursive) { + types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; + flags |= ZFS_ITER_RECURSE; + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) + usage(B_FALSE); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + for (i = 0; i < argc; ++i) { + char *snapshot = argv[i]; + const char *delim; + const char *snapname; + + delim = strchr(snapshot, '@'); + if (delim == NULL) { + (void) fprintf(stderr, + gettext("'%s' is not a snapshot\n"), snapshot); + ++errors; + continue; + } + snapname = delim + 1; + if (recursive) + snapshot[delim - snapshot] = '\0'; + + cb.cb_recursive = recursive; + cb.cb_snapname = snapname; + cb.cb_nvlp = &nvl; + + /* + * 1. collect holds data, set format options + */ + ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit, + holds_callback, &cb); + if (ret != 0) + ++errors; + } + + /* + * 2. print holds data + */ + print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl); + + if (nvlist_empty(nvl)) + (void) printf(gettext("no datasets available\n")); + + nvlist_free(nvl); + + return (0 != errors); +} + #define CHECK_SPINNER 30 #define SPINNER_TIME 3 /* seconds */ #define MOUNT_TIME 5 /* seconds */ @@ -3437,7 +5850,7 @@ static int unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) { zfs_handle_t *zhp; - int ret; + int ret = 0; struct stat64 statbuf; struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; @@ -3820,15 +6233,6 @@ zfs_do_unshare(int argc, char **argv) return (unshare_unmount(OP_SHARE, argc, argv)); } -/* ARGSUSED */ -static int -zfs_do_python(int argc, char **argv) -{ - (void) execv(pypath, argv-1); - (void) printf("internal error: %s not found\n", pypath); - return (-1); -} - /* * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is * 'legacy'. Otherwise, complain that use should be using 'zfs mount'. @@ -3839,7 +6243,7 @@ manual_mount(int argc, char **argv) zfs_handle_t *zhp; char mountpoint[ZFS_MAXPROPLEN]; char mntopts[MNT_LINE_MAX] = { '\0' }; - int ret; + int ret = 0; int c; int flags = 0; char *dataset, *path; @@ -3989,7 +6393,7 @@ zfs_do_diff(int argc, char **argv) char *tosnap = NULL; char *fromsnap = NULL; char *atp, *copy; - int err; + int err = 0; int c; while ((c = getopt(argc, argv, "FHt")) != -1) { @@ -4059,7 +6463,7 @@ zfs_do_diff(int argc, char **argv) int main(int argc, char **argv) { - int ret; + int ret = 0; int i; char *progname; char *cmdname; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 8aa985b..bc300b3 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -21,6 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012 by Frederik Wessels. All rights reserved. */ #include @@ -40,6 +43,7 @@ #include #include #include +#include #include #include @@ -47,6 +51,7 @@ #include "zpool_util.h" #include "zfs_comutil.h" +#include "zfeature_common.h" #include "statcommon.h" @@ -63,6 +68,9 @@ static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); +static int zpool_do_reopen(int, char **); + +static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); @@ -121,7 +129,9 @@ typedef enum { HELP_UPGRADE, HELP_GET, HELP_SET, - HELP_SPLIT + HELP_SPLIT, + HELP_REGUID, + HELP_REOPEN } zpool_help_t; @@ -154,6 +164,7 @@ static zpool_command_t command_table[] = { { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, + { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, @@ -165,6 +176,7 @@ static zpool_command_t command_table[] = { { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, + { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "get", zpool_do_get, HELP_GET }, @@ -189,7 +201,7 @@ get_usage(zpool_help_t idx) { case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: - return (gettext("\tcreate [-fn] [-o property=value] ... \n" + return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_DESTROY: @@ -225,6 +237,8 @@ get_usage(zpool_help_t idx) { "[new-device]\n")); case HELP_REMOVE: return (gettext("\tremove ...\n")); + case HELP_REOPEN: + return (""); /* Undocumented command */ case HELP_SCRUB: return (gettext("\tscrub [-s] ...\n")); case HELP_STATUS: @@ -243,6 +257,8 @@ get_usage(zpool_help_t idx) { return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); + case HELP_REGUID: + return (gettext("\treguid \n")); } abort(); @@ -316,6 +332,12 @@ usage(boolean_t requested) /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); + + (void) fprintf(fp, "\t%-15s ", "feature@..."); + (void) fprintf(fp, "YES disabled | enabled | active\n"); + + (void) fprintf(fp, gettext("\nThe feature@ properties must be " + "appended with a feature name.\nSee zpool-features(5).\n")); } /* @@ -382,12 +404,16 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, proplist = *props; if (poolprop) { - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) { + if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL && + !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } - normnm = zpool_prop_to_name(prop); + if (zpool_prop_feature(propname)) + normnm = propname; + else + normnm = zpool_prop_to_name(prop); } else { if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { normnm = zfs_prop_to_name(fprop); @@ -559,7 +585,7 @@ zpool_do_remove(int argc, char **argv) } /* - * zpool create [-fn] [-o property=value] ... + * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * @@ -568,8 +594,10 @@ zpool_do_remove(int argc, char **argv) * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's - * '/' + * '/' * -o Set property=value. + * -d Don't automatically enable all supported pool features + * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The @@ -582,6 +610,7 @@ zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; + boolean_t enable_all_pool_feat = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; @@ -593,7 +622,7 @@ zpool_do_create(int argc, char **argv) char *propval; /* check options */ - while ((c = getopt(argc, argv, ":fnR:m:o:O:")) != -1) { + while ((c = getopt(argc, argv, ":fndR:m:o:O:")) != -1) { switch (c) { case 'f': force = B_TRUE; @@ -601,6 +630,9 @@ zpool_do_create(int argc, char **argv) case 'n': dryrun = B_TRUE; break; + case 'd': + enable_all_pool_feat = B_FALSE; + break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( @@ -628,6 +660,21 @@ zpool_do_create(int argc, char **argv) if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; + + /* + * If the user is creating a pool that doesn't support + * feature flags, don't enable any features. + */ + if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { + char *end; + u_longlong_t ver; + + ver = strtoull(propval, &end, 10); + if (*end == '\0' && + ver < SPA_VERSION_FEATURES) { + enable_all_pool_feat = B_FALSE; + } + } break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { @@ -693,7 +740,6 @@ zpool_do_create(int argc, char **argv) goto errout; } - if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); @@ -775,6 +821,27 @@ zpool_do_create(int argc, char **argv) /* * Hand off to libzfs. */ + if (enable_all_pool_feat) { + int i; + for (i = 0; i < SPA_FEATURES; i++) { + char propname[MAXPATHLEN]; + zfeature_info_t *feat = &spa_feature_table[i]; + + (void) snprintf(propname, sizeof (propname), + "feature@%s", feat->fi_uname); + + /* + * Skip feature if user specified it manually + * on the command line. + */ + if (nvlist_exists(props, propname)) + continue; + + if (add_prop_list(propname, ZFS_FEATURE_ENABLED, + &props, B_TRUE) != 0) + goto errout; + } + } if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, poolname, @@ -1106,6 +1173,10 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, (void) printf(gettext("newer version")); break; + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &cb.cb_guid) == 0); @@ -1223,6 +1294,10 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) (void) printf(gettext("newer version")); break; + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; @@ -1326,6 +1401,7 @@ show_import(nvlist_t *config) const char *health; uint_t vsc; int namewidth; + char *comment; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); @@ -1342,9 +1418,9 @@ show_import(nvlist_t *config) reason = zpool_import_status(config, &msgid); - (void) printf(gettext(" pool: %s\n"), name); - (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); - (void) printf(gettext(" state: %s"), health); + (void) printf(gettext(" pool: %s\n"), name); + (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); + (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); @@ -1353,58 +1429,73 @@ show_import(nvlist_t *config) case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: - (void) printf(gettext("status: One or more devices are missing " - "from the system.\n")); + (void) printf(gettext(" status: One or more devices are " + "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: - (void) printf(gettext("status: One or more devices contains " + (void) printf(gettext(" status: One or more devices contains " "corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: - (void) printf(gettext("status: The pool data is corrupted.\n")); + (void) printf( + gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: - (void) printf(gettext("status: One or more devices " + (void) printf(gettext(" status: One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: - (void) printf(gettext("status: The pool metadata is " + (void) printf(gettext(" status: The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: - (void) printf(gettext("status: The pool is formatted using an " + (void) printf(gettext(" status: The pool is formatted using an " "older on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext("status: The pool is formatted using an " + (void) printf(gettext(" status: The pool is formatted using an " "incompatible version.\n")); break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: + (void) printf(gettext("status: The pool uses the following " + "feature(s) not supported on this sytem:\n")); + zpool_print_unsup_feat(config); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + (void) printf(gettext("status: The pool can only be accessed " + "in read-only mode on this system. It\n\tcannot be " + "accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + zpool_print_unsup_feat(config); + break; + case ZPOOL_STATUS_HOSTID_MISMATCH: - (void) printf(gettext("status: The pool was last accessed by " + (void) printf(gettext(" status: The pool was last accessed by " "another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: - (void) printf(gettext("status: One or more devices are " + (void) printf(gettext(" status: One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: - (void) printf(gettext("status: An intent log record cannot be " + (void) printf(gettext(" status: An intent log record cannot be " "read.\n")); break; case ZPOOL_STATUS_RESILVERING: - (void) printf(gettext("status: One or more devices were being " + (void) printf(gettext(" status: One or more devices were being " "resilvered.\n")); break; @@ -1420,43 +1511,61 @@ show_import(nvlist_t *config) */ if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER) - (void) printf(gettext("action: The pool can be " + (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) - (void) printf(gettext("action: The pool can be " + (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); else - (void) printf(gettext("action: The pool can be " + (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); } else if (vs->vs_state == VDEV_STATE_DEGRADED) { - (void) printf(gettext("action: The pool can be imported " + (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext("action: The pool cannot be " + (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: + (void) printf(gettext("action: The pool cannot be " + "imported. Access the pool on a system that " + "supports\n\tthe required feature(s), or recreate " + "the pool from backup.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + (void) printf(gettext("action: The pool cannot be " + "imported in read-write mode. Import the pool " + "with\n" + "\t\"-o readonly=on\", access the pool on a system " + "that supports the\n\trequired feature(s), or " + "recreate the pool from backup.\n")); + break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: - (void) printf(gettext("action: The pool cannot be " + (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; default: - (void) printf(gettext("action: The pool cannot be " + (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } + /* Print the comment attached to the pool. */ + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + (void) printf(gettext("comment: %s\n"), comment); + /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": @@ -1474,10 +1583,10 @@ show_import(nvlist_t *config) } if (msgid != NULL) - (void) printf(gettext(" see: http://www.sun.com/msg/%s\n"), + (void) printf(gettext(" see: http://illumos.org/msg/%s\n"), msgid); - (void) printf(gettext("config:\n\n")); + (void) printf(gettext(" config:\n\n")); namewidth = max_width(NULL, nvroot, 0, 0); if (namewidth < 10) @@ -1515,9 +1624,9 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, ZPOOL_CONFIG_POOL_STATE, &state) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); - if (version > SPA_VERSION) { + if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " - "is formatted using a newer ZFS version\n"), name); + "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (state != POOL_STATE_EXPORTED && !(flags & ZFS_IMPORT_ANY_HOST)) { @@ -1966,10 +2075,10 @@ error: } typedef struct iostat_cbdata { - zpool_list_t *cb_list; - int cb_verbose; - int cb_iteration; + boolean_t cb_verbose; int cb_namewidth; + int cb_iteration; + zpool_list_t *cb_list; } iostat_cbdata_t; static void @@ -2078,10 +2187,15 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, return; for (c = 0; c < children; c++) { - uint64_t ishole = B_FALSE; + uint64_t ishole = B_FALSE, islog = B_FALSE; - if (nvlist_lookup_uint64(newchild[c], - ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, + &ishole); + + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, + &islog); + + if (ishole || islog) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE); @@ -2091,6 +2205,31 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, } /* + * Log device section + */ + + if (num_logs(newnv) > 0) { + (void) printf("%-*s - - - - - " + "-\n", cb->cb_namewidth, "logs"); + + for (c = 0; c < children; c++) { + uint64_t islog = B_FALSE; + (void) nvlist_lookup_uint64(newchild[c], + ZPOOL_CONFIG_IS_LOG, &islog); + + if (islog) { + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + B_FALSE); + print_vdev_stats(zhp, vname, oldnv ? + oldchild[c] : NULL, newchild[c], + cb, depth + 2); + free(vname); + } + } + + } + + /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, @@ -2179,7 +2318,8 @@ get_namewidth(zpool_handle_t *zhp, void *data) if (!cb->cb_verbose) cb->cb_namewidth = strlen(zpool_get_name(zhp)); else - cb->cb_namewidth = max_width(zhp, nvroot, 0, 0); + cb->cb_namewidth = max_width(zhp, nvroot, 0, + cb->cb_namewidth); } /* @@ -2408,8 +2548,9 @@ zpool_do_iostat(int argc, char **argv) } typedef struct list_cbdata { + boolean_t cb_verbose; + int cb_namewidth; boolean_t cb_scripted; - boolean_t cb_first; zprop_list_t *cb_proplist; } list_cbdata_t; @@ -2417,30 +2558,50 @@ typedef struct list_cbdata { * Given a list of columns to display, output appropriate headers for each one. */ static void -print_header(zprop_list_t *pl) +print_header(list_cbdata_t *cb) { + zprop_list_t *pl = cb->cb_proplist; + char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; + size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { - if (pl->pl_prop == ZPROP_INVAL) - continue; + width = pl->pl_width; + if (first && cb->cb_verbose) { + /* + * Reset the width to accommodate the verbose listing + * of devices. + */ + width = cb->cb_namewidth; + } if (!first) (void) printf(" "); else first = B_FALSE; - header = zpool_prop_column_name(pl->pl_prop); - right_justify = zpool_prop_align_right(pl->pl_prop); + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + header = zpool_prop_column_name(pl->pl_prop); + right_justify = zpool_prop_align_right(pl->pl_prop); + } else { + int i; + + for (i = 0; pl->pl_user_prop[i] != '\0'; i++) + headerbuf[i] = toupper(pl->pl_user_prop[i]); + headerbuf[i] = '\0'; + header = headerbuf; + } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) - (void) printf("%*s", pl->pl_width, header); + (void) printf("%*s", width, header); else - (void) printf("%-*s", pl->pl_width, header); + (void) printf("%-*s", width, header); + } (void) printf("\n"); @@ -2451,17 +2612,28 @@ print_header(zprop_list_t *pl) * to the described layout. */ static void -print_pool(zpool_handle_t *zhp, zprop_list_t *pl, int scripted) +print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { + zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; char *propstr; boolean_t right_justify; - int width; + size_t width; for (; pl != NULL; pl = pl->pl_next) { + + width = pl->pl_width; + if (first && cb->cb_verbose) { + /* + * Reset the width to accommodate the verbose listing + * of devices. + */ + width = cb->cb_namewidth; + } + if (!first) { - if (scripted) + if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); @@ -2471,25 +2643,32 @@ print_pool(zpool_handle_t *zhp, zprop_list_t *pl, int scripted) right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { - if (zpool_get_prop(zhp, pl->pl_prop, property, + if (pl->pl_prop == ZPOOL_PROP_EXPANDSZ && + zpool_get_prop_int(zhp, pl->pl_prop, NULL) == 0) + propstr = "-"; + else if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), NULL) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); + } else if ((zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop)) && + zpool_prop_get_feature(zhp, pl->pl_user_prop, property, + sizeof (property)) == 0) { + propstr = property; } else { propstr = "-"; } - width = pl->pl_width; /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ - if (scripted || (pl->pl_next == NULL && !right_justify)) + if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", width, propstr); @@ -2500,6 +2679,101 @@ print_pool(zpool_handle_t *zhp, zprop_list_t *pl, int scripted) (void) printf("\n"); } +static void +print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted) +{ + char propval[64]; + boolean_t fixed; + size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); + + zfs_nicenum(value, propval, sizeof (propval)); + + if (prop == ZPOOL_PROP_EXPANDSZ && value == 0) + (void) strlcpy(propval, "-", sizeof (propval)); + + if (scripted) + (void) printf("\t%s", propval); + else + (void) printf(" %*s", width, propval); +} + +void +print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, + list_cbdata_t *cb, int depth) +{ + nvlist_t **child; + vdev_stat_t *vs; + uint_t c, children; + char *vname; + boolean_t scripted = cb->cb_scripted; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + if (name != NULL) { + if (scripted) + (void) printf("\t%s", name); + else if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - depth), ""); + + /* only toplevel vdevs have capacity stats */ + if (vs->vs_space == 0) { + if (scripted) + (void) printf("\t-\t-\t-"); + else + (void) printf(" - - -"); + } else { + print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, + scripted); + print_one_column(ZPOOL_PROP_CAPACITY, vs->vs_alloc, + scripted); + print_one_column(ZPOOL_PROP_FREE, + vs->vs_space - vs->vs_alloc, scripted); + } + print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, + scripted); + (void) printf("\n"); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + uint64_t ishole = B_FALSE; + + if (nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + print_list_stats(zhp, vname, child[c], cb, depth + 2); + free(vname); + } + + /* + * Include level 2 ARC devices in iostat output + */ + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) != 0) + return; + + if (children > 0) { + (void) printf("%-*s - - - - - " + "-\n", cb->cb_namewidth, "cache"); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, child[c], + B_FALSE); + print_list_stats(zhp, vname, child[c], cb, depth + 2); + free(vname); + } + } +} + + /* * Generic callback function to list a pool. */ @@ -2507,14 +2781,18 @@ int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; + nvlist_t *config; + nvlist_t *nvroot; - if (cbp->cb_first) { - if (!cbp->cb_scripted) - print_header(cbp->cb_proplist); - cbp->cb_first = B_FALSE; - } + config = zpool_get_config(zhp, NULL); - print_pool(zhp, cbp->cb_proplist, cbp->cb_scripted); + print_pool(zhp, cbp); + if (!cbp->cb_verbose) + return (0); + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + print_list_stats(zhp, NULL, nvroot, cbp, 0); return (0); } @@ -2538,12 +2816,15 @@ zpool_do_list(int argc, char **argv) int ret; list_cbdata_t cb = { 0 }; static char default_props[] = - "name,size,allocated,free,capacity,dedupratio,health,altroot"; + "name,size,allocated,free,expandsize,capacity,dedupratio," + "health,altroot"; char *props = default_props; unsigned long interval = 0, count = 0; + zpool_list_t *list; + boolean_t first = B_TRUE; /* check options */ - while ((c = getopt(argc, argv, ":Ho:T:")) != -1) { + while ((c = getopt(argc, argv, ":Ho:T:v")) != -1) { switch (c) { case 'H': cb.cb_scripted = B_TRUE; @@ -2554,6 +2835,9 @@ zpool_do_list(int argc, char **argv) case 'T': get_timestamp_arg(*optarg); break; + case 'v': + cb.cb_verbose = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -2574,21 +2858,32 @@ zpool_do_list(int argc, char **argv) if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); - cb.cb_first = B_TRUE; + if ((list = pool_list_get(argc, argv, &cb.cb_proplist, &ret)) == NULL) + return (1); + + if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { + (void) printf(gettext("no pools available\n")); + zprop_free_list(cb.cb_proplist); + return (0); + } for (;;) { + pool_list_update(list); + + if (pool_list_count(list) == 0) + break; + + cb.cb_namewidth = 0; + (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); - ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, - list_callback, &cb); - - if (argc == 0 && cb.cb_first && !cb.cb_scripted) { - (void) printf(gettext("no pools available\n")); - zprop_free_list(cb.cb_proplist); - return (0); + if (!cb.cb_scripted && (first || cb.cb_verbose)) { + print_header(&cb); + first = B_FALSE; } + ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (interval == 0) break; @@ -3162,6 +3457,82 @@ zpool_do_clear(int argc, char **argv) return (ret); } +/* + * zpool reguid + */ +int +zpool_do_reguid(int argc, char **argv) +{ + int c; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + ret = zpool_reguid(zhp); + + zpool_close(zhp); + return (ret); +} + + +/* + * zpool reopen + * + * Reopen the pool so that the kernel can update the sizes of all vdevs. + * + * NOTE: This command is currently undocumented. If the command is ever + * exposed then the appropriate usage() messages will need to be made. + */ +int +zpool_do_reopen(int argc, char **argv) +{ + int ret = 0; + zpool_handle_t *zhp; + char *pool; + + argc--; + argv++; + + if (argc != 1) + return (2); + + pool = argv[0]; + if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) + return (1); + + ret = zpool_reopen(zhp); + zpool_close(zhp); + return (ret); +} + typedef struct scrub_cbdata { int cb_type; int cb_argc; @@ -3249,7 +3620,7 @@ print_scan_status(pool_scan_stat_t *ps) double fraction_done; char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; - (void) printf(gettext(" scan: ")); + (void) printf(gettext(" scan: ")); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || @@ -3433,14 +3804,20 @@ print_dedup_stats(nvlist_t *config) /* * If the pool was faulted then we may not have been able to - * obtain the config. Otherwise, if have anything in the dedup + * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, - (uint64_t **)&ddo, &c) != 0 || ddo->ddo_count == 0) + (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); + (void) printf(gettext(" dedup: ")); + if (ddo->ddo_count == 0) { + (void) printf(gettext("no DDT entries\n")); + return; + } + (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n", (u_longlong_t)ddo->ddo_count, (u_longlong_t)ddo->ddo_dspace, @@ -3459,7 +3836,7 @@ print_dedup_stats(nvlist_t *config) * pool: tank * status: DEGRADED * reason: One or more devices ... - * see: http://www.sun.com/msg/ZFS-xxxx-01 + * see: http://illumos.org/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK @@ -3620,6 +3997,31 @@ status_callback(zpool_handle_t *zhp, void *data) "backup.\n")); break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: + (void) printf(gettext("status: The pool cannot be accessed on " + "this system because it uses the\n\tfollowing feature(s) " + "not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + (void) printf(gettext("action: Access the pool from a system " + "that supports the required feature(s),\n\tor restore the " + "pool from backup.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + (void) printf(gettext("status: The pool can only be accessed " + "in read-only mode on this system. It\n\tcannot be " + "accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + (void) printf(gettext("action: The pool cannot be accessed in " + "read-write mode. Import the pool with\n" + "\t\"-o readonly=on\", access the pool from a system that " + "supports the\n\trequired feature(s), or restore the " + "pool from backup.\n")); + break; + case ZPOOL_STATUS_FAULTED_DEV_R: (void) printf(gettext("status: One or more devices are " "faulted in response to persistent errors.\n\tSufficient " @@ -3667,7 +4069,7 @@ status_callback(zpool_handle_t *zhp, void *data) } if (msgid != NULL) - (void) printf(gettext(" see: http://www.sun.com/msg/%s\n"), + (void) printf(gettext(" see: http://illumos.org/msg/%s\n"), msgid); if (config != NULL) { @@ -3844,7 +4246,8 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); - if (!cbp->cb_newer && version < SPA_VERSION) { + if (!cbp->cb_newer && SPA_VERSION_IS_SUPPORTED(version) && + version != SPA_VERSION) { if (!cbp->cb_all) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " @@ -3867,13 +4270,14 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) "'%s'\n\n"), zpool_get_name(zhp)); } } - } else if (cbp->cb_newer && version > SPA_VERSION) { + } else if (cbp->cb_newer && !SPA_VERSION_IS_SUPPORTED(version)) { assert(!cbp->cb_all); if (cbp->cb_first) { (void) printf(gettext("The following pools are " - "formatted using a newer software version and\n" - "cannot be accessed on the current system.\n\n")); + "formatted using an unsupported software version " + "and\ncannot be accessed on the current " + "system.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; @@ -3957,8 +4361,8 @@ zpool_do_upgrade(int argc, char **argv) break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); - if (*end != '\0' || cb.cb_version > SPA_VERSION || - cb.cb_version < SPA_VERSION_1) { + if (*end != '\0' || + !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); @@ -4003,8 +4407,8 @@ zpool_do_upgrade(int argc, char **argv) } } - (void) printf(gettext("This system is currently running " - "ZFS pool version %llu.\n\n"), SPA_VERSION); + (void) printf(gettext("This system supports ZFS pool feature " + "flags.\n\n")); cb.cb_first = B_TRUE; if (showversions) { (void) printf(gettext("The following versions are " @@ -4255,13 +4659,26 @@ get_callback(zpool_handle_t *zhp, void *data) pl == cbp->cb_proplist) continue; - if (zpool_get_prop(zhp, pl->pl_prop, - value, sizeof (value), &srctype) != 0) - continue; + if (pl->pl_prop == ZPROP_INVAL && + (zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop))) { + srctype = ZPROP_SRC_LOCAL; - zprop_print_one_property(zpool_get_name(zhp), cbp, - zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, - NULL); + if (zpool_prop_get_feature(zhp, pl->pl_user_prop, + value, sizeof (value)) == 0) { + zprop_print_one_property(zpool_get_name(zhp), + cbp, pl->pl_user_prop, value, srctype, + NULL, NULL); + } + } else { + if (zpool_get_prop(zhp, pl->pl_prop, value, + sizeof (value), &srctype) != 0) + continue; + + zprop_print_one_property(zpool_get_name(zhp), cbp, + zpool_prop_to_name(pl->pl_prop), value, srctype, + NULL, NULL); + } } return (0); } @@ -4273,8 +4690,11 @@ zpool_do_get(int argc, char **argv) zprop_list_t fake_name = { 0 }; int ret; - if (argc < 3) + if (argc < 2) { + (void) fprintf(stderr, gettext("missing property " + "argument\n")); usage(B_FALSE); + } cb.cb_first = B_TRUE; cb.cb_sources = ZPROP_SRC_ALL; @@ -4284,7 +4704,7 @@ zpool_do_get(int argc, char **argv) cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; - if (zprop_get_list(g_zfs, argv[1], &cb.cb_proplist, + if (zprop_get_list(g_zfs, argv[1], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index b2d81b5..d9bcb04 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ /* @@ -49,7 +51,9 @@ * At random times, the child self-immolates with a SIGKILL. * This is the software equivalent of pulling the power cord. * The parent then runs the test again, using the existing - * storage pool, as many times as desired. + * storage pool, as many times as desired. If backwards compatability + * testing is enabled ztest will sometimes run the "older" version + * of ztest after a SIGKILL. * * (6) To verify that we don't have future leaks or temporal incursions, * many of the functional tests record the transaction group number @@ -66,9 +70,15 @@ * You can ask more more vdevs [-v], datasets [-d], or threads [-t] * to increase the pool capacity, fanout, and overall stress level. * - * The -N(okill) option will suppress kills, so each child runs to completion. - * This can be useful when you're trying to distinguish temporal incursions - * from plain old race conditions. + * Use the -k option to set the desired frequency of kills. + * + * When ztest invokes itself it passes all relevant information through a + * temporary file which is mmap-ed in the child process. This allows shared + * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always + * stored at offset 0 of this file and contains information on the size and + * number of shared structures in the file. The information stored in this file + * must remain backwards compatible with older versions of ztest so that + * ztest can invoke them during backwards compatibility testing (-B). */ #include @@ -96,6 +106,7 @@ #include #include #include +#include #include #include #include @@ -108,28 +119,82 @@ #include #include -static char cmdname[] = "ztest"; -static char *zopt_pool = cmdname; - -static uint64_t zopt_vdevs = 5; -static uint64_t zopt_vdevtime; -static int zopt_ashift = SPA_MINBLOCKSHIFT; -static int zopt_mirrors = 2; -static int zopt_raidz = 4; -static int zopt_raidz_parity = 1; -static size_t zopt_vdev_size = SPA_MINDEVSIZE; -static int zopt_datasets = 7; -static int zopt_threads = 23; -static uint64_t zopt_passtime = 60; /* 60 seconds */ -static uint64_t zopt_killrate = 70; /* 70% kill rate */ -static int zopt_verbose = 0; -static int zopt_init = 1; -static char *zopt_dir = "/tmp"; -static uint64_t zopt_time = 300; /* 5 minutes */ -static uint64_t zopt_maxloops = 50; /* max loops during spa_freeze() */ +#define ZTEST_FD_DATA 3 +#define ZTEST_FD_RAND 4 + +typedef struct ztest_shared_hdr { + uint64_t zh_hdr_size; + uint64_t zh_opts_size; + uint64_t zh_size; + uint64_t zh_stats_size; + uint64_t zh_stats_count; + uint64_t zh_ds_size; + uint64_t zh_ds_count; +} ztest_shared_hdr_t; + +static ztest_shared_hdr_t *ztest_shared_hdr; + +typedef struct ztest_shared_opts { + char zo_pool[MAXNAMELEN]; + char zo_dir[MAXNAMELEN]; + char zo_alt_ztest[MAXNAMELEN]; + char zo_alt_libpath[MAXNAMELEN]; + uint64_t zo_vdevs; + uint64_t zo_vdevtime; + size_t zo_vdev_size; + int zo_ashift; + int zo_mirrors; + int zo_raidz; + int zo_raidz_parity; + int zo_datasets; + int zo_threads; + uint64_t zo_passtime; + uint64_t zo_killrate; + int zo_verbose; + int zo_init; + uint64_t zo_time; + uint64_t zo_maxloops; + uint64_t zo_metaslab_gang_bang; +} ztest_shared_opts_t; + +static const ztest_shared_opts_t ztest_opts_defaults = { + .zo_pool = { 'z', 't', 'e', 's', 't', '\0' }, + .zo_dir = { '/', 't', 'm', 'p', '\0' }, + .zo_alt_ztest = { '\0' }, + .zo_alt_libpath = { '\0' }, + .zo_vdevs = 5, + .zo_ashift = SPA_MINBLOCKSHIFT, + .zo_mirrors = 2, + .zo_raidz = 4, + .zo_raidz_parity = 1, + .zo_vdev_size = SPA_MINDEVSIZE, + .zo_datasets = 7, + .zo_threads = 23, + .zo_passtime = 60, /* 60 seconds */ + .zo_killrate = 70, /* 70% kill rate */ + .zo_verbose = 0, + .zo_init = 1, + .zo_time = 300, /* 5 minutes */ + .zo_maxloops = 50, /* max loops during spa_freeze() */ + .zo_metaslab_gang_bang = 32 << 10 +}; + +extern uint64_t metaslab_gang_bang; +extern uint64_t metaslab_df_alloc_threshold; + +static ztest_shared_opts_t *ztest_shared_opts; +static ztest_shared_opts_t ztest_opts; + +typedef struct ztest_shared_ds { + uint64_t zd_seq; +} ztest_shared_ds_t; + +static ztest_shared_ds_t *ztest_shared_ds; +#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) #define BT_MAGIC 0x123456789abcdefULL -#define MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1) +#define MAXFAULTS() \ + (MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, @@ -201,9 +266,10 @@ typedef struct ztest_od { * Per-dataset state. */ typedef struct ztest_ds { + ztest_shared_ds_t *zd_shared; objset_t *zd_os; + rwlock_t zd_zilog_lock; zilog_t *zd_zilog; - uint64_t zd_seq; ztest_od_t *zd_od; /* debugging aid */ char zd_name[MAXNAMELEN]; mutex_t zd_dirobj_lock; @@ -220,11 +286,17 @@ typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ - uint64_t zi_call_count; /* per-pass count */ - uint64_t zi_call_time; /* per-pass time */ - uint64_t zi_call_next; /* next time to call this function */ } ztest_info_t; +typedef struct ztest_shared_callstate { + uint64_t zc_count; /* per-pass count */ + uint64_t zc_time; /* per-pass time */ + uint64_t zc_next; /* next time to call this function */ +} ztest_shared_callstate_t; + +static ztest_shared_callstate_t *ztest_shared_callstate; +#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) + /* * Note: these aren't static because we want dladdr() to work. */ @@ -235,6 +307,7 @@ ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; ztest_func_t ztest_zil_commit; +ztest_func_t ztest_zil_remount; ztest_func_t ztest_dmu_read_write_zcopy; ztest_func_t ztest_dmu_objset_create_destroy; ztest_func_t ztest_dmu_prealloc; @@ -254,6 +327,7 @@ ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; +ztest_func_t ztest_reguid; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -270,6 +344,7 @@ ztest_info_t ztest_info[] = { { ztest_zap_parallel, 100, &zopt_always }, { ztest_split_pool, 1, &zopt_always }, { ztest_zil_commit, 1, &zopt_incessant }, + { ztest_zil_remount, 1, &zopt_sometimes }, { ztest_dmu_read_write_zcopy, 1, &zopt_often }, { ztest_dmu_objset_create_destroy, 1, &zopt_often }, { ztest_dsl_prop_get_set, 1, &zopt_often }, @@ -283,13 +358,16 @@ ztest_info_t ztest_info[] = { { ztest_fault_inject, 1, &zopt_sometimes }, { ztest_ddt_repair, 1, &zopt_sometimes }, { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, + { ztest_reguid, 1, &zopt_sometimes }, { ztest_spa_rename, 1, &zopt_rarely }, { ztest_scrub, 1, &zopt_rarely }, { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, { ztest_vdev_attach_detach, 1, &zopt_rarely }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, - { ztest_vdev_add_remove, 1, &zopt_vdevtime }, - { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, + { ztest_vdev_add_remove, 1, + &ztest_opts.zo_vdevtime }, + { ztest_vdev_aux_add_remove, 1, + &ztest_opts.zo_vdevtime }, }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -307,8 +385,7 @@ typedef struct ztest_cb_list { * Stuff we need to share writably between parent and child. */ typedef struct ztest_shared { - char *zs_pool; - spa_t *zs_spa; + boolean_t zs_do_init; hrtime_t zs_proc_start; hrtime_t zs_proc_stop; hrtime_t zs_thread_start; @@ -319,12 +396,11 @@ typedef struct ztest_shared { uint64_t zs_vdev_aux; uint64_t zs_alloc; uint64_t zs_space; - mutex_t zs_vdev_lock; - rwlock_t zs_name_lock; - ztest_info_t zs_info[ZTEST_FUNCS]; uint64_t zs_splits; uint64_t zs_mirrors; - ztest_ds_t zs_zd[]; + uint64_t zs_metaslab_sz; + uint64_t zs_metaslab_df_alloc_threshold; + uint64_t zs_guid; } ztest_shared_t; #define ID_PARALLEL -1ULL @@ -332,20 +408,19 @@ typedef struct ztest_shared { static char ztest_dev_template[] = "%s/%s.%llua"; static char ztest_aux_template[] = "%s/%s.%s.%llu"; ztest_shared_t *ztest_shared; -uint64_t *ztest_seq; -static int ztest_random_fd; -static int ztest_dump_core = 1; +static spa_t *ztest_spa = NULL; +static ztest_ds_t *ztest_ds; +static mutex_t ztest_vdev_lock; +static rwlock_t ztest_name_lock; + +static boolean_t ztest_dump_core = B_TRUE; static boolean_t ztest_exiting; /* Global commit callback list */ static ztest_cb_list_t zcl; -extern uint64_t metaslab_gang_bang; -extern uint64_t metaslab_df_alloc_threshold; -static uint64_t metaslab_sz; - enum ztest_object { ZTEST_META_DNODE = 0, ZTEST_DIROBJ, @@ -458,12 +533,14 @@ nicenumtoull(const char *buf) static void usage(boolean_t requested) { + const ztest_shared_opts_t *zo = &ztest_opts_defaults; + char nice_vdev_size[10]; char nice_gang_bang[10]; FILE *fp = requested ? stdout : stderr; - nicenum(zopt_vdev_size, nice_vdev_size); - nicenum(metaslab_gang_bang, nice_gang_bang); + nicenum(zo->zo_vdev_size, nice_vdev_size); + nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang); (void) fprintf(fp, "Usage: %s\n" "\t[-v vdevs (default: %llu)]\n" @@ -484,39 +561,43 @@ usage(boolean_t requested) "\t[-T time (default: %llu sec)] total run time\n" "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" "\t[-P passtime (default: %llu sec)] time per pass\n" + "\t[-B alt_ztest (default: )] alternate ztest path\n" "\t[-h] (print help)\n" "", - cmdname, - (u_longlong_t)zopt_vdevs, /* -v */ + zo->zo_pool, + (u_longlong_t)zo->zo_vdevs, /* -v */ nice_vdev_size, /* -s */ - zopt_ashift, /* -a */ - zopt_mirrors, /* -m */ - zopt_raidz, /* -r */ - zopt_raidz_parity, /* -R */ - zopt_datasets, /* -d */ - zopt_threads, /* -t */ + zo->zo_ashift, /* -a */ + zo->zo_mirrors, /* -m */ + zo->zo_raidz, /* -r */ + zo->zo_raidz_parity, /* -R */ + zo->zo_datasets, /* -d */ + zo->zo_threads, /* -t */ nice_gang_bang, /* -g */ - zopt_init, /* -i */ - (u_longlong_t)zopt_killrate, /* -k */ - zopt_pool, /* -p */ - zopt_dir, /* -f */ - (u_longlong_t)zopt_time, /* -T */ - (u_longlong_t)zopt_maxloops, /* -F */ - (u_longlong_t)zopt_passtime); /* -P */ + zo->zo_init, /* -i */ + (u_longlong_t)zo->zo_killrate, /* -k */ + zo->zo_pool, /* -p */ + zo->zo_dir, /* -f */ + (u_longlong_t)zo->zo_time, /* -T */ + (u_longlong_t)zo->zo_maxloops, /* -F */ + (u_longlong_t)zo->zo_passtime); exit(requested ? 0 : 1); } static void process_options(int argc, char **argv) { + char *path; + ztest_shared_opts_t *zo = &ztest_opts; + int opt; uint64_t value; + char altdir[MAXNAMELEN] = { 0 }; - /* By default, test gang blocks for blocks 32K and greater */ - metaslab_gang_bang = 32 << 10; + bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:")) != EOF) { + "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:")) != EOF) { value = 0; switch (opt) { case 'v': @@ -537,58 +618,71 @@ process_options(int argc, char **argv) } switch (opt) { case 'v': - zopt_vdevs = value; + zo->zo_vdevs = value; break; case 's': - zopt_vdev_size = MAX(SPA_MINDEVSIZE, value); + zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); break; case 'a': - zopt_ashift = value; + zo->zo_ashift = value; break; case 'm': - zopt_mirrors = value; + zo->zo_mirrors = value; break; case 'r': - zopt_raidz = MAX(1, value); + zo->zo_raidz = MAX(1, value); break; case 'R': - zopt_raidz_parity = MIN(MAX(value, 1), 3); + zo->zo_raidz_parity = MIN(MAX(value, 1), 3); break; case 'd': - zopt_datasets = MAX(1, value); + zo->zo_datasets = MAX(1, value); break; case 't': - zopt_threads = MAX(1, value); + zo->zo_threads = MAX(1, value); break; case 'g': - metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value); + zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, + value); break; case 'i': - zopt_init = value; + zo->zo_init = value; break; case 'k': - zopt_killrate = value; + zo->zo_killrate = value; break; case 'p': - zopt_pool = strdup(optarg); + (void) strlcpy(zo->zo_pool, optarg, + sizeof (zo->zo_pool)); break; case 'f': - zopt_dir = strdup(optarg); + path = realpath(optarg, NULL); + if (path == NULL) { + (void) fprintf(stderr, "error: %s: %s\n", + optarg, strerror(errno)); + usage(B_FALSE); + } else { + (void) strlcpy(zo->zo_dir, path, + sizeof (zo->zo_dir)); + } break; case 'V': - zopt_verbose++; + zo->zo_verbose++; break; case 'E': - zopt_init = 0; + zo->zo_init = 0; break; case 'T': - zopt_time = value; + zo->zo_time = value; break; case 'P': - zopt_passtime = MAX(1, value); + zo->zo_passtime = MAX(1, value); break; case 'F': - zopt_maxloops = MAX(1, value); + zo->zo_maxloops = MAX(1, value); + break; + case 'B': + (void) strlcpy(altdir, optarg, sizeof (altdir)); break; case 'h': usage(B_TRUE); @@ -600,17 +694,59 @@ process_options(int argc, char **argv) } } - zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1); + zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); - zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs : + zo->zo_vdevtime = + (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : UINT64_MAX >> 2); + + if (strlen(altdir) > 0) { + char cmd[MAXNAMELEN]; + char realaltdir[MAXNAMELEN]; + char *bin; + char *ztest; + char *isa; + int isalen; + + (void) realpath(getexecname(), cmd); + if (0 != access(altdir, F_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate ztest path: %s", + altdir); + } + VERIFY(NULL != realpath(altdir, realaltdir)); + + /* + * 'cmd' should be of the form "/usr/bin//ztest". + * We want to extract to determine if we should use + * 32 or 64 bit binaries. + */ + bin = strstr(cmd, "/usr/bin/"); + ztest = strstr(bin, "/ztest"); + isa = bin + 9; + isalen = ztest - isa; + (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), + "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); + (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), + "%s/usr/lib/%.*s", realaltdir, isalen, isa); + + if (0 != access(zo->zo_alt_ztest, X_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate ztest: %s", + zo->zo_alt_ztest); + } else if (0 != access(zo->zo_alt_libpath, X_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate lib directory %s", + zo->zo_alt_libpath); + } + } } static void ztest_kill(ztest_shared_t *zs) { - zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa)); - zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa)); + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); (void) kill(getpid(), SIGKILL); } @@ -622,7 +758,7 @@ ztest_random(uint64_t range) if (range == 0) return (0); - if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r)) + if (read(ZTEST_FD_RAND, &r, sizeof (r)) != sizeof (r)) fatal(1, "short read from /dev/urandom"); return (r % range); @@ -638,9 +774,9 @@ ztest_record_enospc(const char *s) static uint64_t ztest_get_ashift(void) { - if (zopt_ashift == 0) + if (ztest_opts.zo_ashift == 0) return (SPA_MINBLOCKSHIFT + ztest_random(3)); - return (zopt_ashift); + return (ztest_opts.zo_ashift); } static nvlist_t * @@ -658,12 +794,14 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift) if (aux != NULL) { vdev = ztest_shared->zs_vdev_aux; - (void) sprintf(path, ztest_aux_template, - zopt_dir, zopt_pool, aux, vdev); + (void) snprintf(path, sizeof (pathbuf), + ztest_aux_template, ztest_opts.zo_dir, + ztest_opts.zo_pool, aux, vdev); } else { vdev = ztest_shared->zs_vdev_next_leaf++; - (void) sprintf(path, ztest_dev_template, - zopt_dir, zopt_pool, vdev); + (void) snprintf(path, sizeof (pathbuf), + ztest_dev_template, ztest_opts.zo_dir, + ztest_opts.zo_pool, vdev); } } @@ -701,7 +839,7 @@ make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r) VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, VDEV_TYPE_RAIDZ) == 0); VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, - zopt_raidz_parity) == 0); + ztest_opts.zo_raidz_parity) == 0); VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, child, r) == 0); @@ -839,7 +977,7 @@ ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval), 1, &curval, setpoint), ==, 0); - if (zopt_verbose >= 6) { + if (ztest_opts.zo_verbose >= 6) { VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); (void) printf("%s %s = %s at '%s'\n", osname, propname, valname, setpoint); @@ -849,9 +987,9 @@ ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, } static int -ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value) +ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) { - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; nvlist_t *props = NULL; int error; @@ -973,13 +1111,17 @@ ztest_range_unlock(rl_t *rl) } static void -ztest_zd_init(ztest_ds_t *zd, objset_t *os) +ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) { zd->zd_os = os; zd->zd_zilog = dmu_objset_zil(os); - zd->zd_seq = 0; + zd->zd_shared = szd; dmu_objset_name(os, zd->zd_name); + if (zd->zd_shared != NULL) + zd->zd_shared->zd_seq = 0; + + VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0); VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0); for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) @@ -1959,6 +2101,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) if (ztest_random(2) == 0) io_type = ZTEST_IO_WRITE_TAG; + (void) rw_rdlock(&zd->zd_zilog_lock); + switch (io_type) { case ZTEST_IO_WRITE_TAG: @@ -1994,6 +2138,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) break; } + (void) rw_unlock(&zd->zd_zilog_lock); + umem_free(data, blocksize); } @@ -2048,6 +2194,8 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id) { zilog_t *zilog = zd->zd_zilog; + (void) rw_rdlock(&zd->zd_zilog_lock); + zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); /* @@ -2056,9 +2204,35 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id) * will verify that the log really does contain this record. */ mutex_enter(&zilog->zl_lock); - ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq); - zd->zd_seq = zilog->zl_commit_lr_seq; + ASSERT(zd->zd_shared != NULL); + ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); + zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; mutex_exit(&zilog->zl_lock); + + (void) rw_unlock(&zd->zd_zilog_lock); +} + +/* + * This function is designed to simulate the operations that occur during a + * mount/unmount operation. We hold the dataset across these operations in an + * attempt to expose any implicit assumptions about ZIL management. + */ +/* ARGSUSED */ +void +ztest_zil_remount(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + + (void) rw_wrlock(&zd->zd_zilog_lock); + + /* zfsvfs_teardown() */ + zil_close(zd->zd_zilog); + + /* zfsvfs_setup() */ + VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); + zil_replay(os, zd, ztest_replay_vector); + + (void) rw_unlock(&zd->zd_zilog_lock); } /* @@ -2069,7 +2243,7 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id) void ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; + ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa; nvlist_t *nvroot; @@ -2093,15 +2267,15 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) * Attempt to create an existing pool. It shouldn't matter * what's in the nvroot; we should fail with EEXIST. */ - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); - VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL)); + VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); nvlist_free(nvroot); - VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); - VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool)); + VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); + VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); spa_close(spa, FTAG); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } static vdev_t * @@ -2148,14 +2322,15 @@ void ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; nvlist_t *nvroot; int error; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz; + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); + leaves = + MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2180,9 +2355,9 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) * dmu_objset_destroy() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ - VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0); + VERIFY(rw_wrlock(&ztest_name_lock) == 0); error = spa_vdev_remove(spa, guid, B_FALSE); - VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0); + VERIFY(rw_unlock(&ztest_name_lock) == 0); if (error && error != EEXIST) fatal(0, "spa_vdev_remove() = %d", error); @@ -2192,8 +2367,10 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) /* * Make 1/4 of the devices be log devices. */ - nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, - ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1); + nvroot = make_vdev_root(NULL, NULL, + ztest_opts.zo_vdev_size, 0, + ztest_random(4) == 0, ztest_opts.zo_raidz, + zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -2204,7 +2381,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) fatal(0, "spa_vdev_add() = %d", error); } - VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } /* @@ -2215,7 +2392,7 @@ void ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; spa_aux_vdev_t *sav; char *aux; @@ -2230,7 +2407,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) aux = ZPOOL_CONFIG_L2CACHE; } - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2247,8 +2424,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) for (;;) { char path[MAXPATHLEN]; int c; - (void) sprintf(path, ztest_aux_template, zopt_dir, - zopt_pool, aux, zs->zs_vdev_aux); + (void) snprintf(path, sizeof (path), ztest_aux_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, aux, + zs->zs_vdev_aux); for (c = 0; c < sav->sav_count; c++) if (strcmp(sav->sav_vdevs[c]->vdev_path, path) == 0) @@ -2267,7 +2445,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) * Add a new device. */ nvlist_t *nvroot = make_vdev_root(NULL, aux, - (zopt_vdev_size * 5) / 4, 0, 0, 0, 0, 1); + (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1); error = spa_vdev_add(spa, nvroot); if (error != 0) fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); @@ -2286,7 +2464,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); } - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } /* @@ -2297,17 +2475,17 @@ void ztest_split_pool(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; nvlist_t *tree, **child, *config, *split, **schild; uint_t c, children, schildren = 0, lastlogid = 0; int error = 0; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); /* ensure we have a useable config; mirrors of raidz aren't supported */ - if (zs->zs_mirrors < 3 || zopt_raidz > 1) { - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } @@ -2366,9 +2544,9 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) spa_config_exit(spa, SCL_VDEV, FTAG); - (void) rw_wrlock(&zs->zs_name_lock); + (void) rw_wrlock(&ztest_name_lock); error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); nvlist_free(config); @@ -2381,7 +2559,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) ++zs->zs_splits; --zs->zs_mirrors; } - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } @@ -2393,7 +2571,7 @@ void ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; spa_aux_vdev_t *sav = &spa->spa_spares; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; @@ -2410,8 +2588,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) int oldvd_is_log; int error, expected_error; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); - leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2437,12 +2615,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); ASSERT(oldvd->vdev_children >= zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / zopt_raidz]; + oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; } - if (zopt_raidz > 1) { + if (ztest_opts.zo_raidz > 1) { ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); - ASSERT(oldvd->vdev_children == zopt_raidz); - oldvd = oldvd->vdev_child[leaf % zopt_raidz]; + ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); + oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; } /* @@ -2471,7 +2649,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP) fatal(0, "detach (%s) returned %d", oldpath, error); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } @@ -2485,7 +2663,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, - zopt_dir, zopt_pool, top * leaves + leaf); + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + leaf); if (ztest_random(2) == 0) newpath[strlen(newpath) - 1] = 'b'; newvd = vdev_lookup_by_path(rvd, newpath); @@ -2564,7 +2743,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) (longlong_t)newsize, replacing, error, expected_error); } - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } /* @@ -2587,7 +2766,7 @@ grow_vdev(vdev_t *vd, void *arg) fsize = lseek(fd, 0, SEEK_END); (void) ftruncate(fd, *newsize); - if (zopt_verbose >= 6) { + if (ztest_opts.zo_verbose >= 6) { (void) printf("%s grew from %lu to %lu bytes\n", vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); } @@ -2623,7 +2802,7 @@ online_vdev(vdev_t *vd, void *arg) * vdev_open fails is by checking the returned newstate. */ if (error || newstate != VDEV_STATE_HEALTHY) { - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { (void) printf("Unable to expand vdev, state %llu, " "error %d\n", (u_longlong_t)newstate, error); } @@ -2638,7 +2817,7 @@ online_vdev(vdev_t *vd, void *arg) * trying to online it. */ if (generation != spa->spa_config_generation) { - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { (void) printf("vdev configuration has changed, " "guid %llu, state %llu, expected gen %llu, " "got gen %llu\n", @@ -2684,8 +2863,7 @@ vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) void ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; vdev_t *vd, *tvd; metaslab_class_t *mc; metaslab_group_t *mg; @@ -2693,7 +2871,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) uint64_t top; uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); spa_config_enter(spa, SCL_STATE, spa, RW_READER); top = ztest_random_vdev_top(spa, B_TRUE); @@ -2719,16 +2897,16 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) * original size, and it has a valid psize. */ if (tvd->vdev_state != VDEV_STATE_HEALTHY || - psize == 0 || psize >= 4 * zopt_vdev_size) { + psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { spa_config_exit(spa, SCL_STATE, spa); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } ASSERT(psize > 0); newsize = psize + psize / 8; ASSERT3U(newsize, >, psize); - if (zopt_verbose >= 6) { + if (ztest_opts.zo_verbose >= 6) { (void) printf("Expanding LUN %s from %lu to %lu\n", vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); } @@ -2741,12 +2919,12 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || vdev_walk_tree(tvd, online_vdev, NULL) != NULL || tvd->vdev_state != VDEV_STATE_HEALTHY) { - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not expand LUN because " "the vdev configuration changed.\n"); } spa_config_exit(spa, SCL_STATE, spa); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } @@ -2775,12 +2953,12 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) new_class_space = metaslab_class_get_space(mc); if (tvd->vdev_mg != mg || mg->mg_class != mc) { - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not verify LUN expansion due to " "intervening vdev offline or remove.\n"); } spa_config_exit(spa, SCL_STATE, spa); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } @@ -2798,7 +2976,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) fatal(0, "LUN expansion failed: class_space %llu <= %llu\n", old_class_space, new_class_space); - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { char oldnumbuf[6], newnumbuf[6]; nicenum(old_class_space, oldnumbuf); @@ -2808,7 +2986,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) } spa_config_exit(spa, SCL_STATE, spa); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } /* @@ -2835,7 +3013,8 @@ ztest_dataset_create(char *dsname) if (err || zilset < 80) return (err); - (void) printf("Setting dataset %s to sync always\n", dsname); + if (ztest_opts.zo_verbose >= 6) + (void) printf("Setting dataset %s to sync always\n", dsname); return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, ZFS_SYNC_ALWAYS, B_FALSE)); } @@ -2907,7 +3086,6 @@ ztest_snapshot_destroy(char *osname, uint64_t id) void ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; ztest_ds_t zdtmp; int iters; int error; @@ -2915,10 +3093,10 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) char name[MAXNAMELEN]; zilog_t *zilog; - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu", - zs->zs_pool, (u_longlong_t)id); + ztest_opts.zo_pool, (u_longlong_t)id); /* * If this dataset exists from a previous run, process its replay log @@ -2927,7 +3105,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) */ if (ztest_random(2) == 0 && dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { - ztest_zd_init(&zdtmp, os); + ztest_zd_init(&zdtmp, NULL, os); zil_replay(os, &zdtmp, ztest_replay_vector); ztest_zd_fini(&zdtmp); dmu_objset_disown(os, FTAG); @@ -2953,7 +3131,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); return; } fatal(0, "dmu_objset_create(%s) = %d", name, error); @@ -2962,7 +3140,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) VERIFY3U(0, ==, dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); - ztest_zd_init(&zdtmp, os); + ztest_zd_init(&zdtmp, NULL, os); /* * Open the intent log for it. @@ -3002,7 +3180,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) dmu_objset_disown(os, FTAG); ztest_zd_fini(&zdtmp); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -3011,12 +3189,10 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) void ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; - - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); (void) ztest_snapshot_destroy(zd->zd_name, id); (void) ztest_snapshot_create(zd->zd_name, id); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -3061,7 +3237,6 @@ ztest_dsl_dataset_cleanup(char *osname, uint64_t id) void ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; objset_t *clone; dsl_dataset_t *ds; char snap1name[MAXNAMELEN]; @@ -3072,7 +3247,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) char *osname = zd->zd_name; int error; - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); ztest_dsl_dataset_cleanup(osname, id); @@ -3152,7 +3327,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) out: ztest_dsl_dataset_cleanup(osname, id); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -3351,7 +3526,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (freeit) { - if (zopt_verbose >= 7) { + if (ztest_opts.zo_verbose >= 7) { (void) printf("freeing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, @@ -3360,7 +3535,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) } VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); } else { - if (zopt_verbose >= 7) { + if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, @@ -3598,7 +3773,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); - if (zopt_verbose >= 7) { + if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, @@ -4239,37 +4414,35 @@ ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) ZFS_PROP_COPIES, ZFS_PROP_DEDUP }; - ztest_shared_t *zs = ztest_shared; - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* ARGSUSED */ void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; nvlist_t *props = NULL; - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); - (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO, + (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO, ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); - VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0); + VERIFY3U(spa_prop_get(ztest_spa, &props), ==, 0); - if (zopt_verbose >= 6) + if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); nvlist_free(props); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -4287,7 +4460,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) char tag[100]; char osname[MAXNAMELEN]; - (void) rw_rdlock(&ztest_shared->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); dmu_objset_name(os, osname); @@ -4384,7 +4557,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT); out: - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -4395,7 +4568,7 @@ void ztest_fault_inject(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; int fd; uint64_t offset; uint64_t leaves; @@ -4412,11 +4585,11 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) uint64_t guid0 = 0; boolean_t islog = B_FALSE; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); maxfaults = MAXFAULTS(); - leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; mirror_save = zs->zs_mirrors; - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); ASSERT(leaves >= 1); @@ -4439,9 +4612,11 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * and we'll write random garbage to the randomly chosen leaf. */ (void) snprintf(path0, sizeof (path0), ztest_dev_template, - zopt_dir, zopt_pool, top * leaves + zs->zs_splits); + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + zs->zs_splits); (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, - zopt_dir, zopt_pool, top * leaves + leaf); + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + leaf); vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); if (vd0 != NULL && vd0->vdev_top->vdev_islog) @@ -4510,12 +4685,12 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) * leaving the dataset in an inconsistent state. */ if (islog) - (void) rw_wrlock(&ztest_shared->zs_name_lock); + (void) rw_wrlock(&ztest_name_lock); VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); if (islog) - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } else { (void) vdev_online(spa, guid0, 0, NULL); } @@ -4542,9 +4717,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) if (offset >= fsize) continue; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); if (mirror_save != zs->zs_mirrors) { - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); (void) close(fd); return; } @@ -4553,9 +4728,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) fatal(1, "can't inject bad word at 0x%llx in %s", offset, pathrand); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); - if (zopt_verbose >= 7) + if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%llx\n", pathrand, (u_longlong_t)offset); } @@ -4570,7 +4745,7 @@ void ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; objset_t *os = zd->zd_os; ztest_od_t od[1]; uint64_t object, blocksize, txg, pattern, psize; @@ -4593,19 +4768,19 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) * Take the name lock as writer to prevent anyone else from changing * the pool and dataset properies we need to maintain during this test. */ - (void) rw_wrlock(&zs->zs_name_lock); + (void) rw_wrlock(&ztest_name_lock); if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, B_FALSE) != 0 || ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, B_FALSE) != 0) { - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); return; } object = od[0].od_object; blocksize = od[0].od_blocksize; - pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os); + pattern = zs->zs_guid ^ dmu_objset_fsid_guid(os); ASSERT(object != 0); @@ -4613,7 +4788,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) dmu_tx_hold_write(tx, object, 0, copies * blocksize); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); return; } @@ -4657,7 +4832,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) zio_buf_free(buf, psize); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -4667,8 +4842,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) void ztest_scrub(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; (void) spa_scan(spa, POOL_SCAN_SCRUB); (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ @@ -4676,19 +4850,42 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id) } /* + * Change the guid for the pool. + */ +/* ARGSUSED */ +void +ztest_reguid(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + uint64_t orig, load; + + orig = spa_guid(spa); + load = spa_load_guid(spa); + if (spa_change_guid(spa) != 0) + return; + + if (ztest_opts.zo_verbose >= 3) { + (void) printf("Changed guid old %llu -> %llu\n", + (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); + } + + VERIFY3U(orig, !=, spa_guid(spa)); + VERIFY3U(load, ==, spa_load_guid(spa)); +} + +/* * Rename the pool to a different name and then rename it back. */ /* ARGSUSED */ void ztest_spa_rename(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; char *oldname, *newname; spa_t *spa; - (void) rw_wrlock(&zs->zs_name_lock); + (void) rw_wrlock(&ztest_name_lock); - oldname = zs->zs_pool; + oldname = ztest_opts.zo_pool; newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); (void) strcpy(newname, oldname); (void) strcat(newname, "_tmp"); @@ -4708,7 +4905,7 @@ ztest_spa_rename(ztest_ds_t *zd, uint64_t id) */ VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); - ASSERT(spa == zs->zs_spa); + ASSERT(spa == ztest_spa); spa_close(spa, FTAG); /* @@ -4721,12 +4918,12 @@ ztest_spa_rename(ztest_ds_t *zd, uint64_t id) */ VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); - ASSERT(spa == zs->zs_spa); + ASSERT(spa == ztest_spa); spa_close(spa, FTAG); umem_free(newname, strlen(newname) + 1); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -4757,19 +4954,19 @@ ztest_run_zdb(char *pool) "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s", isalen, isa, - zopt_verbose >= 3 ? "s" : "", - zopt_verbose >= 4 ? "v" : "", + ztest_opts.zo_verbose >= 3 ? "s" : "", + ztest_opts.zo_verbose >= 4 ? "v" : "", spa_config_path, pool); free(isa); - if (zopt_verbose >= 5) + if (ztest_opts.zo_verbose >= 5) (void) printf("Executing %s\n", strstr(zdb, "zdb ")); fp = popen(zdb, "r"); while (fgets(zbuf, sizeof (zbuf), fp) != NULL) - if (zopt_verbose >= 3) + if (ztest_opts.zo_verbose >= 3) (void) printf("%s", zbuf); status = pclose(fp); @@ -4789,12 +4986,12 @@ ztest_walk_pool_directory(char *header) { spa_t *spa = NULL; - if (zopt_verbose >= 6) + if (ztest_opts.zo_verbose >= 6) (void) printf("%s\n", header); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) - if (zopt_verbose >= 6) + if (ztest_opts.zo_verbose >= 6) (void) printf("\t%s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); } @@ -4806,7 +5003,7 @@ ztest_spa_import_export(char *oldname, char *newname) uint64_t pool_guid; spa_t *spa; - if (zopt_verbose >= 4) { + if (ztest_opts.zo_verbose >= 4) { (void) printf("import/export: old = %s, new = %s\n", oldname, newname); } @@ -4881,7 +5078,7 @@ ztest_spa_import_export(char *oldname, char *newname) static void ztest_resume(spa_t *spa) { - if (spa_suspended(spa) && zopt_verbose >= 6) + if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) (void) printf("resuming from suspended state\n"); spa_vdev_state_enter(spa, SCL_NONE); vdev_clear(spa, NULL); @@ -4919,10 +5116,10 @@ ztest_deadman_thread(void *arg) } static void -ztest_execute(ztest_info_t *zi, uint64_t id) +ztest_execute(int test, ztest_info_t *zi, uint64_t id) { - ztest_shared_t *zs = ztest_shared; - ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets]; + ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; + ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); hrtime_t functime = gethrtime(); for (int i = 0; i < zi->zi_iters; i++) @@ -4930,10 +5127,10 @@ ztest_execute(ztest_info_t *zi, uint64_t id) functime = gethrtime() - functime; - atomic_add_64(&zi->zi_call_count, 1); - atomic_add_64(&zi->zi_call_time, functime); + atomic_add_64(&zc->zc_count, 1); + atomic_add_64(&zc->zc_time, functime); - if (zopt_verbose >= 4) { + if (ztest_opts.zo_verbose >= 4) { Dl_info dli; (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%6.2f sec in %s\n", @@ -4944,11 +5141,13 @@ ztest_execute(ztest_info_t *zi, uint64_t id) static void * ztest_thread(void *arg) { + int rand; uint64_t id = (uintptr_t)arg; ztest_shared_t *zs = ztest_shared; uint64_t call_next; hrtime_t now; ztest_info_t *zi; + ztest_shared_callstate_t *zc; while ((now = gethrtime()) < zs->zs_thread_stop) { /* @@ -4966,13 +5165,16 @@ ztest_thread(void *arg) /* * Pick a random function to execute. */ - zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)]; - call_next = zi->zi_call_next; + rand = ztest_random(ZTEST_FUNCS); + zi = &ztest_info[rand]; + zc = ZTEST_GET_SHARED_CALLSTATE(rand); + call_next = zc->zc_next; if (now >= call_next && - atomic_cas_64(&zi->zi_call_next, call_next, call_next + - ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) - ztest_execute(zi, id); + atomic_cas_64(&zc->zc_next, call_next, call_next + + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { + ztest_execute(rand, zi, id); + } } return (NULL); @@ -4985,13 +5187,13 @@ ztest_dataset_name(char *dsname, char *pool, int d) } static void -ztest_dataset_destroy(ztest_shared_t *zs, int d) +ztest_dataset_destroy(int d) { char name[MAXNAMELEN]; - ztest_dataset_name(name, zs->zs_pool, d); + ztest_dataset_name(name, ztest_opts.zo_pool, d); - if (zopt_verbose >= 3) + if (ztest_opts.zo_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); /* @@ -4999,8 +5201,10 @@ ztest_dataset_destroy(ztest_shared_t *zs, int d) * ztest thread t operates on dataset (t % zopt_datasets), * so there may be more than one thing to clean up. */ - for (int t = d; t < zopt_threads; t += zopt_datasets) + for (int t = d; t < ztest_opts.zo_threads; + t += ztest_opts.zo_datasets) { ztest_dsl_dataset_cleanup(name, t); + } (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); @@ -5028,31 +5232,31 @@ ztest_dataset_dirobj_verify(ztest_ds_t *zd) } static int -ztest_dataset_open(ztest_shared_t *zs, int d) +ztest_dataset_open(int d) { - ztest_ds_t *zd = &zs->zs_zd[d]; - uint64_t committed_seq = zd->zd_seq; + ztest_ds_t *zd = &ztest_ds[d]; + uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; objset_t *os; zilog_t *zilog; char name[MAXNAMELEN]; int error; - ztest_dataset_name(name, zs->zs_pool, d); + ztest_dataset_name(name, ztest_opts.zo_pool, d); - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); error = ztest_dataset_create(name); if (error == ENOSPC) { - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); ztest_record_enospc(FTAG); return (error); } ASSERT(error == 0 || error == EEXIST); VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); - ztest_zd_init(zd, os); + ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); zilog = zd->zd_zilog; @@ -5067,7 +5271,7 @@ ztest_dataset_open(ztest_shared_t *zs, int d) ztest_dataset_dirobj_verify(zd); - if (zopt_verbose >= 6) + if (ztest_opts.zo_verbose >= 6) (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", zd->zd_name, (u_longlong_t)zilog->zl_parse_blk_count, @@ -5085,9 +5289,9 @@ ztest_dataset_open(ztest_shared_t *zs, int d) } static void -ztest_dataset_close(ztest_shared_t *zs, int d) +ztest_dataset_close(int d) { - ztest_ds_t *zd = &zs->zs_zd[d]; + ztest_ds_t *zd = &ztest_ds[d]; zil_close(zd->zd_zilog); dmu_objset_rele(zd->zd_os, zd); @@ -5103,6 +5307,7 @@ ztest_run(ztest_shared_t *zs) { thread_t *tid; spa_t *spa; + objset_t *os; thread_t resume_tid; int error; @@ -5111,15 +5316,18 @@ ztest_run(ztest_shared_t *zs) /* * Initialize parent/child shared state. */ - VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); - VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); + VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0); + VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0); zs->zs_thread_start = gethrtime(); - zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC; + zs->zs_thread_stop = + zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); zs->zs_thread_kill = zs->zs_thread_stop; - if (ztest_random(100) < zopt_killrate) - zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC); + if (ztest_random(100) < ztest_opts.zo_killrate) { + zs->zs_thread_kill -= + ztest_random(ztest_opts.zo_passtime * NANOSEC); + } (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL); @@ -5130,8 +5338,13 @@ ztest_run(ztest_shared_t *zs) * Open our pool. */ kernel_init(FREAD | FWRITE); - VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0); - zs->zs_spa = spa; + VERIFY(spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0); + spa->spa_debug = B_TRUE; + ztest_spa = spa; + + VERIFY3U(0, ==, dmu_objset_hold(ztest_opts.zo_pool, FTAG, &os)); + zs->zs_guid = dmu_objset_fsid_guid(os); + dmu_objset_rele(os, FTAG); spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; @@ -5176,21 +5389,23 @@ ztest_run(ztest_shared_t *zs) * If we got any ENOSPC errors on the previous run, destroy something. */ if (zs->zs_enospc_count != 0) { - int d = ztest_random(zopt_datasets); - ztest_dataset_destroy(zs, d); + int d = ztest_random(ztest_opts.zo_datasets); + ztest_dataset_destroy(d); } zs->zs_enospc_count = 0; - tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL); + tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t), + UMEM_NOFAIL); - if (zopt_verbose >= 4) + if (ztest_opts.zo_verbose >= 4) (void) printf("starting main threads...\n"); /* * Kick off all the tests that run in parallel. */ - for (int t = 0; t < zopt_threads; t++) { - if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0) + for (int t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets && + ztest_dataset_open(t) != 0) return; VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, THR_BOUND, &tid[t]) == 0); @@ -5200,10 +5415,10 @@ ztest_run(ztest_shared_t *zs) * Wait for all of the tests to complete. We go in reverse order * so we don't close datasets while threads are still using them. */ - for (int t = zopt_threads - 1; t >= 0; t--) { + for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) { VERIFY(thr_join(tid[t], NULL, NULL) == 0); - if (t < zopt_datasets) - ztest_dataset_close(zs, t); + if (t < ztest_opts.zo_datasets) + ztest_dataset_close(t); } txg_wait_synced(spa_get_dsl(spa), 0); @@ -5211,7 +5426,7 @@ ztest_run(ztest_shared_t *zs) zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); - umem_free(tid, zopt_threads * sizeof (thread_t)); + umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t)); /* Kill the resume thread */ ztest_exiting = B_TRUE; @@ -5232,7 +5447,7 @@ ztest_run(ztest_shared_t *zs) */ mutex_enter(&spa_namespace_lock); for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) - if (zopt_verbose > 3) + if (ztest_opts.zo_verbose > 3) (void) printf("spa_next: found %s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); @@ -5242,9 +5457,10 @@ ztest_run(ztest_shared_t *zs) */ if (ztest_random(2) == 0) { char name[MAXNAMELEN]; - (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool); - ztest_spa_import_export(zs->zs_pool, name); - ztest_spa_import_export(name, zs->zs_pool); + (void) snprintf(name, MAXNAMELEN, "%s_import", + ztest_opts.zo_pool); + ztest_spa_import_export(ztest_opts.zo_pool, name); + ztest_spa_import_export(name, ztest_opts.zo_pool); } kernel_fini(); @@ -5253,23 +5469,23 @@ ztest_run(ztest_shared_t *zs) (void) _mutex_destroy(&zcl.zcl_callbacks_lock); - (void) rwlock_destroy(&zs->zs_name_lock); - (void) _mutex_destroy(&zs->zs_vdev_lock); + (void) rwlock_destroy(&ztest_name_lock); + (void) _mutex_destroy(&ztest_vdev_lock); } static void -ztest_freeze(ztest_shared_t *zs) +ztest_freeze(void) { - ztest_ds_t *zd = &zs->zs_zd[0]; + ztest_ds_t *zd = &ztest_ds[0]; spa_t *spa; int numloops = 0; - if (zopt_verbose >= 3) + if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); kernel_init(FREAD | FWRITE); - VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); - VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(0)); /* * Force the first log block to be transactionally allocated. @@ -5296,7 +5512,8 @@ ztest_freeze(ztest_shared_t *zs) * to increase well beyond the last synced value in the uberblock. * The ZIL should be OK with that. */ - while (ztest_random(10) != 0 && numloops++ < zopt_maxloops) { + while (ztest_random(10) != 0 && + numloops++ < ztest_opts.zo_maxloops) { ztest_dmu_write_parallel(zd, 0); ztest_dmu_object_alloc_free(zd, 0); txg_wait_synced(spa_get_dsl(spa), 0); @@ -5311,7 +5528,7 @@ ztest_freeze(ztest_shared_t *zs) /* * Close our dataset and close the pool. */ - ztest_dataset_close(zs, 0); + ztest_dataset_close(0); spa_close(spa, FTAG); kernel_fini(); @@ -5319,9 +5536,9 @@ ztest_freeze(ztest_shared_t *zs) * Open and close the pool and dataset to induce log replay. */ kernel_init(FREAD | FWRITE); - VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); - VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); - ztest_dataset_close(zs, 0); + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(0)); + ztest_dataset_close(0); spa_close(spa, FTAG); kernel_fini(); } @@ -5356,15 +5573,11 @@ make_random_props() { nvlist_t *props; - if (ztest_random(2) == 0) - return (NULL); - VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + if (ztest_random(2) == 0) + return (props); VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); - (void) printf("props:\n"); - dump_nvlist(props, 4); - return (props); } @@ -5378,38 +5591,212 @@ ztest_init(ztest_shared_t *zs) spa_t *spa; nvlist_t *nvroot, *props; - VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); - VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); + VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0); + VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0); kernel_init(FREAD | FWRITE); /* * Create the storage pool. */ - (void) spa_destroy(zs->zs_pool); + (void) spa_destroy(ztest_opts.zo_pool); ztest_shared->zs_vdev_next_leaf = 0; zs->zs_splits = 0; - zs->zs_mirrors = zopt_mirrors; - nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, - 0, zopt_raidz, zs->zs_mirrors, 1); + zs->zs_mirrors = ztest_opts.zo_mirrors; + nvroot = make_vdev_root(NULL, NULL, ztest_opts.zo_vdev_size, 0, + 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); props = make_random_props(); - VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL)); + for (int i = 0; i < SPA_FEATURES; i++) { + char buf[1024]; + (void) snprintf(buf, sizeof (buf), "feature@%s", + spa_feature_table[i].fi_uname); + VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); + } + VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, + NULL, NULL)); nvlist_free(nvroot); - VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); - metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + spa_close(spa, FTAG); kernel_fini(); - ztest_run_zdb(zs->zs_pool); + ztest_run_zdb(ztest_opts.zo_pool); + + ztest_freeze(); + + ztest_run_zdb(ztest_opts.zo_pool); - ztest_freeze(zs); + (void) rwlock_destroy(&ztest_name_lock); + (void) _mutex_destroy(&ztest_vdev_lock); +} + +static void +setup_fds(void) +{ + int fd; - ztest_run_zdb(zs->zs_pool); + char *tmp = tempnam(NULL, NULL); + fd = open(tmp, O_RDWR | O_CREAT, 0700); + ASSERT3U(fd, ==, ZTEST_FD_DATA); + (void) unlink(tmp); + free(tmp); - (void) rwlock_destroy(&zs->zs_name_lock); - (void) _mutex_destroy(&zs->zs_vdev_lock); + fd = open("/dev/urandom", O_RDONLY); + ASSERT3U(fd, ==, ZTEST_FD_RAND); +} + +static int +shared_data_size(ztest_shared_hdr_t *hdr) +{ + int size; + + size = hdr->zh_hdr_size; + size += hdr->zh_opts_size; + size += hdr->zh_size; + size += hdr->zh_stats_size * hdr->zh_stats_count; + size += hdr->zh_ds_size * hdr->zh_ds_count; + + return (size); +} + +static void +setup_hdr(void) +{ + int size; + ztest_shared_hdr_t *hdr; + + hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), + PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0); + ASSERT(hdr != MAP_FAILED); + + VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, sizeof (ztest_shared_hdr_t))); + + hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); + hdr->zh_opts_size = sizeof (ztest_shared_opts_t); + hdr->zh_size = sizeof (ztest_shared_t); + hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); + hdr->zh_stats_count = ZTEST_FUNCS; + hdr->zh_ds_size = sizeof (ztest_shared_ds_t); + hdr->zh_ds_count = ztest_opts.zo_datasets; + + size = shared_data_size(hdr); + VERIFY3U(0, ==, ftruncate(ZTEST_FD_DATA, size)); + + (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); +} + +static void +setup_data(void) +{ + int size, offset; + ztest_shared_hdr_t *hdr; + uint8_t *buf; + + hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), + PROT_READ, MAP_SHARED, ZTEST_FD_DATA, 0); + ASSERT(hdr != MAP_FAILED); + + size = shared_data_size(hdr); + + (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); + hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), + PROT_READ | PROT_WRITE, MAP_SHARED, ZTEST_FD_DATA, 0); + ASSERT(hdr != MAP_FAILED); + buf = (uint8_t *)hdr; + + offset = hdr->zh_hdr_size; + ztest_shared_opts = (void *)&buf[offset]; + offset += hdr->zh_opts_size; + ztest_shared = (void *)&buf[offset]; + offset += hdr->zh_size; + ztest_shared_callstate = (void *)&buf[offset]; + offset += hdr->zh_stats_size * hdr->zh_stats_count; + ztest_shared_ds = (void *)&buf[offset]; +} + +static boolean_t +exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) +{ + pid_t pid; + int status; + char cmdbuf[MAXPATHLEN]; + + pid = fork(); + + if (cmd == NULL) { + (void) strlcpy(cmdbuf, getexecname(), sizeof (cmdbuf)); + cmd = cmdbuf; + } + + if (pid == -1) + fatal(1, "fork failed"); + + if (pid == 0) { /* child */ + char *emptyargv[2] = { cmd, NULL }; + + struct rlimit rl = { 1024, 1024 }; + (void) setrlimit(RLIMIT_NOFILE, &rl); + (void) enable_extended_FILE_stdio(-1, -1); + if (libpath != NULL) + VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); + (void) execv(cmd, emptyargv); + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "exec failed: %s", cmd); + } + + while (waitpid(pid, &status, 0) != pid) + continue; + if (statusp != NULL) + *statusp = status; + + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != 0) { + (void) fprintf(stderr, "child exited with code %d\n", + WEXITSTATUS(status)); + exit(2); + } + return (B_FALSE); + } else if (WIFSIGNALED(status)) { + if (!ignorekill || WTERMSIG(status) != SIGKILL) { + (void) fprintf(stderr, "child died with signal %d\n", + WTERMSIG(status)); + exit(3); + } + return (B_TRUE); + } else { + (void) fprintf(stderr, "something strange happened to child\n"); + exit(4); + /* NOTREACHED */ + } +} + +static void +ztest_run_init(void) +{ + ztest_shared_t *zs = ztest_shared; + + ASSERT(ztest_opts.zo_init != 0); + + /* + * Blow away any existing copy of zpool.cache + */ + (void) remove(spa_config_path); + + /* + * Create and initialize our storage pool. + */ + for (int i = 1; i <= ztest_opts.zo_init; i++) { + bzero(zs, sizeof (ztest_shared_t)); + if (ztest_opts.zo_verbose >= 3 && + ztest_opts.zo_init != 1) { + (void) printf("ztest_init(), pass %d\n", i); + } + ztest_init(zs); + } } int @@ -5417,63 +5804,92 @@ main(int argc, char **argv) { int kills = 0; int iters = 0; + int older = 0; + int newer = 0; ztest_shared_t *zs; - size_t shared_size; ztest_info_t *zi; + ztest_shared_callstate_t *zc; char timebuf[100]; char numbuf[6]; spa_t *spa; + char cmd[MAXNAMELEN]; + boolean_t hasalt; + + boolean_t ischild = (0 == lseek(ZTEST_FD_DATA, 0, SEEK_CUR)); + ASSERT(ischild || errno == EBADF); (void) setvbuf(stdout, NULL, _IOLBF, 0); - ztest_random_fd = open("/dev/urandom", O_RDONLY); + if (!ischild) { + process_options(argc, argv); - process_options(argc, argv); + setup_fds(); + setup_hdr(); + setup_data(); + bcopy(&ztest_opts, ztest_shared_opts, + sizeof (*ztest_shared_opts)); + } else { + setup_data(); + bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); + } + ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); /* Override location of zpool.cache */ - (void) asprintf((char **)&spa_config_path, "%s/zpool.cache", zopt_dir); + (void) asprintf((char **)&spa_config_path, "%s/zpool.cache", + ztest_opts.zo_dir); - /* - * Blow away any existing copy of zpool.cache - */ - if (zopt_init != 0) - (void) remove(spa_config_path); + ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), + UMEM_NOFAIL); + zs = ztest_shared; + + if (ischild) { + metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang; + metaslab_df_alloc_threshold = + zs->zs_metaslab_df_alloc_threshold; - shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t); + if (zs->zs_do_init) + ztest_run_init(); + else + ztest_run(zs); + exit(0); + } - zs = ztest_shared = (void *)mmap(0, - P2ROUNDUP(shared_size, getpagesize()), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); - if (zopt_verbose >= 1) { + if (ztest_opts.zo_verbose >= 1) { (void) printf("%llu vdevs, %d datasets, %d threads," " %llu seconds...\n", - (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads, - (u_longlong_t)zopt_time); + (u_longlong_t)ztest_opts.zo_vdevs, + ztest_opts.zo_datasets, + ztest_opts.zo_threads, + (u_longlong_t)ztest_opts.zo_time); } - /* - * Create and initialize our storage pool. - */ - for (int i = 1; i <= zopt_init; i++) { - bzero(zs, sizeof (ztest_shared_t)); - if (zopt_verbose >= 3 && zopt_init != 1) - (void) printf("ztest_init(), pass %d\n", i); - zs->zs_pool = zopt_pool; - ztest_init(zs); + (void) strlcpy(cmd, getexecname(), sizeof (cmd)); + + zs->zs_do_init = B_TRUE; + if (strlen(ztest_opts.zo_alt_ztest) != 0) { + if (ztest_opts.zo_verbose >= 1) { + (void) printf("Executing older ztest for " + "initialization: %s\n", ztest_opts.zo_alt_ztest); + } + VERIFY(!exec_child(ztest_opts.zo_alt_ztest, + ztest_opts.zo_alt_libpath, B_FALSE, NULL)); + } else { + VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); } + zs->zs_do_init = B_FALSE; - zs->zs_pool = zopt_pool; zs->zs_proc_start = gethrtime(); - zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC; + zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; for (int f = 0; f < ZTEST_FUNCS; f++) { - zi = &zs->zs_info[f]; - *zi = ztest_info[f]; + zi = &ztest_info[f]; + zc = ZTEST_GET_SHARED_CALLSTATE(f); if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) - zi->zi_call_next = UINT64_MAX; + zc->zc_next = UINT64_MAX; else - zi->zi_call_next = zs->zs_proc_start + + zc->zc_next = zs->zs_proc_start + ztest_random(2 * zi->zi_interval[0] + 1); } @@ -5484,60 +5900,43 @@ main(int argc, char **argv) */ while (gethrtime() < zs->zs_proc_stop) { int status; - pid_t pid; + boolean_t killed; /* * Initialize the workload counters for each function. */ for (int f = 0; f < ZTEST_FUNCS; f++) { - zi = &zs->zs_info[f]; - zi->zi_call_count = 0; - zi->zi_call_time = 0; + zc = ZTEST_GET_SHARED_CALLSTATE(f); + zc->zc_count = 0; + zc->zc_time = 0; } /* Set the allocation switch size */ - metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1; - - pid = fork(); + zs->zs_metaslab_df_alloc_threshold = + ztest_random(zs->zs_metaslab_sz / 4) + 1; - if (pid == -1) - fatal(1, "fork failed"); - - if (pid == 0) { /* child */ - struct rlimit rl = { 1024, 1024 }; - (void) setrlimit(RLIMIT_NOFILE, &rl); - (void) enable_extended_FILE_stdio(-1, -1); - ztest_run(zs); - exit(0); - } - - while (waitpid(pid, &status, 0) != pid) - continue; - - if (WIFEXITED(status)) { - if (WEXITSTATUS(status) != 0) { - (void) fprintf(stderr, - "child exited with code %d\n", - WEXITSTATUS(status)); - exit(2); + if (!hasalt || ztest_random(2) == 0) { + if (hasalt && ztest_opts.zo_verbose >= 1) { + (void) printf("Executing newer ztest: %s\n", + cmd); } - } else if (WIFSIGNALED(status)) { - if (WTERMSIG(status) != SIGKILL) { - (void) fprintf(stderr, - "child died with signal %d\n", - WTERMSIG(status)); - exit(3); - } - kills++; + newer++; + killed = exec_child(cmd, NULL, B_TRUE, &status); } else { - (void) fprintf(stderr, "something strange happened " - "to child\n"); - exit(4); + if (hasalt && ztest_opts.zo_verbose >= 1) { + (void) printf("Executing older ztest: %s\n", + ztest_opts.zo_alt_ztest); + } + older++; + killed = exec_child(ztest_opts.zo_alt_ztest, + ztest_opts.zo_alt_libpath, B_TRUE, &status); } + if (killed) + kills++; iters++; - if (zopt_verbose >= 1) { + if (ztest_opts.zo_verbose >= 1) { hrtime_t now = gethrtime(); now = MIN(now, zs->zs_proc_stop); @@ -5552,10 +5951,10 @@ main(int argc, char **argv) 100.0 * zs->zs_alloc / zs->zs_space, numbuf, 100.0 * (now - zs->zs_proc_start) / - (zopt_time * NANOSEC), timebuf); + (ztest_opts.zo_time * NANOSEC), timebuf); } - if (zopt_verbose >= 2) { + if (ztest_opts.zo_verbose >= 2) { (void) printf("\nWorkload summary:\n\n"); (void) printf("%7s %9s %s\n", "Calls", "Time", "Function"); @@ -5564,11 +5963,12 @@ main(int argc, char **argv) for (int f = 0; f < ZTEST_FUNCS; f++) { Dl_info dli; - zi = &zs->zs_info[f]; - print_time(zi->zi_call_time, timebuf); + zi = &ztest_info[f]; + zc = ZTEST_GET_SHARED_CALLSTATE(f); + print_time(zc->zc_time, timebuf); (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%7llu %9s %s\n", - (u_longlong_t)zi->zi_call_count, timebuf, + (u_longlong_t)zc->zc_count, timebuf, dli.dli_sname); } (void) printf("\n"); @@ -5580,22 +5980,28 @@ main(int argc, char **argv) * instead of 'ztest'. Do a blind rename in case this happened. */ kernel_init(FREAD); - if (spa_open(zopt_pool, &spa, FTAG) == 0) { + if (spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0) { spa_close(spa, FTAG); } else { char tmpname[MAXNAMELEN]; kernel_fini(); kernel_init(FREAD | FWRITE); (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp", - zopt_pool); - (void) spa_rename(tmpname, zopt_pool); + ztest_opts.zo_pool); + (void) spa_rename(tmpname, ztest_opts.zo_pool); } kernel_fini(); - ztest_run_zdb(zopt_pool); + ztest_run_zdb(ztest_opts.zo_pool); } - if (zopt_verbose >= 1) { + if (ztest_opts.zo_verbose >= 1) { + if (hasalt) { + (void) printf("%d runs of older ztest: %s\n", older, + ztest_opts.zo_alt_ztest); + (void) printf("%d runs of newer ztest: %s\n", newer, + cmd); + } (void) printf("%d killed, %d completed, %.0f%% kill rate\n", kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } diff --git a/lib/libnvpair/libnvpair.c b/lib/libnvpair/libnvpair.c index 16bce48..c2e5a1b 100644 --- a/lib/libnvpair/libnvpair.c +++ b/lib/libnvpair/libnvpair.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -803,6 +804,10 @@ dump_nvlist(nvlist_t *list, int indent) while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN: + (void) printf("%*s%s\n", indent, "", nvpair_name(elem)); + break; + case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(elem, &bool_value); (void) printf("%*s%s: %s\n", indent, "", diff --git a/lib/libuutil/common/uu_list.c b/lib/libuutil/common/uu_list.c index 35c7ba8..93795e5 100644 --- a/lib/libuutil/common/uu_list.c +++ b/lib/libuutil/common/uu_list.c @@ -18,13 +18,14 @@ * * CDDL HEADER END */ + /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2011 Jason King. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "libuutil_common.h" #include @@ -318,6 +319,8 @@ uu_list_find(uu_list_t *lp, void *elem, void *private, uu_list_index_t *out) uu_compare_fn_t *func = lp->ul_pool->ulp_cmp; uu_list_node_impl_t *np; + uu_set_error(UU_ERROR_NONE); + if (func == NULL) { if (out != NULL) *out = 0; diff --git a/lib/libzfs/common/libzfs.h b/lib/libzfs/common/libzfs.h index ea34cc9..4dc039c 100644 --- a/lib/libzfs/common/libzfs.h +++ b/lib/libzfs/common/libzfs.h @@ -21,6 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _LIBZFS_H @@ -229,6 +232,8 @@ typedef struct splitflags { */ extern int zpool_scan(zpool_handle_t *, pool_scan_func_t); extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); +extern int zpool_reguid(zpool_handle_t *); +extern int zpool_reopen(zpool_handle_t *); extern int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); @@ -286,6 +291,15 @@ typedef enum { ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ /* + * If the pool has unsupported features but can still be opened in + * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the + * pool has unsupported features but cannot be opened at all, its + * status is ZPOOL_STATUS_UNSUP_FEAT_READ. + */ + ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ + ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ + + /* * These faults have no corresponding message ID. At the time we are * checking the status, the original reason for the FMA fault (I/O or * checksum errors) has been lost. @@ -317,6 +331,7 @@ extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); * Statistics and configuration functions. */ extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); +extern nvlist_t *zpool_get_features(zpool_handle_t *); extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); @@ -329,6 +344,7 @@ extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); +extern void zpool_print_unsup_feat(nvlist_t *config); /* * Search for pools to import @@ -380,6 +396,7 @@ extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, * underlying datasets, only the references to them. */ extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); +extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *); extern void zfs_close(zfs_handle_t *); extern zfs_type_t zfs_get_type(const zfs_handle_t *); extern const char *zfs_get_name(const zfs_handle_t *); @@ -413,12 +430,22 @@ extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); +extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue); +extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal); +extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, + char *buf, size_t len); +extern int zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, + uint64_t *usedp); extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); extern const char *zfs_prop_values(zfs_prop_t); extern int zfs_prop_is_string(zfs_prop_t prop); extern nvlist_t *zfs_get_user_props(zfs_handle_t *); extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); +extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); + typedef struct zprop_list { int pl_prop; @@ -436,10 +463,19 @@ extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" +#define ZFS_FEATURE_DISABLED "disabled" +#define ZFS_FEATURE_ENABLED "enabled" +#define ZFS_FEATURE_ACTIVE "active" + +#define ZFS_UNSUPPORTED_INACTIVE "inactive" +#define ZFS_UNSUPPORTED_READONLY "readonly" + /* * zpool property management */ extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); +extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, + size_t); extern const char *zpool_prop_default_string(zpool_prop_t); extern uint64_t zpool_prop_default_numeric(zpool_prop_t); extern const char *zpool_prop_column_name(zpool_prop_t); @@ -493,6 +529,7 @@ extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); typedef struct get_all_cb { zfs_handle_t **cb_handles; @@ -513,79 +550,92 @@ extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, extern int zfs_create_ancestors(libzfs_handle_t *, const char *); extern int zfs_destroy(zfs_handle_t *, boolean_t); extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); +extern int zfs_destroy_snaps_nvl(zfs_handle_t *, nvlist_t *, boolean_t); extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); -extern int zfs_rename(zfs_handle_t *, const char *, boolean_t); +extern int zfs_rename(zfs_handle_t *, const char *, boolean_t, boolean_t); typedef struct sendflags { /* print informational messages (ie, -v was specified) */ - int verbose : 1; + boolean_t verbose; /* recursive send (ie, -R) */ - int replicate : 1; + boolean_t replicate; /* for incrementals, do all intermediate snapshots */ - int doall : 1; /* (ie, -I) */ + boolean_t doall; /* if dataset is a clone, do incremental from its origin */ - int fromorigin : 1; + boolean_t fromorigin; /* do deduplication */ - int dedup : 1; + boolean_t dedup; /* send properties (ie, -p) */ - int props : 1; + boolean_t props; + + /* do not send (no-op, ie. -n) */ + boolean_t dryrun; + + /* parsable verbose output (ie. -P) */ + boolean_t parsable; + + /* show progress (ie. -v) */ + boolean_t progress; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); -extern int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - sendflags_t flags, int outfd, snapfilter_cb_t filter_func, - void *cb_arg, nvlist_t **debugnvp); +extern int zfs_send(zfs_handle_t *, const char *, const char *, + sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); extern int zfs_promote(zfs_handle_t *); extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, boolean_t, boolean_t, int, uint64_t, uint64_t); extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); +extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); -extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, - zfs_userspace_cb_t func, void *arg); +extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, + zfs_userspace_cb_t, void *); + +extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); +extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ - int verbose : 1; + boolean_t verbose; /* the destination is a prefix, not the exact fs (ie, -d) */ - int isprefix : 1; + boolean_t isprefix; /* * Only the tail of the sent snapshot path is appended to the * destination to determine the received snapshot name (ie, -e). */ - int istail : 1; + boolean_t istail; /* do not actually do the recv, just check if it would work (ie, -n) */ - int dryrun : 1; + boolean_t dryrun; /* rollback/destroy filesystems as necessary (eg, -F) */ - int force : 1; + boolean_t force; /* set "canmount=off" on all modified filesystems */ - int canmountoff : 1; + boolean_t canmountoff; /* byteswap flag is used internally; callers need not specify */ - int byteswap : 1; + boolean_t byteswap; /* do not mount file systems as they are extracted (private) */ - int nomount : 1; + boolean_t nomount; } recvflags_t; -extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t, +extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *, int, avl_tree_t *); typedef enum diff_flags { diff --git a/lib/libzfs/common/libzfs_config.c b/lib/libzfs/common/libzfs_config.c index dc27238..f756da2 100644 --- a/lib/libzfs/common/libzfs_config.c +++ b/lib/libzfs/common/libzfs_config.c @@ -18,12 +18,17 @@ * * CDDL HEADER END */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* * The pool configuration repository is stored in /etc/zfs/zpool.cache as a * single packed nvlist. While it would be nice to just read in this * file from userland, this wouldn't work from a local zone. So we have to have @@ -218,6 +223,36 @@ zpool_get_config(zpool_handle_t *zhp, nvlist_t **oldconfig) } /* + * Retrieves a list of enabled features and their refcounts and caches it in + * the pool handle. + */ +nvlist_t * +zpool_get_features(zpool_handle_t *zhp) +{ + nvlist_t *config, *features; + + config = zpool_get_config(zhp, NULL); + + if (config == NULL || !nvlist_exists(config, + ZPOOL_CONFIG_FEATURE_STATS)) { + int error; + boolean_t missing = B_FALSE; + + error = zpool_refresh_stats(zhp, &missing); + + if (error != 0 || missing) + return (NULL); + + config = zpool_get_config(zhp, NULL); + } + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + &features) == 0); + + return (features); +} + +/* * Refresh the vdev statistics associated with the given pool. This is used in * iostat to show configuration changes and determine the delta from the last * time the function was called. This function can fail, in case the pool has diff --git a/lib/libzfs/common/libzfs_dataset.c b/lib/libzfs/common/libzfs_dataset.c index b7c1360..c1767cb 100644 --- a/lib/libzfs/common/libzfs_dataset.c +++ b/lib/libzfs/common/libzfs_dataset.c @@ -21,6 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. */ #include @@ -132,6 +135,7 @@ zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, namecheck_err_t why; char what; + (void) zfs_prop_get_table(); if (dataset_namecheck(path, &why, &what) != 0) { if (hdl != NULL) { switch (why) { @@ -493,7 +497,7 @@ make_dataset_handle(libzfs_handle_t *hdl, const char *path) return (zhp); } -static zfs_handle_t * +zfs_handle_t * make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); @@ -510,6 +514,53 @@ make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) return (zhp); } +zfs_handle_t * +zfs_handle_dup(zfs_handle_t *zhp_orig) +{ + zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = zhp_orig->zfs_hdl; + zhp->zpool_hdl = zhp_orig->zpool_hdl; + (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name, + sizeof (zhp->zfs_name)); + zhp->zfs_type = zhp_orig->zfs_type; + zhp->zfs_head_type = zhp_orig->zfs_head_type; + zhp->zfs_dmustats = zhp_orig->zfs_dmustats; + if (zhp_orig->zfs_props != NULL) { + if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) { + (void) no_memory(zhp->zfs_hdl); + zfs_close(zhp); + return (NULL); + } + } + if (zhp_orig->zfs_user_props != NULL) { + if (nvlist_dup(zhp_orig->zfs_user_props, + &zhp->zfs_user_props, 0) != 0) { + (void) no_memory(zhp->zfs_hdl); + zfs_close(zhp); + return (NULL); + } + } + if (zhp_orig->zfs_recvd_props != NULL) { + if (nvlist_dup(zhp_orig->zfs_recvd_props, + &zhp->zfs_recvd_props, 0)) { + (void) no_memory(zhp->zfs_hdl); + zfs_close(zhp); + return (NULL); + } + } + zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck; + if (zhp_orig->zfs_mntopts != NULL) { + zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl, + zhp_orig->zfs_mntopts); + } + zhp->zfs_props_table = zhp_orig->zfs_props_table; + return (zhp); +} + /* * Opens the given snapshot, filesystem, or volume. The 'types' * argument is a mask of acceptable types. The function will print an @@ -873,6 +924,12 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, goto error; } continue; + } else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is readonly"), + propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); + goto error; } if (prop == ZPROP_INVAL) { @@ -1846,8 +1903,6 @@ zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, err = zfs_prop_get(zhp, prop, propbuf, proplen, NULL, NULL, 0, literal); zfs_unset_recvd_props_mode(zhp, &cookie); - } else if (zfs_prop_userquota(propname)) { - return (-1); } else { nvlist_t *propval; char *recvdval; @@ -1862,6 +1917,120 @@ zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, return (err == 0 ? 0 : -1); } +static int +get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) +{ + nvlist_t *value; + nvpair_t *pair; + + value = zfs_get_clones_nvl(zhp); + if (value == NULL) + return (-1); + + propbuf[0] = '\0'; + for (pair = nvlist_next_nvpair(value, NULL); pair != NULL; + pair = nvlist_next_nvpair(value, pair)) { + if (propbuf[0] != '\0') + (void) strlcat(propbuf, ",", proplen); + (void) strlcat(propbuf, nvpair_name(pair), proplen); + } + + return (0); +} + +struct get_clones_arg { + uint64_t numclones; + nvlist_t *value; + const char *origin; + char buf[ZFS_MAXNAMELEN]; +}; + +int +get_clones_cb(zfs_handle_t *zhp, void *arg) +{ + struct get_clones_arg *gca = arg; + + if (gca->numclones == 0) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf), + NULL, NULL, 0, B_TRUE) != 0) + goto out; + if (strcmp(gca->buf, gca->origin) == 0) { + if (nvlist_add_boolean(gca->value, zfs_get_name(zhp)) != 0) { + zfs_close(zhp); + return (no_memory(zhp->zfs_hdl)); + } + gca->numclones--; + } + +out: + (void) zfs_iter_children(zhp, get_clones_cb, gca); + zfs_close(zhp); + return (0); +} + +nvlist_t * +zfs_get_clones_nvl(zfs_handle_t *zhp) +{ + nvlist_t *nv, *value; + + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) { + struct get_clones_arg gca; + + /* + * if this is a snapshot, then the kernel wasn't able + * to get the clones. Do it by slowly iterating. + */ + if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) + return (NULL); + if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) { + nvlist_free(nv); + return (NULL); + } + + gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES); + gca.value = value; + gca.origin = zhp->zfs_name; + + if (gca.numclones != 0) { + zfs_handle_t *root; + char pool[ZFS_MAXNAMELEN]; + char *cp = pool; + + /* get the pool name */ + (void) strlcpy(pool, zhp->zfs_name, sizeof (pool)); + (void) strsep(&cp, "/@"); + root = zfs_open(zhp->zfs_hdl, pool, + ZFS_TYPE_FILESYSTEM); + + (void) get_clones_cb(root, &gca); + } + + if (gca.numclones != 0 || + nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 || + nvlist_add_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) { + nvlist_free(nv); + nvlist_free(value); + return (NULL); + } + nvlist_free(nv); + nvlist_free(value); + verify(0 == nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_CLONES), &nv)); + } + + verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0); + + return (value); +} + /* * Retrieve a property from the given object. If 'literal' is specified, then * numbers are left as exact values. Otherwise, numbers are converted to a @@ -1990,6 +2159,11 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, return (-1); break; + case ZFS_PROP_CLONES: + if (get_clones_string(zhp, propbuf, proplen) != 0) + return (-1); + break; + case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: @@ -2018,6 +2192,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, } break; + case ZFS_PROP_REFRATIO: case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); @@ -2106,6 +2281,17 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, } break; + case ZFS_PROP_GUID: + /* + * GUIDs are stored as numbers, but they are identifiers. + * We don't want them to be pretty printed, because pretty + * printing mangles the ID into a truncated and useless value. + */ + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); + break; + default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: @@ -2349,7 +2535,7 @@ zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, int err; zfs_cmd_t zc = { 0 }; - (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); err = userquota_propname_decode(propname, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), @@ -2401,144 +2587,95 @@ zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, return (0); } -/* - * Returns the name of the given zfs handle. - */ -const char * -zfs_get_name(const zfs_handle_t *zhp) +int +zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue) { - return (zhp->zfs_name); -} + int err; + zfs_cmd_t zc = { 0 }; + const char *snapname; -/* - * Returns the type of the given zfs handle. - */ -zfs_type_t -zfs_get_type(const zfs_handle_t *zhp) -{ - return (zhp->zfs_type); -} + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -static int -zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc) -{ - int rc; - uint64_t orig_cookie; + snapname = strchr(propname, '@') + 1; + if (strchr(snapname, '@')) { + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + } else { + /* snapname is the short name, append it to zhp's fsname */ + char *cp; + + (void) strlcpy(zc.zc_value, zhp->zfs_name, + sizeof (zc.zc_value)); + cp = strchr(zc.zc_value, '@'); + if (cp != NULL) + *cp = '\0'; + (void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value)); + (void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value)); + } - orig_cookie = zc->zc_cookie; -top: - (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); - rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc); + err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc); + if (err) + return (err); - if (rc == -1) { - switch (errno) { - case ENOMEM: - /* expand nvlist memory and try again */ - if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) { - zcmd_free_nvlists(zc); - return (-1); - } - zc->zc_cookie = orig_cookie; - goto top; - /* - * An errno value of ESRCH indicates normal completion. - * If ENOENT is returned, then the underlying dataset - * has been removed since we obtained the handle. - */ - case ESRCH: - case ENOENT: - rc = 1; - break; - default: - rc = zfs_standard_error(zhp->zfs_hdl, errno, - dgettext(TEXT_DOMAIN, - "cannot iterate filesystems")); - break; - } - } - return (rc); + *propvalue = zc.zc_cookie; + return (0); } -/* - * Iterate over all child filesystems - */ int -zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) +zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal) { - zfs_cmd_t zc = { 0 }; - zfs_handle_t *nzhp; - int ret; + int err; + uint64_t propvalue; - if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) - return (0); + err = zfs_prop_get_written_int(zhp, propname, &propvalue); - if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); - - while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, - &zc)) == 0) { - /* - * Silently ignore errors, as the only plausible explanation is - * that the pool has since been removed. - */ - if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, - &zc)) == NULL) { - continue; - } + if (err) + return (err); - if ((ret = func(nzhp, data)) != 0) { - zcmd_free_nvlists(&zc); - return (ret); - } + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", propvalue); + } else { + zfs_nicenum(propvalue, propbuf, proplen); } - zcmd_free_nvlists(&zc); - return ((ret < 0) ? ret : 0); + return (0); } -/* - * Iterate over all snapshots - */ int -zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data) +zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, + uint64_t *usedp) { + int err; zfs_cmd_t zc = { 0 }; - zfs_handle_t *nzhp; - int ret; - if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) - return (0); + (void) strlcpy(zc.zc_name, lastsnap->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, firstsnap->zfs_name, sizeof (zc.zc_value)); - if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); - while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT, - &zc)) == 0) { + err = ioctl(lastsnap->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_SNAPS, &zc); + if (err) + return (err); - if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, - &zc)) == NULL) { - continue; - } + *usedp = zc.zc_cookie; - if ((ret = func(nzhp, data)) != 0) { - zcmd_free_nvlists(&zc); - return (ret); - } - } - zcmd_free_nvlists(&zc); - return ((ret < 0) ? ret : 0); + return (0); } /* - * Iterate over all children, snapshots and filesystems + * Returns the name of the given zfs handle. */ -int -zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) +const char * +zfs_get_name(const zfs_handle_t *zhp) { - int ret; - - if ((ret = zfs_iter_filesystems(zhp, func, data)) != 0) - return (ret); + return (zhp->zfs_name); +} - return (zfs_iter_snapshots(zhp, func, data)); +/* + * Returns the type of the given zfs handle. + */ +zfs_type_t +zfs_get_type(const zfs_handle_t *zhp) +{ + return (zhp->zfs_type); } /* @@ -2564,18 +2701,19 @@ is_descendant(const char *ds1, const char *ds2) /* * Given a complete name, return just the portion that refers to the parent. - * Can return NULL if this is a pool. + * Will return -1 if there is no parent (path is just the name of the + * pool). */ static int parent_name(const char *path, char *buf, size_t buflen) { - char *loc; + char *slashp; - if ((loc = strrchr(path, '/')) == NULL) - return (-1); + (void) strlcpy(buf, path, buflen); - (void) strncpy(buf, path, MIN(buflen, loc - path)); - buf[loc - path] = '\0'; + if ((slashp = strrchr(buf, '/')) == NULL) + return (-1); + *slashp = '\0'; return (0); } @@ -2974,9 +3112,8 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer) } struct destroydata { - char *snapname; - boolean_t gotone; - boolean_t closezhp; + nvlist_t *nvl; + const char *snapname; }; static int @@ -2985,24 +3122,19 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) struct destroydata *dd = arg; zfs_handle_t *szhp; char name[ZFS_MAXNAMELEN]; - boolean_t closezhp = dd->closezhp; int rv = 0; - (void) strlcpy(name, zhp->zfs_name, sizeof (name)); - (void) strlcat(name, "@", sizeof (name)); - (void) strlcat(name, dd->snapname, sizeof (name)); + (void) snprintf(name, sizeof (name), + "%s@%s", zhp->zfs_name, dd->snapname); szhp = make_dataset_handle(zhp->zfs_hdl, name); if (szhp) { - dd->gotone = B_TRUE; + verify(nvlist_add_boolean(dd->nvl, name) == 0); zfs_close(szhp); } - dd->closezhp = B_TRUE; - if (!dd->gotone) - rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg); - if (closezhp) - zfs_close(zhp); + rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd); + zfs_close(zhp); return (rv); } @@ -3012,29 +3144,45 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) int zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) { - zfs_cmd_t zc = { 0 }; int ret; struct destroydata dd = { 0 }; dd.snapname = snapname; - (void) zfs_check_snap_cb(zhp, &dd); + verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0); + (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); - if (!dd.gotone) { - return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, + if (nvlist_next_nvpair(dd.nvl, NULL) == NULL) { + ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), - zhp->zfs_name, snapname)); + zhp->zfs_name, snapname); + } else { + ret = zfs_destroy_snaps_nvl(zhp, dd.nvl, defer); } + nvlist_free(dd.nvl); + return (ret); +} + +/* + * Destroys all the snapshots named in the nvlist. They must be underneath + * the zhp (either snapshots of it, or snapshots of its descendants). + */ +int +zfs_destroy_snaps_nvl(zfs_handle_t *zhp, nvlist_t *snaps, boolean_t defer) +{ + int ret; + zfs_cmd_t zc = { 0 }; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + if (zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, snaps) != 0) + return (-1); zc.zc_defer_destroy = defer; - ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc); + ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS_NVL, &zc); if (ret != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot destroy '%s@%s'"), zc.zc_name, snapname); + "cannot destroy snapshots in %s"), zc.zc_name); switch (errno) { case EEXIST: @@ -3070,7 +3218,7 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), target); - /* validate the target name */ + /* validate the target/clone name */ if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); @@ -3407,46 +3555,11 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) } /* - * Iterate over all dependents for a given dataset. This includes both - * hierarchical dependents (children) and data dependents (snapshots and - * clones). The bulk of the processing occurs in get_dependents() in - * libzfs_graph.c. - */ -int -zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion, - zfs_iter_f func, void *data) -{ - char **dependents; - size_t count; - int i; - zfs_handle_t *child; - int ret = 0; - - if (get_dependents(zhp->zfs_hdl, allowrecursion, zhp->zfs_name, - &dependents, &count) != 0) - return (-1); - - for (i = 0; i < count; i++) { - if ((child = make_dataset_handle(zhp->zfs_hdl, - dependents[i])) == NULL) - continue; - - if ((ret = func(child, data)) != 0) - break; - } - - for (i = 0; i < count; i++) - free(dependents[i]); - free(dependents); - - return (ret); -} - -/* * Renames the given dataset. */ int -zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) +zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, + boolean_t force_unmount) { int ret; zfs_cmd_t zc = { 0 }; @@ -3558,7 +3671,8 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) } } else { - if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL) + if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, + force_unmount ? MS_FORCE : 0)) == NULL) return (-1); if (changelist_haszonedchild(cl)) { @@ -3891,7 +4005,7 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, int error; zfs_useracct_t buf[100]; - (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_objset_type = type; zc.zc_nvlist_dst = (uintptr_t)buf; @@ -4019,6 +4133,193 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, return (0); } +int +zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + int nvsz = 2048; + void *nvbuf; + int err = 0; + char errbuf[ZFS_MAXNAMELEN+32]; + + assert(zhp->zfs_type == ZFS_TYPE_VOLUME || + zhp->zfs_type == ZFS_TYPE_FILESYSTEM); + +tryagain: + + nvbuf = malloc(nvsz); + if (nvbuf == NULL) { + err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); + goto out; + } + + zc.zc_nvlist_dst_size = nvsz; + zc.zc_nvlist_dst = (uintptr_t)nvbuf; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN); + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), + zc.zc_name); + switch (errno) { + case ENOMEM: + free(nvbuf); + nvsz = zc.zc_nvlist_dst_size; + goto tryagain; + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + err = zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + err = zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case ENOENT: + err = zfs_error(hdl, EZFS_NOENT, errbuf); + break; + default: + err = zfs_standard_error_fmt(hdl, errno, errbuf); + break; + } + } else { + /* success */ + int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); + if (rc) { + (void) snprintf(errbuf, sizeof (errbuf), dgettext( + TEXT_DOMAIN, "cannot get permissions on '%s'"), + zc.zc_name); + err = zfs_standard_error_fmt(hdl, rc, errbuf); + } + } + + free(nvbuf); +out: + return (err); +} + +int +zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char *nvbuf; + char errbuf[ZFS_MAXNAMELEN+32]; + size_t nvsz; + int err; + + assert(zhp->zfs_type == ZFS_TYPE_VOLUME || + zhp->zfs_type == ZFS_TYPE_FILESYSTEM); + + err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); + assert(err == 0); + + nvbuf = malloc(nvsz); + + err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); + assert(err == 0); + + zc.zc_nvlist_src_size = nvsz; + zc.zc_nvlist_src = (uintptr_t)nvbuf; + zc.zc_perm_action = un; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"), + zc.zc_name); + switch (errno) { + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + err = zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + err = zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case ENOENT: + err = zfs_error(hdl, EZFS_NOENT, errbuf); + break; + default: + err = zfs_standard_error_fmt(hdl, errno, errbuf); + break; + } + } + + free(nvbuf); + + return (err); +} + +int +zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + int nvsz = 2048; + void *nvbuf; + int err = 0; + char errbuf[ZFS_MAXNAMELEN+32]; + + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + +tryagain: + + nvbuf = malloc(nvsz); + if (nvbuf == NULL) { + err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); + goto out; + } + + zc.zc_nvlist_dst_size = nvsz; + zc.zc_nvlist_dst = (uintptr_t)nvbuf; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN); + + if (zfs_ioctl(hdl, ZFS_IOC_GET_HOLDS, &zc) != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), + zc.zc_name); + switch (errno) { + case ENOMEM: + free(nvbuf); + nvsz = zc.zc_nvlist_dst_size; + goto tryagain; + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + err = zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + err = zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case ENOENT: + err = zfs_error(hdl, EZFS_NOENT, errbuf); + break; + default: + err = zfs_standard_error_fmt(hdl, errno, errbuf); + break; + } + } else { + /* success */ + int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); + if (rc) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), + zc.zc_name); + err = zfs_standard_error_fmt(hdl, rc, errbuf); + } + } + + free(nvbuf); +out: + return (err); +} + uint64_t zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) { diff --git a/lib/libzfs/common/libzfs_graph.c b/lib/libzfs/common/libzfs_graph.c deleted file mode 100644 index bc21c51..0000000 --- a/lib/libzfs/common/libzfs_graph.c +++ /dev/null @@ -1,653 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Iterate over all children of the current object. This includes the normal - * dataset hierarchy, but also arbitrary hierarchies due to clones. We want to - * walk all datasets in the pool, and construct a directed graph of the form: - * - * home - * | - * +----+----+ - * | | - * v v ws - * bar baz | - * | | - * v v - * @yesterday ----> foo - * - * In order to construct this graph, we have to walk every dataset in the pool, - * because the clone parent is stored as a property of the child, not the - * parent. The parent only keeps track of the number of clones. - * - * In the normal case (without clones) this would be rather expensive. To avoid - * unnecessary computation, we first try a walk of the subtree hierarchy - * starting from the initial node. At each dataset, we construct a node in the - * graph and an edge leading from its parent. If we don't see any snapshots - * with a non-zero clone count, then we are finished. - * - * If we do find a cloned snapshot, then we finish the walk of the current - * subtree, but indicate that we need to do a complete walk. We then perform a - * global walk of all datasets, avoiding the subtree we already processed. - * - * At the end of this, we'll end up with a directed graph of all relevant (and - * possible some irrelevant) datasets in the system. We need to both find our - * limiting subgraph and determine a safe ordering in which to destroy the - * datasets. We do a topological ordering of our graph starting at our target - * dataset, and then walk the results in reverse. - * - * It's possible for the graph to have cycles if, for example, the user renames - * a clone to be the parent of its origin snapshot. The user can request to - * generate an error in this case, or ignore the cycle and continue. - * - * When removing datasets, we want to destroy the snapshots in chronological - * order (because this is the most efficient method). In order to accomplish - * this, we store the creation transaction group with each vertex and keep each - * vertex's edges sorted according to this value. The topological sort will - * automatically walk the snapshots in the correct order. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "libzfs_impl.h" -#include "zfs_namecheck.h" - -#define MIN_EDGECOUNT 4 - -/* - * Vertex structure. Indexed by dataset name, this structure maintains a list - * of edges to other vertices. - */ -struct zfs_edge; -typedef struct zfs_vertex { - char zv_dataset[ZFS_MAXNAMELEN]; - struct zfs_vertex *zv_next; - int zv_visited; - uint64_t zv_txg; - struct zfs_edge **zv_edges; - int zv_edgecount; - int zv_edgealloc; -} zfs_vertex_t; - -enum { - VISIT_SEEN = 1, - VISIT_SORT_PRE, - VISIT_SORT_POST -}; - -/* - * Edge structure. Simply maintains a pointer to the destination vertex. There - * is no need to store the source vertex, since we only use edges in the context - * of the source vertex. - */ -typedef struct zfs_edge { - zfs_vertex_t *ze_dest; - struct zfs_edge *ze_next; -} zfs_edge_t; - -#define ZFS_GRAPH_SIZE 1027 /* this could be dynamic some day */ - -/* - * Graph structure. Vertices are maintained in a hash indexed by dataset name. - */ -typedef struct zfs_graph { - zfs_vertex_t **zg_hash; - size_t zg_size; - size_t zg_nvertex; - const char *zg_root; - int zg_clone_count; -} zfs_graph_t; - -/* - * Allocate a new edge pointing to the target vertex. - */ -static zfs_edge_t * -zfs_edge_create(libzfs_handle_t *hdl, zfs_vertex_t *dest) -{ - zfs_edge_t *zep = zfs_alloc(hdl, sizeof (zfs_edge_t)); - - if (zep == NULL) - return (NULL); - - zep->ze_dest = dest; - - return (zep); -} - -/* - * Destroy an edge. - */ -static void -zfs_edge_destroy(zfs_edge_t *zep) -{ - free(zep); -} - -/* - * Allocate a new vertex with the given name. - */ -static zfs_vertex_t * -zfs_vertex_create(libzfs_handle_t *hdl, const char *dataset) -{ - zfs_vertex_t *zvp = zfs_alloc(hdl, sizeof (zfs_vertex_t)); - - if (zvp == NULL) - return (NULL); - - assert(strlen(dataset) < ZFS_MAXNAMELEN); - - (void) strlcpy(zvp->zv_dataset, dataset, sizeof (zvp->zv_dataset)); - - if ((zvp->zv_edges = zfs_alloc(hdl, - MIN_EDGECOUNT * sizeof (void *))) == NULL) { - free(zvp); - return (NULL); - } - - zvp->zv_edgealloc = MIN_EDGECOUNT; - - return (zvp); -} - -/* - * Destroy a vertex. Frees up any associated edges. - */ -static void -zfs_vertex_destroy(zfs_vertex_t *zvp) -{ - int i; - - for (i = 0; i < zvp->zv_edgecount; i++) - zfs_edge_destroy(zvp->zv_edges[i]); - - free(zvp->zv_edges); - free(zvp); -} - -/* - * Given a vertex, add an edge to the destination vertex. - */ -static int -zfs_vertex_add_edge(libzfs_handle_t *hdl, zfs_vertex_t *zvp, - zfs_vertex_t *dest) -{ - zfs_edge_t *zep = zfs_edge_create(hdl, dest); - - if (zep == NULL) - return (-1); - - if (zvp->zv_edgecount == zvp->zv_edgealloc) { - void *ptr; - - if ((ptr = zfs_realloc(hdl, zvp->zv_edges, - zvp->zv_edgealloc * sizeof (void *), - zvp->zv_edgealloc * 2 * sizeof (void *))) == NULL) - return (-1); - - zvp->zv_edges = ptr; - zvp->zv_edgealloc *= 2; - } - - zvp->zv_edges[zvp->zv_edgecount++] = zep; - - return (0); -} - -static int -zfs_edge_compare(const void *a, const void *b) -{ - const zfs_edge_t *ea = *((zfs_edge_t **)a); - const zfs_edge_t *eb = *((zfs_edge_t **)b); - - if (ea->ze_dest->zv_txg < eb->ze_dest->zv_txg) - return (-1); - if (ea->ze_dest->zv_txg > eb->ze_dest->zv_txg) - return (1); - return (0); -} - -/* - * Sort the given vertex edges according to the creation txg of each vertex. - */ -static void -zfs_vertex_sort_edges(zfs_vertex_t *zvp) -{ - if (zvp->zv_edgecount == 0) - return; - - qsort(zvp->zv_edges, zvp->zv_edgecount, sizeof (void *), - zfs_edge_compare); -} - -/* - * Construct a new graph object. We allow the size to be specified as a - * parameter so in the future we can size the hash according to the number of - * datasets in the pool. - */ -static zfs_graph_t * -zfs_graph_create(libzfs_handle_t *hdl, const char *dataset, size_t size) -{ - zfs_graph_t *zgp = zfs_alloc(hdl, sizeof (zfs_graph_t)); - - if (zgp == NULL) - return (NULL); - - zgp->zg_size = size; - if ((zgp->zg_hash = zfs_alloc(hdl, - size * sizeof (zfs_vertex_t *))) == NULL) { - free(zgp); - return (NULL); - } - - zgp->zg_root = dataset; - zgp->zg_clone_count = 0; - - return (zgp); -} - -/* - * Destroy a graph object. We have to iterate over all the hash chains, - * destroying each vertex in the process. - */ -static void -zfs_graph_destroy(zfs_graph_t *zgp) -{ - int i; - zfs_vertex_t *current, *next; - - for (i = 0; i < zgp->zg_size; i++) { - current = zgp->zg_hash[i]; - while (current != NULL) { - next = current->zv_next; - zfs_vertex_destroy(current); - current = next; - } - } - - free(zgp->zg_hash); - free(zgp); -} - -/* - * Graph hash function. Classic bernstein k=33 hash function, taken from - * usr/src/cmd/sgs/tools/common/strhash.c - */ -static size_t -zfs_graph_hash(zfs_graph_t *zgp, const char *str) -{ - size_t hash = 5381; - int c; - - while ((c = *str++) != 0) - hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ - - return (hash % zgp->zg_size); -} - -/* - * Given a dataset name, finds the associated vertex, creating it if necessary. - */ -static zfs_vertex_t * -zfs_graph_lookup(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset, - uint64_t txg) -{ - size_t idx = zfs_graph_hash(zgp, dataset); - zfs_vertex_t *zvp; - - for (zvp = zgp->zg_hash[idx]; zvp != NULL; zvp = zvp->zv_next) { - if (strcmp(zvp->zv_dataset, dataset) == 0) { - if (zvp->zv_txg == 0) - zvp->zv_txg = txg; - return (zvp); - } - } - - if ((zvp = zfs_vertex_create(hdl, dataset)) == NULL) - return (NULL); - - zvp->zv_next = zgp->zg_hash[idx]; - zvp->zv_txg = txg; - zgp->zg_hash[idx] = zvp; - zgp->zg_nvertex++; - - return (zvp); -} - -/* - * Given two dataset names, create an edge between them. For the source vertex, - * mark 'zv_visited' to indicate that we have seen this vertex, and not simply - * created it as a destination of another edge. If 'dest' is NULL, then this - * is an individual vertex (i.e. the starting vertex), so don't add an edge. - */ -static int -zfs_graph_add(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *source, - const char *dest, uint64_t txg) -{ - zfs_vertex_t *svp, *dvp; - - if ((svp = zfs_graph_lookup(hdl, zgp, source, 0)) == NULL) - return (-1); - svp->zv_visited = VISIT_SEEN; - if (dest != NULL) { - dvp = zfs_graph_lookup(hdl, zgp, dest, txg); - if (dvp == NULL) - return (-1); - if (zfs_vertex_add_edge(hdl, svp, dvp) != 0) - return (-1); - } - - return (0); -} - -/* - * Iterate over all children of the given dataset, adding any vertices - * as necessary. Returns -1 if there was an error, or 0 otherwise. - * This is a simple recursive algorithm - the ZFS namespace typically - * is very flat. We manually invoke the necessary ioctl() calls to - * avoid the overhead and additional semantics of zfs_open(). - */ -static int -iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) -{ - zfs_cmd_t zc = { 0 }; - zfs_vertex_t *zvp; - - /* - * Look up the source vertex, and avoid it if we've seen it before. - */ - zvp = zfs_graph_lookup(hdl, zgp, dataset, 0); - if (zvp == NULL) - return (-1); - if (zvp->zv_visited == VISIT_SEEN) - return (0); - - /* - * Iterate over all children - */ - for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0; - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) { - /* - * Get statistics for this dataset, to determine the type of the - * dataset and clone statistics. If this fails, the dataset has - * since been removed, and we're pretty much screwed anyway. - */ - zc.zc_objset_stats.dds_origin[0] = '\0'; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) - continue; - - if (zc.zc_objset_stats.dds_origin[0] != '\0') { - if (zfs_graph_add(hdl, zgp, - zc.zc_objset_stats.dds_origin, zc.zc_name, - zc.zc_objset_stats.dds_creation_txg) != 0) - return (-1); - /* - * Count origins only if they are contained in the graph - */ - if (isa_child_of(zc.zc_objset_stats.dds_origin, - zgp->zg_root)) - zgp->zg_clone_count--; - } - - /* - * Add an edge between the parent and the child. - */ - if (zfs_graph_add(hdl, zgp, dataset, zc.zc_name, - zc.zc_objset_stats.dds_creation_txg) != 0) - return (-1); - - /* - * Recursively visit child - */ - if (iterate_children(hdl, zgp, zc.zc_name)) - return (-1); - } - - /* - * Now iterate over all snapshots. - */ - bzero(&zc, sizeof (zc)); - - for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0; - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) { - - /* - * Get statistics for this dataset, to determine the type of the - * dataset and clone statistics. If this fails, the dataset has - * since been removed, and we're pretty much screwed anyway. - */ - if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) - continue; - - /* - * Add an edge between the parent and the child. - */ - if (zfs_graph_add(hdl, zgp, dataset, zc.zc_name, - zc.zc_objset_stats.dds_creation_txg) != 0) - return (-1); - - zgp->zg_clone_count += zc.zc_objset_stats.dds_num_clones; - } - - zvp->zv_visited = VISIT_SEEN; - - return (0); -} - -/* - * Returns false if there are no snapshots with dependent clones in this - * subtree or if all of those clones are also in this subtree. Returns - * true if there is an error or there are external dependents. - */ -static boolean_t -external_dependents(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) -{ - zfs_cmd_t zc = { 0 }; - - /* - * Check whether this dataset is a clone or has clones since - * iterate_children() only checks the children. - */ - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) - return (B_TRUE); - - if (zc.zc_objset_stats.dds_origin[0] != '\0') { - if (zfs_graph_add(hdl, zgp, - zc.zc_objset_stats.dds_origin, zc.zc_name, - zc.zc_objset_stats.dds_creation_txg) != 0) - return (B_TRUE); - if (isa_child_of(zc.zc_objset_stats.dds_origin, dataset)) - zgp->zg_clone_count--; - } - - if ((zc.zc_objset_stats.dds_num_clones) || - iterate_children(hdl, zgp, dataset)) - return (B_TRUE); - - return (zgp->zg_clone_count != 0); -} - -/* - * Construct a complete graph of all necessary vertices. First, iterate over - * only our object's children. If no cloned snapshots are found, or all of - * the cloned snapshots are in this subtree then return a graph of the subtree. - * Otherwise, start at the root of the pool and iterate over all datasets. - */ -static zfs_graph_t * -construct_graph(libzfs_handle_t *hdl, const char *dataset) -{ - zfs_graph_t *zgp = zfs_graph_create(hdl, dataset, ZFS_GRAPH_SIZE); - int ret = 0; - - if (zgp == NULL) - return (zgp); - - if ((strchr(dataset, '/') == NULL) || - (external_dependents(hdl, zgp, dataset))) { - /* - * Determine pool name and try again. - */ - int len = strcspn(dataset, "/@") + 1; - char *pool = zfs_alloc(hdl, len); - - if (pool == NULL) { - zfs_graph_destroy(zgp); - return (NULL); - } - (void) strlcpy(pool, dataset, len); - - if (iterate_children(hdl, zgp, pool) == -1 || - zfs_graph_add(hdl, zgp, pool, NULL, 0) != 0) { - free(pool); - zfs_graph_destroy(zgp); - return (NULL); - } - free(pool); - } - - if (ret == -1 || zfs_graph_add(hdl, zgp, dataset, NULL, 0) != 0) { - zfs_graph_destroy(zgp); - return (NULL); - } - - return (zgp); -} - -/* - * Given a graph, do a recursive topological sort into the given array. This is - * really just a depth first search, so that the deepest nodes appear first. - * hijack the 'zv_visited' marker to avoid visiting the same vertex twice. - */ -static int -topo_sort(libzfs_handle_t *hdl, boolean_t allowrecursion, char **result, - size_t *idx, zfs_vertex_t *zgv) -{ - int i; - - if (zgv->zv_visited == VISIT_SORT_PRE && !allowrecursion) { - /* - * If we've already seen this vertex as part of our depth-first - * search, then we have a cyclic dependency, and we must return - * an error. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "recursive dependency at '%s'"), - zgv->zv_dataset); - return (zfs_error(hdl, EZFS_RECURSIVE, - dgettext(TEXT_DOMAIN, - "cannot determine dependent datasets"))); - } else if (zgv->zv_visited >= VISIT_SORT_PRE) { - /* - * If we've already processed this as part of the topological - * sort, then don't bother doing so again. - */ - return (0); - } - - zgv->zv_visited = VISIT_SORT_PRE; - - /* avoid doing a search if we don't have to */ - zfs_vertex_sort_edges(zgv); - for (i = 0; i < zgv->zv_edgecount; i++) { - if (topo_sort(hdl, allowrecursion, result, idx, - zgv->zv_edges[i]->ze_dest) != 0) - return (-1); - } - - /* we may have visited this in the course of the above */ - if (zgv->zv_visited == VISIT_SORT_POST) - return (0); - - if ((result[*idx] = zfs_alloc(hdl, - strlen(zgv->zv_dataset) + 1)) == NULL) - return (-1); - - (void) strcpy(result[*idx], zgv->zv_dataset); - *idx += 1; - zgv->zv_visited = VISIT_SORT_POST; - return (0); -} - -/* - * The only public interface for this file. Do the dirty work of constructing a - * child list for the given object. Construct the graph, do the toplogical - * sort, and then return the array of strings to the caller. - * - * The 'allowrecursion' parameter controls behavior when cycles are found. If - * it is set, the the cycle is ignored and the results returned as if the cycle - * did not exist. If it is not set, then the routine will generate an error if - * a cycle is found. - */ -int -get_dependents(libzfs_handle_t *hdl, boolean_t allowrecursion, - const char *dataset, char ***result, size_t *count) -{ - zfs_graph_t *zgp; - zfs_vertex_t *zvp; - - if ((zgp = construct_graph(hdl, dataset)) == NULL) - return (-1); - - if ((*result = zfs_alloc(hdl, - zgp->zg_nvertex * sizeof (char *))) == NULL) { - zfs_graph_destroy(zgp); - return (-1); - } - - if ((zvp = zfs_graph_lookup(hdl, zgp, dataset, 0)) == NULL) { - free(*result); - zfs_graph_destroy(zgp); - return (-1); - } - - *count = 0; - if (topo_sort(hdl, allowrecursion, *result, count, zvp) != 0) { - free(*result); - zfs_graph_destroy(zgp); - return (-1); - } - - /* - * Get rid of the last entry, which is our starting vertex and not - * strictly a dependent. - */ - assert(*count > 0); - free((*result)[*count - 1]); - (*count)--; - - zfs_graph_destroy(zgp); - - return (0); -} diff --git a/lib/libzfs/common/libzfs_impl.h b/lib/libzfs/common/libzfs_impl.h index c9b09a2..b1eae47 100644 --- a/lib/libzfs/common/libzfs_impl.h +++ b/lib/libzfs/common/libzfs_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #ifndef _LIBFS_IMPL_H @@ -115,7 +116,7 @@ struct zpool_handle { diskaddr_t zpool_start_block; }; -typedef enum { +typedef enum { PROTO_NFS = 0, PROTO_SMB = 1, PROTO_END = 2 @@ -147,6 +148,7 @@ int zpool_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); int get_dependents(libzfs_handle_t *, boolean_t, const char *, char ***, size_t *); +zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *); int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t, diff --git a/lib/libzfs/common/libzfs_import.c b/lib/libzfs/common/libzfs_import.c index e137035..414aa2f 100644 --- a/lib/libzfs/common/libzfs_import.c +++ b/lib/libzfs/common/libzfs_import.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ /* @@ -437,7 +439,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) uint_t i, nspares, nl2cache; boolean_t config_seen; uint64_t best_txg; - char *name, *hostname; + char *name, *hostname, *comment; uint64_t version, guid; uint_t children = 0; nvlist_t **child = NULL; @@ -526,6 +528,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) * version * pool guid * name + * comment (if available) * pool state * hostid (if available) * hostname (if available) @@ -547,11 +550,24 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) if (nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, name) != 0) goto nomem; + + /* + * COMMENT is optional, don't bail if it's not + * there, instead, set it to NULL. + */ + if (nvlist_lookup_string(tmp, + ZPOOL_CONFIG_COMMENT, &comment) != 0) + comment = NULL; + else if (nvlist_add_string(config, + ZPOOL_CONFIG_COMMENT, comment) != 0) + goto nomem; + verify(nvlist_lookup_uint64(tmp, ZPOOL_CONFIG_POOL_STATE, &state) == 0); if (nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) != 0) goto nomem; + hostid = 0; if (nvlist_lookup_uint64(tmp, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { diff --git a/lib/libzfs/common/libzfs_iter.c b/lib/libzfs/common/libzfs_iter.c new file mode 100644 index 0000000..212383d --- /dev/null +++ b/lib/libzfs/common/libzfs_iter.c @@ -0,0 +1,462 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" + +int +zfs_iter_clones(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + nvlist_t *nvl = zfs_get_clones_nvl(zhp); + nvpair_t *pair; + + if (nvl == NULL) + return (0); + + for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(nvl, pair)) { + zfs_handle_t *clone = zfs_open(zhp->zfs_hdl, nvpair_name(pair), + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (clone != NULL) { + int err = func(clone, data); + if (err != 0) + return (err); + } + } + return (0); +} + +static int +zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc) +{ + int rc; + uint64_t orig_cookie; + + orig_cookie = zc->zc_cookie; +top: + (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); + rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc); + + if (rc == -1) { + switch (errno) { + case ENOMEM: + /* expand nvlist memory and try again */ + if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) { + zcmd_free_nvlists(zc); + return (-1); + } + zc->zc_cookie = orig_cookie; + goto top; + /* + * An errno value of ESRCH indicates normal completion. + * If ENOENT is returned, then the underlying dataset + * has been removed since we obtained the handle. + */ + case ESRCH: + case ENOENT: + rc = 1; + break; + default: + rc = zfs_standard_error(zhp->zfs_hdl, errno, + dgettext(TEXT_DOMAIN, + "cannot iterate filesystems")); + break; + } + } + return (rc); +} + +/* + * Iterate over all child filesystems + */ +int +zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + zfs_cmd_t zc = { 0 }; + zfs_handle_t *nzhp; + int ret; + + if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) + return (0); + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, + &zc)) == 0) { + /* + * Silently ignore errors, as the only plausible explanation is + * that the pool has since been removed. + */ + if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, + &zc)) == NULL) { + continue; + } + + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); + return (ret); + } + } + zcmd_free_nvlists(&zc); + return ((ret < 0) ? ret : 0); +} + +/* + * Iterate over all snapshots + */ +int +zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + zfs_cmd_t zc = { 0 }; + zfs_handle_t *nzhp; + int ret; + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) + return (0); + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT, + &zc)) == 0) { + + if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, + &zc)) == NULL) { + continue; + } + + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); + return (ret); + } + } + zcmd_free_nvlists(&zc); + return ((ret < 0) ? ret : 0); +} + +/* + * Routines for dealing with the sorted snapshot functionality + */ +typedef struct zfs_node { + zfs_handle_t *zn_handle; + avl_node_t zn_avlnode; +} zfs_node_t; + +static int +zfs_sort_snaps(zfs_handle_t *zhp, void *data) +{ + avl_tree_t *avl = data; + zfs_node_t *node; + zfs_node_t search; + + search.zn_handle = zhp; + node = avl_find(avl, &search, NULL); + if (node) { + /* + * If this snapshot was renamed while we were creating the + * AVL tree, it's possible that we already inserted it under + * its old name. Remove the old handle before adding the new + * one. + */ + zfs_close(node->zn_handle); + avl_remove(avl, node); + free(node); + } + + node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t)); + node->zn_handle = zhp; + avl_add(avl, node); + + return (0); +} + +static int +zfs_snapshot_compare(const void *larg, const void *rarg) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + uint64_t lcreate, rcreate; + + /* + * Sort them according to creation time. We use the hidden + * CREATETXG property to get an absolute ordering of snapshots. + */ + lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); + rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); + + if (lcreate < rcreate) + return (-1); + else if (lcreate > rcreate) + return (+1); + else + return (0); +} + +int +zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data) +{ + int ret = 0; + zfs_node_t *node; + avl_tree_t avl; + void *cookie = NULL; + + avl_create(&avl, zfs_snapshot_compare, + sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode)); + + ret = zfs_iter_snapshots(zhp, zfs_sort_snaps, &avl); + + for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node)) + ret |= callback(node->zn_handle, data); + + while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL) + free(node); + + avl_destroy(&avl); + + return (ret); +} + +typedef struct { + char *ssa_first; + char *ssa_last; + boolean_t ssa_seenfirst; + boolean_t ssa_seenlast; + zfs_iter_f ssa_func; + void *ssa_arg; +} snapspec_arg_t; + +static int +snapspec_cb(zfs_handle_t *zhp, void *arg) { + snapspec_arg_t *ssa = arg; + char *shortsnapname; + int err = 0; + + if (ssa->ssa_seenlast) + return (0); + shortsnapname = zfs_strdup(zhp->zfs_hdl, + strchr(zfs_get_name(zhp), '@') + 1); + + if (!ssa->ssa_seenfirst && strcmp(shortsnapname, ssa->ssa_first) == 0) + ssa->ssa_seenfirst = B_TRUE; + + if (ssa->ssa_seenfirst) { + err = ssa->ssa_func(zhp, ssa->ssa_arg); + } else { + zfs_close(zhp); + } + + if (strcmp(shortsnapname, ssa->ssa_last) == 0) + ssa->ssa_seenlast = B_TRUE; + free(shortsnapname); + + return (err); +} + +/* + * spec is a string like "A,B%C,D" + * + * , where can be: + * (single snapshot) + * % (range of snapshots, inclusive) + * % (range of snapshots, starting with earliest) + * % (range of snapshots, ending with last) + * % (all snapshots) + * [,...] (comma separated list of the above) + * + * If a snapshot can not be opened, continue trying to open the others, but + * return ENOENT at the end. + */ +int +zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, + zfs_iter_f func, void *arg) +{ + char buf[ZFS_MAXNAMELEN]; + char *comma_separated, *cp; + int err = 0; + int ret = 0; + + (void) strlcpy(buf, spec_orig, sizeof (buf)); + cp = buf; + + while ((comma_separated = strsep(&cp, ",")) != NULL) { + char *pct = strchr(comma_separated, '%'); + if (pct != NULL) { + snapspec_arg_t ssa = { 0 }; + ssa.ssa_func = func; + ssa.ssa_arg = arg; + + if (pct == comma_separated) + ssa.ssa_seenfirst = B_TRUE; + else + ssa.ssa_first = comma_separated; + *pct = '\0'; + ssa.ssa_last = pct + 1; + + /* + * If there is a lastname specified, make sure it + * exists. + */ + if (ssa.ssa_last[0] != '\0') { + char snapname[ZFS_MAXNAMELEN]; + (void) snprintf(snapname, sizeof (snapname), + "%s@%s", zfs_get_name(fs_zhp), + ssa.ssa_last); + if (!zfs_dataset_exists(fs_zhp->zfs_hdl, + snapname, ZFS_TYPE_SNAPSHOT)) { + ret = ENOENT; + continue; + } + } + + err = zfs_iter_snapshots_sorted(fs_zhp, + snapspec_cb, &ssa); + if (ret == 0) + ret = err; + if (ret == 0 && (!ssa.ssa_seenfirst || + (ssa.ssa_last[0] != '\0' && !ssa.ssa_seenlast))) { + ret = ENOENT; + } + } else { + char snapname[ZFS_MAXNAMELEN]; + zfs_handle_t *snap_zhp; + (void) snprintf(snapname, sizeof (snapname), "%s@%s", + zfs_get_name(fs_zhp), comma_separated); + snap_zhp = make_dataset_handle(fs_zhp->zfs_hdl, + snapname); + if (snap_zhp == NULL) { + ret = ENOENT; + continue; + } + err = func(snap_zhp, arg); + if (ret == 0) + ret = err; + } + } + + return (ret); +} + +/* + * Iterate over all children, snapshots and filesystems + */ +int +zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + int ret; + + if ((ret = zfs_iter_filesystems(zhp, func, data)) != 0) + return (ret); + + return (zfs_iter_snapshots(zhp, func, data)); +} + + +typedef struct iter_stack_frame { + struct iter_stack_frame *next; + zfs_handle_t *zhp; +} iter_stack_frame_t; + +typedef struct iter_dependents_arg { + boolean_t first; + boolean_t allowrecursion; + iter_stack_frame_t *stack; + zfs_iter_f func; + void *data; +} iter_dependents_arg_t; + +static int +iter_dependents_cb(zfs_handle_t *zhp, void *arg) +{ + iter_dependents_arg_t *ida = arg; + int err; + boolean_t first = ida->first; + ida->first = B_FALSE; + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { + err = zfs_iter_clones(zhp, iter_dependents_cb, ida); + } else { + iter_stack_frame_t isf; + iter_stack_frame_t *f; + + /* + * check if there is a cycle by seeing if this fs is already + * on the stack. + */ + for (f = ida->stack; f != NULL; f = f->next) { + if (f->zhp->zfs_dmustats.dds_guid == + zhp->zfs_dmustats.dds_guid) { + if (ida->allowrecursion) { + zfs_close(zhp); + return (0); + } else { + zfs_error_aux(zhp->zfs_hdl, + dgettext(TEXT_DOMAIN, + "recursive dependency at '%s'"), + zfs_get_name(zhp)); + err = zfs_error(zhp->zfs_hdl, + EZFS_RECURSIVE, + dgettext(TEXT_DOMAIN, + "cannot determine dependent " + "datasets")); + zfs_close(zhp); + return (err); + } + } + } + + isf.zhp = zhp; + isf.next = ida->stack; + ida->stack = &isf; + err = zfs_iter_filesystems(zhp, iter_dependents_cb, ida); + if (err == 0) + err = zfs_iter_snapshots(zhp, iter_dependents_cb, ida); + ida->stack = isf.next; + } + if (!first && err == 0) + err = ida->func(zhp, ida->data); + return (err); +} + +int +zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion, + zfs_iter_f func, void *data) +{ + iter_dependents_arg_t ida; + ida.allowrecursion = allowrecursion; + ida.stack = NULL; + ida.func = func; + ida.data = data; + ida.first = B_TRUE; + return (iter_dependents_cb(zfs_handle_dup(zhp), &ida)); +} diff --git a/lib/libzfs/common/libzfs_pool.c b/lib/libzfs/common/libzfs_pool.c index 7df7e91..df89a2b 100644 --- a/lib/libzfs/common/libzfs_pool.c +++ b/lib/libzfs/common/libzfs_pool.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -41,6 +43,7 @@ #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_comutil.h" +#include "zfeature_common.h" static int read_efi_label(nvlist_t *config, diskaddr_t *sb); @@ -233,6 +236,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, case ZPOOL_PROP_ALTROOT: case ZPOOL_PROP_CACHEFILE: + case ZPOOL_PROP_COMMENT: if (zhp->zpool_props != NULL || zpool_get_all_props(zhp) == 0) { (void) strlcpy(buf, @@ -270,6 +274,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, case ZPOOL_PROP_SIZE: case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: + case ZPOOL_PROP_FREEING: + case ZPOOL_PROP_EXPANDSZ: (void) zfs_nicenum(intval, buf, len); break; @@ -294,6 +300,12 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, (void) strlcpy(buf, zpool_state_to_name(intval, vs->vs_aux), len); break; + case ZPOOL_PROP_VERSION: + if (intval >= SPA_VERSION_FEATURES) { + (void) snprintf(buf, len, "-"); + break; + } + /* FALLTHROUGH */ default: (void) snprintf(buf, len, "%llu", intval); } @@ -357,8 +369,8 @@ pool_uses_efi(nvlist_t *config) return (B_FALSE); } -static boolean_t -pool_is_bootable(zpool_handle_t *zhp) +boolean_t +zpool_is_bootable(zpool_handle_t *zhp) { char bootfs[ZPOOL_MAXNAMELEN]; @@ -382,7 +394,7 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, zpool_prop_t prop; char *strval; uint64_t intval; - char *slash; + char *slash, *check; struct stat64 statbuf; zpool_handle_t *zhp; nvlist_t *nvroot; @@ -396,10 +408,48 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); + prop = zpool_name_to_prop(propname); + if (prop == ZPROP_INVAL && zpool_prop_feature(propname)) { + int err; + zfeature_info_t *feature; + char *fname = strchr(propname, '@') + 1; + + err = zfeature_lookup_name(fname, &feature); + if (err != 0) { + ASSERT3U(err, ==, ENOENT); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid feature '%s'"), fname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (nvpair_type(elem) != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + (void) nvpair_value_string(elem, &strval); + if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set to " + "'enabled'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (nvlist_add_uint64(retprops, propname, 0) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; + } + /* * Make sure this property is valid and applies to this type. */ - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) { + if (prop == ZPROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); @@ -422,7 +472,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, */ switch (prop) { case ZPOOL_PROP_VERSION: - if (intval < version || intval > SPA_VERSION) { + if (intval < version || + !SPA_VERSION_IS_SUPPORTED(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %d is invalid."), propname, intval); @@ -541,6 +592,26 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, *slash = '/'; break; + case ZPOOL_PROP_COMMENT: + for (check = strval; *check != '\0'; check++) { + if (!isprint(*check)) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "comment may only have printable " + "characters")); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + } + if (strlen(strval) > ZPROP_MAX_COMMENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "comment must not exceed %d characters"), + ZPROP_MAX_COMMENT); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; case ZPOOL_PROP_READONLY: if (!flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -624,10 +695,77 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; + nvlist_t *features = NULL; + zprop_list_t **last; + boolean_t firstexpand = (NULL == *plp); if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0) return (-1); + last = plp; + while (*last != NULL) + last = &(*last)->pl_next; + + if ((*plp)->pl_all) + features = zpool_get_features(zhp); + + if ((*plp)->pl_all && firstexpand) { + for (int i = 0; i < SPA_FEATURES; i++) { + zprop_list_t *entry = zfs_alloc(hdl, + sizeof (zprop_list_t)); + entry->pl_prop = ZPROP_INVAL; + entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", + spa_feature_table[i].fi_uname); + entry->pl_width = strlen(entry->pl_user_prop); + entry->pl_all = B_TRUE; + + *last = entry; + last = &entry->pl_next; + } + } + + /* add any unsupported features */ + for (nvpair_t *nvp = nvlist_next_nvpair(features, NULL); + nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { + char *propname; + boolean_t found; + zprop_list_t *entry; + + if (zfeature_is_supported(nvpair_name(nvp))) + continue; + + propname = zfs_asprintf(hdl, "unsupported@%s", + nvpair_name(nvp)); + + /* + * Before adding the property to the list make sure that no + * other pool already added the same property. + */ + found = B_FALSE; + entry = *plp; + while (entry != NULL) { + if (entry->pl_user_prop != NULL && + strcmp(propname, entry->pl_user_prop) == 0) { + found = B_TRUE; + break; + } + entry = entry->pl_next; + } + if (found) { + free(propname); + continue; + } + + entry = zfs_alloc(hdl, sizeof (zprop_list_t)); + entry->pl_prop = ZPROP_INVAL; + entry->pl_user_prop = propname; + entry->pl_width = strlen(entry->pl_user_prop); + entry->pl_all = B_TRUE; + + *last = entry; + last = &entry->pl_next; + } + for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed) @@ -644,6 +782,66 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) return (0); } +/* + * Get the state for the given feature on the given ZFS pool. + */ +int +zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, + size_t len) +{ + uint64_t refcount; + boolean_t found = B_FALSE; + nvlist_t *features = zpool_get_features(zhp); + boolean_t supported; + const char *feature = strchr(propname, '@') + 1; + + supported = zpool_prop_feature(propname); + ASSERT(supported || zfs_prop_unsupported(propname)); + + /* + * Convert from feature name to feature guid. This conversion is + * unecessary for unsupported@... properties because they already + * use guids. + */ + if (supported) { + int ret; + zfeature_info_t *fi; + + ret = zfeature_lookup_name(feature, &fi); + if (ret != 0) { + (void) strlcpy(buf, "-", len); + return (ENOTSUP); + } + feature = fi->fi_guid; + } + + if (nvlist_lookup_uint64(features, feature, &refcount) == 0) + found = B_TRUE; + + if (supported) { + if (!found) { + (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); + } else { + if (refcount == 0) + (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); + else + (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); + } + } else { + if (found) { + if (refcount == 0) { + (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); + } else { + (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); + } + } else { + (void) strlcpy(buf, "-", len); + return (ENOTSUP); + } + } + + return (0); +} /* * Don't start the slice at the default block of 34; many storage @@ -1071,7 +1269,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) return (zfs_error(hdl, EZFS_BADVERSION, msg)); } - if (pool_is_bootable(zhp) && nvlist_lookup_nvlist_array(nvroot, + if (zpool_is_bootable(zhp) && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { uint64_t s; @@ -1230,8 +1428,10 @@ zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, if (!hdl->libzfs_printerr || config == NULL) return; - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0) + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { return; + } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) return; @@ -1287,6 +1487,7 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) goto no_info; @@ -1409,6 +1610,30 @@ print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, } } +void +zpool_print_unsup_feat(nvlist_t *config) +{ + nvlist_t *nvinfo, *unsup_feat; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == + 0); + verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT, + &unsup_feat) == 0); + + for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(unsup_feat, nvp)) { + char *desc; + + verify(nvpair_type(nvp) == DATA_TYPE_STRING); + verify(nvpair_value_string(nvp, &desc) == 0); + + if (strlen(desc) > 0) + (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc); + else + (void) printf("\t%s\n", nvpair_name(nvp)); + } +} + /* * Import the given pool using the known configuration and a list of * properties to be set. The configuration should have come from @@ -1515,6 +1740,22 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, switch (error) { case ENOTSUP: + if (nv != NULL && nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && + nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { + (void) printf(dgettext(TEXT_DOMAIN, "This " + "pool uses the following feature(s) not " + "supported by this system:\n")); + zpool_print_unsup_feat(nv); + if (nvlist_exists(nvinfo, + ZPOOL_CONFIG_CAN_RDONLY)) { + (void) printf(dgettext(TEXT_DOMAIN, + "All unsupported features are only " + "required for writing to the pool." + "\nThe pool can be imported using " + "'-o readonly=on'.\n")); + } + } /* * Unsupported version. */ @@ -2355,7 +2596,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, uint_t children; nvlist_t *config_root; libzfs_handle_t *hdl = zhp->zpool_hdl; - boolean_t rootpool = pool_is_bootable(zhp); + boolean_t rootpool = zpool_is_bootable(zhp); if (replacing) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, @@ -2967,6 +3208,46 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) } /* + * Change the GUID for a pool. + */ +int +zpool_reguid(zpool_handle_t *zhp) +{ + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + zfs_cmd_t zc = { 0 }; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Reopen the pool. + */ +int +zpool_reopen(zpool_handle_t *zhp) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), + zhp->zpool_name); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0) + return (0); + return (zpool_standard_error(hdl, errno, msg)); +} + +/* * Convert from a devid string to a path. */ static char * @@ -3599,7 +3880,7 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) if (zhp) { nvlist_t *nvroot; - if (pool_is_bootable(zhp)) { + if (zpool_is_bootable(zhp)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "EFI labeled devices are not supported on root " "pools.")); diff --git a/lib/libzfs/common/libzfs_sendrecv.c b/lib/libzfs/common/libzfs_sendrecv.c index 3093ab9..a02a41b 100644 --- a/lib/libzfs/common/libzfs_sendrecv.c +++ b/lib/libzfs/common/libzfs_sendrecv.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include @@ -36,6 +38,7 @@ #include #include #include +#include #include @@ -50,7 +53,7 @@ /* in libzfs_dataset.c */ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); -static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t, +static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *); static const zio_cksum_t zero_cksum = { 0 }; @@ -61,6 +64,12 @@ typedef struct dedup_arg { libzfs_handle_t *dedup_hdl; } dedup_arg_t; +typedef struct progress_arg { + zfs_handle_t *pa_zhp; + int pa_fd; + boolean_t pa_parsable; +} progress_arg_t; + typedef struct dataref { uint64_t ref_guid; uint64_t ref_object; @@ -771,88 +780,6 @@ gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, } /* - * Routines for dealing with the sorted snapshot functionality - */ -typedef struct zfs_node { - zfs_handle_t *zn_handle; - avl_node_t zn_avlnode; -} zfs_node_t; - -static int -zfs_sort_snaps(zfs_handle_t *zhp, void *data) -{ - avl_tree_t *avl = data; - zfs_node_t *node; - zfs_node_t search; - - search.zn_handle = zhp; - node = avl_find(avl, &search, NULL); - if (node) { - /* - * If this snapshot was renamed while we were creating the - * AVL tree, it's possible that we already inserted it under - * its old name. Remove the old handle before adding the new - * one. - */ - zfs_close(node->zn_handle); - avl_remove(avl, node); - free(node); - } - - node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t)); - node->zn_handle = zhp; - avl_add(avl, node); - - return (0); -} - -static int -zfs_snapshot_compare(const void *larg, const void *rarg) -{ - zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; - zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; - uint64_t lcreate, rcreate; - - /* - * Sort them according to creation time. We use the hidden - * CREATETXG property to get an absolute ordering of snapshots. - */ - lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); - rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); - - if (lcreate < rcreate) - return (-1); - else if (lcreate > rcreate) - return (+1); - else - return (0); -} - -int -zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data) -{ - int ret = 0; - zfs_node_t *node; - avl_tree_t avl; - void *cookie = NULL; - - avl_create(&avl, zfs_snapshot_compare, - sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode)); - - ret = zfs_iter_snapshots(zhp, zfs_sort_snaps, &avl); - - for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node)) - ret |= callback(node->zn_handle, data); - - while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL) - free(node); - - avl_destroy(&avl); - - return (ret); -} - -/* * Routines specific to "zfs send" */ typedef struct send_dump_data { @@ -862,7 +789,7 @@ typedef struct send_dump_data { char prevsnap[ZFS_MAXNAMELEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; - boolean_t verbose; + boolean_t verbose, dryrun, parsable, progress; int outfd; boolean_t err; nvlist_t *fss; @@ -872,8 +799,69 @@ typedef struct send_dump_data { nvlist_t *debugnv; char holdtag[ZFS_MAXNAMELEN]; int cleanup_fd; + uint64_t size; } send_dump_data_t; +static int +estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, + boolean_t fromorigin, uint64_t *sizep) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + assert(fromsnap_obj == 0 || !fromorigin); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_obj = fromorigin; + zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + zc.zc_fromobj = fromsnap_obj; + zc.zc_guid = 1; /* estimate flag */ + + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot estimate space for '%s'"), zhp->zfs_name); + + switch (errno) { + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + + case ENOENT: + if (zfs_dataset_exists(hdl, zc.zc_name, + ZFS_TYPE_SNAPSHOT)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source (@%s) does not exist"), + zc.zc_value); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: + case ENOSTR: + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + + *sizep = zc.zc_objset_type; + + return (0); +} + /* * Dumps a backup of the given snapshot (incremental from fromsnap if it's not * NULL) to the file descriptor specified by outfd. @@ -901,7 +889,7 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, "fromsnap", fromsnap)); } - if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) { + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); @@ -914,7 +902,6 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, nvlist_free(thisdbg); switch (errno) { - case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); @@ -964,6 +951,9 @@ hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd) assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + if (sdd->dryrun) + return (0); + /* * zfs_send() only opens a cleanup_fd for sends that need it, * e.g. replication and doall. @@ -991,13 +981,63 @@ hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd) return (error); } +static void * +send_progress_thread(void *arg) +{ + progress_arg_t *pa = arg; + + zfs_cmd_t zc = { 0 }; + zfs_handle_t *zhp = pa->pa_zhp; + libzfs_handle_t *hdl = zhp->zfs_hdl; + unsigned long long bytes; + char buf[16]; + + time_t t; + struct tm *tm; + + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (!pa->pa_parsable) + (void) fprintf(stderr, "TIME SENT SNAPSHOT\n"); + + /* + * Print the progress from ZFS_IOC_SEND_PROGRESS every second. + */ + for (;;) { + (void) sleep(1); + + zc.zc_cookie = pa->pa_fd; + if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) + return ((void *)-1); + + (void) time(&t); + tm = localtime(&t); + bytes = zc.zc_cookie; + + if (pa->pa_parsable) { + (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + bytes, zhp->zfs_name); + } else { + zfs_nicenum(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + buf, zhp->zfs_name); + } + } +} + static int dump_snapshot(zfs_handle_t *zhp, void *arg) { send_dump_data_t *sdd = arg; + progress_arg_t pa = { 0 }; + pthread_t tid; + char *thissnap; int err; - boolean_t isfromsnap, istosnap; + boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; thissnap = strchr(zhp->zfs_name, '@') + 1; @@ -1074,15 +1114,68 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) return (err); } - /* send it */ + fromorigin = sdd->prevsnap[0] == '\0' && + (sdd->fromorigin || sdd->replicate); + if (sdd->verbose) { - (void) fprintf(stderr, "sending from @%s to %s\n", - sdd->prevsnap, zhp->zfs_name); + uint64_t size; + err = estimate_ioctl(zhp, sdd->prevsnap_obj, + fromorigin, &size); + + if (sdd->parsable) { + if (sdd->prevsnap[0] != '\0') { + (void) fprintf(stderr, "incremental\t%s\t%s", + sdd->prevsnap, zhp->zfs_name); + } else { + (void) fprintf(stderr, "full\t%s", + zhp->zfs_name); + } + } else { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "send from @%s to %s"), + sdd->prevsnap, zhp->zfs_name); + } + if (err == 0) { + if (sdd->parsable) { + (void) fprintf(stderr, "\t%llu\n", + (longlong_t)size); + } else { + char buf[16]; + zfs_nicenum(size, buf, sizeof (buf)); + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + " estimated size is %s\n"), buf); + } + sdd->size += size; + } else { + (void) fprintf(stderr, "\n"); + } } - err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, - sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate), - sdd->outfd, sdd->debugnv); + if (!sdd->dryrun) { + /* + * If progress reporting is requested, spawn a new thread to + * poll ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (sdd->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = sdd->outfd; + pa.pa_parsable = sdd->parsable; + + if (err = pthread_create(&tid, NULL, + send_progress_thread, &pa)) { + zfs_close(zhp); + return (err); + } + } + + err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, + fromorigin, sdd->outfd, sdd->debugnv); + + if (sdd->progress) { + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + } + } (void) strcpy(sdd->prevsnap, thissnap); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); @@ -1101,8 +1194,8 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->tosnap); if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { - (void) fprintf(stderr, "WARNING: " - "could not send %s@%s: does not exist\n", + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); sdd->err = B_TRUE; return (0); @@ -1131,23 +1224,24 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg); if (!sdd->seenfrom) { - (void) fprintf(stderr, + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" - "incremental source (%s@%s) does not exist\n", + "incremental source (%s@%s) does not exist\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); sdd->err = B_TRUE; } else if (!sdd->seento) { if (sdd->fromsnap) { - (void) fprintf(stderr, + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) " - "is not earlier than it\n", + "is not earlier than it\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); } else { - (void) fprintf(stderr, "WARNING: " - "could not send %s@%s: does not exist\n", + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: " + "could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); } sdd->err = B_TRUE; @@ -1193,11 +1287,12 @@ again: needagain = progress = B_FALSE; for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { - nvlist_t *fslist; + nvlist_t *fslist, *parent_nv; char *fsname; zfs_handle_t *zhp; int err; uint64_t origin_guid = 0; + uint64_t parent_guid = 0; VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); if (nvlist_lookup_boolean(fslist, "sent") == 0) @@ -1205,13 +1300,23 @@ again: VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0); (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); + (void) nvlist_lookup_uint64(fslist, "parentfromsnap", + &parent_guid); + + if (parent_guid != 0) { + parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); + if (!nvlist_exists(parent_nv, "sent")) { + /* parent has not been sent; skip this one */ + needagain = B_TRUE; + continue; + } + } if (origin_guid != 0) { nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL); if (origin_nv != NULL && - nvlist_lookup_boolean(origin_nv, - "sent") == ENOENT) { + !nvlist_exists(origin_nv, "sent")) { /* * origin has not been sent yet; * skip this clone. @@ -1235,6 +1340,16 @@ again: assert(progress); goto again; } + + /* clean out the sent flags in case we reuse this fss */ + for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; + fspair = nvlist_next_nvpair(sdd->fss, fspair)) { + nvlist_t *fslist; + + VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); + (void) nvlist_remove_all(fslist, "sent"); + } + return (0); } @@ -1256,12 +1371,12 @@ again: */ int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - sendflags_t flags, int outfd, snapfilter_cb_t filter_func, + sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { char errbuf[1024]; send_dump_data_t sdd = { 0 }; - int err; + int err = 0; nvlist_t *fss = NULL; avl_tree_t *fsavl = NULL; static uint64_t holdseq; @@ -1289,12 +1404,12 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } } - if (zfs_spa_version(zhp, &spa_version) == 0 && + if (!flags->dryrun && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS && - (flags.doall || flags.replicate)) + (flags->doall || flags->replicate)) holdsnaps = B_TRUE; - if (flags.dedup) { + if (flags->dedup && !flags->dryrun) { featureflags |= (DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); if (err = pipe(pipefd)) { @@ -1314,13 +1429,13 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } } - if (flags.replicate || flags.doall || flags.props) { + if (flags->replicate || flags->doall || flags->props) { dmu_replay_record_t drr = { 0 }; char *packbuf = NULL; size_t buflen = 0; zio_cksum_t zc = { 0 }; - if (flags.replicate || flags.props) { + if (flags->replicate || flags->props) { nvlist_t *hdrnv; VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0)); @@ -1329,13 +1444,13 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, "fromsnap", fromsnap)); } VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap)); - if (!flags.replicate) { + if (!flags->replicate) { VERIFY(0 == nvlist_add_boolean(hdrnv, "not_recursive")); } err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name, - fromsnap, tosnap, flags.replicate, &fss, &fsavl); + fromsnap, tosnap, flags->replicate, &fss, &fsavl); if (err) goto err_out; VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); @@ -1352,33 +1467,34 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } } - /* write first begin record */ - drr.drr_type = DRR_BEGIN; - drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.drr_versioninfo, - DMU_COMPOUNDSTREAM); - DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.drr_versioninfo, - featureflags); - (void) snprintf(drr.drr_u.drr_begin.drr_toname, - sizeof (drr.drr_u.drr_begin.drr_toname), - "%s@%s", zhp->zfs_name, tosnap); - drr.drr_payloadlen = buflen; - err = cksum_and_write(&drr, sizeof (drr), &zc, outfd); - - /* write header nvlist */ - if (err != -1 && packbuf != NULL) { - err = cksum_and_write(packbuf, buflen, &zc, outfd); - } - free(packbuf); - if (err == -1) { - fsavl_destroy(fsavl); - nvlist_free(fss); - err = errno; - goto stderr_out; - } + if (!flags->dryrun) { + /* write first begin record */ + drr.drr_type = DRR_BEGIN; + drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. + drr_versioninfo, DMU_COMPOUNDSTREAM); + DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. + drr_versioninfo, featureflags); + (void) snprintf(drr.drr_u.drr_begin.drr_toname, + sizeof (drr.drr_u.drr_begin.drr_toname), + "%s@%s", zhp->zfs_name, tosnap); + drr.drr_payloadlen = buflen; + err = cksum_and_write(&drr, sizeof (drr), &zc, outfd); + + /* write header nvlist */ + if (err != -1 && packbuf != NULL) { + err = cksum_and_write(packbuf, buflen, &zc, + outfd); + } + free(packbuf); + if (err == -1) { + fsavl_destroy(fsavl); + nvlist_free(fss); + err = errno; + goto stderr_out; + } - /* write end record */ - if (err != -1) { + /* write end record */ bzero(&drr, sizeof (drr)); drr.drr_type = DRR_END; drr.drr_u.drr_end.drr_checksum = zc; @@ -1389,27 +1505,32 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, err = errno; goto stderr_out; } + + err = 0; } } /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; - if (flags.dedup) + if (flags->dedup) sdd.outfd = pipefd[0]; else sdd.outfd = outfd; - sdd.replicate = flags.replicate; - sdd.doall = flags.doall; - sdd.fromorigin = flags.fromorigin; + sdd.replicate = flags->replicate; + sdd.doall = flags->doall; + sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; - sdd.verbose = flags.verbose; + sdd.verbose = flags->verbose; + sdd.parsable = flags->parsable; + sdd.progress = flags->progress; + sdd.dryrun = flags->dryrun; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; - if (holdsnaps) { + if (holdsnaps || flags->progress) { ++holdseq; (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); @@ -1421,11 +1542,31 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } else { sdd.cleanup_fd = -1; } + if (flags->verbose) { + /* + * Do a verbose no-op dry run to get all the verbose output + * before generating any data. Then do a non-verbose real + * run to generate the streams. + */ + sdd.dryrun = B_TRUE; + err = dump_filesystems(zhp, &sdd); + sdd.dryrun = flags->dryrun; + sdd.verbose = B_FALSE; + if (flags->parsable) { + (void) fprintf(stderr, "size\t%llu\n", + (longlong_t)sdd.size); + } else { + char buf[16]; + zfs_nicenum(sdd.size, buf, sizeof (buf)); + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "total estimated size is %s\n"), buf); + } + } err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); nvlist_free(fss); - if (flags.dedup) { + if (flags->dedup) { (void) close(pipefd[0]); (void) pthread_join(tid, NULL); } @@ -1435,7 +1576,8 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.cleanup_fd = -1; } - if (flags.replicate || flags.doall || flags.props) { + if (!flags->dryrun && (flags->replicate || flags->doall || + flags->props)) { /* * write final end record. NB: want to do this even if * there was some error, because it might not be totally @@ -1456,7 +1598,7 @@ stderr_out: err_out: if (sdd.cleanup_fd != -1) VERIFY(0 == close(sdd.cleanup_fd)); - if (flags.dedup) { + if (flags->dedup) { (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); (void) close(pipefd[0]); @@ -1527,7 +1669,7 @@ recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, static int recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, - int baselen, char *newname, recvflags_t flags) + int baselen, char *newname, recvflags_t *flags) { static int seq; zfs_cmd_t zc = { 0 }; @@ -1539,7 +1681,7 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, - flags.force ? MS_FORCE : 0); + flags->force ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) return (-1); @@ -1555,7 +1697,7 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, (void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value)); - if (flags.verbose) { + if (flags->verbose) { (void) printf("attempting rename %s to %s\n", zc.zc_name, zc.zc_value); } @@ -1574,19 +1716,19 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, "recv-%u-%u", getpid(), seq); (void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value)); - if (flags.verbose) { + if (flags->verbose) { (void) printf("failed - trying rename %s to %s\n", zc.zc_name, zc.zc_value); } err = ioctl(hdl->libzfs_fd, ZFS_IOC_RENAME, &zc); if (err == 0) changelist_rename(clp, name, newname); - if (err && flags.verbose) { + if (err && flags->verbose) { (void) printf("failed (%u) - " "will try again on next pass\n", errno); } err = EAGAIN; - } else if (flags.verbose) { + } else if (flags->verbose) { if (err == 0) (void) printf("success\n"); else @@ -1601,7 +1743,7 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, static int recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, - char *newname, recvflags_t flags) + char *newname, recvflags_t *flags) { zfs_cmd_t zc = { 0 }; int err = 0; @@ -1614,7 +1756,7 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, - flags.force ? MS_FORCE : 0); + flags->force ? MS_FORCE : 0); if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS) @@ -1630,11 +1772,11 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, zc.zc_defer_destroy = defer; (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); - if (flags.verbose) + if (flags->verbose) (void) printf("attempting destroy %s\n", zc.zc_name); err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc); if (err == 0) { - if (flags.verbose) + if (flags->verbose) (void) printf("success\n"); changelist_remove(clp, zc.zc_name); } @@ -1657,6 +1799,7 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, typedef struct guid_to_name_data { uint64_t guid; char *name; + char *skip; } guid_to_name_data_t; static int @@ -1665,21 +1808,35 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg) guid_to_name_data_t *gtnd = arg; int err; + if (gtnd->skip != NULL && + strcmp(zhp->zfs_name, gtnd->skip) == 0) { + return (0); + } + if (zhp->zfs_dmustats.dds_guid == gtnd->guid) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } + err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } +/* + * Attempt to find the local dataset associated with this guid. In the case of + * multiple matches, we attempt to find the "best" match by searching + * progressively larger portions of the hierarchy. This allows one to send a + * tree of datasets individually and guarantee that we will find the source + * guid within that hierarchy, even if there are multiple matches elsewhere. + */ static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, char *name) { /* exhaustive search all local snapshots */ + char pname[ZFS_MAXNAMELEN]; guid_to_name_data_t gtnd; int err = 0; zfs_handle_t *zhp; @@ -1687,35 +1844,42 @@ guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, gtnd.guid = guid; gtnd.name = name; + gtnd.skip = NULL; - if (strchr(parent, '@') == NULL) { - zhp = make_dataset_handle(hdl, parent); - if (zhp != NULL) { - err = zfs_iter_children(zhp, guid_to_name_cb, >nd); - zfs_close(zhp); - if (err == EEXIST) - return (0); - } - } + (void) strlcpy(pname, parent, sizeof (pname)); + + /* + * Search progressively larger portions of the hierarchy. This will + * select the "most local" version of the origin snapshot in the case + * that there are multiple matching snapshots in the system. + */ + while ((cp = strrchr(pname, '/')) != NULL) { - cp = strchr(parent, '/'); - if (cp) + /* Chop off the last component and open the parent */ *cp = '\0'; - zhp = make_dataset_handle(hdl, parent); - if (cp) - *cp = '/'; + zhp = make_dataset_handle(hdl, pname); + + if (zhp == NULL) + continue; - if (zhp) { err = zfs_iter_children(zhp, guid_to_name_cb, >nd); zfs_close(zhp); - } + if (err == EEXIST) + return (0); - return (err == EEXIST ? 0 : ENOENT); + /* + * Remember the dataset that we already searched, so we + * skip it next time through. + */ + gtnd.skip = pname; + } + return (ENOENT); } /* - * Return true if dataset guid1 is created before guid2. + * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if + * guid1 is after guid2. */ static int created_before(libzfs_handle_t *hdl, avl_tree_t *avl, @@ -1725,7 +1889,8 @@ created_before(libzfs_handle_t *hdl, avl_tree_t *avl, char *fsname, *snapname; char buf[ZFS_MAXNAMELEN]; int rv; - zfs_node_t zn1, zn2; + zfs_handle_t *guid1hdl, *guid2hdl; + uint64_t create1, create2; if (guid2 == 0) return (0); @@ -1735,30 +1900,38 @@ created_before(libzfs_handle_t *hdl, avl_tree_t *avl, nvfs = fsavl_find(avl, guid1, &snapname); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); - zn1.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); - if (zn1.zn_handle == NULL) + guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); + if (guid1hdl == NULL) return (-1); nvfs = fsavl_find(avl, guid2, &snapname); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); - zn2.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); - if (zn2.zn_handle == NULL) { - zfs_close(zn2.zn_handle); + guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); + if (guid2hdl == NULL) { + zfs_close(guid1hdl); return (-1); } - rv = (zfs_snapshot_compare(&zn1, &zn2) == -1); + create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); + create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); - zfs_close(zn1.zn_handle); - zfs_close(zn2.zn_handle); + if (create1 < create2) + rv = -1; + else if (create1 > create2) + rv = +1; + else + rv = 0; + + zfs_close(guid1hdl); + zfs_close(guid2hdl); return (rv); } static int recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, - recvflags_t flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, + recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, nvlist_t *renamed) { nvlist_t *local_nv; @@ -1775,7 +1948,7 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); - if (flags.dryrun) + if (flags->dryrun) return (0); again: @@ -1835,7 +2008,7 @@ again: nvlist_t *origin_nvfs; char *origin_fsname; - if (flags.verbose) + if (flags->verbose) (void) printf("promoting %s\n", fsname); origin_nvfs = fsavl_find(local_avl, originguid, @@ -1883,7 +2056,7 @@ again: if (found == NULL) { char name[ZFS_MAXNAMELEN]; - if (!flags.force) + if (!flags->force) continue; (void) snprintf(name, sizeof (name), "%s@%s", @@ -1941,7 +2114,7 @@ again: /* check for delete */ if (stream_nvfs == NULL) { - if (!flags.force) + if (!flags->force) continue; error = recv_destroy(hdl, fsname, strlen(tofs)+1, @@ -1954,7 +2127,7 @@ again: } if (fromguid == 0) { - if (flags.verbose) { + if (flags->verbose) { (void) printf("local fs %s does not have " "fromsnap (%s in stream); must have " "been deleted locally; ignoring\n", @@ -1979,7 +2152,7 @@ again: if ((stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) || - ((flags.isprefix || strcmp(tofs, fsname) != 0) && + ((flags->isprefix || strcmp(tofs, fsname) != 0) && (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { nvlist_t *parent; char tryname[ZFS_MAXNAMELEN]; @@ -2002,7 +2175,7 @@ again: "%s%s", pname, strrchr(stream_fsname, '/')); } else { tryname[0] = '\0'; - if (flags.verbose) { + if (flags->verbose) { (void) printf("local fs %s new parent " "not found\n", fsname); } @@ -2030,7 +2203,7 @@ again: if (needagain && progress) { /* do another pass to fix up temporary names */ - if (flags.verbose) + if (flags->verbose) (void) printf("another pass:\n"); goto again; } @@ -2040,7 +2213,7 @@ again: static int zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, - recvflags_t flags, dmu_replay_record_t *drr, zio_cksum_t *zc, + recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, char **top_zfs, int cleanup_fd, uint64_t *action_handlep) { nvlist_t *stream_nv = NULL; @@ -2069,7 +2242,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, */ if (drr->drr_payloadlen != 0) { error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, - &stream_nv, flags.byteswap, zc); + &stream_nv, flags->byteswap, zc); if (error) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; @@ -2090,9 +2263,9 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, * Read in the end record and verify checksum. */ if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), - flags.byteswap, NULL))) + flags->byteswap, NULL))) goto out; - if (flags.byteswap) { + if (flags->byteswap) { drre.drr_type = BSWAP_32(drre.drr_type); drre.drr_u.drr_end.drr_checksum.zc_word[0] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); @@ -2133,11 +2306,11 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, nvpair_t *pair = NULL; (void) strlcpy(tofs, destname, ZFS_MAXNAMELEN); - if (flags.isprefix) { + if (flags->isprefix) { struct drr_begin *drrb = &drr->drr_u.drr_begin; int i; - if (flags.istail) { + if (flags->istail) { cp = strrchr(drrb->drr_toname, '/'); if (cp == NULL) { (void) strlcat(tofs, "/", @@ -2155,7 +2328,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, *strchr(tofs, '@') = '\0'; } - if (recursive && !flags.dryrun && !flags.nomount) { + if (recursive && !flags->dryrun && !flags->nomount) { VERIFY(0 == nvlist_alloc(&renamed, NV_UNIQUE_NAME, 0)); } @@ -2329,7 +2502,7 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) */ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - recvflags_t flags, dmu_replay_record_t *drr, + recvflags_t *flags, dmu_replay_record_t *drr, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, uint64_t *action_handlep) @@ -2371,7 +2544,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, if (err) VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); - if (flags.canmountoff) { + if (flags->canmountoff) { VERIFY(0 == nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0)); } @@ -2398,7 +2571,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * If they specified a snapshot, chop the entire name stored in * the stream. */ - if (flags.istail) { + if (flags->istail) { /* * A filesystem was specified with -e. We want to tack on only * the tail of the sent snapshot path. @@ -2424,7 +2597,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } else { chopprefix = drrb->drr_toname + (chopprefix - sendfs); } - } else if (flags.isprefix) { + } else if (flags->isprefix) { /* * A filesystem was specified with -d. We want to tack on * everything but the first element of the sent snapshot path @@ -2478,7 +2651,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * Determine the name of the origin snapshot, store in zc_string. */ if (drrb->drr_flags & DRR_FLAG_CLONE) { - if (guid_to_name(hdl, tosnap, + if (guid_to_name(hdl, zc.zc_value, drrb->drr_fromguid, zc.zc_string) != 0) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -2486,7 +2659,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zc.zc_value); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } - if (flags.verbose) + if (flags->verbose) (void) printf("found clone origin %s\n", zc.zc_string); } @@ -2509,7 +2682,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { char suffix[ZFS_MAXNAMELEN]; (void) strcpy(suffix, strrchr(zc.zc_value, '/')); - if (guid_to_name(hdl, tosnap, parent_snapguid, + if (guid_to_name(hdl, zc.zc_name, parent_snapguid, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, suffix); @@ -2531,12 +2704,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * topmost path in the stream, then if the fs does not exist we * should look no further. */ - if ((flags.isprefix || (*(chopprefix = drrb->drr_toname + + if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + strlen(sendfs)) != '\0' && *chopprefix != '@')) && !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { char snap[ZFS_MAXNAMELEN]; (void) strcpy(snap, strchr(zc.zc_value, '@')); - if (guid_to_name(hdl, tosnap, drrb->drr_fromguid, + if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, snap); @@ -2558,7 +2731,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * snapshots). */ if (stream_wantsnewfs) { - if (!flags.force) { + if (!flags->force) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' exists\n" @@ -2594,7 +2767,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } - if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && + if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && stream_wantsnewfs) { /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0); @@ -2633,7 +2806,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, */ *cp = '\0'; - if (flags.isprefix && !flags.istail && !flags.dryrun && + if (flags->isprefix && !flags->istail && !flags->dryrun && create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) { zcmd_free_nvlists(&zc); return (zfs_error(hdl, EZFS_BADRESTORE, errbuf)); @@ -2644,18 +2817,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zc.zc_begin_record = drr_noswap->drr_u.drr_begin; zc.zc_cookie = infd; - zc.zc_guid = flags.force; - if (flags.verbose) { + zc.zc_guid = flags->force; + if (flags->verbose) { (void) printf("%s %s stream of %s into %s\n", - flags.dryrun ? "would receive" : "receiving", + flags->dryrun ? "would receive" : "receiving", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, zc.zc_value); (void) fflush(stdout); } - if (flags.dryrun) { + if (flags->dryrun) { zcmd_free_nvlists(&zc); - return (recv_skip(hdl, infd, flags.byteswap)); + return (recv_skip(hdl, infd, flags->byteswap)); } zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf; @@ -2736,12 +2909,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, nvlist_free(local_nv); if (fs != NULL) { - if (flags.verbose) { + if (flags->verbose) { (void) printf("snap %s already exists; " "ignoring\n", zc.zc_value); } err = ioctl_err = recv_skip(hdl, infd, - flags.byteswap); + flags->byteswap); } } *cp = '@'; @@ -2793,7 +2966,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, case EDQUOT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s space quota exceeded"), zc.zc_name); - (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); + (void) zfs_error(hdl, EZFS_NOSPC, errbuf); break; default: (void) zfs_standard_error(hdl, ioctl_errno, errbuf); @@ -2851,7 +3024,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, *action_handlep = zc.zc_action_handle; - if (flags.verbose) { + if (flags->verbose) { char buf1[64]; char buf2[64]; uint64_t bytes = zc.zc_cookie; @@ -2869,7 +3042,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } static int -zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, +zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, uint64_t *action_handlep) { @@ -2884,7 +3057,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); - if (flags.isprefix && + if (flags->isprefix && !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " "(%s) does not exist"), tosnap); @@ -2904,7 +3077,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, /* the kernel needs the non-byteswapped begin record */ drr_noswap = drr; - flags.byteswap = B_FALSE; + flags->byteswap = B_FALSE; if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { /* * We computed the checksum in the wrong byteorder in @@ -2912,7 +3085,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, */ bzero(&zcksum, sizeof (zio_cksum_t)); fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); - flags.byteswap = B_TRUE; + flags->byteswap = B_TRUE; drr.drr_type = BSWAP_32(drr.drr_type); drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); @@ -2980,7 +3153,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, * (-1 will override -2). */ int -zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, +zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, int infd, avl_tree_t *stream_avl) { char *top_zfs = NULL; @@ -2996,7 +3169,7 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, VERIFY(0 == close(cleanup_fd)); - if (err == 0 && !flags.nomount && top_zfs) { + if (err == 0 && !flags->nomount && top_zfs) { zfs_handle_t *zhp; prop_changelist_t *clp; diff --git a/lib/libzfs/common/libzfs_status.c b/lib/libzfs/common/libzfs_status.c index 24725ec..af0707a 100644 --- a/lib/libzfs/common/libzfs_status.c +++ b/lib/libzfs/common/libzfs_status.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -214,6 +216,20 @@ check_status(nvlist_t *config, boolean_t isimport) return (ZPOOL_STATUS_VERSION_NEWER); /* + * Unsupported feature(s). + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_UNSUP_FEAT) { + nvlist_t *nvinfo; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + &nvinfo) == 0); + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) + return (ZPOOL_STATUS_UNSUP_FEAT_WRITE); + return (ZPOOL_STATUS_UNSUP_FEAT_READ); + } + + /* * Check that the config is complete. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && @@ -300,7 +316,7 @@ check_status(nvlist_t *config, boolean_t isimport) /* * Outdated, but usable, version */ - if (version < SPA_VERSION) + if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION) return (ZPOOL_STATUS_VERSION_OLDER); return (ZPOOL_STATUS_OK); diff --git a/lib/libzfs/common/libzfs_util.c b/lib/libzfs/common/libzfs_util.c index 01b7c87..41db2fd 100644 --- a/lib/libzfs/common/libzfs_util.c +++ b/lib/libzfs/common/libzfs_util.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -44,6 +46,7 @@ #include "libzfs_impl.h" #include "zfs_prop.h" +#include "zfeature_common.h" int libzfs_errno(libzfs_handle_t *hdl) @@ -111,7 +114,8 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_RESILVERING: return (dgettext(TEXT_DOMAIN, "currently resilvering")); case EZFS_BADVERSION: - return (dgettext(TEXT_DOMAIN, "unsupported version")); + return (dgettext(TEXT_DOMAIN, "unsupported version or " + "feature")); case EZFS_POOLUNAVAIL: return (dgettext(TEXT_DOMAIN, "pool is unavailable")); case EZFS_DEVOVERFLOW: @@ -344,6 +348,7 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) switch (error) { case ENXIO: case ENODEV: + case EPIPE: zfs_verror(hdl, EZFS_IO, fmt, ap); break; @@ -627,6 +632,7 @@ libzfs_init(void) zfs_prop_init(); zpool_prop_init(); + zpool_feature_init(); libzfs_mnttab_init(hdl); return (hdl); @@ -1280,8 +1286,11 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, * this is a pool property or if this isn't a user-defined * dataset property, */ - if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL || - (!zfs_prop_user(propname) && !zfs_prop_userquota(propname)))) { + if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL && + !zpool_prop_feature(propname) && + !zpool_prop_unsupported(propname)) || + (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) && + !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); return (zfs_error(hdl, EZFS_BADPROP, @@ -1293,7 +1302,8 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, entry->pl_prop = prop; if (prop == ZPROP_INVAL) { - if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == NULL) { + if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == + NULL) { free(entry); return (-1); } diff --git a/lib/libzpool/common/kernel.c b/lib/libzpool/common/kernel.c index f323bf6..8e1e7f7 100644 --- a/lib/libzpool/common/kernel.c +++ b/lib/libzpool/common/kernel.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -45,6 +46,7 @@ int aok; uint64_t physmem; vnode_t *rootdir = (vnode_t *)0xabcd1234; char hw_serial[HW_HOSTID_LEN]; +vmem_t *zio_arena = NULL; struct utsname utsname = { "userland", "libzpool", "1", "1", "na" @@ -424,7 +426,9 @@ vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, * To simulate partial disk writes, we split writes into two * system calls so that the process can be killed in between. */ - split = (len > 0 ? rand() % len : 0); + int sectors = len >> SPA_MINBLOCKSHIFT; + split = (sectors > 0 ? rand() % sectors : 0) << + SPA_MINBLOCKSHIFT; iolen = pwrite64(vp->v_fd, addr, split, offset); iolen += pwrite64(vp->v_fd, (char *)addr + split, len - split, offset + split); diff --git a/lib/libzpool/common/sys/zfs_context.h b/lib/libzpool/common/sys/zfs_context.h index 3b0390d..1f5e758 100644 --- a/lib/libzpool/common/sys/zfs_context.h +++ b/lib/libzpool/common/sys/zfs_context.h @@ -20,6 +20,9 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H @@ -212,6 +215,7 @@ struct proc { }; extern struct proc p0; +#define curproc (&p0) #define PS_NONE -1 @@ -327,9 +331,12 @@ extern void kstat_delete(kstat_t *); #define kmem_debugging() 0 #define kmem_cache_reap_now(_c) /* nothing */ #define kmem_cache_set_move(_c, _cb) /* nothing */ +#define vmem_qcache_reap(_v) /* nothing */ #define POINTER_INVALIDATE(_pp) /* nothing */ #define POINTER_IS_VALID(_p) 0 +extern vmem_t *zio_arena; + typedef umem_cache_t kmem_cache_t; typedef enum kmem_cbrc { @@ -347,6 +354,16 @@ typedef struct taskq taskq_t; typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); +typedef struct taskq_ent { + struct taskq_ent *tqent_next; + struct taskq_ent *tqent_prev; + task_func_t *tqent_func; + void *tqent_arg; + uintptr_t tqent_flags; +} taskq_ent_t; + +#define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */ + #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ @@ -358,6 +375,7 @@ typedef void (task_func_t)(void *); #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_FRONT 0x08 /* Queue in front */ + extern taskq_t *system_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); @@ -366,6 +384,8 @@ extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ (taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern int taskq_member(taskq_t *, void *); diff --git a/lib/libzpool/common/taskq.c b/lib/libzpool/common/taskq.c index 8db5d11..2c5dfd8 100644 --- a/lib/libzpool/common/taskq.c +++ b/lib/libzpool/common/taskq.c @@ -22,19 +22,16 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 Garrett D'Amore . All rights reserved. + */ #include int taskq_now; taskq_t *system_taskq; -typedef struct task { - struct task *task_next; - struct task *task_prev; - task_func_t *task_func; - void *task_arg; -} task_t; - #define TASKQ_ACTIVE 0x00010000 struct taskq { @@ -51,18 +48,18 @@ struct taskq { int tq_maxalloc; kcondvar_t tq_maxalloc_cv; int tq_maxalloc_wait; - task_t *tq_freelist; - task_t tq_task; + taskq_ent_t *tq_freelist; + taskq_ent_t tq_task; }; -static task_t * +static taskq_ent_t * task_alloc(taskq_t *tq, int tqflags) { - task_t *t; + taskq_ent_t *t; int rv; again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { - tq->tq_freelist = t->task_next; + tq->tq_freelist = t->tqent_next; } else { if (tq->tq_nalloc >= tq->tq_maxalloc) { if (!(tqflags & KM_SLEEP)) @@ -87,7 +84,7 @@ again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { } mutex_exit(&tq->tq_lock); - t = kmem_alloc(sizeof (task_t), tqflags); + t = kmem_alloc(sizeof (taskq_ent_t), tqflags); mutex_enter(&tq->tq_lock); if (t != NULL) @@ -97,15 +94,15 @@ again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { } static void -task_free(taskq_t *tq, task_t *t) +task_free(taskq_t *tq, taskq_ent_t *t) { if (tq->tq_nalloc <= tq->tq_minalloc) { - t->task_next = tq->tq_freelist; + t->tqent_next = tq->tq_freelist; tq->tq_freelist = t; } else { tq->tq_nalloc--; mutex_exit(&tq->tq_lock); - kmem_free(t, sizeof (task_t)); + kmem_free(t, sizeof (taskq_ent_t)); mutex_enter(&tq->tq_lock); } @@ -116,7 +113,7 @@ task_free(taskq_t *tq, task_t *t) taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) { - task_t *t; + taskq_ent_t *t; if (taskq_now) { func(arg); @@ -130,26 +127,59 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) return (0); } if (tqflags & TQ_FRONT) { - t->task_next = tq->tq_task.task_next; - t->task_prev = &tq->tq_task; + t->tqent_next = tq->tq_task.tqent_next; + t->tqent_prev = &tq->tq_task; } else { - t->task_next = &tq->tq_task; - t->task_prev = tq->tq_task.task_prev; + t->tqent_next = &tq->tq_task; + t->tqent_prev = tq->tq_task.tqent_prev; } - t->task_next->task_prev = t; - t->task_prev->task_next = t; - t->task_func = func; - t->task_arg = arg; + t->tqent_next->tqent_prev = t; + t->tqent_prev->tqent_next = t; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_flags = 0; cv_signal(&tq->tq_dispatch_cv); mutex_exit(&tq->tq_lock); return (1); } void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + ASSERT(func != NULL); + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + t->tqent_flags |= TQENT_FLAG_PREALLOC; + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + if (flags & TQ_FRONT) { + t->tqent_next = tq->tq_task.tqent_next; + t->tqent_prev = &tq->tq_task; + } else { + t->tqent_next = &tq->tq_task; + t->tqent_prev = tq->tq_task.tqent_prev; + } + t->tqent_next->tqent_prev = t; + t->tqent_prev->tqent_next = t; + t->tqent_func = func; + t->tqent_arg = arg; + cv_signal(&tq->tq_dispatch_cv); + mutex_exit(&tq->tq_lock); +} + +void taskq_wait(taskq_t *tq) { mutex_enter(&tq->tq_lock); - while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0) + while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) cv_wait(&tq->tq_wait_cv, &tq->tq_lock); mutex_exit(&tq->tq_lock); } @@ -158,27 +188,32 @@ static void * taskq_thread(void *arg) { taskq_t *tq = arg; - task_t *t; + taskq_ent_t *t; + boolean_t prealloc; mutex_enter(&tq->tq_lock); while (tq->tq_flags & TASKQ_ACTIVE) { - if ((t = tq->tq_task.task_next) == &tq->tq_task) { + if ((t = tq->tq_task.tqent_next) == &tq->tq_task) { if (--tq->tq_active == 0) cv_broadcast(&tq->tq_wait_cv); cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); tq->tq_active++; continue; } - t->task_prev->task_next = t->task_next; - t->task_next->task_prev = t->task_prev; + t->tqent_prev->tqent_next = t->tqent_next; + t->tqent_next->tqent_prev = t->tqent_prev; + t->tqent_next = NULL; + t->tqent_prev = NULL; + prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC; mutex_exit(&tq->tq_lock); rw_enter(&tq->tq_threadlock, RW_READER); - t->task_func(t->task_arg); + t->tqent_func(t->tqent_arg); rw_exit(&tq->tq_threadlock); mutex_enter(&tq->tq_lock); - task_free(tq, t); + if (!prealloc) + task_free(tq, t); } tq->tq_nthreads--; cv_broadcast(&tq->tq_wait_cv); @@ -217,8 +252,8 @@ taskq_create(const char *name, int nthreads, pri_t pri, tq->tq_nthreads = nthreads; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; - tq->tq_task.task_next = &tq->tq_task; - tq->tq_task.task_prev = &tq->tq_task; + tq->tq_task.tqent_next = &tq->tq_task; + tq->tq_task.tqent_prev = &tq->tq_task; tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP); if (flags & TASKQ_PREPOPULATE) { diff --git a/man/man1m/zdb.1m b/man/man1m/zdb.1m index 661165d..106dcfb 100644 --- a/man/man1m/zdb.1m +++ b/man/man1m/zdb.1m @@ -1,87 +1,484 @@ -'\" te -.\" Copyright (c) 2004, Sun Microsystems, Inc. All Rights Reserved. -.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. -.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. -.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH zdb 1M "31 Oct 2005" "SunOS 5.11" "System Administration Commands" -.SH NAME -zdb \- ZFS debugger -.SH SYNOPSIS -.LP -.nf -\fBzdb\fR \fIpool\fR -.fi +'\" t +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2012, Richard Lowe. +.\" +.TH "ZDB" "1M" "February 15, 2012" "" "" + +.SH "NAME" +\fBzdb\fR - Display zpool debugging and consistency information + +.SH "SYNOPSIS" +\fBzdb\fR [-CumdibcsDvhLXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR] + \fIpoolname\fR [\fIobject\fR ...] + +.P +\fBzdb\fR [-divPA] [-e [-p \fIpath\fR...]] \fIdataset\fR [\fIobject\fR ...] + +.P +\fBzdb\fR -m [-LXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] \fIpoolname\fR + [\fIvdev\fR [\fImetaslab\fR ...]] + +.P +\fBzdb\fR -R [-A] [-e [-p \fIpath\fR...]] \fIpoolname\fR + \fIvdev\fR:\fIoffset\fR:\fIsize\fR[:\fIflags\fR] + +.P +\fBzdb\fR -S [-AP] [-e [-p \fIpath\fR...]] \fIpoolname\fR + +.P +\fBzdb\fR -l [-uA] \fIdevice\fR + +.P +\fBzdb\fR -C [-A] [-U \fIcache\fR] + +.SH "DESCRIPTION" +The \fBzdb\fR utility displays information about a ZFS pool useful for +debugging and performs some amount of consistency checking. It is a not a +general purpose tool and options (and facilities) may change. This is neither +a fsck(1M) nor an fsdb(1M) utility. + +.P +The output of this command in general reflects the on-disk structure of a ZFS +pool, and is inherently unstable. The precise output of most invocations is +not documented, a knowledge of ZFS internals is assumed. + +.P +When operating on an imported and active pool it is possible, though unlikely, +that zdb may interpret inconsistent pool data and behave erratically. + +.SH "OPTIONS" +Display options: -.SH DESCRIPTION .sp -.LP -The \fBzdb\fR command is used by support engineers to diagnose failures and -gather statistics. Since the \fBZFS\fR file system is always consistent on disk -and is self-repairing, \fBzdb\fR should only be run under the direction by a -support engineer. +.ne 2 +.na +\fB-b\fR +.ad +.sp .6 +.RS 4n +Display statistics regarding the number, size (logical, physical and +allocated) and deduplication of blocks. +.RE + .sp -.LP -If no arguments are specified, \fBzdb\fR, performs basic consistency checks on -the pool and associated datasets, and report any problems detected. +.ne 2 +.na +\fB-c\fR +.ad +.sp .6 +.RS 4n +Verify the checksum of all metadata blocks while printing block statistics +(see \fB-b\fR). .sp -.LP -Any options supported by this command are internal to Sun and subject to change -at any time. -.SH EXIT STATUS +If specified multiple times, verify the checksums of all blocks. +.RE + .sp -.LP -The following exit values are returned: +.ne 2 +.na +\fB-C\fR +.ad +.sp .6 +.RS 4n +Display information about the configuration. If specified with no other +options, instead display information about the cache file +(\fB/etc/zfs/zpool.cache\fR). To specify the cache file to display, see +\fB-U\fR. +.P +If specified multiple times, and a pool name is also specified display both +the cached configuration and the on-disk configuration. If specified multiple +times with \fB-e\fR also display the configuration that would be used were the +pool to be imported. +.RE + +.sp +.ne 2 +.na +\fB-d\fR +.ad +.sp .6 +.RS 4n +Display information about datasets. Specified once, displays basic dataset +information: ID, create transaction, size, and object count. +.sp +If specified multiple times provides greater and greater verbosity. +.sp +If object IDs are specified, display information about those specific objects only. +.RE + +.sp +.ne 2 +.na +\fB-D\fR +.ad +.sp .6 +.RS 4n +Display deduplication statistics, including the deduplication ratio (dedup), +compression ratio (compress), inflation due to the zfs copies property +(copies), and an overall effective ratio (dedup * compress / copies). +.sp +If specified twice, display a histogram of deduplication statistics, showing +the allocated (physically present on disk) and referenced (logically +referenced in the pool) block counts and sizes by reference count. +.RE + +.sp +.ne 2 +.na +\fB-h\fR +.ad +.sp .6 +.RS 4n +Display pool history similar to \fBzpool history\fR, but include internal +changes, transaction, and dataset information. +.RE + +.sp +.ne 2 +.na +\fB-i\fR +.ad +.sp .6 +.RS 4n +Display information about intent log (ZIL) entries relating to each +dataset. If specified multiple times, display counts of each intent log +transaction type. +.RE + +.sp +.ne 2 +.na +\fB-l\fR \fIdevice\fR +.ad +.sp .6 +.RS 4n +Display the vdev labels from the specified device. If the \fB-u\fR option is +also specified, also display the uberblocks on this device. +.RE + .sp .ne 2 -.mk .na -\fB\fB0\fR\fR +\fB-L\fR .ad -.RS 5n -.rt -The pool is consistent. +.sp .6 +.RS 4n +Disable leak tracing and the loading of space maps. By default, \fBzdb\fR +verifies that all non-free blocks are referenced, which can be very expensive. .RE .sp .ne 2 -.mk .na -\fB\fB1\fR\fR +\fB-m\fR .ad -.RS 5n -.rt -An error was detected. +.sp .6 +.RS 4n +Display the offset, spacemap, and free space of each metaslab. +When specified twice, also display information about the maximum contiguous +free space and the percentage of free space in each space map. When specified +three times display every spacemap record. .RE .sp .ne 2 -.mk .na -\fB\fB2\fR\fR +\fB-R\fR \fIpoolname\fR \fIvdev\fR:\fIoffset\fR:\fIsize\fR[:\fIflags\fR] .ad -.RS 5n -.rt -Invalid command line options were specified. +.sp .6 +.RS 4n +Read and display a block from the specified device. By default the block is +displayed as a hex dump, but see the description of the \'r\' flag, below. +.sp +The block is specified in terms of a colon-separated tuple \fIvdev\fR (an +integer vdev identifier) \fIoffset\fR (the offset within the vdev) \fIsize\fR +(the size of the block to read) and, optionally, \fIflags\fR (a set of flags, +described below). + +.sp +.ne 2 +.na +\fBb\fR \fIoffset\fR +.ad +.sp .6 +.RS 4n +Print block pointer .RE -.SH ATTRIBUTES .sp +.ne 2 +.na +\fBd\fR +.ad +.sp .6 +.RS 4n +Decompress the block +.RE + +.sp +.ne 2 +.na +\fBe\fR +.ad +.sp .6 +.RS 4n +Byte swap the block +.RE + +.sp +.ne 2 +.na +\fBg\fR +.ad +.sp .6 +.RS 4n +Dump gang block header +.RE + +.sp +.ne 2 +.na +\fBi\fR +.ad +.sp .6 +.RS 4n +Dump indirect block +.RE + +.sp +.ne 2 +.na +\fBr\fR +.ad +.sp .6 +.RS 4n +Dump raw uninterpreted block data +.RE +.RE + +.sp +.ne 2 +.na +\fB-s\fR +.ad +.sp .6 +.RS 4n +Report statistics on \fBzdb\fR\'s I/O. Display operation counts, bandwidth, +and error counts of I/O to the pool from \fBzdb\fR. +.RE + +.sp +.ne 2 +.na +\fB-S\fR +.ad +.sp .6 +.RS 4n +Simulate the effects of deduplication, constructing a DDT and then display +that DDT as with \fB-DD\fR. +.RE + +.sp +.ne 2 +.na +\fB-u\fR +.ad +.sp .6 +.RS 4n +Display the current uberblock. +.RE + +.P +Other options: + +.sp +.ne 2 +.na +\fB-A\fR +.ad +.sp .6 +.RS 4n +Do not abort should any assertion fail. +.RE + +.sp +.ne 2 +.na +\fB-AA\fR +.ad +.sp .6 +.RS 4n +Enable panic recovery, certain errors which would otherwise be fatal are +demoted to warnings. +.RE + +.sp +.ne 2 +.na +\fB-AAA\fR +.ad +.sp .6 +.RS 4n +Do not abort if asserts fail and also enable panic recovery. +.RE + +.sp +.ne 2 +.na +\fB-e\fR [-p \fIpath\fR]... +.ad +.sp .6 +.RS 4n +Operate on an exported pool, not present in \fB/etc/zfs/zpool.cache\fR. The +\fB-p\fR flag specifies the path under which devices are to be searched. +.RE + +.sp +.ne 2 +.na +\fB-F\fR +.ad +.sp .6 +.RS 4n +Attempt to make an unreadable pool readable by trying progressively older +transactions. +.RE + +.sp +.ne 2 +.na +\fB-P\fR +.ad +.sp .6 +.RS 4n +Print numbers in an unscaled form more amenable to parsing, eg. 1000000 rather +than 1M. +.RE + +.sp +.ne 2 +.na +\fB-t\fR \fItransaction\fR +.ad +.sp .6 +.RS 4n +Specify the highest transaction to use when searching for uberblocks. See also +the \fB-u\fR and \fB-l\fR options for a means to see the available uberblocks +and their associated transaction numbers. +.RE + +.sp +.ne 2 +.na +\fB-U\fR \fIcachefile\fR +.ad +.sp .6 +.RS 4n +Use a cache file other than \fB/etc/zfs/zpool.cache\fR. This option is only +valid with \fB-C\fR +.RE + +.sp +.ne 2 +.na +\fB-v\fR +.ad +.sp .6 +.RS 4n +Enable verbosity. Specify multiple times for increased verbosity. +.RE + +.sp +.ne 2 +.na +\fB-X\fR +.ad +.sp .6 +.RS 4n +Attempt \'extreme\' transaction rewind, that is attempt the same recovery as +\fB-F\fR but read transactions otherwise deemed too old. +.RE + +.P +Specifying a display option more than once enables verbosity for only that +option, with more occurrences enabling more verbosity. +.P +If no options are specified, all information about the named pool will be +displayed at default verbosity. + +.SH "EXAMPLES" .LP -See \fBattributes\fR(5) for descriptions of the following attributes: +\fBExample 1 \fRDisplay the configuration of imported pool 'rpool' .sp +.in +2 +.nf +# zdb -C rpool +MOS Configuration: + version: 28 + name: 'rpool' + ... +.fi +.in -2 .sp -.TS -tab() box; -cw(2.75i) |cw(2.75i) -lw(2.75i) |lw(2.75i) -. -ATTRIBUTE TYPEATTRIBUTE VALUE -_ -Interface StabilityUnstable -.TE -.SH SEE ALSO +.LP +\fBExample 2 \fRDisplay basic dataset information about 'rpool' +.sp +.in +2 +.nf +# zdb -d rpool +Dataset mos [META], ID 0, cr_txg 4, 26.9M, 1051 objects +Dataset rpool/swap [ZVOL], ID 59, cr_txg 356, 486M, 2 objects + ... +.fi +.in -2 .sp + .LP -\fBzfs\fR(1M), \fBzpool\fR(1M), \fBattributes\fR(5) +\fBExample 3 \fRDisplay basic information about object 0 in +'rpool/export/home' +.sp +.in +2 +.nf +# zdb -d rpool/export/home 0 +Dataset rpool/export/home [ZPL], ID 137, cr_txg 1546, 32K, 8 objects + + Object lvl iblk dblk dsize lsize %full type + 0 7 16K 16K 15.0K 16K 25.00 DMU dnode +.fi +.in -2 +.sp + +.LP +\fBExample 4 \fRDisplay the predicted effect of enabling deduplication on 'rpool' +.sp +.in +2 +.nf +# zdb -S rpool +Simulated DDT histogram: + +bucket allocated referenced +______ ______________________________ ______________________________ +refcnt blocks LSIZE PSIZE DSIZE blocks LSIZE PSIZE DSIZE +------ ------ ----- ----- ----- ------ ----- ----- ----- + 1 694K 27.1G 15.0G 15.0G 694K 27.1G 15.0G 15.0G + 2 35.0K 1.33G 699M 699M 74.7K 2.79G 1.45G 1.45G + ... +dedup = 1.11, compress = 1.80, copies = 1.00, dedup * compress / copies = 2.00 +.fi +.in -2 +.sp + +.SH "SEE ALSO" +zfs(1M), zpool(1M) diff --git a/man/man1m/zfs.1m b/man/man1m/zfs.1m index 68244c7..e713566 100644 --- a/man/man1m/zfs.1m +++ b/man/man1m/zfs.1m @@ -1,12 +1,13 @@ '\" te .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Nexenta Systems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, Joyent, Inc. All rights reserved. .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with .\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with -.\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH zfs 1M "24 Sep 2009" "SunOS 5.11" "System Administration Commands" +.\" Copyright 2011 Joshua M. Clulow +.TH ZFS 1M "28 Jul 2011" .SH NAME zfs \- configures ZFS file systems .SH SYNOPSIS @@ -27,17 +28,17 @@ zfs \- configures ZFS file systems .LP .nf -\fBzfs\fR \fBdestroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR +\fBzfs\fR \fBdestroy\fR [\fB-fnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR .fi .LP .nf -\fBzfs\fR \fBdestroy\fR [\fB-rRd\fR] \fIsnapshot\fR +\fBzfs\fR \fBdestroy\fR [\fB-dnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR@\fIsnap\fR[%\fIsnap\fR][,...] .fi .LP .nf -\fBzfs\fR \fBsnapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... +\fBzfs\fR \fBsnapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR .fi @@ -58,13 +59,13 @@ zfs \- configures ZFS file systems .LP .nf -\fBzfs\fR \fBrename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR +\fBzfs\fR \fBrename\fR [\fB-f\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR .fi .LP .nf -\fBzfs\fR \fBrename\fR [\fB-p\fR] \fIfilesystem\fR|\fIvolume\fR \fIfilesystem\fR|\fIvolume\fR +\fBzfs\fR \fBrename\fR [\fB-fp\fR] \fIfilesystem\fR|\fIvolume\fR \fIfilesystem\fR|\fIvolume\fR .fi .LP @@ -85,8 +86,8 @@ zfs \- configures ZFS file systems .LP .nf -\fBzfs\fR \fBget\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIfield\fR[,...]] [\fB-s\fR \fIsource\fR[,...]] - "\fIall\fR" | \fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ... +\fBzfs\fR \fBget\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIfield\fR[,...]] [\fB-t\fR \fItype\fR[,...]] + [\fB-s\fR \fIsource\fR[,...]] "\fIall\fR" | \fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ... .fi .LP @@ -107,18 +108,18 @@ zfs \- configures ZFS file systems .LP .nf \fBzfs\fR \fBuserspace\fR [\fB-niHp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-sS\fR \fIfield\fR] ... - [\fB-t\fR \fItype\fR [,...]] \fIfilesystem\fR|\fIsnapshot\fR + [\fB-t\fR \fItype\fR[,...]] \fIfilesystem\fR|\fIsnapshot\fR .fi .LP .nf \fBzfs\fR \fBgroupspace\fR [\fB-niHp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-sS\fR \fIfield\fR] ... - [\fB-t\fR \fItype\fR [,...]] \fIfilesystem\fR|\fIsnapshot\fR + [\fB-t\fR \fItype\fR[,...]] \fIfilesystem\fR|\fIsnapshot\fR .fi .LP .nf -\fBzfs\fR \fBmount\fR +\fBzfs\fR \fBmount\fR .fi .LP @@ -143,7 +144,7 @@ zfs \- configures ZFS file systems .LP .nf -\fBzfs\fR \fBsend\fR [\fB-vR\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR +\fBzfs\fR \fBsend\fR [\fB-DnPpRrv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR .fi .LP @@ -153,7 +154,7 @@ zfs \- configures ZFS file systems .LP .nf -\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] \fB-d\fR \fIfilesystem\fR +\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR .fi .LP @@ -163,7 +164,7 @@ zfs \- configures ZFS file systems .LP .nf -\fBzfs\fR \fBallow\fR [\fB-ldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] \fIperm\fR|\fI@setname\fR[,...] +\fBzfs\fR \fBallow\fR [\fB-ldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] \fIperm\fR|\fI@setname\fR[,...] \fIfilesystem\fR|\fIvolume\fR .fi @@ -184,7 +185,7 @@ zfs \- configures ZFS file systems .LP .nf -\fBzfs\fR \fBunallow\fR [\fB-rldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] [\fIperm\fR|@\fIsetname\fR[,... ]] +\fBzfs\fR \fBunallow\fR [\fB-rldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR .fi @@ -240,7 +241,6 @@ where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes). A dataset can be one of the following: .sp .ne 2 -.mk .na \fB\fIfile system\fR\fR .ad @@ -256,7 +256,6 @@ free space. .sp .ne 2 -.mk .na \fB\fIvolume\fR\fR .ad @@ -269,7 +268,6 @@ most environments. .sp .ne 2 -.mk .na \fB\fIsnapshot\fR\fR .ad @@ -441,7 +439,6 @@ dataset. These properties can be neither set, nor inherited. Native properties apply to all dataset types unless otherwise noted. .sp .ne 2 -.mk .na \fB\fBavailable\fR\fR .ad @@ -458,20 +455,22 @@ This property can also be referred to by its shortened column name, .sp .ne 2 -.mk .na \fB\fBcompressratio\fR\fR .ad .sp .6 .RS 4n -The compression ratio achieved for this dataset, expressed as a multiplier. +For non-snapshots, the compression ratio achieved for the \fBused\fR +space of this dataset, expressed as a multiplier. The \fBused\fR +property includes descendant datasets, and, for clones, does not include +the space shared with the origin snapshot. For snapshots, the +\fBcompressratio\fR is the same as the \fBrefcompressratio\fR property. Compression can be turned on by running: \fBzfs set compression=on \fIdataset\fR\fR. The default value is \fBoff\fR. .RE .sp .ne 2 -.mk .na \fB\fBcreation\fR\fR .ad @@ -482,7 +481,19 @@ The time this dataset was created. .sp .ne 2 -.mk +.na +\fB\fBclones\fR\fR +.ad +.sp .6 +.RS 4n +For snapshots, this property is a comma-separated list of filesystems or +volumes which are clones of this snapshot. The clones' \fBorigin\fR property +is this snapshot. If the \fBclones\fR property is not empty, then this +snapshot can not be destroyed (even with the \fB-r\fR or \fB-f\fR options). +.RE + +.sp +.ne 2 .na \fB\fBdefer_destroy\fR\fR .ad @@ -495,7 +506,6 @@ by using the \fBzfs destroy\fR \fB-d\fR command. Otherwise, the property is .sp .ne 2 -.mk .na \fB\fBmounted\fR\fR .ad @@ -507,20 +517,17 @@ property can be either \fByes\fR or \fBno\fR. .sp .ne 2 -.mk .na \fB\fBorigin\fR\fR .ad .sp .6 .RS 4n For cloned file systems or volumes, the snapshot from which the clone was -created. The origin cannot be destroyed (even with the \fB-r\fR or \fB-f\fR -options) so long as a clone exists. +created. See also the \fBclones\fR property. .RE .sp .ne 2 -.mk .na \fB\fBreferenced\fR\fR .ad @@ -537,7 +544,18 @@ This property can also be referred to by its shortened column name, .sp .ne 2 -.mk +.na +\fB\fBrefcompressratio\fR\fR +.ad +.sp .6 +.RS 4n +The compression ratio achieved for the \fBreferenced\fR space of this +dataset, expressed as a multiplier. See also the \fBcompressratio\fR +property. +.RE + +.sp +.ne 2 .na \fB\fBtype\fR\fR .ad @@ -548,7 +566,6 @@ The type of dataset: \fBfilesystem\fR, \fBvolume\fR, or \fBsnapshot\fR. .sp .ne 2 -.mk .na \fB\fBused\fR\fR .ad @@ -578,7 +595,6 @@ immediately. .sp .ne 2 -.mk .na \fB\fBusedby*\fR\fR .ad @@ -593,7 +609,6 @@ on \fBzpool\fR "version 13" pools. .sp .ne 2 -.mk .na \fB\fBusedbychildren\fR\fR .ad @@ -605,7 +620,6 @@ all the dataset's children were destroyed. .sp .ne 2 -.mk .na \fB\fBusedbydataset\fR\fR .ad @@ -618,7 +632,6 @@ destroying any necessary snapshots or descendents). .sp .ne 2 -.mk .na \fB\fBusedbyrefreservation\fR\fR .ad @@ -630,7 +643,6 @@ would be freed if the \fBrefreservation\fR was removed. .sp .ne 2 -.mk .na \fB\fBusedbysnapshots\fR\fR .ad @@ -644,7 +656,6 @@ properties because space can be shared by multiple snapshots. .sp .ne 2 -.mk .na \fB\fBuserused@\fR\fIuser\fR\fR .ad @@ -690,7 +701,6 @@ following forms: .sp .ne 2 -.mk .na \fB\fBuserrefs\fR\fR .ad @@ -702,7 +712,6 @@ are set by using the \fBzfs hold\fR command. .sp .ne 2 -.mk .na \fB\fBgroupused@\fR\fIgroup\fR\fR .ad @@ -719,7 +728,6 @@ allow\fR, can access all groups' usage. .sp .ne 2 -.mk .na \fB\fBvolblocksize\fR=\fIblocksize\fR\fR .ad @@ -735,12 +743,41 @@ This property can also be referred to by its shortened column name, .RE .sp +.ne 2 +.na +\fB\fBwritten\fR\fR +.ad +.sp .6 +.RS 4n +The amount of \fBreferenced\fR space written to this dataset since the +previous snapshot. +.RE + +.sp +.ne 2 +.na +\fB\fBwritten@\fR\fIsnapshot\fR\fR +.ad +.sp .6 +.RS 4n +The amount of \fBreferenced\fR space written to this dataset since the +specified snapshot. This is the space that is referenced by this dataset +but was not referenced by the specified snapshot. +.sp +The \fIsnapshot\fR may be specified as a short snapshot name (just the part +after the \fB@\fR), in which case it will be interpreted as a snapshot in +the same filesystem as this dataset. +The \fIsnapshot\fR be a full snapshot name (\fIfilesystem\fR@\fIsnapshot\fR), +which for clones may be a snapshot in the origin's filesystem (or the origin +of the origin's filesystem, etc). +.RE + +.sp .LP The following native properties can be used to change the behavior of a \fBZFS\fR dataset. .sp .ne 2 -.mk .na \fB\fBaclinherit\fR=\fBdiscard\fR | \fBnoallow\fR | \fBrestricted\fR | \fBpassthrough\fR | \fBpassthrough-x\fR\fR @@ -769,28 +806,24 @@ mode from the application. .sp .ne 2 -.mk .na \fB\fBaclmode\fR=\fBdiscard\fR | \fBgroupmask\fR | \fBpassthrough\fR\fR .ad .sp .6 .RS 4n Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with -an \fBaclmode\fR property of \fBdiscard\fR deletes all \fBACL\fR entries that -do not represent the mode of the file. An \fBaclmode\fR property of -\fBgroupmask\fR (the default) reduces user or group permissions. The -permissions are reduced, such that they are no greater than the group -permission bits, unless it is a user entry that has the same \fBUID\fR as the -owner of the file or directory. In this case, the \fBACL\fR permissions are -reduced so that they are no greater than owner permission bits. A file system -with an \fBaclmode\fR property of \fBpassthrough\fR indicates that no changes -are made to the \fBACL\fR other than generating the necessary \fBACL\fR entries -to represent the new mode of the file or directory. +an \fBaclmode\fR property of \fBdiscard\fR (the default) deletes all \fBACL\fR +entries that do not represent the mode of the file. An \fBaclmode\fR property +of \fBgroupmask\fR reduces permissions granted in all \fBALLOW\fR entries found +in the \fBACL\fR such that they are no greater than the group permissions +specified by \fBchmod\fR. A file system with an \fBaclmode\fR property of +\fBpassthrough\fR indicates that no changes are made to the \fBACL\fR other +than creating or updating the necessary \fBACL\fR entries to +represent the new mode of the file or directory. .RE .sp .ne 2 -.mk .na \fB\fBatime\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -804,7 +837,6 @@ and other similar utilities. The default value is \fBon\fR. .sp .ne 2 -.mk .na \fB\fBcanmount\fR=\fBon\fR | \fBoff\fR | \fBnoauto\fR\fR .ad @@ -830,7 +862,6 @@ This property is not inherited. .sp .ne 2 -.mk .na \fB\fBchecksum\fR=\fBon\fR | \fBoff\fR | \fBfletcher2,\fR| \fBfletcher4\fR | \fBsha256\fR\fR @@ -848,10 +879,9 @@ Changing this property affects only newly-written data. .sp .ne 2 -.mk .na \fB\fBcompression\fR=\fBon\fR | \fBoff\fR | \fBlzjb\fR | \fBgzip\fR | -\fBgzip-\fR\fIN\fR\fR +\fBgzip-\fR\fIN\fR | \fBzle\fR\fR .ad .sp .6 .RS 4n @@ -862,7 +892,8 @@ algorithm. The \fBgzip\fR compression algorithm uses the same compression as the \fBgzip\fR(1) command. You can specify the \fBgzip\fR level by using the value \fBgzip-\fR\fIN\fR where \fIN\fR is an integer from 1 (fastest) to 9 (best compression ratio). Currently, \fBgzip\fR is equivalent to \fBgzip-6\fR -(which is also the default for \fBgzip\fR(1)). +(which is also the default for \fBgzip\fR(1)). The \fBzle\fR compression +algorithm compresses runs of zeros. .sp This property can also be referred to by its shortened column name \fBcompress\fR. Changing this property affects only newly-written data. @@ -870,7 +901,6 @@ This property can also be referred to by its shortened column name .sp .ne 2 -.mk .na \fB\fBcopies\fR=\fB1\fR | \fB2\fR | \fB3\fR\fR .ad @@ -889,7 +919,6 @@ property at file system creation time by using the \fB-o\fR .sp .ne 2 -.mk .na \fB\fBdevices\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -901,7 +930,6 @@ value is \fBon\fR. .sp .ne 2 -.mk .na \fB\fBexec\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -913,7 +941,6 @@ default value is \fBon\fR. .sp .ne 2 -.mk .na \fB\fBmountpoint\fR=\fIpath\fR | \fBnone\fR | \fBlegacy\fR\fR .ad @@ -933,7 +960,6 @@ new location. .sp .ne 2 -.mk .na \fB\fBnbmand\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -947,7 +973,6 @@ property only take effect when the file system is umounted and remounted. See .sp .ne 2 -.mk .na \fB\fBprimarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR .ad @@ -962,7 +987,6 @@ is set to \fBmetadata\fR, then only metadata is cached. The default value is .sp .ne 2 -.mk .na \fB\fBquota\fR=\fIsize\fR | \fBnone\fR\fR .ad @@ -980,7 +1004,6 @@ implicit quota. .sp .ne 2 -.mk .na \fB\fBuserquota@\fR\fIuser\fR=\fIsize\fR | \fBnone\fR\fR .ad @@ -1030,7 +1053,6 @@ displayed by \fBzfs get all\fR. The user's name must be appended after the .sp .ne 2 -.mk .na \fB\fBgroupquota@\fR\fIgroup\fR=\fIsize\fR | \fBnone\fR\fR .ad @@ -1046,7 +1068,6 @@ allow\fR, can get and set all groups' quotas. .sp .ne 2 -.mk .na \fB\fBreadonly\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -1060,7 +1081,6 @@ This property can also be referred to by its shortened column name, .sp .ne 2 -.mk .na \fB\fBrecordsize\fR=\fIsize\fR\fR .ad @@ -1089,7 +1109,6 @@ This property can also be referred to by its shortened column name, .sp .ne 2 -.mk .na \fB\fBrefquota\fR=\fIsize\fR | \fBnone\fR\fR .ad @@ -1102,7 +1121,6 @@ by descendents, including file systems and snapshots. .sp .ne 2 -.mk .na \fB\fBrefreservation\fR=\fIsize\fR | \fBnone\fR\fR .ad @@ -1125,7 +1143,6 @@ This property can also be referred to by its shortened column name, .sp .ne 2 -.mk .na \fB\fBreservation\fR=\fIsize\fR | \fBnone\fR\fR .ad @@ -1143,7 +1160,6 @@ This property can also be referred to by its shortened column name, .sp .ne 2 -.mk .na \fB\fBsecondarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR .ad @@ -1158,7 +1174,6 @@ value is \fBall\fR. .sp .ne 2 -.mk .na \fB\fBsetuid\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -1170,7 +1185,6 @@ default value is \fBon\fR. .sp .ne 2 -.mk .na \fB\fBshareiscsi\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -1189,7 +1203,6 @@ setting this property on a file system has no direct effect. .sp .ne 2 -.mk .na \fB\fBsharesmb\fR=\fBon\fR | \fBoff\fR | \fIopts\fR\fR .ad @@ -1228,7 +1241,6 @@ are unshared. .sp .ne 2 -.mk .na \fB\fBsharenfs\fR=\fBon\fR | \fBoff\fR | \fIopts\fR\fR .ad @@ -1252,7 +1264,6 @@ unshared. .sp .ne 2 -.mk .na \fB\fBlogbias\fR = \fBlatency\fR | \fBthroughput\fR\fR .ad @@ -1268,7 +1279,6 @@ efficient use of resources. .sp .ne 2 -.mk .na \fB\fBsnapdir\fR=\fBhidden\fR | \fBvisible\fR\fR .ad @@ -1281,7 +1291,26 @@ the file system as discussed in the "Snapshots" section. The default value is .sp .ne 2 -.mk +.na +\fB\fBsync\fR=\fBdefault\fR | \fBalways\fR | \fBdisabled\fR\fR +.ad +.sp .6 +.RS 4n +Controls the behavior of synchronous requests (e.g. fsync, O_DSYNC). +\fBdefault\fR is the POSIX specified behavior of ensuring all synchronous +requests are written to stable storage and all devices are flushed to ensure +data is not cached by device controllers (this is the default). \fBalways\fR +causes every file system transaction to be written and flushed before its +system call returns. This has a large performance penalty. \fBdisabled\fR +disables synchronous requests. File system transactions are only committed to +stable storage periodically. This option will give the highest performance. +However, it is very dangerous as ZFS would be ignoring the synchronous +transaction demands of applications such as databases or NFS. Administrators +should only use this option when the risks are understood. +.RE + +.sp +.ne 2 .na \fB\fBversion\fR=\fB1\fR | \fB2\fR | \fBcurrent\fR\fR .ad @@ -1294,7 +1323,6 @@ version. This property can only be set to later supported versions. See the .sp .ne 2 -.mk .na \fB\fBvolsize\fR=\fIsize\fR\fR .ad @@ -1325,7 +1353,6 @@ reflected in the reservation. .sp .ne 2 -.mk .na \fB\fBvscan\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -1339,7 +1366,6 @@ service must also be enabled for virus scanning to occur. The default value is .sp .ne 2 -.mk .na \fB\fBxattr\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -1351,7 +1377,6 @@ default value is \fBon\fR. .sp .ne 2 -.mk .na \fB\fBzoned\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -1372,7 +1397,6 @@ features being supported, the new file system will have the default values for these properties. .sp .ne 2 -.mk .na \fB\fBcasesensitivity\fR=\fBsensitive\fR | \fBinsensitive\fR | \fBmixed\fR\fR .ad @@ -1394,7 +1418,6 @@ product. For more information about the \fBmixed\fR value behavior, see the .sp .ne 2 -.mk .na \fB\fBnormalization\fR = \fBnone\fR | \fBformC\fR | \fBformD\fR | \fBformKC\fR | \fBformKD\fR\fR @@ -1413,7 +1436,6 @@ cannot be changed after the file system is created. .sp .ne 2 -.mk .na \fB\fButf8only\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -1517,7 +1539,6 @@ All subcommands that modify state are logged persistently to the pool in their original form. .sp .ne 2 -.mk .na \fB\fBzfs ?\fR\fR .ad @@ -1528,7 +1549,6 @@ Displays a help message. .sp .ne 2 -.mk .na \fB\fBzfs create\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem\fR\fR @@ -1539,7 +1559,6 @@ Creates a new \fBZFS\fR file system. The file system is automatically mounted according to the \fBmountpoint\fR property inherited from the parent. .sp .ne 2 -.mk .na \fB\fB-p\fR\fR .ad @@ -1554,7 +1573,6 @@ operation completes successfully. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR .ad @@ -1571,7 +1589,6 @@ property is specified in multiple \fB-o\fR options. .sp .ne 2 -.mk .na \fB\fBzfs create\fR [\fB-ps\fR] [\fB-b\fR \fIblocksize\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fB-V\fR \fIsize\fR \fIvolume\fR\fR @@ -1587,7 +1604,6 @@ exported by the device. By default, a reservation of equal size is created. the volume has an integral number of blocks regardless of \fIblocksize\fR. .sp .ne 2 -.mk .na \fB\fB-p\fR\fR .ad @@ -1602,7 +1618,6 @@ operation completes successfully. .sp .ne 2 -.mk .na \fB\fB-s\fR\fR .ad @@ -1614,7 +1629,6 @@ Properties section for more information about sparse volumes. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR .ad @@ -1629,7 +1643,6 @@ multiple \fB-o\fR options. .sp .ne 2 -.mk .na \fB\fB-b\fR \fIblocksize\fR\fR .ad @@ -1644,9 +1657,8 @@ behavior is undefined. .sp .ne 2 -.mk .na -\fB\fBzfs destroy\fR [\fB-rRf\fR] \fIfilesystem\fR|\fIvolume\fR\fR +\fBzfs destroy\fR [\fB-fnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR .ad .sp .6 .RS 4n @@ -1656,7 +1668,6 @@ mounted, and refuses to destroy a dataset that has active dependents (children or clones). .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -1667,7 +1678,6 @@ Recursively destroy all children. .sp .ne 2 -.mk .na \fB\fB-R\fR\fR .ad @@ -1679,7 +1689,6 @@ target hierarchy. .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad @@ -1689,30 +1698,72 @@ Force an unmount of any file systems using the \fBunmount -f\fR command. This option has no effect on non-file systems or unmounted file systems. .RE -Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR +.sp +.ne 2 +.na +\fB\fB-n\fR\fR +.ad +.sp .6 +.RS 4n +Do a dry-run ("No-op") deletion. No data will be deleted. This is +useful in conjunction with the \fB-v\fR or \fB-p\fR flags to determine what +data would be deleted. +.RE + +.sp +.ne 2 +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Print machine-parsable verbose information about the deleted data. +.RE + +.sp +.ne 2 +.na +\fB\fB-v\fR\fR +.ad +.sp .6 +.RS 4n +Print verbose information about the deleted data. +.RE +.sp +Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. .RE .sp .ne 2 -.mk .na -\fB\fBzfs destroy\fR [\fB-rRd\fR] \fIsnapshot\fR\fR +\fBzfs destroy\fR [\fB-dnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR@\fIsnap\fR[%\fIsnap\fR][,...] .ad .sp .6 .RS 4n -The given snapshot is destroyed immediately if and only if the \fBzfs +The given snapshots are destroyed immediately if and only if the \fBzfs destroy\fR command without the \fB-d\fR option would have destroyed it. Such immediate destruction would occur, for example, if the snapshot had no clones and the user-initiated reference count were zero. .sp -If the snapshot does not qualify for immediate destruction, it is marked for +If a snapshot does not qualify for immediate destruction, it is marked for deferred deletion. In this state, it exists as a usable, visible snapshot until both of the preconditions listed above are met, at which point it is destroyed. .sp +An inclusive range of snapshots may be specified by separating the +first and last snapshots with a percent sign. +The first and/or last snapshots may be left blank, in which case the +filesystem's oldest or newest snapshot will be implied. +.sp +Multiple snapshots +(or ranges of snapshots) of the same filesystem or volume may be specified +in a comma-separated list of snapshots. +Only the snapshot's short name (the +part after the \fB@\fR) should be specified when using a range or +comma-separated list to identify multiple snapshots. +.sp .ne 2 -.mk .na \fB\fB-d\fR\fR .ad @@ -1723,7 +1774,6 @@ Defer snapshot deletion. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -1735,7 +1785,6 @@ descendent file systems. .sp .ne 2 -.mk .na \fB\fB-R\fR\fR .ad @@ -1744,11 +1793,48 @@ descendent file systems. Recursively destroy all dependents. .RE +.sp +.ne 2 +.na +\fB\fB-n\fR\fR +.ad +.sp .6 +.RS 4n +Do a dry-run ("No-op") deletion. No data will be deleted. This is +useful in conjunction with the \fB-v\fR or \fB-p\fR flags to determine what +data would be deleted. +.RE + +.sp +.ne 2 +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Print machine-parsable verbose information about the deleted data. +.RE + +.sp +.ne 2 +.na +\fB\fB-v\fR\fR +.ad +.sp .6 +.RS 4n +Print verbose information about the deleted data. +.RE + +.sp +Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR +options, as they can destroy large portions of a pool and cause unexpected +behavior for mounted file systems in use. +.RE + .RE .sp .ne 2 -.mk .na \fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR @@ -1760,7 +1846,6 @@ successful system calls to the file system are part of the snapshot. See the "Snapshots" section for details. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -1773,7 +1858,6 @@ time. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR .ad @@ -1786,7 +1870,6 @@ Sets the specified property; see \fBzfs create\fR for details. .sp .ne 2 -.mk .na \fB\fBzfs rollback\fR [\fB-rRf\fR] \fIsnapshot\fR\fR .ad @@ -1805,7 +1888,6 @@ either of these options. To completely roll back a recursive snapshot, you must rollback the individual child snapshots. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -1816,7 +1898,6 @@ Recursively destroy any snapshots more recent than the one specified. .sp .ne 2 -.mk .na \fB\fB-R\fR\fR .ad @@ -1828,7 +1909,6 @@ snapshots. .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad @@ -1842,7 +1922,6 @@ that are to be destroyed. .sp .ne 2 -.mk .na \fB\fBzfs clone\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR\fR @@ -1854,7 +1933,6 @@ The target dataset can be located anywhere in the \fBZFS\fR hierarchy, and is created as the same type as the original. .sp .ne 2 -.mk .na \fB\fB-p\fR\fR .ad @@ -1868,7 +1946,6 @@ operation completes successfully. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR .ad @@ -1881,7 +1958,6 @@ Sets the specified property; see \fBzfs create\fR for details. .sp .ne 2 -.mk .na \fB\fBzfs promote\fR \fIclone-filesystem\fR\fR .ad @@ -1903,9 +1979,8 @@ any conflicting snapshots. .sp .ne 2 -.mk .na -\fB\fBzfs rename\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR +\fB\fBzfs rename\fR [\fB-f\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR .ad .br .na @@ -1913,7 +1988,7 @@ any conflicting snapshots. .ad .br .na -\fB\fBzfs rename\fR [\fB-p\fR] \fIfilesystem\fR|\fIvolume\fR +\fB\fBzfs rename\fR [\fB-fp\fR] \fIfilesystem\fR|\fIvolume\fR \fIfilesystem\fR|\fIvolume\fR\fR .ad .sp .6 @@ -1926,7 +2001,6 @@ second argument. Renamed file systems can inherit new mount points, in which case they are unmounted and remounted at the new mount point. .sp .ne 2 -.mk .na \fB\fB-p\fR\fR .ad @@ -1937,11 +2011,20 @@ are automatically mounted according to the \fBmountpoint\fR property inherited from their parent. .RE +.sp +.ne 2 +.na +\fB\fB-f\fR\fR +.ad +.sp .6 +.RS 4n +Force unmount any filesystems that need to be unmounted in the process. +.RE + .RE .sp .ne 2 -.mk .na \fB\fBzfs rename\fR \fB-r\fR \fIsnapshot\fR \fIsnapshot\fR\fR .ad @@ -1953,7 +2036,6 @@ only dataset that can be renamed recursively. .sp .ne 2 -.mk .na \fB\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-H\fR] [\fB-o\fR \fIproperty\fR[,\fI\&...\fR]] [ \fB-t\fR \fItype\fR[,\fI\&...\fR]] [ \fB-s\fR @@ -1970,7 +2052,6 @@ default is \fBoff\fR) . The following fields are displayed, \fBname,used,available,referenced,mountpoint\fR. .sp .ne 2 -.mk .na \fB\fB-H\fR\fR .ad @@ -1982,7 +2063,6 @@ tab instead of arbitrary white space. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -1993,7 +2073,6 @@ Recursively display any children of the dataset on the command line. .sp .ne 2 -.mk .na \fB\fB-d\fR \fIdepth\fR\fR .ad @@ -2006,7 +2085,6 @@ children. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIproperty\fR\fR .ad @@ -2044,7 +2122,6 @@ filesystem,volume\fR syntax. .sp .ne 2 -.mk .na \fB\fB-s\fR \fIproperty\fR\fR .ad @@ -2088,7 +2165,6 @@ preserved. .sp .ne 2 -.mk .na \fB\fB-S\fR \fIproperty\fR\fR .ad @@ -2099,7 +2175,6 @@ Same as the \fB-s\fR option, but sorts by property in descending order. .sp .ne 2 -.mk .na \fB\fB-t\fR \fItype\fR\fR .ad @@ -2114,7 +2189,6 @@ specifying \fB-t snapshot\fR displays only snapshots. .sp .ne 2 -.mk .na \fB\fBzfs set\fR \fIproperty\fR=\fIvalue\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR @@ -2133,10 +2207,9 @@ Properties" section. .sp .ne 2 -.mk .na \fB\fBzfs get\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-Hp\fR] [\fB-o\fR -\fIfield\fR[,...] [\fB-s\fR \fIsource\fR[,...] "\fIall\fR" | +\fIfield\fR[,...] [\fB-t\fR \fItype\fR[,...]] [\fB-s\fR \fIsource\fR[,...] "\fIall\fR" | \fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR .ad .sp .6 @@ -2164,7 +2237,6 @@ The special value \fBall\fR can be used to display all properties that apply to the given dataset's type (filesystem, volume, or snapshot). .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -2175,7 +2247,6 @@ Recursively display properties for any children. .sp .ne 2 -.mk .na \fB\fB-d\fR \fIdepth\fR\fR .ad @@ -2188,7 +2259,6 @@ children. .sp .ne 2 -.mk .na \fB\fB-H\fR\fR .ad @@ -2201,7 +2271,6 @@ arbitrary amount of space. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIfield\fR\fR .ad @@ -2213,7 +2282,6 @@ is the default value. .sp .ne 2 -.mk .na \fB\fB-s\fR \fIsource\fR\fR .ad @@ -2227,7 +2295,6 @@ is all sources. .sp .ne 2 -.mk .na \fB\fB-p\fR\fR .ad @@ -2240,7 +2307,6 @@ Display numbers in parseable (exact) values. .sp .ne 2 -.mk .na \fB\fBzfs inherit\fR [\fB-r\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR @@ -2253,7 +2319,6 @@ no ancestor has the property set, then the default value is used. See the properties can be inherited. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -2266,7 +2331,6 @@ Recursively inherit the given property for all children. .sp .ne 2 -.mk .na \fB\fBzfs upgrade\fR [\fB-v\fR]\fR .ad @@ -2277,7 +2341,6 @@ Displays a list of file systems that are not the most recent version. .sp .ne 2 -.mk .na \fB\fBzfs upgrade\fR [\fB-r\fR] [\fB-V\fR \fIversion\fR] [\fB-a\fR | \fIfilesystem\fR]\fR @@ -2297,7 +2360,6 @@ and the pool version must be upgraded before the file system version can be upgraded. .sp .ne 2 -.mk .na \fB\fB-a\fR\fR .ad @@ -2308,7 +2370,6 @@ Upgrade all file systems on all imported pools. .sp .ne 2 -.mk .na \fB\fIfilesystem\fR\fR .ad @@ -2319,7 +2380,6 @@ Upgrade the specified file system. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -2330,7 +2390,6 @@ Upgrade the specified file system and all descendent file systems .sp .ne 2 -.mk .na \fB\fB-V\fR \fIversion\fR\fR .ad @@ -2346,7 +2405,6 @@ supported by this software. .sp .ne 2 -.mk .na \fB\fBzfs userspace\fR [\fB-niHp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-sS\fR \fIfield\fR]... [\fB-t\fR \fItype\fR [,...]] \fIfilesystem\fR | @@ -2359,7 +2417,6 @@ filesystem or snapshot. This corresponds to the \fBuserused@\fR\fIuser\fR and \fBuserquota@\fR\fIuser\fR properties. .sp .ne 2 -.mk .na \fB\fB-n\fR\fR .ad @@ -2370,7 +2427,6 @@ Print numeric ID instead of user/group name. .sp .ne 2 -.mk .na \fB\fB-H\fR\fR .ad @@ -2381,7 +2437,6 @@ Do not print headers, use tab-delimited output. .sp .ne 2 -.mk .na \fB\fB-p\fR\fR .ad @@ -2392,7 +2447,6 @@ Use exact (parseable) numeric output. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIfield\fR[,...]\fR .ad @@ -2404,7 +2458,6 @@ Display only the specified fields from the following set, .sp .ne 2 -.mk .na \fB\fB-s\fR \fIfield\fR\fR .ad @@ -2417,7 +2470,6 @@ multiple times to sort first by one field, then by another. The default is .sp .ne 2 -.mk .na \fB\fB-S\fR \fIfield\fR\fR .ad @@ -2428,7 +2480,6 @@ Sort by this field in reverse order. See \fB-s\fR. .sp .ne 2 -.mk .na \fB\fB-t\fR \fItype\fR[,...]\fR .ad @@ -2444,7 +2495,6 @@ The default can be changed to include group types. .sp .ne 2 -.mk .na \fB\fB-i\fR\fR .ad @@ -2464,7 +2514,6 @@ will report that the POSIX entity has the total usage and quota for both. .sp .ne 2 -.mk .na \fB\fBzfs groupspace\fR [\fB-niHp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-sS\fR \fIfield\fR]... [\fB-t\fR \fItype\fR [,...]] \fIfilesystem\fR | @@ -2487,7 +2536,6 @@ except that the default types to display are \fB-t posixgroup,smbgroup\fR. .sp .ne 2 -.mk .na \fB\fBzfs mount\fR\fR .ad @@ -2498,7 +2546,6 @@ Displays all \fBZFS\fR file systems currently mounted. .sp .ne 2 -.mk .na \fB\fBzfs mount\fR [\fB-vO\fR] [\fB-o\fR \fIoptions\fR] \fB-a\fR | \fIfilesystem\fR\fR @@ -2509,7 +2556,6 @@ Mounts \fBZFS\fR file systems. Invoked automatically as part of the boot process. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIoptions\fR\fR .ad @@ -2522,7 +2568,6 @@ details. .sp .ne 2 -.mk .na \fB\fB-O\fR\fR .ad @@ -2533,7 +2578,6 @@ Perform an overlay mount. See \fBmount\fR(1M) for more information. .sp .ne 2 -.mk .na \fB\fB-v\fR\fR .ad @@ -2544,7 +2588,6 @@ Report mount progress. .sp .ne 2 -.mk .na \fB\fB-a\fR\fR .ad @@ -2556,7 +2599,6 @@ the boot process. .sp .ne 2 -.mk .na \fB\fIfilesystem\fR\fR .ad @@ -2569,7 +2611,6 @@ Mount the specified filesystem. .sp .ne 2 -.mk .na \fB\fBzfs unmount\fR [\fB-f\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR .ad @@ -2579,7 +2620,6 @@ Unmounts currently mounted \fBZFS\fR file systems. Invoked automatically as part of the shutdown process. .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad @@ -2590,7 +2630,6 @@ Forcefully unmount the file system, even if it is currently in use. .sp .ne 2 -.mk .na \fB\fB-a\fR\fR .ad @@ -2602,7 +2641,6 @@ the boot process. .sp .ne 2 -.mk .na \fB\fIfilesystem\fR|\fImountpoint\fR\fR .ad @@ -2616,7 +2654,6 @@ Unmount the specified filesystem. The command can also be given a path to a .sp .ne 2 -.mk .na \fB\fBzfs share\fR \fB-a\fR | \fIfilesystem\fR\fR .ad @@ -2625,7 +2662,6 @@ Unmount the specified filesystem. The command can also be given a path to a Shares available \fBZFS\fR file systems. .sp .ne 2 -.mk .na \fB\fB-a\fR\fR .ad @@ -2637,7 +2673,6 @@ the boot process. .sp .ne 2 -.mk .na \fB\fIfilesystem\fR\fR .ad @@ -2652,7 +2687,6 @@ Share the specified filesystem according to the \fBsharenfs\fR and .sp .ne 2 -.mk .na \fB\fBzfs unshare\fR \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR .ad @@ -2662,7 +2696,6 @@ Unshares currently shared \fBZFS\fR file systems. This is invoked automatically as part of the shutdown process. .sp .ne 2 -.mk .na \fB\fB-a\fR\fR .ad @@ -2674,7 +2707,6 @@ the boot process. .sp .ne 2 -.mk .na \fB\fIfilesystem\fR|\fImountpoint\fR\fR .ad @@ -2688,10 +2720,8 @@ Unshare the specified filesystem. The command can also be given a path to a .sp .ne 2 -.mk .na -\fB\fBzfs send\fR [\fB-vR\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] -\fIsnapshot\fR\fR +\fBzfs send\fR [\fB-DnPpRrv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR .ad .sp .6 .RS 4n @@ -2701,7 +2731,6 @@ system (for example, using \fBssh\fR(1). By default, a full stream is generated. .sp .ne 2 -.mk .na \fB\fB-i\fR \fIsnapshot\fR\fR .ad @@ -2720,7 +2749,6 @@ must be fully specified (for example, \fBpool/fs@origin\fR, not just .sp .ne 2 -.mk .na \fB\fB-I\fR \fIsnapshot\fR\fR .ad @@ -2734,7 +2762,6 @@ be specified as with the \fB-i\fR option. .sp .ne 2 -.mk .na \fB\fB-R\fR\fR .ad @@ -2754,13 +2781,73 @@ snapshots and file systems that do not exist on the sending side are destroyed. .sp .ne 2 -.mk +.na +\fB\fB-D\fR\fR +.ad +.sp .6 +.RS 4n +Generate a deduplicated stream. Blocks which would have been sent multiple +times in the send stream will only be sent once. The receiving system must +also support this feature to recieve a deduplicated stream. This flag can +be used regardless of the dataset's \fBdedup\fR property, but performance +will be much better if the filesystem uses a dedup-capable checksum (eg. +\fBsha256\fR). +.RE + +.sp +.ne 2 +.na +\fB\fB-r\fR\fR +.ad +.sp .6 +.RS 4n +Recursively send all descendant snapshots. This is similar to the \fB-R\fR +flag, but information about deleted and renamed datasets is not included, and +property information is only included if the \fB-p\fR flag is specified. +.RE + +.sp +.ne 2 +.na +\fB\fB-p\fR\fR +.ad +.sp .6 +.RS 4n +Include the dataset's properties in the stream. This flag is implicit when +\fB-R\fR is specified. The receiving system must also support this feature. +.RE + +.sp +.ne 2 +.na +\fB\fB-n\fR\fR +.ad +.sp .6 +.RS 4n +Do a dry-run ("No-op") send. Do not generate any actual send data. This is +useful in conjunction with the \fB-v\fR or \fB-P\fR flags to determine what +data will be sent. +.RE + +.sp +.ne 2 +.na +\fB\fB-P\fR\fR +.ad +.sp .6 +.RS 4n +Print machine-parsable verbose information about the stream package generated. +.RE + +.sp +.ne 2 .na \fB\fB-v\fR\fR .ad .sp .6 .RS 4n -Print verbose information about the stream package generated. +Print verbose information about the stream package generated. This information +includes a per-second report of how much data has been sent. .RE The format of the stream is committed. You will be able to receive your streams @@ -2769,14 +2856,13 @@ on future versions of \fBZFS\fR. .sp .ne 2 -.mk .na \fB\fBzfs receive\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR .ad .br .na -\fB\fBzfs receive\fR [\fB-vnFu\fR] \fB-d\fR \fIfilesystem\fR\fR +\fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR\fR .ad .sp .6 .RS 4n @@ -2798,30 +2884,49 @@ on the sending location are destroyed by using the \fBzfs destroy\fR \fB-d\fR command. .sp The name of the snapshot (and file system, if a full stream is received) that -this subcommand creates depends on the argument type and the \fB-d\fR option. +this subcommand creates depends on the argument type and the use of the +\fB-d\fR or \fB-e\fR options. .sp If the argument is a snapshot name, the specified \fIsnapshot\fR is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified \fIfilesystem\fR or -\fIvolume\fR. If the \fB-d\fR option is specified, the snapshot name is -determined by appending the sent snapshot's name to the specified -\fIfilesystem\fR. If the \fB-d\fR option is specified, any required file -systems within the specified one are created. +\fIvolume\fR. If neither of the \fB-d\fR or \fB-e\fR options are specified, +the provided target snapshot name is used exactly as provided. +.sp +The \fB-d\fR and \fB-e\fR options cause the file system name of the target +snapshot to be determined by appending a portion of the sent snapshot's name to +the specified target \fIfilesystem\fR. If the \fB-d\fR option is specified, all +but the first element of the sent snapshot's file system path (usually the +pool name) is used and any required intermediate file systems within the +specified one are created. If the \fB-e\fR option is specified, then only the +last element of the sent snapshot's file system name (i.e. the name of the +source file system itself) is used as the target file system name. .sp .ne 2 -.mk .na \fB\fB-d\fR\fR .ad .sp .6 .RS 4n -Use the name of the sent snapshot to determine the name of the new snapshot as -described in the paragraph above. +Discard the first element of the sent snapshot's file system name, using +the remaining elements to determine the name of the target file system for +the new snapshot as described in the paragraph above. +.RE + +.sp +.ne 2 +.na +\fB\fB-e\fR\fR +.ad +.sp .6 +.RS 4n +Discard all but the last element of the sent snapshot's file system name, +using that element to determine the name of the target file system for +the new snapshot as described in the paragraph above. .RE .sp .ne 2 -.mk .na \fB\fB-u\fR\fR .ad @@ -2832,7 +2937,6 @@ File system that is associated with the received stream is not mounted. .sp .ne 2 -.mk .na \fB\fB-v\fR\fR .ad @@ -2844,7 +2948,6 @@ receive operation. .sp .ne 2 -.mk .na \fB\fB-n\fR\fR .ad @@ -2856,7 +2959,6 @@ Do not actually receive the stream. This can be useful in conjunction with the .sp .ne 2 -.mk .na \fB\fB-F\fR\fR .ad @@ -2872,7 +2974,6 @@ snapshots and file systems that do not exist on the sending side. .sp .ne 2 -.mk .na \fB\fBzfs allow\fR \fIfilesystem\fR | \fIvolume\fR\fR .ad @@ -2884,7 +2985,6 @@ volume. See the other forms of \fBzfs allow\fR for more information. .sp .ne 2 -.mk .na \fB\fBzfs allow\fR [\fB-ldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR| \fIvolume\fR\fR @@ -2900,7 +3000,6 @@ Delegates \fBZFS\fR administration permission for the file systems to non-privileged users. .sp .ne 2 -.mk .na \fB[\fB-ug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...]\fR .ad @@ -2916,7 +3015,6 @@ group with the same name as a user, use the \fB-g\fR options. .sp .ne 2 -.mk .na \fB[\fB-e\fR] \fIperm\fR|@\fIsetname\fR[,...]\fR .ad @@ -2931,7 +3029,6 @@ set names, which begin with an at sign (\fB@\fR) , may be specified. See the .sp .ne 2 -.mk .na \fB[\fB-ld\fR] \fIfilesystem\fR|\fIvolume\fR\fR .ad @@ -2968,7 +3065,7 @@ receive subcommand Must also have the 'mount' and 'create' ability rename subcommand Must also have the 'mount' and 'create' ability in the new parent rollback subcommand Must also have the 'mount' ability -send subcommand +send subcommand share subcommand Allows sharing file systems over NFS or SMB protocols snapshot subcommand Must also have the 'mount' ability @@ -2978,46 +3075,45 @@ userprop other Allows changing any user property userquota other Allows accessing any userquota@... property userused other Allows reading any userused@... property -aclinherit property -aclmode property -atime property -canmount property -casesensitivity property -checksum property -compression property -copies property -devices property -exec property -mountpoint property -nbmand property -normalization property -primarycache property -quota property -readonly property -recordsize property -refquota property -refreservation property -reservation property -secondarycache property -setuid property -shareiscsi property -sharenfs property -sharesmb property -snapdir property -utf8only property -version property -volblocksize property -volsize property -vscan property -xattr property -zoned property +aclinherit property +aclmode property +atime property +canmount property +casesensitivity property +checksum property +compression property +copies property +devices property +exec property +mountpoint property +nbmand property +normalization property +primarycache property +quota property +readonly property +recordsize property +refquota property +refreservation property +reservation property +secondarycache property +setuid property +shareiscsi property +sharenfs property +sharesmb property +snapdir property +utf8only property +version property +volblocksize property +volsize property +vscan property +xattr property +zoned property .fi .in -2 .sp .sp .ne 2 -.mk .na \fB\fBzfs allow\fR \fB-c\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR\fR @@ -3030,7 +3126,6 @@ creator of any newly-created descendent file system. .sp .ne 2 -.mk .na \fB\fBzfs allow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR\fR @@ -3047,7 +3142,6 @@ characters long. .sp .ne 2 -.mk .na \fB\fBzfs unallow\fR [\fB-rldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] @@ -3078,7 +3172,6 @@ not all permissions for every user and group. See the \fBzfs allow\fR command for a description of the \fB-ldugec\fR options. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -3091,7 +3184,6 @@ Recursively remove the permissions from this file system and all descendents. .sp .ne 2 -.mk .na \fB\fBzfs unallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR [\fIperm\fR|@\fIsetname\fR[,...]]\fR @@ -3108,7 +3200,6 @@ then all permissions are removed, thus removing the set entirely. .sp .ne 2 -.mk .na \fB\fBzfs hold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR .ad @@ -3122,7 +3213,6 @@ If a hold exists on a snapshot, attempts to destroy that snapshot by using the \fBzfs destroy\fR command return \fBEBUSY\fR. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -3136,7 +3226,6 @@ snapshots of all descendent file systems. .sp .ne 2 -.mk .na \fB\fBzfs holds\fR [\fB-r\fR] \fIsnapshot\fR...\fR .ad @@ -3145,7 +3234,6 @@ snapshots of all descendent file systems. Lists all existing user references for the given snapshot or snapshots. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -3159,7 +3247,6 @@ listing the holds on the named snapshot. .sp .ne 2 -.mk .na \fB\fBzfs release\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR .ad @@ -3172,7 +3259,6 @@ If a hold exists on a snapshot, attempts to destroy that snapshot by using the \fBzfs destroy\fR command return \fBEBUSY\fR. .sp .ne 2 -.mk .na \fB\fB-r\fR\fR .ad @@ -3323,7 +3409,7 @@ pool/home/bob setuid on default pool/home/bob readonly off default pool/home/bob zoned off default pool/home/bob snapdir hidden default -pool/home/bob aclmode groupmask default +pool/home/bob aclmode discard default pool/home/bob aclinherit restricted default pool/home/bob canmount on default pool/home/bob shareiscsi off default @@ -3517,7 +3603,7 @@ target. # \fBzfs set shareiscsi=on pool/volumes/vol1\fR # \fBiscsitadm list target\fR Target: pool/volumes/vol1 - iSCSI Name: + iSCSI Name: iqn.1986-03.com.sun:02:7b4b02a6-3277-eb1b-e686-a24762c52a8c Connections: 0 .fi @@ -3629,7 +3715,7 @@ Create time permissions on (tank/users) create,destroy Local+Descendent permissions on (tank/users) group staff create,mount -------------------------------------------------------------- +------------------------------------------------------------- .fi .in -2 .sp @@ -3680,7 +3766,7 @@ Local+Descendent permissions on (users/home) cindys% \fBzfs set quota=10G users/home/marks\fR cindys% \fBzfs get quota users/home/marks\fR NAME PROPERTY VALUE SOURCE -users/home/marks quota 10G local +users/home/marks quota 10G local .fi .in -2 .sp @@ -3705,7 +3791,7 @@ Create time permissions on (tank/users) create,destroy Local+Descendent permissions on (tank/users) group staff @pset,create,mount -------------------------------------------------------------- +------------------------------------------------------------- .fi .in -2 .sp @@ -3716,7 +3802,6 @@ Local+Descendent permissions on (tank/users) The following exit values are returned: .sp .ne 2 -.mk .na \fB\fB0\fR\fR .ad @@ -3727,7 +3812,6 @@ Successful completion. .sp .ne 2 -.mk .na \fB\fB1\fR\fR .ad @@ -3738,7 +3822,6 @@ An error occurred. .sp .ne 2 -.mk .na \fB\fB2\fR\fR .ad @@ -3755,13 +3838,12 @@ See \fBattributes\fR(5) for descriptions of the following attributes: .sp .TS -tab() box; -cw(2.75i) |cw(2.75i) -lw(2.75i) |lw(2.75i) -. -ATTRIBUTE TYPEATTRIBUTE VALUE +box; +c | c +l | l . +ATTRIBUTE TYPE ATTRIBUTE VALUE _ -Interface StabilityCommitted +Interface Stability Committed .TE .SH SEE ALSO diff --git a/man/man1m/zpool.1m b/man/man1m/zpool.1m index 7a67781..a2a9b72 100644 --- a/man/man1m/zpool.1m +++ b/man/man1m/zpool.1m @@ -1,9 +1,20 @@ '\" te .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. -.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. -.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the -.\" fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH zpool 1M "21 Sep 2009" "SunOS 5.11" "System Administration Commands" +.\" Copyright 2011, Nexenta Systems, Inc. All Rights Reserved. +.\" Copyright (c) 2012 by Delphix. All rights reserved. +.\" The contents of this file are subject to the terms of the Common Development +.\" and Distribution License (the "License"). You may not use this file except +.\" in compliance with the License. You can obtain a copy of the license at +.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" +.\" See the License for the specific language governing permissions and +.\" limitations under the License. When distributing Covered Code, include this +.\" CDDL HEADER in each file and include the License file at +.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this +.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your +.\" own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.TH ZPOOL 1M "Mar 16, 2012" .SH NAME zpool \- configures ZFS storage pools .SH SYNOPSIS @@ -29,7 +40,7 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] +\fBzpool create\fR [\fB-fnd\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ... .fi @@ -65,7 +76,7 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] +\fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] [\fB-D\fR] [\fB-f\fR] [\fB-R\fR \fIroot\fR] \fB-a\fR .fi @@ -82,7 +93,7 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIproperty\fR[,...]] [\fIpool\fR] ... +\fBzpool list\fR [\fB-Hv\fR] [\fB-o\fR \fIproperty\fR[,...]] [\fIpool\fR] ... .fi .LP @@ -97,6 +108,11 @@ zpool \- configures ZFS storage pools .LP .nf +\fBzpool reguid\fR \fIpool\fR +.fi + +.LP +.nf \fBzpool remove\fR \fIpool\fR \fIdevice\fR ... .fi @@ -122,7 +138,7 @@ zpool \- configures ZFS storage pools .LP .nf -\fBzpool upgrade\fR +\fBzpool upgrade\fR .fi .LP @@ -153,12 +169,10 @@ organized according to certain performance and fault characteristics. The following virtual devices are supported: .sp .ne 2 -.mk .na \fB\fBdisk\fR\fR .ad .RS 10n -.rt A block device, typically located under \fB/dev/dsk\fR. \fBZFS\fR can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a @@ -170,12 +184,10 @@ disk, \fBZFS\fR automatically labels the disk, if necessary. .sp .ne 2 -.mk .na \fB\fBfile\fR\fR .ad .RS 10n -.rt A regular file. The use of files as a backing store is strongly discouraged. It is designed primarily for experimental purposes, as the fault tolerance of a file is only as good as the file system of which it is a part. A file must be @@ -184,12 +196,10 @@ specified by a full path. .sp .ne 2 -.mk .na \fB\fBmirror\fR\fR .ad .RS 10n -.rt A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with \fIN\fR disks of size \fIX\fR can hold \fIX\fR bytes and can withstand (\fIN-1\fR) devices failing before @@ -198,7 +208,6 @@ data integrity is compromised. .sp .ne 2 -.mk .na \fB\fBraidz\fR\fR .ad @@ -215,7 +224,6 @@ data integrity is compromised. \fB\fBraidz3\fR\fR .ad .RS 10n -.rt A variation on \fBRAID-5\fR that allows for better distribution of parity and eliminates the "\fBRAID-5\fR write hole" (in which data and parity become inconsistent after a power loss). Data and parity is striped across all disks @@ -238,24 +246,20 @@ disks. The recommended number is between 3 and 9 to help increase performance. .sp .ne 2 -.mk .na \fB\fBspare\fR\fR .ad .RS 10n -.rt A special pseudo-\fBvdev\fR which keeps track of available hot spares for a pool. For more information, see the "Hot Spares" section. .RE .sp .ne 2 -.mk .na \fB\fBlog\fR\fR .ad .RS 10n -.rt A separate-intent log device. If more than one log device is specified, then writes are load-balanced between devices. Log devices can be mirrored. However, \fBraidz\fR \fBvdev\fR types are not supported for the intent log. For more @@ -264,12 +268,10 @@ information, see the "Intent Log" section. .sp .ne 2 -.mk .na \fB\fBcache\fR\fR .ad .RS 10n -.rt A device used to cache storage pool data. A cache device cannot be cannot be configured as a mirror or \fBraidz\fR group. For more information, see the "Cache Devices" section. @@ -329,12 +331,10 @@ devices. A top-level vdev or component device is in one of the following states: .sp .ne 2 -.mk .na \fB\fBDEGRADED\fR\fR .ad .RS 12n -.rt One or more top-level vdevs is in the degraded state because one or more component devices are offline. Sufficient replicas exist to continue functioning. @@ -362,12 +362,10 @@ functioning. .sp .ne 2 -.mk .na \fB\fBFAULTED\fR\fR .ad .RS 12n -.rt One or more top-level vdevs is in the faulted state because one or more component devices are offline. Insufficient replicas exist to continue functioning. @@ -392,46 +390,38 @@ prevent further use of the device. .sp .ne 2 -.mk .na \fB\fBOFFLINE\fR\fR .ad .RS 12n -.rt The device was explicitly taken offline by the "\fBzpool offline\fR" command. .RE .sp .ne 2 -.mk .na \fB\fBONLINE\fR\fR .ad .RS 12n -.rt The device is online and functioning. .RE .sp .ne 2 -.mk .na \fB\fBREMOVED\fR\fR .ad .RS 12n -.rt The device was physically removed while the system was running. Device removal detection is hardware-dependent and may not be supported on all platforms. .RE .sp .ne 2 -.mk .na \fB\fBUNAVAIL\fR\fR .ad .RS 12n -.rt The device could not be opened. If a pool is imported when a device was unavailable, then the device will be identified by a unique identifier instead of its path since the path was never correct in the first place. @@ -545,76 +535,107 @@ read-only statistics while others are configurable and change the behavior of the pool. The following are read-only properties: .sp .ne 2 -.mk .na \fB\fBavailable\fR\fR .ad .RS 20n -.rt Amount of storage available within the pool. This property can also be referred to by its shortened column name, "avail". .RE .sp .ne 2 -.mk .na \fB\fBcapacity\fR\fR .ad .RS 20n -.rt Percentage of pool space used. This property can also be referred to by its shortened column name, "cap". .RE .sp .ne 2 -.mk +.na +\fB\fBexpandsize\fR\fR +.ad +.RS 20n +Amount of uninitialized space within the pool or device that can be used to +increase the total capacity of the pool. Uninitialized space consists of +any space on an EFI labeled vdev which has not been brought online +(i.e. zpool online -e). This space occurs when a LUN is dynamically expanded. +.RE + +.sp +.ne 2 +.na +\fB\fBfree\fR\fR +.ad +.RS 20n +The amount of free space available in the pool. +.RE + +.sp +.ne 2 +.na +\fB\fBfreeing\fR\fR +.ad +.RS 20n +After a file system or snapshot is destroyed, the space it was using is +returned to the pool asynchronously. \fB\fBfreeing\fR\fR is the amount of +space remaining to be reclaimed. Over time \fB\fBfreeing\fR\fR will decrease +while \fB\fBfree\fR\fR increases. +.RE + +.sp +.ne 2 .na \fB\fBhealth\fR\fR .ad .RS 20n -.rt The current health of the pool. Health can be "\fBONLINE\fR", "\fBDEGRADED\fR", "\fBFAULTED\fR", " \fBOFFLINE\fR", "\fBREMOVED\fR", or "\fBUNAVAIL\fR". .RE .sp .ne 2 -.mk .na \fB\fBguid\fR\fR .ad .RS 20n -.rt A unique identifier for the pool. .RE .sp .ne 2 -.mk .na \fB\fBsize\fR\fR .ad .RS 20n -.rt Total size of the storage pool. .RE .sp .ne 2 -.mk +.na +\fB\fBunsupported@\fR\fIfeature_guid\fR\fR +.ad +.RS 20n +Information about unsupported features that are enabled on the pool. See +\fBzpool-features\fR(5) for details. +.RE + +.sp +.ne 2 .na \fB\fBused\fR\fR .ad .RS 20n -.rt Amount of storage space used within the pool. .RE .sp .LP -These space usage properties report actual physical space available to the +The space usage properties report actual physical space available to the storage pool. The physical space can be different from the total amount of space that any contained datasets can actually use. The amount of space used in a \fBraidz\fR configuration depends on the characteristics of the data being @@ -628,7 +649,6 @@ these discrepancies may become more noticeable. The following property can be set at creation time and import time: .sp .ne 2 -.mk .na \fB\fBaltroot\fR\fR .ad @@ -639,7 +659,7 @@ points within the pool. This can be used when examining an unknown pool where the mount points cannot be trusted, or in an alternate boot environment, where the typical paths are not valid. \fBaltroot\fR is not a persistent property. It is valid only while the system is up. Setting \fBaltroot\fR defaults to using -\fBcachefile\fR=none, though this may be overridden using an explicit setting. +\fBcachefile\fR=none, though this may be overridden using an explicit setting. .RE .sp @@ -648,7 +668,6 @@ The following properties can be set at creation time and import time, and later changed with the \fBzpool set\fR command: .sp .ne 2 -.mk .na \fB\fBautoexpand\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -664,7 +683,6 @@ can also be referred to by its shortened column name, \fBexpand\fR. .sp .ne 2 -.mk .na \fB\fBautoreplace\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -680,7 +698,6 @@ property can also be referred to by its shortened column name, "replace". .sp .ne 2 -.mk .na \fB\fBbootfs\fR=\fIpool\fR/\fIdataset\fR\fR .ad @@ -692,7 +709,6 @@ expected to be set mainly by the installation and upgrade programs. .sp .ne 2 -.mk .na \fB\fBcachefile\fR=\fIpath\fR | \fBnone\fR\fR .ad @@ -717,7 +733,17 @@ exported or destroyed, the file is removed. .sp .ne 2 -.mk +.na +\fB\fBcomment\fR=\fB\fItext\fR\fR +.ad +.RS 4n +A text string consisting of printable ASCII characters that will be stored +such that it is available even if the pool becomes faulted. An administrator +can provide additional information about a pool using this property. +.RE + +.sp +.ne 2 .na \fB\fBdelegation\fR=\fBon\fR | \fBoff\fR\fR .ad @@ -730,7 +756,6 @@ permissions defined on the dataset. See \fBzfs\fR(1M) for more information on .sp .ne 2 -.mk .na \fB\fBfailmode\fR=\fBwait\fR | \fBcontinue\fR | \fBpanic\fR\fR .ad @@ -742,24 +767,20 @@ storage device(s) or a failure of all devices within the pool. The behavior of such an event is determined as follows: .sp .ne 2 -.mk .na \fB\fBwait\fR\fR .ad .RS 12n -.rt Blocks all \fBI/O\fR access until the device connectivity is recovered and the errors are cleared. This is the default behavior. .RE .sp .ne 2 -.mk .na \fB\fBcontinue\fR\fR .ad .RS 12n -.rt Returns \fBEIO\fR to any new write \fBI/O\fR requests but allows reads to any of the remaining healthy devices. Any write requests that have yet to be committed to disk would be blocked. @@ -767,12 +788,10 @@ committed to disk would be blocked. .sp .ne 2 -.mk .na \fB\fBpanic\fR\fR .ad .RS 12n -.rt Prints out a message to the console and generates a system crash dump. .RE @@ -780,7 +799,18 @@ Prints out a message to the console and generates a system crash dump. .sp .ne 2 -.mk +.na +\fB\fBfeature@\fR\fIfeature_name\fR=\fBenabled\fR\fR +.ad +.RS 4n +The value of this property is the current state of \fIfeature_name\fR. The +only valid value when setting this property is \fBenabled\fR which moves +\fIfeature_name\fR to the enabled state. See \fBzpool-features\fR(5) for +details on feature states. +.RE + +.sp +.ne 2 .na \fB\fBlistsnaps\fR=on | off\fR .ad @@ -793,7 +823,6 @@ value is "off". .sp .ne 2 -.mk .na \fB\fBversion\fR=\fIversion\fR\fR .ad @@ -802,8 +831,8 @@ value is "off". The current on-disk version of the pool. This can be increased, but never decreased. The preferred method of updating pools is with the "\fBzpool upgrade\fR" command, though this property can be used when a specific version -is needed for backwards compatibility. This property can be any number between -1 and the current version reported by "\fBzpool upgrade -v\fR". +is needed for backwards compatibility. Once feature flags is enabled on a +pool this property will no longer have a value. .RE .SS "Subcommands" @@ -818,7 +847,6 @@ pools, add capacity to storage pools, and provide information about the storage pools. The following subcommands are supported: .sp .ne 2 -.mk .na \fB\fBzpool\fR \fB-?\fR\fR .ad @@ -829,7 +857,6 @@ Displays a help message. .sp .ne 2 -.mk .na \fB\fBzpool add\fR [\fB-fn\fR] \fIpool\fR \fIvdev\fR ...\fR .ad @@ -841,24 +868,20 @@ the \fB-f\fR option, and the device checks performed are described in the "zpool create" subcommand. .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad .RS 6n -.rt Forces use of \fBvdev\fRs, even if they appear in use or specify a conflicting replication level. Not all devices can be overridden in this manner. .RE .sp .ne 2 -.mk .na \fB\fB-n\fR\fR .ad .RS 6n -.rt Displays the configuration that would be used without actually adding the \fBvdev\fRs. The actual pool creation can still fail due to insufficient privileges or device sharing. @@ -871,7 +894,6 @@ device. .sp .ne 2 -.mk .na \fB\fBzpool attach\fR [\fB-f\fR] \fIpool\fR \fIdevice\fR \fInew_device\fR\fR .ad @@ -886,12 +908,10 @@ three-way mirror, and so on. In either case, \fInew_device\fR begins to resilver immediately. .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad .RS 6n -.rt Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner. .RE @@ -900,7 +920,6 @@ devices can be overridden in this manner. .sp .ne 2 -.mk .na \fB\fBzpool clear\fR \fIpool\fR [\fIdevice\fR] ...\fR .ad @@ -913,9 +932,8 @@ those errors associated with the specified device or devices are cleared. .sp .ne 2 -.mk .na -\fB\fBzpool create\fR [\fB-fn\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR +\fB\fBzpool create\fR [\fB-fnd\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-O\fR \fIfile-system-property=value\fR] ... [\fB-m\fR \fImountpoint\fR] [\fB-R\fR \fIroot\fR] \fIpool\fR \fIvdev\fR ...\fR .ad @@ -945,8 +963,10 @@ Unless the \fB-R\fR option is specified, the default mount point is root dataset cannot be mounted. This can be overridden with the \fB-m\fR option. .sp +By default all supported features are enabled on the new pool unless the +\fB-d\fR option is specified. +.sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad @@ -958,7 +978,6 @@ replication level. Not all devices can be overridden in this manner. .sp .ne 2 -.mk .na \fB\fB-n\fR\fR .ad @@ -971,7 +990,18 @@ device sharing. .sp .ne 2 -.mk +.na +\fB\fB-d\fR\fR +.ad +.sp .6 +.RS 4n +Do not enable any features on the new pool. Individual features can be enabled +by setting their corresponding properties to \fBenabled\fR with the \fB-o\fR +option. See \fBzpool-features\fR(5) for details about feature properties. +.RE + +.sp +.ne 2 .na \fB\fB-o\fR \fIproperty=value\fR [\fB-o\fR \fIproperty=value\fR] ...\fR .ad @@ -983,7 +1013,6 @@ valid properties that can be set. .sp .ne 2 -.mk .na \fB\fB-O\fR \fIfile-system-property=value\fR\fR .ad @@ -1000,7 +1029,6 @@ can be set. .sp .ne 2 -.mk .na \fB\fB-R\fR \fIroot\fR\fR .ad @@ -1011,7 +1039,6 @@ Equivalent to "-o cachefile=none,altroot=\fIroot\fR" .sp .ne 2 -.mk .na \fB\fB-m\fR \fImountpoint\fR\fR .ad @@ -1027,7 +1054,6 @@ information on dataset mount points, see \fBzfs\fR(1M). .sp .ne 2 -.mk .na \fB\fBzpool destroy\fR [\fB-f\fR] \fIpool\fR\fR .ad @@ -1037,12 +1063,10 @@ Destroys the given pool, freeing up any devices for other use. This command tries to unmount any active datasets before destroying the pool. .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad .RS 6n -.rt Forces any active datasets contained within the pool to be unmounted. .RE @@ -1050,7 +1074,6 @@ Forces any active datasets contained within the pool to be unmounted. .sp .ne 2 -.mk .na \fB\fBzpool detach\fR \fIpool\fR \fIdevice\fR\fR .ad @@ -1062,7 +1085,6 @@ other valid replicas of the data. .sp .ne 2 -.mk .na \fB\fBzpool export\fR [\fB-f\fR] \fIpool\fR ...\fR .ad @@ -1082,12 +1104,10 @@ labels. Otherwise, disk drivers on platforms of different endianness will not recognize the disks. .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad .RS 6n -.rt Forcefully unmount all datasets, using the "\fBunmount -f\fR" command. .sp This command will forcefully export the pool even if it has a shared spare that @@ -1098,7 +1118,6 @@ is currently being used. This may lead to potential data corruption. .sp .ne 2 -.mk .na \fB\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...\fR .ad @@ -1124,7 +1143,6 @@ properties. .sp .ne 2 -.mk .na \fB\fBzpool history\fR [\fB-il\fR] [\fIpool\fR] ...\fR .ad @@ -1134,24 +1152,20 @@ Displays the command history of the specified pools or all pools if no pool is specified. .sp .ne 2 -.mk .na \fB\fB-i\fR\fR .ad .RS 6n -.rt Displays internally logged \fBZFS\fR events in addition to user initiated events. .RE .sp .ne 2 -.mk .na \fB\fB-l\fR\fR .ad .RS 6n -.rt Displays log records in long format, which in addition to standard format includes, the user name, the hostname, and the zone in which the operation was performed. @@ -1161,7 +1175,6 @@ performed. .sp .ne 2 -.mk .na \fB\fBzpool import\fR [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] [\fB-D\fR]\fR @@ -1181,12 +1194,10 @@ The numeric identifier is unique, and can be used instead of the pool name when multiple exported pools of the same name are available. .sp .ne 2 -.mk .na \fB\fB-c\fR \fIcachefile\fR\fR .ad .RS 16n -.rt Reads configuration from the given \fBcachefile\fR that was created with the "\fBcachefile\fR" pool property. This \fBcachefile\fR is used instead of searching for devices. @@ -1194,24 +1205,20 @@ searching for devices. .sp .ne 2 -.mk .na \fB\fB-d\fR \fIdir\fR\fR .ad .RS 16n -.rt Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times. .RE .sp .ne 2 -.mk .na \fB\fB-D\fR\fR .ad .RS 16n -.rt Lists destroyed pools only. .RE @@ -1219,7 +1226,6 @@ Lists destroyed pools only. .sp .ne 2 -.mk .na \fB\fBzpool import\fR [\fB-o\fR \fImntopts\fR] [ \fB-o\fR \fIproperty\fR=\fIvalue\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] @@ -1234,12 +1240,10 @@ are imported. Destroyed pools, pools that were previously destroyed with the is specified. .sp .ne 2 -.mk .na \fB\fB-o\fR \fImntopts\fR\fR .ad .RS 21n -.rt Comma-separated list of mount options to use when mounting datasets within the pool. See \fBzfs\fR(1M) for a description of dataset properties and mount options. @@ -1247,24 +1251,20 @@ options. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIproperty=value\fR\fR .ad .RS 21n -.rt Sets the specified property on the imported pool. See the "Properties" section for more information on the available pool properties. .RE .sp .ne 2 -.mk .na \fB\fB-c\fR \fIcachefile\fR\fR .ad .RS 21n -.rt Reads configuration from the given \fBcachefile\fR that was created with the "\fBcachefile\fR" pool property. This \fBcachefile\fR is used instead of searching for devices. @@ -1272,57 +1272,47 @@ searching for devices. .sp .ne 2 -.mk .na \fB\fB-d\fR \fIdir\fR\fR .ad .RS 21n -.rt Searches for devices or files in \fIdir\fR. The \fB-d\fR option can be specified multiple times. This option is incompatible with the \fB-c\fR option. .RE .sp .ne 2 -.mk .na \fB\fB-D\fR\fR .ad .RS 21n -.rt Imports destroyed pools only. The \fB-f\fR option is also required. .RE .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad .RS 21n -.rt Forces import, even if the pool appears to be potentially active. .RE .sp .ne 2 -.mk .na \fB\fB-a\fR\fR .ad .RS 21n -.rt Searches for and imports all pools found. .RE .sp .ne 2 -.mk .na \fB\fB-R\fR \fIroot\fR\fR .ad .RS 21n -.rt Sets the "\fBcachefile\fR" property to "\fBnone\fR" and the "\fIaltroot\fR" property to "\fIroot\fR". .RE @@ -1331,7 +1321,6 @@ property to "\fIroot\fR". .sp .ne 2 -.mk .na \fB\fBzpool import\fR [\fB-o\fR \fImntopts\fR] [ \fB-o\fR \fIproperty\fR=\fIvalue\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] @@ -1351,7 +1340,6 @@ this was a failed export, or whether the device is really in use from another host. To import a pool in this state, the \fB-f\fR option is required. .sp .ne 2 -.mk .na \fB\fB-o\fR \fImntopts\fR\fR .ad @@ -1364,7 +1352,6 @@ options. .sp .ne 2 -.mk .na \fB\fB-o\fR \fIproperty=value\fR\fR .ad @@ -1376,7 +1363,6 @@ for more information on the available pool properties. .sp .ne 2 -.mk .na \fB\fB-c\fR \fIcachefile\fR\fR .ad @@ -1389,7 +1375,6 @@ searching for devices. .sp .ne 2 -.mk .na \fB\fB-d\fR \fIdir\fR\fR .ad @@ -1401,7 +1386,6 @@ specified multiple times. This option is incompatible with the \fB-c\fR option. .sp .ne 2 -.mk .na \fB\fB-D\fR\fR .ad @@ -1412,7 +1396,6 @@ Imports destroyed pool. The \fB-f\fR option is also required. .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad @@ -1423,7 +1406,6 @@ Forces import, even if the pool appears to be potentially active. .sp .ne 2 -.mk .na \fB\fB-R\fR \fIroot\fR\fR .ad @@ -1437,7 +1419,6 @@ property to "\fIroot\fR". .sp .ne 2 -.mk .na \fB\fBzpool iostat\fR [\fB-T\fR \fBu\fR | \fBd\fR] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR @@ -1451,12 +1432,10 @@ system is shown. If \fIcount\fR is specified, the command exits after \fIcount\fR reports are printed. .sp .ne 2 -.mk .na \fB\fB-T\fR \fBu\fR | \fBd\fR\fR .ad .RS 12n -.rt Display a time stamp. .sp Specify \fBu\fR for a printed representation of the internal representation of @@ -1466,12 +1445,10 @@ time. See \fBtime\fR(2). Specify \fBd\fR for standard date format. See .sp .ne 2 -.mk .na \fB\fB-v\fR\fR .ad .RS 12n -.rt Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within the pool, in addition to the pool-wide statistics. .RE @@ -1480,9 +1457,8 @@ the pool, in addition to the pool-wide statistics. .sp .ne 2 -.mk .na -\fB\fBzpool list\fR [\fB-H\fR] [\fB-o\fR \fIprops\fR[,...]] [\fIpool\fR] ...\fR +\fB\fBzpool list\fR [\fB-Hv\fR] [\fB-o\fR \fIprops\fR[,...]] [\fIpool\fR] ...\fR .ad .sp .6 .RS 4n @@ -1490,34 +1466,39 @@ Lists the given pools along with a health status and space usage. When given no arguments, all pools in the system are listed. .sp .ne 2 -.mk .na \fB\fB-H\fR\fR .ad .RS 12n -.rt Scripted mode. Do not display headers, and separate fields by a single tab instead of arbitrary space. .RE .sp .ne 2 -.mk .na \fB\fB-o\fR \fIprops\fR\fR .ad .RS 12n -.rt Comma-separated list of properties to display. See the "Properties" section for a list of valid properties. The default list is "name, size, used, available, -capacity, health, altroot" +expandsize, capacity, dedupratio, health, altroot" +.RE + +.sp +.ne 2 +.na +\fB\fB-v\fR\fR +.ad +.RS 12n +Verbose statistics. Reports usage statistics for individual \fIvdevs\fR within +the pool, in addition to the pool-wise statistics. .RE .RE .sp .ne 2 -.mk .na \fB\fBzpool offline\fR [\fB-t\fR] \fIpool\fR \fIdevice\fR ...\fR .ad @@ -1529,12 +1510,10 @@ no attempt is made to read or write to the device. This command is not applicable to spares or cache devices. .sp .ne 2 -.mk .na \fB\fB-t\fR\fR .ad .RS 6n -.rt Temporary. Upon reboot, the specified physical device reverts to its previous state. .RE @@ -1543,7 +1522,6 @@ state. .sp .ne 2 -.mk .na \fB\fBzpool online\fR [\fB-e\fR] \fIpool\fR \fIdevice\fR...\fR .ad @@ -1554,12 +1532,10 @@ Brings the specified physical device online. This command is not applicable to spares or cache devices. .sp .ne 2 -.mk .na \fB\fB-e\fR\fR .ad .RS 6n -.rt Expand the device to use all available space. If the device is part of a mirror or \fBraidz\fR then all devices must be expanded before the new space will become available to the pool. @@ -1569,7 +1545,17 @@ become available to the pool. .sp .ne 2 -.mk +.na +\fB\fBzpool reguid\fR \fIpool\fR +.ad +.sp .6 +.RS 4n +Generates a new unique identifier for the pool. You must ensure that all devices in this pool are online and +healthy before performing this action. +.RE + +.sp +.ne 2 .na \fB\fBzpool remove\fR \fIpool\fR \fIdevice\fR ...\fR .ad @@ -1585,7 +1571,6 @@ a pool. .sp .ne 2 -.mk .na \fB\fBzpool replace\fR [\fB-f\fR] \fIpool\fR \fIold_device\fR [\fInew_device\fR]\fR @@ -1606,12 +1591,10 @@ this case, the new disk may have the same \fB/dev/dsk\fR path as the old device, even though it is actually a different disk. \fBZFS\fR recognizes this. .sp .ne 2 -.mk .na \fB\fB-f\fR\fR .ad .RS 6n -.rt Forces use of \fInew_device\fR, even if its appears to be in use. Not all devices can be overridden in this manner. .RE @@ -1620,7 +1603,6 @@ devices can be overridden in this manner. .sp .ne 2 -.mk .na \fB\fBzpool scrub\fR [\fB-s\fR] \fIpool\fR ...\fR .ad @@ -1645,12 +1627,10 @@ progress, \fBZFS\fR does not allow a scrub to be started until the resilver completes. .sp .ne 2 -.mk .na \fB\fB-s\fR\fR .ad .RS 6n -.rt Stop scrubbing. .RE @@ -1658,7 +1638,6 @@ Stop scrubbing. .sp .ne 2 -.mk .na \fB\fBzpool set\fR \fIproperty\fR=\fIvalue\fR \fIpool\fR\fR .ad @@ -1670,7 +1649,6 @@ more information on what properties can be set and acceptable values. .sp .ne 2 -.mk .na \fB\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...\fR .ad @@ -1687,24 +1665,20 @@ because the amount of data in the pool and the other workloads on the system can change. .sp .ne 2 -.mk .na \fB\fB-x\fR\fR .ad .RS 6n -.rt Only display status for pools that are exhibiting errors or are otherwise unavailable. .RE .sp .ne 2 -.mk .na \fB\fB-v\fR\fR .ad .RS 6n -.rt Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub. .RE @@ -1713,7 +1687,6 @@ data errors since the last complete pool scrub. .sp .ne 2 -.mk .na \fB\fBzpool upgrade\fR\fR .ad @@ -1728,7 +1701,6 @@ inaccessible on the system. .sp .ne 2 -.mk .na \fB\fBzpool upgrade\fR \fB-v\fR\fR .ad @@ -1741,7 +1713,6 @@ with an explanation of the features provided with each version. .sp .ne 2 -.mk .na \fB\fBzpool upgrade\fR [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIpool\fR ...\fR .ad @@ -1752,23 +1723,19 @@ pool will no longer be accessible on systems running older versions of the software. .sp .ne 2 -.mk .na \fB\fB-a\fR\fR .ad .RS 14n -.rt Upgrades all pools. .RE .sp .ne 2 -.mk .na \fB\fB-V\fR \fIversion\fR\fR .ad .RS 14n -.rt Upgrade to the specified version. If the \fB-V\fR flag is not specified, the pool is upgraded to the most recent version. This option can only be used to increase the version number, and only up to the most recent version supported @@ -1868,10 +1835,10 @@ The results from this command are similar to the following: .in +2 .nf # \fBzpool list\fR - NAME SIZE USED AVAIL CAP HEALTH ALTROOT - pool 67.5G 2.92M 67.5G 0% ONLINE - - tank 67.5G 2.92M 67.5G 0% ONLINE - - zion - - - 0% FAULTED - + NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT + rpool 19.9G 8.43G 11.4G - 42% 1.00x ONLINE - + tank 61.5G 20.0G 41.5G - 32% 1.00x ONLINE - + zion - - - - - - FAULTED - .fi .in -2 .sp @@ -2086,40 +2053,57 @@ The command to remove the mirrored log \fBmirror-2\fR is: .in -2 .sp +.LP +\fBExample 15 \fRDisplaying expanded space on a device +.sp +.LP +The following command dipslays the detailed information for the \fIdata\fR +pool. This pool is comprised of a single \fIraidz\fR vdev where one of its +devices increased its capacity by 1GB. In this example, the pool will not +be able to utilized this extra capacity until all the devices under the +\fIraidz\fR vdev have been expanded. + +.sp +.in +2 +.nf +# \fBzpool list -v data\fR + NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT + data 17.9G 174K 17.9G - 0% 1.00x ONLINE - + raidz1 17.9G 174K 17.9G - + c4t2d0 - - - 1G + c4t3d0 - - - - + c4t4d0 - - - - +.fi +.in -2 + .SH EXIT STATUS .sp .LP The following exit values are returned: .sp .ne 2 -.mk .na \fB\fB0\fR\fR .ad .RS 5n -.rt Successful completion. .RE .sp .ne 2 -.mk .na \fB\fB1\fR\fR .ad .RS 5n -.rt An error occurred. .RE .sp .ne 2 -.mk .na \fB\fB2\fR\fR .ad .RS 5n -.rt Invalid command line options were specified. .RE @@ -2131,16 +2115,15 @@ See \fBattributes\fR(5) for descriptions of the following attributes: .sp .TS -tab() box; -cw(2.75i) |cw(2.75i) -lw(2.75i) |lw(2.75i) -. -ATTRIBUTE TYPEATTRIBUTE VALUE +box; +c | c +l | l . +ATTRIBUTE TYPE ATTRIBUTE VALUE _ -Interface StabilityEvolving +Interface Stability Evolving .TE .SH SEE ALSO .sp .LP -\fBzfs\fR(1M), \fBattributes\fR(5) +\fBzfs\fR(1M), \fBzpool-features\fR(5), \fBattributes\fR(5) diff --git a/man/man1m/zstreamdump.1m b/man/man1m/zstreamdump.1m index d8b5e94..70f6ee8 100644 --- a/man/man1m/zstreamdump.1m +++ b/man/man1m/zstreamdump.1m @@ -3,7 +3,7 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with .\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH zstreamdump 1M "21 Sep 2009" "SunOS 5.11" "System Administration Commands" +.TH ZSTREAMDUMP 1M "Sep 21, 2009" .SH NAME zstreamdump \- filter data in zfs send stream .SH SYNOPSIS @@ -24,7 +24,6 @@ command, then displays headers and some statistics from that output. See The following options are supported: .sp .ne 2 -.mk .na \fB\fB-C\fR\fR .ad @@ -35,7 +34,6 @@ Suppress the validation of checksums. .sp .ne 2 -.mk .na \fB\fB-v\fR\fR .ad @@ -52,13 +50,12 @@ See \fBattributes\fR(5) for descriptions of the following attributes: .sp .TS -tab() box; -cw(2.75i) |cw(2.75i) -lw(2.75i) |lw(2.75i) -. -ATTRIBUTE TYPEATTRIBUTE VALUE +box; +c | c +l | l . +ATTRIBUTE TYPE ATTRIBUTE VALUE _ -Interface StabilityUncommitted +Interface Stability Uncommitted .TE .SH SEE ALSO -- cgit v1.1 From 245df8fcab5edca9a826890c7a35416de0733d1d Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 18 Jul 2012 10:42:43 +0000 Subject: Add zhack command missing from vendor import illumos-gate revision 13742:b6bbdd77139c Obtained from: ssh://anonhg@hg.illumos.org/illumos-gate --- cmd/zhack/zhack.c | 533 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 533 insertions(+) create mode 100644 cmd/zhack/zhack.c diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c new file mode 100644 index 0000000..2618cea --- /dev/null +++ b/cmd/zhack/zhack.c @@ -0,0 +1,533 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* + * zhack is a debugging tool that can write changes to ZFS pool using libzpool + * for testing purposes. Altering pools with zhack is unsupported and may + * result in corrupted pools. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#undef ZFS_MAXNAMELEN +#undef verify +#include + +extern boolean_t zfeature_checks_disable; + +const char cmdname[] = "zhack"; +libzfs_handle_t *g_zfs; +static importargs_t g_importargs; +static char *g_pool; +static boolean_t g_readonly; + +static void +usage(void) +{ + (void) fprintf(stderr, + "Usage: %s [-c cachefile] [-d dir] ...\n" + "where is one of the following:\n" + "\n", cmdname); + + (void) fprintf(stderr, + " feature stat \n" + " print information about enabled features\n" + " feature enable [-d desc] \n" + " add a new enabled feature to the pool\n" + " -d sets the feature's description\n" + " feature ref [-md] \n" + " change the refcount on the given feature\n" + " -d decrease instead of increase the refcount\n" + " -m add the feature to the label if increasing refcount\n" + "\n" + " : should be a feature guid\n"); + exit(1); +} + + +static void +fatal(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + (void) fprintf(stderr, "%s: ", cmdname); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); + (void) fprintf(stderr, "\n"); + + exit(1); +} + +/* ARGSUSED */ +static int +space_delta_cb(dmu_object_type_t bonustype, void *data, + uint64_t *userp, uint64_t *groupp) +{ + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (ENOENT); + (void) fprintf(stderr, "modifying object that needs user accounting"); + abort(); + /* NOTREACHED */ +} + +/* + * Target is the dataset whose pool we want to open. + */ +static void +import_pool(const char *target, boolean_t readonly) +{ + nvlist_t *config; + nvlist_t *pools; + int error; + char *sepp; + spa_t *spa; + nvpair_t *elem; + nvlist_t *props; + const char *name; + + kernel_init(readonly ? FREAD : (FREAD | FWRITE)); + g_zfs = libzfs_init(); + ASSERT(g_zfs != NULL); + + dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb); + + g_readonly = readonly; + + /* + * If we only want readonly access, it's OK if we find + * a potentially-active (ie, imported into the kernel) pool from the + * default cachefile. + */ + if (readonly && spa_open(target, &spa, FTAG) == 0) { + spa_close(spa, FTAG); + return; + } + + g_importargs.unique = B_TRUE; + g_importargs.can_be_active = readonly; + g_pool = strdup(target); + if ((sepp = strpbrk(g_pool, "/@")) != NULL) + *sepp = '\0'; + g_importargs.poolname = g_pool; + pools = zpool_search_import(g_zfs, &g_importargs); + + if (pools == NULL || nvlist_next_nvpair(pools, NULL) == NULL) { + if (!g_importargs.can_be_active) { + g_importargs.can_be_active = B_TRUE; + if (zpool_search_import(g_zfs, &g_importargs) != NULL || + spa_open(target, &spa, FTAG) == 0) { + fatal("cannot import '%s': pool is active; run " + "\"zpool export %s\" first\n", + g_pool, g_pool); + } + } + + fatal("cannot import '%s': no such pool available\n", g_pool); + } + + elem = nvlist_next_nvpair(pools, NULL); + name = nvpair_name(elem); + verify(nvpair_value_nvlist(elem, &config) == 0); + + props = NULL; + if (readonly) { + verify(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + verify(nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0); + } + + zfeature_checks_disable = B_TRUE; + error = spa_import(name, config, props, ZFS_IMPORT_NORMAL); + zfeature_checks_disable = B_FALSE; + if (error == EEXIST) + error = 0; + + if (error) + fatal("can't import '%s': %s", name, strerror(error)); +} + +static void +zhack_spa_open(const char *target, boolean_t readonly, void *tag, spa_t **spa) +{ + int err; + + import_pool(target, readonly); + + zfeature_checks_disable = B_TRUE; + err = spa_open(target, spa, tag); + zfeature_checks_disable = B_FALSE; + + if (err != 0) + fatal("cannot open '%s': %s", target, strerror(err)); + if (spa_version(*spa) < SPA_VERSION_FEATURES) { + fatal("'%s' has version %d, features not enabled", target, + (int)spa_version(*spa)); + } +} + +static void +dump_obj(objset_t *os, uint64_t obj, const char *name) +{ + zap_cursor_t zc; + zap_attribute_t za; + + (void) printf("%s_obj:\n", name); + + for (zap_cursor_init(&zc, os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (za.za_integer_length == 8) { + ASSERT(za.za_num_integers == 1); + (void) printf("\t%s = %llu\n", + za.za_name, (u_longlong_t)za.za_first_integer); + } else { + ASSERT(za.za_integer_length == 1); + char val[1024]; + VERIFY(zap_lookup(os, obj, za.za_name, + 1, sizeof (val), val) == 0); + (void) printf("\t%s = %s\n", za.za_name, val); + } + } + zap_cursor_fini(&zc); +} + +static void +dump_mos(spa_t *spa) +{ + nvlist_t *nv = spa->spa_label_features; + + (void) printf("label config:\n"); + for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + pair != NULL; + pair = nvlist_next_nvpair(nv, pair)) { + (void) printf("\t%s\n", nvpair_name(pair)); + } +} + +static void +zhack_do_feature_stat(int argc, char **argv) +{ + spa_t *spa; + objset_t *os; + char *target; + + argc--; + argv++; + + if (argc < 1) { + (void) fprintf(stderr, "error: missing pool name\n"); + usage(); + } + target = argv[0]; + + zhack_spa_open(target, B_TRUE, FTAG, &spa); + os = spa->spa_meta_objset; + + dump_obj(os, spa->spa_feat_for_read_obj, "for_read"); + dump_obj(os, spa->spa_feat_for_write_obj, "for_write"); + dump_obj(os, spa->spa_feat_desc_obj, "descriptions"); + dump_mos(spa); + + spa_close(spa, FTAG); +} + +static void +feature_enable_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + zfeature_info_t *feature = arg2; + + spa_feature_enable(spa, feature, tx); +} + +static void +zhack_do_feature_enable(int argc, char **argv) +{ + char c; + char *desc, *target; + spa_t *spa; + objset_t *mos; + zfeature_info_t feature; + zfeature_info_t *nodeps[] = { NULL }; + + /* + * Features are not added to the pool's label until their refcounts + * are incremented, so fi_mos can just be left as false for now. + */ + desc = NULL; + feature.fi_uname = "zhack"; + feature.fi_mos = B_FALSE; + feature.fi_can_readonly = B_FALSE; + feature.fi_depends = nodeps; + + optind = 1; + while ((c = getopt(argc, argv, "rmd:")) != -1) { + switch (c) { + case 'r': + feature.fi_can_readonly = B_TRUE; + break; + case 'd': + desc = strdup(optarg); + break; + default: + usage(); + break; + } + } + + if (desc == NULL) + desc = strdup("zhack injected"); + feature.fi_desc = desc; + + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, "error: missing feature or pool name\n"); + usage(); + } + target = argv[0]; + feature.fi_guid = argv[1]; + + if (!zfeature_is_valid_guid(feature.fi_guid)) + fatal("invalid feature guid: %s", feature.fi_guid); + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + mos = spa->spa_meta_objset; + + if (0 == zfeature_lookup_guid(feature.fi_guid, NULL)) + fatal("'%s' is a real feature, will not enable"); + if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid)) + fatal("feature already enabled: %s", feature.fi_guid); + + VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL, + feature_enable_sync, spa, &feature, 5)); + + spa_close(spa, FTAG); + + free(desc); +} + +static void +feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + zfeature_info_t *feature = arg2; + + spa_feature_incr(spa, feature, tx); +} + +static void +feature_decr_sync(void *arg1, void *arg2, dmu_tx_t *tx) +{ + spa_t *spa = arg1; + zfeature_info_t *feature = arg2; + + spa_feature_decr(spa, feature, tx); +} + +static void +zhack_do_feature_ref(int argc, char **argv) +{ + char c; + char *target; + boolean_t decr = B_FALSE; + spa_t *spa; + objset_t *mos; + zfeature_info_t feature; + zfeature_info_t *nodeps[] = { NULL }; + + /* + * fi_desc does not matter here because it was written to disk + * when the feature was enabled, but we need to properly set the + * feature for read or write based on the information we read off + * disk later. + */ + feature.fi_uname = "zhack"; + feature.fi_mos = B_FALSE; + feature.fi_desc = NULL; + feature.fi_depends = nodeps; + + optind = 1; + while ((c = getopt(argc, argv, "md")) != -1) { + switch (c) { + case 'm': + feature.fi_mos = B_TRUE; + break; + case 'd': + decr = B_TRUE; + break; + default: + usage(); + break; + } + } + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, "error: missing feature or pool name\n"); + usage(); + } + target = argv[0]; + feature.fi_guid = argv[1]; + + if (!zfeature_is_valid_guid(feature.fi_guid)) + fatal("invalid feature guid: %s", feature.fi_guid); + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + mos = spa->spa_meta_objset; + + if (0 == zfeature_lookup_guid(feature.fi_guid, NULL)) + fatal("'%s' is a real feature, will not change refcount"); + + if (0 == zap_contains(mos, spa->spa_feat_for_read_obj, + feature.fi_guid)) { + feature.fi_can_readonly = B_FALSE; + } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj, + feature.fi_guid)) { + feature.fi_can_readonly = B_TRUE; + } else { + fatal("feature is not enabled: %s", feature.fi_guid); + } + + if (decr && !spa_feature_is_active(spa, &feature)) + fatal("feature refcount already 0: %s", feature.fi_guid); + + VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL, + decr ? feature_decr_sync : feature_incr_sync, spa, &feature, 5)); + + spa_close(spa, FTAG); +} + +static int +zhack_do_feature(int argc, char **argv) +{ + char *subcommand; + + argc--; + argv++; + if (argc == 0) { + (void) fprintf(stderr, + "error: no feature operation specified\n"); + usage(); + } + + subcommand = argv[0]; + if (strcmp(subcommand, "stat") == 0) { + zhack_do_feature_stat(argc, argv); + } else if (strcmp(subcommand, "enable") == 0) { + zhack_do_feature_enable(argc, argv); + } else if (strcmp(subcommand, "ref") == 0) { + zhack_do_feature_ref(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + return (0); +} + +#define MAX_NUM_PATHS 1024 + +int +main(int argc, char **argv) +{ + extern void zfs_prop_init(void); + + char *path[MAX_NUM_PATHS]; + const char *subcommand; + int rv = 0; + char c; + + g_importargs.path = path; + + dprintf_setup(&argc, argv); + zfs_prop_init(); + + while ((c = getopt(argc, argv, "c:d:")) != -1) { + switch (c) { + case 'c': + g_importargs.cachefile = optarg; + break; + case 'd': + assert(g_importargs.paths < MAX_NUM_PATHS); + g_importargs.path[g_importargs.paths++] = optarg; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + optind = 1; + + if (argc == 0) { + (void) fprintf(stderr, "error: no command specified\n"); + usage(); + } + + subcommand = argv[0]; + + if (strcmp(subcommand, "feature") == 0) { + rv = zhack_do_feature(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_TRUE) != 0) { + fatal("pool export failed; " + "changes may not be committed to disk\n"); + } + + libzfs_fini(g_zfs); + kernel_fini(); + + return (rv); +} -- cgit v1.1 From d5063724619a16bd1592c39395c787e97be18693 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 18 Jul 2012 10:58:07 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate 13752:9f5f6c52ba19 (zfs part) Obtained from: ssh://anonhg@hg.illumos.org/illumos-gate --- cmd/zdb/zdb.c | 41 ++- cmd/zfs/zfs_main.c | 103 ++++++-- cmd/zhack/zhack.c | 7 + cmd/zpool/zpool_main.c | 171 +++++++----- cmd/ztest/ztest.c | 27 +- lib/libzfs/common/libzfs.h | 23 +- lib/libzfs/common/libzfs_config.c | 48 ++++ lib/libzfs/common/libzfs_dataset.c | 291 +++++++++++---------- lib/libzfs/common/libzfs_impl.h | 10 +- lib/libzfs/common/libzfs_iter.c | 8 +- lib/libzfs/common/libzfs_pool.c | 64 +++-- lib/libzfs/common/libzfs_sendrecv.c | 21 +- lib/libzfs/common/libzfs_util.c | 24 +- lib/libzfs_core/common/libzfs_core.c | 477 ++++++++++++++++++++++++++++++++++ lib/libzfs_core/common/libzfs_core.h | 62 +++++ lib/libzpool/common/kernel.c | 6 + lib/libzpool/common/sys/zfs_context.h | 1 + man/man1m/zfs.1m | 15 +- 18 files changed, 1064 insertions(+), 335 deletions(-) create mode 100644 lib/libzfs_core/common/libzfs_core.c create mode 100644 lib/libzfs_core/common/libzfs_core.h diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index ea211bf..d462d3f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -57,6 +57,7 @@ #include #include #include +#include #undef ZFS_MAXNAMELEN #undef verify #include @@ -204,6 +205,27 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) nvlist_free(nv); } +/* ARGSUSED */ +static void +dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) +{ + spa_history_phys_t *shp = data; + + if (shp == NULL) + return; + + (void) printf("\t\tpool_create_len = %llu\n", + (u_longlong_t)shp->sh_pool_create_len); + (void) printf("\t\tphys_max_off = %llu\n", + (u_longlong_t)shp->sh_phys_max_off); + (void) printf("\t\tbof = %llu\n", + (u_longlong_t)shp->sh_bof); + (void) printf("\t\teof = %llu\n", + (u_longlong_t)shp->sh_eof); + (void) printf("\t\trecords_lost = %llu\n", + (u_longlong_t)shp->sh_records_lost); +} + static void zdb_nicenum(uint64_t num, char *buf) { @@ -853,21 +875,22 @@ dump_history(spa_t *spa) for (int i = 0; i < num; i++) { uint64_t time, txg, ievent; char *cmd, *intstr; + boolean_t printed = B_FALSE; if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, &time) != 0) - continue; + goto next; if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, &cmd) != 0) { if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT, &ievent) != 0) - continue; + goto next; verify(nvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG, &txg) == 0); verify(nvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR, &intstr) == 0); - if (ievent >= LOG_END) - continue; + if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) + goto next; (void) snprintf(internalstr, sizeof (internalstr), @@ -880,6 +903,14 @@ dump_history(spa_t *spa) (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); (void) printf("%s %s\n", tbuf, cmd); + printed = B_TRUE; + +next: + if (dump_opt['h'] > 1) { + if (!printed) + (void) printf("unrecognized record:\n"); + dump_nvlist(events[i], 2); + } } } @@ -1456,7 +1487,7 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ - dump_uint64, /* SPA history offsets */ + dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index b64905f..1fd27c5 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -56,6 +56,7 @@ #include #include +#include #include #include #include @@ -70,6 +71,7 @@ libzfs_handle_t *g_zfs; static FILE *mnttab_file; static char history_str[HIS_MAX_RECORD_LEN]; +static boolean_t log_history = B_TRUE; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); @@ -259,7 +261,7 @@ get_usage(zfs_help_t idx) return (gettext("\tshare <-a | filesystem>\n")); case HELP_SNAPSHOT: return (gettext("\tsnapshot [-r] [-o property=value] ... " - "\n")); + " ...\n")); case HELP_UNMOUNT: return (gettext("\tunmount [-f] " "<-a | filesystem|mountpoint>\n")); @@ -888,9 +890,9 @@ typedef struct destroy_cbdata { nvlist_t *cb_nvl; /* first snap in contiguous run */ - zfs_handle_t *cb_firstsnap; + char *cb_firstsnap; /* previous snap in contiguous run */ - zfs_handle_t *cb_prevsnap; + char *cb_prevsnap; int64_t cb_snapused; char *cb_snapspec; } destroy_cbdata_t; @@ -1004,11 +1006,13 @@ destroy_print_cb(zfs_handle_t *zhp, void *arg) if (nvlist_exists(cb->cb_nvl, name)) { if (cb->cb_firstsnap == NULL) - cb->cb_firstsnap = zfs_handle_dup(zhp); + cb->cb_firstsnap = strdup(name); if (cb->cb_prevsnap != NULL) - zfs_close(cb->cb_prevsnap); + free(cb->cb_prevsnap); /* this snap continues the current range */ - cb->cb_prevsnap = zfs_handle_dup(zhp); + cb->cb_prevsnap = strdup(name); + if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) + nomem(); if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); @@ -1023,12 +1027,12 @@ destroy_print_cb(zfs_handle_t *zhp, void *arg) } else if (cb->cb_firstsnap != NULL) { /* end of this range */ uint64_t used = 0; - err = zfs_get_snapused_int(cb->cb_firstsnap, + err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); cb->cb_snapused += used; - zfs_close(cb->cb_firstsnap); + free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; - zfs_close(cb->cb_prevsnap); + free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } zfs_close(zhp); @@ -1045,13 +1049,13 @@ destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) if (cb->cb_firstsnap != NULL) { uint64_t used = 0; if (err == 0) { - err = zfs_get_snapused_int(cb->cb_firstsnap, + err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); } cb->cb_snapused += used; - zfs_close(cb->cb_firstsnap); + free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; - zfs_close(cb->cb_prevsnap); + free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } return (err); @@ -1064,7 +1068,7 @@ snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) int err = 0; /* Check for clones. */ - if (!cb->cb_doclones) { + if (!cb->cb_doclones && !cb->cb_defer_destroy) { cb->cb_target = zhp; cb->cb_first = B_TRUE; err = zfs_iter_dependents(zhp, B_TRUE, @@ -1904,9 +1908,11 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data) /* * If they did "zfs upgrade -a", then we could * be doing ioctls to different pools. We need - * to log this history once to each pool. + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). */ - verify(zpool_stage_history(g_zfs, history_str) == 0); + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; } if (zfs_prop_set(zhp, "version", verstr) == 0) cb->cb_numupgraded++; @@ -3424,6 +3430,32 @@ zfs_do_set(int argc, char **argv) return (ret); } +typedef struct snap_cbdata { + nvlist_t *sd_nvl; + boolean_t sd_recursive; + const char *sd_snapname; +} snap_cbdata_t; + +static int +zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) +{ + snap_cbdata_t *sd = arg; + char *name; + int rv = 0; + int error; + + error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); + if (error == -1) + nomem(); + fnvlist_add_boolean(sd->sd_nvl, name); + free(name); + + if (sd->sd_recursive) + rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); + zfs_close(zhp); + return (rv); +} + /* * zfs snapshot [-r] [-o prop=value] ... * @@ -3433,13 +3465,16 @@ zfs_do_set(int argc, char **argv) static int zfs_do_snapshot(int argc, char **argv) { - boolean_t recursive = B_FALSE; int ret = 0; char c; nvlist_t *props; + snap_cbdata_t sd = { 0 }; + boolean_t multiple_snaps = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); + if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); /* check options */ while ((c = getopt(argc, argv, "ro:")) != -1) { @@ -3449,7 +3484,8 @@ zfs_do_snapshot(int argc, char **argv) return (1); break; case 'r': - recursive = B_TRUE; + sd.sd_recursive = B_TRUE; + multiple_snaps = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), @@ -3466,18 +3502,35 @@ zfs_do_snapshot(int argc, char **argv) (void) fprintf(stderr, gettext("missing snapshot argument\n")); goto usage; } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - goto usage; + + if (argc > 1) + multiple_snaps = B_TRUE; + for (; argc > 0; argc--, argv++) { + char *atp; + zfs_handle_t *zhp; + + atp = strchr(argv[0], '@'); + if (atp == NULL) + goto usage; + *atp = '\0'; + sd.sd_snapname = atp + 1; + zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + goto usage; + if (zfs_snapshot_cb(zhp, &sd) != 0) + goto usage; } - ret = zfs_snapshot(g_zfs, argv[0], recursive, props); + ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); + nvlist_free(sd.sd_nvl); nvlist_free(props); - if (ret && recursive) + if (ret != 0 && multiple_snaps) (void) fprintf(stderr, gettext("no snapshots were created\n")); return (ret != 0); usage: + nvlist_free(sd.sd_nvl); nvlist_free(props); usage(B_FALSE); return (-1); @@ -6479,8 +6532,7 @@ main(int argc, char **argv) return (1); } - zpool_set_history_str("zfs", argc, argv, history_str); - verify(zpool_stage_history(g_zfs, history_str) == 0); + zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); @@ -6549,6 +6601,9 @@ main(int argc, char **argv) (void) fclose(mnttab_file); + if (ret == 0 && log_history) + (void) zpool_log_history(g_zfs, history_str); + libzfs_fini(g_zfs); /* diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c index 2618cea..f4be0b2 100644 --- a/cmd/zhack/zhack.c +++ b/cmd/zhack/zhack.c @@ -279,6 +279,9 @@ feature_enable_sync(void *arg1, void *arg2, dmu_tx_t *tx) zfeature_info_t *feature = arg2; spa_feature_enable(spa, feature, tx); + spa_history_log_internal(spa, "zhack enable feature", tx, + "name=%s can_readonly=%u", + feature->fi_guid, feature->fi_can_readonly); } static void @@ -356,6 +359,8 @@ feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx) zfeature_info_t *feature = arg2; spa_feature_incr(spa, feature, tx); + spa_history_log_internal(spa, "zhack feature incr", tx, + "name=%s", feature->fi_guid); } static void @@ -365,6 +370,8 @@ feature_decr_sync(void *arg1, void *arg2, dmu_tx_t *tx) zfeature_info_t *feature = arg2; spa_feature_decr(spa, feature, tx); + spa_history_log_internal(spa, "zhack feature decr", tx, + "name=%s", feature->fi_guid); } static void diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index bc300b3..3ecdbec 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -185,9 +185,9 @@ static zpool_command_t command_table[] = { #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) -zpool_command_t *current_command; +static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; - +static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * @@ -935,7 +935,10 @@ zpool_do_destroy(int argc, char **argv) return (1); } - ret = (zpool_destroy(zhp) != 0); + /* The history must be logged as part of the export */ + log_history = B_FALSE; + + ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); @@ -999,10 +1002,13 @@ zpool_do_export(int argc, char **argv) continue; } + /* The history must be logged as part of the export */ + log_history = B_FALSE; + if (hardforce) { - if (zpool_export_force(zhp) != 0) + if (zpool_export_force(zhp, history_str) != 0) ret = 1; - } else if (zpool_export(zhp, force) != 0) { + } else if (zpool_export(zhp, force, history_str) != 0) { ret = 1; } @@ -4269,6 +4275,14 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) (void) printf(gettext("Successfully upgraded " "'%s'\n\n"), zpool_get_name(zhp)); } + /* + * If they did "zpool upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; } } else if (cbp->cb_newer && !SPA_VERSION_IS_SUPPORTED(version)) { assert(!cbp->cb_all); @@ -4491,8 +4505,8 @@ zpool_do_upgrade(int argc, char **argv) typedef struct hist_cbdata { boolean_t first; - int longfmt; - int internal; + boolean_t longfmt; + boolean_t internal; } hist_cbdata_t; /* @@ -4504,21 +4518,8 @@ get_history_one(zpool_handle_t *zhp, void *data) nvlist_t *nvhis; nvlist_t **records; uint_t numrecords; - char *cmdstr; - char *pathstr; - uint64_t dst_time; - time_t tsec; - struct tm t; - char tbuf[30]; int ret, i; - uint64_t who; - struct passwd *pwd; - char *hostname; - char *zonename; - char internalstr[MAXPATHLEN]; hist_cbdata_t *cb = (hist_cbdata_t *)data; - uint64_t txg; - uint64_t ievent; cb->first = B_FALSE; @@ -4530,64 +4531,94 @@ get_history_one(zpool_handle_t *zhp, void *data) verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { - if (nvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME, - &dst_time) != 0) - continue; + nvlist_t *rec = records[i]; + char tbuf[30] = ""; - /* is it an internal event or a standard event? */ - if (nvlist_lookup_string(records[i], ZPOOL_HIST_CMD, - &cmdstr) != 0) { - if (cb->internal == 0) - continue; + if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { + time_t tsec; + struct tm t; + + tsec = fnvlist_lookup_uint64(records[i], + ZPOOL_HIST_TIME); + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + } - if (nvlist_lookup_uint64(records[i], - ZPOOL_HIST_INT_EVENT, &ievent) != 0) + if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { + (void) printf("%s %s", tbuf, + fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); + } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { + int ievent = + fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); + if (!cb->internal) + continue; + if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { + (void) printf("%s unrecognized record:\n", + tbuf); + dump_nvlist(rec, 4); + continue; + } + (void) printf("%s [internal %s txg:%lld] %s", tbuf, + zfs_history_event_names[ievent], + fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG), + fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { + if (!cb->internal) continue; - verify(nvlist_lookup_uint64(records[i], - ZPOOL_HIST_TXG, &txg) == 0); - verify(nvlist_lookup_string(records[i], - ZPOOL_HIST_INT_STR, &pathstr) == 0); - if (ievent >= LOG_END) + (void) printf("%s [txg:%lld] %s", tbuf, + fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG), + fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); + if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { + (void) printf(" %s (%llu)", + fnvlist_lookup_string(rec, + ZPOOL_HIST_DSNAME), + fnvlist_lookup_uint64(rec, + ZPOOL_HIST_DSID)); + } + (void) printf(" %s", fnvlist_lookup_string(rec, + ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { + if (!cb->internal) + continue; + (void) printf("%s ioctl %s\n", tbuf, + fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); + if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { + (void) printf(" input:\n"); + dump_nvlist(fnvlist_lookup_nvlist(rec, + ZPOOL_HIST_INPUT_NVL), 8); + } + if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { + (void) printf(" output:\n"); + dump_nvlist(fnvlist_lookup_nvlist(rec, + ZPOOL_HIST_OUTPUT_NVL), 8); + } + } else { + if (!cb->internal) continue; - (void) snprintf(internalstr, - sizeof (internalstr), - "[internal %s txg:%lld] %s", - zfs_history_event_names[ievent], txg, - pathstr); - cmdstr = internalstr; + (void) printf("%s unrecognized record:\n", tbuf); + dump_nvlist(rec, 4); } - tsec = dst_time; - (void) localtime_r(&tsec, &t); - (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); - (void) printf("%s %s", tbuf, cmdstr); if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); - if (nvlist_lookup_uint64(records[i], - ZPOOL_HIST_WHO, &who) == 0) { - pwd = getpwuid((uid_t)who); - if (pwd) - (void) printf("user %s on", - pwd->pw_name); - else - (void) printf("user %d on", - (int)who); - } else { - (void) printf(gettext("no info]\n")); - continue; + if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { + uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); + struct passwd *pwd = getpwuid(who); + (void) printf("user %d ", (int)who); + if (pwd != NULL) + (void) printf("(%s) ", pwd->pw_name); } - if (nvlist_lookup_string(records[i], - ZPOOL_HIST_HOST, &hostname) == 0) { - (void) printf(" %s", hostname); + if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { + (void) printf("on %s", + fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } - if (nvlist_lookup_string(records[i], - ZPOOL_HIST_ZONE, &zonename) == 0) { - (void) printf(":%s", zonename); + if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { + (void) printf(":%s", + fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } - (void) printf("]"); (void) printf("\n"); } @@ -4602,8 +4633,6 @@ get_history_one(zpool_handle_t *zhp, void *data) * * Displays the history of commands that modified pools. */ - - int zpool_do_history(int argc, char **argv) { @@ -4616,10 +4645,10 @@ zpool_do_history(int argc, char **argv) while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': - cbdata.longfmt = 1; + cbdata.longfmt = B_TRUE; break; case 'i': - cbdata.internal = 1; + cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), @@ -4844,8 +4873,7 @@ main(int argc, char **argv) if (strcmp(cmdname, "-?") == 0) usage(B_TRUE); - zpool_set_history_str("zpool", argc, argv, history_str); - verify(zpool_stage_history(g_zfs, history_str) == 0); + zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Run the appropriate command. @@ -4872,6 +4900,9 @@ main(int argc, char **argv) usage(B_FALSE); } + if (ret == 0 && log_history) + (void) zpool_log_history(g_zfs, history_str); + libzfs_fini(g_zfs); /* diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index d9bcb04..63acd0c 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -2252,7 +2252,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); VERIFY3U(ENOENT, ==, - spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); + spa_create("ztest_bad_file", nvroot, NULL, NULL)); nvlist_free(nvroot); /* @@ -2260,7 +2260,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); VERIFY3U(ENOENT, ==, - spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); + spa_create("ztest_bad_mirror", nvroot, NULL, NULL)); nvlist_free(nvroot); /* @@ -2269,7 +2269,7 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) */ (void) rw_rdlock(&ztest_name_lock); nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); - VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); + VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL)); nvlist_free(nvroot); VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); @@ -3056,8 +3056,7 @@ ztest_snapshot_create(char *osname, uint64_t id) (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, (u_longlong_t)id); - error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1, - NULL, NULL, B_FALSE, B_FALSE, -1); + error = dmu_objset_snapshot_one(osname, strchr(snapname, '@') + 1); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (B_FALSE); @@ -3257,8 +3256,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); - error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1, - NULL, NULL, B_FALSE, B_FALSE, -1); + error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); @@ -3281,8 +3279,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); } - error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1, - NULL, NULL, B_FALSE, B_FALSE, -1); + error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); @@ -3291,8 +3288,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); } - error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1, - NULL, NULL, B_FALSE, B_FALSE, -1); + error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); @@ -4480,8 +4476,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) * Create snapshot, clone it, mark snap for deferred destroy, * destroy clone, verify snap was also destroyed. */ - error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, - FALSE, -1); + error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); @@ -4523,8 +4518,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) * destroy a held snapshot, mark for deferred destroy, * release hold, verify snapshot was destroyed. */ - error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, - FALSE, -1); + error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); @@ -5612,8 +5606,7 @@ ztest_init(ztest_shared_t *zs) spa_feature_table[i].fi_uname); VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); } - VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, - NULL, NULL)); + VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL)); nvlist_free(nvroot); VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); diff --git a/lib/libzfs/common/libzfs.h b/lib/libzfs/common/libzfs.h index 4dc039c..56ebf53 100644 --- a/lib/libzfs/common/libzfs.h +++ b/lib/libzfs/common/libzfs.h @@ -54,7 +54,8 @@ extern "C" { /* * libzfs errors */ -enum { +typedef enum zfs_error { + EZFS_SUCCESS = 0, /* no error -- success */ EZFS_NOMEM = 2000, /* out of memory */ EZFS_BADPROP, /* invalid property value */ EZFS_PROPREADONLY, /* cannot set readonly property */ @@ -126,7 +127,7 @@ enum { EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_UNKNOWN -}; +} zfs_error_t; /* * The following data structures are all part @@ -182,6 +183,9 @@ extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *); extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); +extern void zfs_save_arguments(int argc, char **, char *, int); +extern int zpool_log_history(libzfs_handle_t *, const char *); + extern int libzfs_errno(libzfs_handle_t *); extern const char *libzfs_error_action(libzfs_handle_t *); extern const char *libzfs_error_description(libzfs_handle_t *); @@ -216,7 +220,7 @@ extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); */ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); -extern int zpool_destroy(zpool_handle_t *); +extern int zpool_destroy(zpool_handle_t *, const char *); extern int zpool_add(zpool_handle_t *, nvlist_t *); typedef struct splitflags { @@ -338,8 +342,8 @@ extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); /* * Import and export functions */ -extern int zpool_export(zpool_handle_t *, boolean_t); -extern int zpool_export_force(zpool_handle_t *); +extern int zpool_export(zpool_handle_t *, boolean_t, const char *); +extern int zpool_export_force(zpool_handle_t *, const char *); extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, @@ -373,7 +377,7 @@ extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, */ struct zfs_cmd; -extern const char *zfs_history_event_names[LOG_END]; +extern const char *zfs_history_event_names[]; extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, boolean_t verbose); @@ -381,9 +385,6 @@ extern int zpool_upgrade(zpool_handle_t *, uint64_t); extern int zpool_get_history(zpool_handle_t *, nvlist_t **); extern int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, uint_t *); -extern void zpool_set_history_str(const char *subcommand, int argc, - char **argv, char *history_str); -extern int zpool_stage_history(libzfs_handle_t *, const char *); extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t len); extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); @@ -436,8 +437,6 @@ extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, char *buf, size_t len); -extern int zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, - uint64_t *usedp); extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); extern const char *zfs_prop_values(zfs_prop_t); @@ -553,6 +552,8 @@ extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); extern int zfs_destroy_snaps_nvl(zfs_handle_t *, nvlist_t *, boolean_t); extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); +extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, + nvlist_t *props); extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); extern int zfs_rename(zfs_handle_t *, const char *, boolean_t, boolean_t); diff --git a/lib/libzfs/common/libzfs_config.c b/lib/libzfs/common/libzfs_config.c index f756da2..d5ba20f 100644 --- a/lib/libzfs/common/libzfs_config.c +++ b/lib/libzfs/common/libzfs_config.c @@ -337,6 +337,48 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) } /* + * If the __ZFS_POOL_RESTRICT environment variable is set we only iterate over + * pools it lists. + * + * This is an undocumented feature for use during testing only. + * + * This function returns B_TRUE if the pool should be skipped + * during iteration. + */ +static boolean_t +check_restricted(const char *poolname) +{ + static boolean_t initialized = B_FALSE; + static char *restricted = NULL; + + const char *cur, *end; + int len, namelen; + + if (!initialized) { + initialized = B_TRUE; + restricted = getenv("__ZFS_POOL_RESTRICT"); + } + + if (NULL == restricted) + return (B_FALSE); + + cur = restricted; + namelen = strlen(poolname); + do { + end = strchr(cur, ' '); + len = (NULL == end) ? strlen(cur) : (end - cur); + + if (len == namelen && 0 == strncmp(cur, poolname, len)) { + return (B_FALSE); + } + + cur += (len + 1); + } while (NULL != end); + + return (B_TRUE); +} + +/* * Iterate over all pools in the system. */ int @@ -359,6 +401,9 @@ zpool_iter(libzfs_handle_t *hdl, zpool_iter_f func, void *data) for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { + if (check_restricted(cn->cn_name)) + continue; + if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) { hdl->libzfs_pool_iter--; return (-1); @@ -394,6 +439,9 @@ zfs_iter_root(libzfs_handle_t *hdl, zfs_iter_f func, void *data) for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { + if (check_restricted(cn->cn_name)) + continue; + if ((zhp = make_dataset_handle(hdl, cn->cn_name)) == NULL) continue; diff --git a/lib/libzfs/common/libzfs_dataset.c b/lib/libzfs/common/libzfs_dataset.c index c1767cb..cc2603c 100644 --- a/lib/libzfs/common/libzfs_dataset.c +++ b/lib/libzfs/common/libzfs_dataset.c @@ -1407,8 +1407,7 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl = NULL, *realprops; zfs_prop_t prop; - boolean_t do_prefix; - uint64_t idx; + boolean_t do_prefix = B_TRUE; int added_resv; (void) snprintf(errbuf, sizeof (errbuf), @@ -1447,12 +1446,17 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) } /* - * If the dataset's canmount property is being set to noauto, - * then we want to prevent unmounting & remounting it. + * We don't want to unmount & remount the dataset when changing + * its canmount property to 'on' or 'noauto'. We only use + * the changelist logic to unmount when setting canmount=off. */ - do_prefix = !((prop == ZFS_PROP_CANMOUNT) && - (zprop_string_to_index(prop, propval, &idx, - ZFS_TYPE_DATASET) == 0) && (idx == ZFS_CANMOUNT_NOAUTO)); + if (prop == ZFS_PROP_CANMOUNT) { + uint64_t idx; + int err = zprop_string_to_index(prop, propval, &idx, + ZFS_TYPE_DATASET); + if (err == 0 && idx != ZFS_CANMOUNT_OFF) + do_prefix = B_FALSE; + } if (do_prefix && (ret = changelist_prefix(cl)) != 0) goto error; @@ -2641,25 +2645,6 @@ zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, return (0); } -int -zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, - uint64_t *usedp) -{ - int err; - zfs_cmd_t zc = { 0 }; - - (void) strlcpy(zc.zc_name, lastsnap->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, firstsnap->zfs_name, sizeof (zc.zc_value)); - - err = ioctl(lastsnap->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_SNAPS, &zc); - if (err) - return (err); - - *usedp = zc.zc_cookie; - - return (0); -} - /* * Returns the name of the given zfs handle. */ @@ -2860,7 +2845,6 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) */ for (cp = target + prefixlen + 1; cp = strchr(cp, '/'); *cp = '/', cp++) { - char *logstr; *cp = '\0'; @@ -2871,16 +2855,12 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) continue; } - logstr = hdl->libzfs_log_str; - hdl->libzfs_log_str = NULL; if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, NULL) != 0) { - hdl->libzfs_log_str = logstr; opname = dgettext(TEXT_DOMAIN, "create"); goto ancestorerr; } - hdl->libzfs_log_str = logstr; h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); if (h == NULL) { opname = dgettext(TEXT_DOMAIN, "open"); @@ -2938,12 +2918,12 @@ int zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, nvlist_t *props) { - zfs_cmd_t zc = { 0 }; int ret; uint64_t size = 0; uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); char errbuf[1024]; uint64_t zoned; + dmu_objset_type_t ost; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); @@ -2963,17 +2943,16 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, * will return ENOENT, not EEXIST. To prevent this from happening, we * first try to see if the dataset exists. */ - (void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name)); - if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { + if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (type == ZFS_TYPE_VOLUME) - zc.zc_objset_type = DMU_OST_ZVOL; + ost = DMU_OST_ZVOL; else - zc.zc_objset_type = DMU_OST_ZFS; + ost = DMU_OST_ZFS; if (props && (props = zfs_valid_proplist(hdl, type, props, zoned, NULL, errbuf)) == 0) @@ -3025,14 +3004,9 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, } } - if (props && zcmd_write_src_nvlist(hdl, &zc, props) != 0) - return (-1); - nvlist_free(props); - /* create the dataset */ - ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc); - - zcmd_free_nvlists(&zc); + ret = lzc_create(path, ost, props); + nvlist_free(props); /* check for failure */ if (ret != 0) { @@ -3084,7 +3058,8 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, /* * Destroys the given dataset. The caller must make sure that the filesystem - * isn't mounted, and that there are no active dependents. + * isn't mounted, and that there are no active dependents. If the file system + * does not exist this function does nothing. */ int zfs_destroy(zfs_handle_t *zhp, boolean_t defer) @@ -3100,7 +3075,8 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer) } zc.zc_defer_destroy = defer; - if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) { + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0 && + errno != ENOENT) { return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); @@ -3170,33 +3146,35 @@ int zfs_destroy_snaps_nvl(zfs_handle_t *zhp, nvlist_t *snaps, boolean_t defer) { int ret; - zfs_cmd_t zc = { 0 }; + nvlist_t *errlist; - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - if (zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, snaps) != 0) - return (-1); - zc.zc_defer_destroy = defer; + ret = lzc_destroy_snaps(snaps, defer, &errlist); - ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS_NVL, &zc); if (ret != 0) { - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot destroy snapshots in %s"), zc.zc_name); - - switch (errno) { - case EEXIST: - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "snapshot is cloned")); - return (zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf)); + for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL); + pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), + nvpair_name(pair)); - default: - return (zfs_standard_error(zhp->zfs_hdl, errno, - errbuf)); + switch (fnvpair_value_int32(pair)) { + case EEXIST: + zfs_error_aux(zhp->zfs_hdl, + dgettext(TEXT_DOMAIN, + "snapshot is cloned")); + ret = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, + errbuf); + break; + default: + ret = zfs_standard_error(zhp->zfs_hdl, errno, + errbuf); + break; + } } } - return (0); + return (ret); } /* @@ -3205,12 +3183,10 @@ zfs_destroy_snaps_nvl(zfs_handle_t *zhp, nvlist_t *snaps, boolean_t defer) int zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) { - zfs_cmd_t zc = { 0 }; char parent[ZFS_MAXNAMELEN]; int ret; char errbuf[1024]; libzfs_handle_t *hdl = zhp->zfs_hdl; - zfs_type_t type; uint64_t zoned; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); @@ -3229,32 +3205,21 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) (void) parent_name(target, parent, sizeof (parent)); /* do the clone */ - if (ZFS_IS_VOLUME(zhp)) { - zc.zc_objset_type = DMU_OST_ZVOL; - type = ZFS_TYPE_VOLUME; - } else { - zc.zc_objset_type = DMU_OST_ZFS; - type = ZFS_TYPE_FILESYSTEM; - } if (props) { + zfs_type_t type; + if (ZFS_IS_VOLUME(zhp)) { + type = ZFS_TYPE_VOLUME; + } else { + type = ZFS_TYPE_FILESYSTEM; + } if ((props = zfs_valid_proplist(hdl, type, props, zoned, zhp, errbuf)) == NULL) return (-1); - - if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { - nvlist_free(props); - return (-1); - } - - nvlist_free(props); } - (void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); - ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_CREATE, &zc); - - zcmd_free_nvlists(&zc); + ret = lzc_clone(target, zhp->zfs_name, props); + nvlist_free(props); if (ret != 0) { switch (errno) { @@ -3339,74 +3304,134 @@ zfs_promote(zfs_handle_t *zhp) return (ret); } +typedef struct snapdata { + nvlist_t *sd_nvl; + const char *sd_snapname; +} snapdata_t; + +static int +zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) +{ + snapdata_t *sd = arg; + char name[ZFS_MAXNAMELEN]; + int rv = 0; + + (void) snprintf(name, sizeof (name), + "%s@%s", zfs_get_name(zhp), sd->sd_snapname); + + fnvlist_add_boolean(sd->sd_nvl, name); + + rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); + zfs_close(zhp); + return (rv); +} + /* - * Takes a snapshot of the given dataset. + * Creates snapshots. The keys in the snaps nvlist are the snapshots to be + * created. */ int -zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, - nvlist_t *props) +zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) { - const char *delim; - char parent[ZFS_MAXNAMELEN]; - zfs_handle_t *zhp; - zfs_cmd_t zc = { 0 }; int ret; char errbuf[1024]; + nvpair_t *elem; + nvlist_t *errors; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot snapshot '%s'"), path); + "cannot create snapshots ")); - /* validate the target name */ - if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + elem = NULL; + while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { + const char *snapname = nvpair_name(elem); - if (props) { - if ((props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, - props, B_FALSE, NULL, errbuf)) == NULL) - return (-1); + /* validate the target name */ + if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, + B_TRUE)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot create snapshot '%s'"), snapname); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + } - if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { - nvlist_free(props); - return (-1); + if (props != NULL && + (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, + props, B_FALSE, NULL, errbuf)) == NULL) { + return (-1); + } + + ret = lzc_snapshot(snaps, props, &errors); + + if (ret != 0) { + boolean_t printed = B_FALSE; + for (elem = nvlist_next_nvpair(errors, NULL); + elem != NULL; + elem = nvlist_next_nvpair(errors, elem)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot create snapshot '%s'"), nvpair_name(elem)); + (void) zfs_standard_error(hdl, + fnvpair_value_int32(elem), errbuf); + printed = B_TRUE; } + if (!printed) { + switch (ret) { + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "multiple snapshots of same " + "fs not allowed")); + (void) zfs_error(hdl, EZFS_EXISTS, errbuf); - nvlist_free(props); + break; + default: + (void) zfs_standard_error(hdl, ret, errbuf); + } + } } - /* make sure the parent exists and is of the appropriate type */ - delim = strchr(path, '@'); - (void) strncpy(parent, path, delim - path); - parent[delim - path] = '\0'; + nvlist_free(props); + nvlist_free(errors); + return (ret); +} + +int +zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, + nvlist_t *props) +{ + int ret; + snapdata_t sd = { 0 }; + char fsname[ZFS_MAXNAMELEN]; + char *cp; + zfs_handle_t *zhp; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot snapshot %s"), path); + + if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - if ((zhp = zfs_open(hdl, parent, ZFS_TYPE_FILESYSTEM | + (void) strlcpy(fsname, path, sizeof (fsname)); + cp = strchr(fsname, '@'); + *cp = '\0'; + sd.sd_snapname = cp + 1; + + if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { - zcmd_free_nvlists(&zc); return (-1); } - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, delim+1, sizeof (zc.zc_value)); - if (ZFS_IS_VOLUME(zhp)) - zc.zc_objset_type = DMU_OST_ZVOL; - else - zc.zc_objset_type = DMU_OST_ZFS; - zc.zc_cookie = recursive; - ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SNAPSHOT, &zc); - - zcmd_free_nvlists(&zc); - - /* - * if it was recursive, the one that actually failed will be in - * zc.zc_name. - */ - if (ret != 0) { - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value); - (void) zfs_standard_error(hdl, errno, errbuf); + verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0); + if (recursive) { + (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); + } else { + fnvlist_add_boolean(sd.sd_nvl, path); } + ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); + nvlist_free(sd.sd_nvl); zfs_close(zhp); - return (ret); } @@ -3434,17 +3459,13 @@ rollback_destroy(zfs_handle_t *zhp, void *data) zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { - char *logstr; cbp->cb_dependent = B_TRUE; cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, rollback_destroy, cbp); cbp->cb_dependent = B_FALSE; - logstr = zhp->zfs_hdl->libzfs_log_str; - zhp->zfs_hdl->libzfs_log_str = NULL; cbp->cb_error |= zfs_destroy(zhp, B_FALSE); - zhp->zfs_hdl->libzfs_log_str = logstr; } } else { /* We must destroy this clone; first unmount it */ diff --git a/lib/libzfs/common/libzfs_impl.h b/lib/libzfs/common/libzfs_impl.h index b1eae47..576b2af 100644 --- a/lib/libzfs/common/libzfs_impl.h +++ b/lib/libzfs/common/libzfs_impl.h @@ -21,11 +21,11 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ -#ifndef _LIBFS_IMPL_H -#define _LIBFS_IMPL_H +#ifndef _LIBZFS_IMPL_H +#define _LIBZFS_IMPL_H #include #include @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -67,7 +68,6 @@ struct libzfs_handle { int libzfs_desc_active; char libzfs_action[1024]; char libzfs_desc[1024]; - char *libzfs_log_str; int libzfs_printerr; int libzfs_storeerr; /* stuff error messages into buffer */ void *libzfs_sharehdl; /* libshare handle */ @@ -213,4 +213,4 @@ extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t); } #endif -#endif /* _LIBFS_IMPL_H */ +#endif /* _LIBZFS_IMPL_H */ diff --git a/lib/libzfs/common/libzfs_iter.c b/lib/libzfs/common/libzfs_iter.c index 212383d..be5767f 100644 --- a/lib/libzfs/common/libzfs_iter.c +++ b/lib/libzfs/common/libzfs_iter.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2010 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -301,12 +301,11 @@ int zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, zfs_iter_f func, void *arg) { - char buf[ZFS_MAXNAMELEN]; - char *comma_separated, *cp; + char *buf, *comma_separated, *cp; int err = 0; int ret = 0; - (void) strlcpy(buf, spec_orig, sizeof (buf)); + buf = zfs_strdup(fs_zhp->zfs_hdl, spec_orig); cp = buf; while ((comma_separated = strsep(&cp, ",")) != NULL) { @@ -364,6 +363,7 @@ zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, } } + free(buf); return (ret); } diff --git a/lib/libzfs/common/libzfs_pool.c b/lib/libzfs/common/libzfs_pool.c index df89a2b..1c6fb37 100644 --- a/lib/libzfs/common/libzfs_pool.c +++ b/lib/libzfs/common/libzfs_pool.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1205,7 +1206,7 @@ create_failed: * datasets left in the pool. */ int -zpool_destroy(zpool_handle_t *zhp) +zpool_destroy(zpool_handle_t *zhp, const char *log_str) { zfs_cmd_t zc = { 0 }; zfs_handle_t *zfp = NULL; @@ -1217,6 +1218,7 @@ zpool_destroy(zpool_handle_t *zhp) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, @@ -1371,8 +1373,9 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) * Exports the pool from the system. The caller must ensure that there are no * mounted datasets in the pool. */ -int -zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) +static int +zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, + const char *log_str) { zfs_cmd_t zc = { 0 }; char msg[1024]; @@ -1383,6 +1386,7 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = force; zc.zc_guid = hardforce; + zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { switch (errno) { @@ -1404,15 +1408,15 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) } int -zpool_export(zpool_handle_t *zhp, boolean_t force) +zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) { - return (zpool_export_common(zhp, force, B_FALSE)); + return (zpool_export_common(zhp, force, B_FALSE, log_str)); } int -zpool_export_force(zpool_handle_t *zhp) +zpool_export_force(zpool_handle_t *zhp, const char *log_str) { - return (zpool_export_common(zhp, B_TRUE, B_TRUE)); + return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); } static void @@ -3574,40 +3578,30 @@ zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) } void -zpool_set_history_str(const char *subcommand, int argc, char **argv, - char *history_str) +zfs_save_arguments(int argc, char **argv, char *string, int len) { - int i; - - (void) strlcpy(history_str, subcommand, HIS_MAX_RECORD_LEN); - for (i = 1; i < argc; i++) { - if (strlen(history_str) + 1 + strlen(argv[i]) > - HIS_MAX_RECORD_LEN) - break; - (void) strlcat(history_str, " ", HIS_MAX_RECORD_LEN); - (void) strlcat(history_str, argv[i], HIS_MAX_RECORD_LEN); + (void) strlcpy(string, basename(argv[0]), len); + for (int i = 1; i < argc; i++) { + (void) strlcat(string, " ", len); + (void) strlcat(string, argv[i], len); } } -/* - * Stage command history for logging. - */ int -zpool_stage_history(libzfs_handle_t *hdl, const char *history_str) +zpool_log_history(libzfs_handle_t *hdl, const char *message) { - if (history_str == NULL) - return (EINVAL); - - if (strlen(history_str) > HIS_MAX_RECORD_LEN) - return (EINVAL); - - if (hdl->libzfs_log_str != NULL) - free(hdl->libzfs_log_str); - - if ((hdl->libzfs_log_str = strdup(history_str)) == NULL) - return (no_memory(hdl)); - - return (0); + zfs_cmd_t zc = { 0 }; + nvlist_t *args; + int err; + + args = fnvlist_alloc(); + fnvlist_add_string(args, "message", message); + err = zcmd_write_src_nvlist(hdl, &zc, args); + if (err == 0) + err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc); + nvlist_free(args); + zcmd_free_nvlists(&zc); + return (err); } /* diff --git a/lib/libzfs/common/libzfs_sendrecv.c b/lib/libzfs/common/libzfs_sendrecv.c index a02a41b..ee6e643 100644 --- a/lib/libzfs/common/libzfs_sendrecv.c +++ b/lib/libzfs/common/libzfs_sendrecv.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -1381,7 +1381,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, avl_tree_t *fsavl = NULL; static uint64_t holdseq; int spa_version; - boolean_t holdsnaps = B_FALSE; pthread_t tid; int pipefd[2]; dedup_arg_t dda = { 0 }; @@ -1404,11 +1403,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } } - if (!flags->dryrun && zfs_spa_version(zhp, &spa_version) == 0 && - spa_version >= SPA_VERSION_USERREFS && - (flags->doall || flags->replicate)) - holdsnaps = B_TRUE; - if (flags->dedup && !flags->dryrun) { featureflags |= (DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); @@ -1530,7 +1524,18 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; - if (holdsnaps || flags->progress) { + + /* + * Some flags require that we place user holds on the datasets that are + * being sent so they don't get destroyed during the send. We can skip + * this step if the pool is imported read-only since the datasets cannot + * be destroyed. + */ + if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), + ZPOOL_PROP_READONLY, NULL) && + zfs_spa_version(zhp, &spa_version) == 0 && + spa_version >= SPA_VERSION_USERREFS && + (flags->doall || flags->replicate)) { ++holdseq; (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); diff --git a/lib/libzfs/common/libzfs_util.c b/lib/libzfs/common/libzfs_util.c index 41db2fd..41e25e9 100644 --- a/lib/libzfs/common/libzfs_util.c +++ b/lib/libzfs/common/libzfs_util.c @@ -43,6 +43,7 @@ #include #include +#include #include "libzfs_impl.h" #include "zfs_prop.h" @@ -630,6 +631,14 @@ libzfs_init(void) hdl->libzfs_sharetab = fopen("/etc/dfs/sharetab", "r"); + if (libzfs_core_init() != 0) { + (void) close(hdl->libzfs_fd); + (void) fclose(hdl->libzfs_mnttab); + (void) fclose(hdl->libzfs_sharetab); + free(hdl); + return (NULL); + } + zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); @@ -647,12 +656,11 @@ libzfs_fini(libzfs_handle_t *hdl) if (hdl->libzfs_sharetab) (void) fclose(hdl->libzfs_sharetab); zfs_uninit_libshare(hdl); - if (hdl->libzfs_log_str) - (void) free(hdl->libzfs_log_str); zpool_free_handles(hdl); libzfs_fru_clear(hdl, B_TRUE); namespace_clear(hdl); libzfs_mnttab_fini(hdl); + libzfs_core_fini(); free(hdl); } @@ -814,17 +822,7 @@ zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp) int zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) { - int error; - - zc->zc_history = (uint64_t)(uintptr_t)hdl->libzfs_log_str; - error = ioctl(hdl->libzfs_fd, request, zc); - if (hdl->libzfs_log_str) { - free(hdl->libzfs_log_str); - hdl->libzfs_log_str = NULL; - } - zc->zc_history = 0; - - return (error); + return (ioctl(hdl->libzfs_fd, request, zc)); } /* diff --git a/lib/libzfs_core/common/libzfs_core.c b/lib/libzfs_core/common/libzfs_core.c new file mode 100644 index 0000000..73afd50 --- /dev/null +++ b/lib/libzfs_core/common/libzfs_core.c @@ -0,0 +1,477 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* + * LibZFS_Core (lzc) is intended to replace most functionality in libzfs. + * It has the following characteristics: + * + * - Thread Safe. libzfs_core is accessible concurrently from multiple + * threads. This is accomplished primarily by avoiding global data + * (e.g. caching). Since it's thread-safe, there is no reason for a + * process to have multiple libzfs "instances". Therefore, we store + * our few pieces of data (e.g. the file descriptor) in global + * variables. The fd is reference-counted so that the libzfs_core + * library can be "initialized" multiple times (e.g. by different + * consumers within the same process). + * + * - Committed Interface. The libzfs_core interface will be committed, + * therefore consumers can compile against it and be confident that + * their code will continue to work on future releases of this code. + * Currently, the interface is Evolving (not Committed), but we intend + * to commit to it once it is more complete and we determine that it + * meets the needs of all consumers. + * + * - Programatic Error Handling. libzfs_core communicates errors with + * defined error numbers, and doesn't print anything to stdout/stderr. + * + * - Thin Layer. libzfs_core is a thin layer, marshaling arguments + * to/from the kernel ioctls. There is generally a 1:1 correspondence + * between libzfs_core functions and ioctls to /dev/zfs. + * + * - Clear Atomicity. Because libzfs_core functions are generally 1:1 + * with kernel ioctls, and kernel ioctls are general atomic, each + * libzfs_core function is atomic. For example, creating multiple + * snapshots with a single call to lzc_snapshot() is atomic -- it + * can't fail with only some of the requested snapshots created, even + * in the event of power loss or system crash. + * + * - Continued libzfs Support. Some higher-level operations (e.g. + * support for "zfs send -R") are too complicated to fit the scope of + * libzfs_core. This functionality will continue to live in libzfs. + * Where appropriate, libzfs will use the underlying atomic operations + * of libzfs_core. For example, libzfs may implement "zfs send -R | + * zfs receive" by using individual "send one snapshot", rename, + * destroy, and "receive one snapshot" operations in libzfs_core. + * /sbin/zfs and /zbin/zpool will link with both libzfs and + * libzfs_core. Other consumers should aim to use only libzfs_core, + * since that will be the supported, stable interface going forwards. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int g_fd; +static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; +static int g_refcount; + +int +libzfs_core_init(void) +{ + (void) pthread_mutex_lock(&g_lock); + if (g_refcount == 0) { + g_fd = open("/dev/zfs", O_RDWR); + if (g_fd < 0) { + (void) pthread_mutex_unlock(&g_lock); + return (errno); + } + } + g_refcount++; + (void) pthread_mutex_unlock(&g_lock); + return (0); +} + +void +libzfs_core_fini(void) +{ + (void) pthread_mutex_lock(&g_lock); + ASSERT3S(g_refcount, >, 0); + g_refcount--; + if (g_refcount == 0) + (void) close(g_fd); + (void) pthread_mutex_unlock(&g_lock); +} + +static int +lzc_ioctl(zfs_ioc_t ioc, const char *name, + nvlist_t *source, nvlist_t **resultp) +{ + zfs_cmd_t zc = { 0 }; + int error = 0; + char *packed; + size_t size; + + ASSERT3S(g_refcount, >, 0); + + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + + packed = fnvlist_pack(source, &size); + zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; + zc.zc_nvlist_src_size = size; + + if (resultp != NULL) { + zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); + zc.zc_nvlist_dst = (uint64_t)(uintptr_t) + malloc(zc.zc_nvlist_dst_size); + if (zc.zc_nvlist_dst == NULL) { + error = ENOMEM; + goto out; + } + } + + while (ioctl(g_fd, ioc, &zc) != 0) { + if (errno == ENOMEM && resultp != NULL) { + free((void *)(uintptr_t)zc.zc_nvlist_dst); + zc.zc_nvlist_dst_size *= 2; + zc.zc_nvlist_dst = (uint64_t)(uintptr_t) + malloc(zc.zc_nvlist_dst_size); + if (zc.zc_nvlist_dst == NULL) { + error = ENOMEM; + goto out; + } + } else { + error = errno; + break; + } + } + if (zc.zc_nvlist_dst_filled) { + *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, + zc.zc_nvlist_dst_size); + } else if (resultp != NULL) { + *resultp = NULL; + } + +out: + fnvlist_pack_free(packed, size); + free((void *)(uintptr_t)zc.zc_nvlist_dst); + return (error); +} + +int +lzc_create(const char *fsname, dmu_objset_type_t type, nvlist_t *props) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_int32(args, "type", type); + if (props != NULL) + fnvlist_add_nvlist(args, "props", props); + error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL); + nvlist_free(args); + return (error); +} + +int +lzc_clone(const char *fsname, const char *origin, + nvlist_t *props) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_string(args, "origin", origin); + if (props != NULL) + fnvlist_add_nvlist(args, "props", props); + error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL); + nvlist_free(args); + return (error); +} + +/* + * Creates snapshots. + * + * The keys in the snaps nvlist are the snapshots to be created. + * They must all be in the same pool. + * + * The props nvlist is properties to set. Currently only user properties + * are supported. { user:prop_name -> string value } + * + * The returned results nvlist will have an entry for each snapshot that failed. + * The value will be the (int32) error code. + * + * The return value will be 0 if all snapshots were created, otherwise it will + * be the errno of a (undetermined) snapshot that failed. + */ +int +lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist) +{ + nvpair_t *elem; + nvlist_t *args; + int error; + char pool[MAXNAMELEN]; + + *errlist = NULL; + + /* determine the pool name */ + elem = nvlist_next_nvpair(snaps, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_nvlist(args, "snaps", snaps); + if (props != NULL) + fnvlist_add_nvlist(args, "props", props); + + error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist); + nvlist_free(args); + + return (error); +} + +/* + * Destroys snapshots. + * + * The keys in the snaps nvlist are the snapshots to be destroyed. + * They must all be in the same pool. + * + * Snapshots that do not exist will be silently ignored. + * + * If 'defer' is not set, and a snapshot has user holds or clones, the + * destroy operation will fail and none of the snapshots will be + * destroyed. + * + * If 'defer' is set, and a snapshot has user holds or clones, it will be + * marked for deferred destruction, and will be destroyed when the last hold + * or clone is removed/destroyed. + * + * The return value will be 0 if all snapshots were destroyed (or marked for + * later destruction if 'defer' is set) or didn't exist to begin with. + * + * Otherwise the return value will be the errno of a (undetermined) snapshot + * that failed, no snapshots will be destroyed, and the errlist will have an + * entry for each snapshot that failed. The value in the errlist will be + * the (int32) error code. + */ +int +lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist) +{ + nvpair_t *elem; + nvlist_t *args; + int error; + char pool[MAXNAMELEN]; + + /* determine the pool name */ + elem = nvlist_next_nvpair(snaps, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_nvlist(args, "snaps", snaps); + if (defer) + fnvlist_add_boolean(args, "defer"); + + error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist); + nvlist_free(args); + + return (error); + +} + +int +lzc_snaprange_space(const char *firstsnap, const char *lastsnap, + uint64_t *usedp) +{ + nvlist_t *args; + nvlist_t *result; + int err; + char fs[MAXNAMELEN]; + char *atp; + + /* determine the fs name */ + (void) strlcpy(fs, firstsnap, sizeof (fs)); + atp = strchr(fs, '@'); + if (atp == NULL) + return (EINVAL); + *atp = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_string(args, "firstsnap", firstsnap); + + err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result); + nvlist_free(args); + if (err == 0) + *usedp = fnvlist_lookup_uint64(result, "used"); + fnvlist_free(result); + + return (err); +} + +boolean_t +lzc_exists(const char *dataset) +{ + /* + * The objset_stats ioctl is still legacy, so we need to construct our + * own zfs_cmd_t rather than using zfsc_ioctl(). + */ + zfs_cmd_t zc = { 0 }; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); +} + +/* + * If fromsnap is NULL, a full (non-incremental) stream will be sent. + */ +int +lzc_send(const char *snapname, const char *fromsnap, int fd) +{ + nvlist_t *args; + int err; + + args = fnvlist_alloc(); + fnvlist_add_int32(args, "fd", fd); + if (fromsnap != NULL) + fnvlist_add_string(args, "fromsnap", fromsnap); + err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); + nvlist_free(args); + return (err); +} + +/* + * If fromsnap is NULL, a full (non-incremental) stream will be estimated. + */ +int +lzc_send_space(const char *snapname, const char *fromsnap, uint64_t *spacep) +{ + nvlist_t *args; + nvlist_t *result; + int err; + + args = fnvlist_alloc(); + if (fromsnap != NULL) + fnvlist_add_string(args, "fromsnap", fromsnap); + err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); + nvlist_free(args); + if (err == 0) + *spacep = fnvlist_lookup_uint64(result, "space"); + nvlist_free(result); + return (err); +} + +static int +recv_read(int fd, void *buf, int ilen) +{ + char *cp = buf; + int rv; + int len = ilen; + + do { + rv = read(fd, cp, len); + cp += rv; + len -= rv; + } while (rv > 0); + + if (rv < 0 || len != 0) + return (EIO); + + return (0); +} + +/* + * The simplest receive case: receive from the specified fd, creating the + * specified snapshot. Apply the specified properties a "received" properties + * (which can be overridden by locally-set properties). If the stream is a + * clone, its origin snapshot must be specified by 'origin'. The 'force' + * flag will cause the target filesystem to be rolled back or destroyed if + * necessary to receive. + * + * Return 0 on success or an errno on failure. + * + * Note: this interface does not work on dedup'd streams + * (those with DMU_BACKUP_FEATURE_DEDUP). + */ +int +lzc_receive(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, int fd) +{ + /* + * The receive ioctl is still legacy, so we need to construct our own + * zfs_cmd_t rather than using zfsc_ioctl(). + */ + zfs_cmd_t zc = { 0 }; + char *atp; + char *packed = NULL; + size_t size; + dmu_replay_record_t drr; + int error; + + ASSERT3S(g_refcount, >, 0); + + /* zc_name is name of containing filesystem */ + (void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name)); + atp = strchr(zc.zc_name, '@'); + if (atp == NULL) + return (EINVAL); + *atp = '\0'; + + /* if the fs does not exist, try its parent. */ + if (!lzc_exists(zc.zc_name)) { + char *slashp = strrchr(zc.zc_name, '/'); + if (slashp == NULL) + return (ENOENT); + *slashp = '\0'; + + } + + /* zc_value is full name of the snapshot to create */ + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + + if (props != NULL) { + /* zc_nvlist_src is props to set */ + packed = fnvlist_pack(props, &size); + zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; + zc.zc_nvlist_src_size = size; + } + + /* zc_string is name of clone origin (if DRR_FLAG_CLONE) */ + if (origin != NULL) + (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string)); + + /* zc_begin_record is non-byteswapped BEGIN record */ + error = recv_read(fd, &drr, sizeof (drr)); + if (error != 0) + goto out; + zc.zc_begin_record = drr.drr_u.drr_begin; + + /* zc_cookie is fd to read from */ + zc.zc_cookie = fd; + + /* zc guid is force flag */ + zc.zc_guid = force; + + /* zc_cleanup_fd is unused */ + zc.zc_cleanup_fd = -1; + + error = ioctl(g_fd, ZFS_IOC_RECV, &zc); + if (error != 0) + error = errno; + +out: + if (packed != NULL) + fnvlist_pack_free(packed, size); + free((void*)(uintptr_t)zc.zc_nvlist_dst); + return (error); +} diff --git a/lib/libzfs_core/common/libzfs_core.h b/lib/libzfs_core/common/libzfs_core.h new file mode 100644 index 0000000..9edc884 --- /dev/null +++ b/lib/libzfs_core/common/libzfs_core.h @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _LIBZFS_CORE_H +#define _LIBZFS_CORE_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int libzfs_core_init(void); +void libzfs_core_fini(void); + +int lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist); +int lzc_create(const char *fsname, dmu_objset_type_t type, nvlist_t *props); +int lzc_clone(const char *fsname, const char *origin, nvlist_t *props); +int lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist); + +int lzc_snaprange_space(const char *firstsnap, const char *lastsnap, + uint64_t *usedp); + +int lzc_send(const char *snapname, const char *fromsnap, int fd); +int lzc_receive(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, int fd); +int lzc_send_space(const char *snapname, const char *fromsnap, + uint64_t *result); + +boolean_t lzc_exists(const char *dataset); + + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_CORE_H */ diff --git a/lib/libzpool/common/kernel.c b/lib/libzpool/common/kernel.c index 8e1e7f7..04d5307 100644 --- a/lib/libzpool/common/kernel.c +++ b/lib/libzpool/common/kernel.c @@ -871,6 +871,12 @@ crgetuid(cred_t *cr) return (0); } +uid_t +crgetruid(cred_t *cr) +{ + return (0); +} + gid_t crgetgid(cred_t *cr) { diff --git a/lib/libzpool/common/sys/zfs_context.h b/lib/libzpool/common/sys/zfs_context.h index 1f5e758..39af927 100644 --- a/lib/libzpool/common/sys/zfs_context.h +++ b/lib/libzpool/common/sys/zfs_context.h @@ -286,6 +286,7 @@ extern void rw_exit(krwlock_t *rwlp); #define rw_downgrade(rwlp) do { } while (0) extern uid_t crgetuid(cred_t *cr); +extern uid_t crgetruid(cred_t *cr); extern gid_t crgetgid(cred_t *cr); extern int crgetngroups(cred_t *cr); extern gid_t *crgetgroups(cred_t *cr); diff --git a/man/man1m/zfs.1m b/man/man1m/zfs.1m index e713566..32b0cb2 100644 --- a/man/man1m/zfs.1m +++ b/man/man1m/zfs.1m @@ -39,7 +39,7 @@ zfs \- configures ZFS file systems .LP .nf \fBzfs\fR \fBsnapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... - \fIfilesystem@snapname\fR|\fIvolume@snapname\fR + \fIfilesystem@snapname\fR|\fIvolume@snapname\fR... .fi .LP @@ -1837,13 +1837,14 @@ behavior for mounted file systems in use. .ne 2 .na \fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... -\fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR +\fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR... .ad .sp .6 .RS 4n -Creates a snapshot with the given name. All previous modifications by -successful system calls to the file system are part of the snapshot. See the -"Snapshots" section for details. +Creates snapshots with the given names. All previous modifications by +successful system calls to the file system are part of the snapshots. +Snapshots are taken atomically, so that all snapshots correspond to the same +moment in time. See the "Snapshots" section for details. .sp .ne 2 .na @@ -1851,9 +1852,7 @@ successful system calls to the file system are part of the snapshot. See the .ad .sp .6 .RS 4n -Recursively create snapshots of all descendent datasets. Snapshots are taken -atomically, so that all recursive snapshots correspond to the same moment in -time. +Recursively create snapshots of all descendent datasets .RE .sp -- cgit v1.1 From 046a9401928ebf661f30677bcf13cfa674f47204 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 6 Feb 2013 07:54:46 +0000 Subject: Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13934:9e23a7f7b812 Illumos ZFS issues: 3422 zpool create/syseventd race yield non-importable pool 3425 first write to a new zvol can fail with EFBIG --- uts/common/fs/zfs/dmu_tx.c | 10 ++-------- uts/common/fs/zfs/vdev.c | 5 +++-- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/uts/common/fs/zfs/dmu_tx.c b/uts/common/fs/zfs/dmu_tx.c index e44786f..556ae6a 100644 --- a/uts/common/fs/zfs/dmu_tx.c +++ b/uts/common/fs/zfs/dmu_tx.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #include @@ -284,6 +284,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) delta = P2NPHASE(off, dn->dn_datablksz); } + min_ibs = max_ibs = dn->dn_indblkshift; if (dn->dn_maxblkid > 0) { /* * The blocksize can't change, @@ -291,13 +292,6 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) */ ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; - min_ibs = max_ibs = dn->dn_indblkshift; - } else if (dn->dn_indblkshift > max_ibs) { - /* - * This ensures that if we reduce DN_MAX_INDBLKSHIFT, - * the code will still work correctly on older pools. - */ - min_ibs = max_ibs = dn->dn_indblkshift; } /* diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c index 18180ec..4f5c4e9 100644 --- a/uts/common/fs/zfs/vdev.c +++ b/uts/common/fs/zfs/vdev.c @@ -1327,7 +1327,8 @@ vdev_validate(vdev_t *vd, boolean_t strict) if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { uint64_t aux_guid = 0; nvlist_t *nvl; - uint64_t txg = strict ? spa->spa_config_txg : -1ULL; + uint64_t txg = spa_last_synced_txg(spa) != 0 ? + spa_last_synced_txg(spa) : -1ULL; if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, @@ -1511,7 +1512,7 @@ vdev_reopen(vdev_t *vd) !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { - (void) vdev_validate(vd, spa_last_synced_txg(spa)); + (void) vdev_validate(vd, B_TRUE); } /* -- cgit v1.1 From 3521d0bfab6065ce01749cbe55a920c76a91cf63 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 6 Feb 2013 08:14:58 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate version 13937:6b4f289e7094 Illumos ZFS issues: 3468 mdb enhancements for zfs development (not relevant for FreeBSD) --- uts/common/fs/zfs/sys/refcount.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/uts/common/fs/zfs/sys/refcount.h b/uts/common/fs/zfs/sys/refcount.h index 1752c64..1dcd467 100644 --- a/uts/common/fs/zfs/sys/refcount.h +++ b/uts/common/fs/zfs/sys/refcount.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_REFCOUNT_H @@ -52,8 +53,8 @@ typedef struct refcount { kmutex_t rc_mtx; list_t rc_list; list_t rc_removed; - int64_t rc_count; - int64_t rc_removed_count; + uint64_t rc_count; + uint64_t rc_removed_count; } refcount_t; /* Note: refcount_t must be initialized with refcount_create() */ -- cgit v1.1 From 870a44893b6d9fb9583e691bee2ecdb2b93ae2f5 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 6 Feb 2013 08:17:29 +0000 Subject: Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13939:20e4d8d8da6d illumos dtrace issues: 3511 dtrace.c erroneously checks for memory alignment on amd64 --- uts/common/dtrace/dtrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uts/common/dtrace/dtrace.c b/uts/common/dtrace/dtrace.c index 5013661..7a5e67e 100644 --- a/uts/common/dtrace/dtrace.c +++ b/uts/common/dtrace/dtrace.c @@ -355,7 +355,7 @@ static kmutex_t dtrace_errlock; #define DTRACE_STORE(type, tomax, offset, what) \ *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what); -#ifndef __i386 +#ifndef __x86 #define DTRACE_ALIGNCHECK(addr, size, flags) \ if (addr & (size - 1)) { \ *flags |= CPU_DTRACE_BADALIGN; \ -- cgit v1.1 From 78c7f6de8720de1f612bdc626f6be3f27a6f2aae Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 6 Feb 2013 08:21:40 +0000 Subject: Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13941:d48547176ab4 Illumos ZFS issues: 3498 panic in arc_read(): !refcount_is_zero(&pbuf->b_hdr->b_refcnt) --- uts/common/fs/zfs/arc.c | 52 ++---------------------- uts/common/fs/zfs/bptree.c | 2 +- uts/common/fs/zfs/dbuf.c | 28 ++----------- uts/common/fs/zfs/dmu_diff.c | 8 ++-- uts/common/fs/zfs/dmu_objset.c | 12 ++---- uts/common/fs/zfs/dmu_send.c | 20 ++++----- uts/common/fs/zfs/dmu_traverse.c | 79 ++++++++++++++++-------------------- uts/common/fs/zfs/dsl_dataset.c | 2 +- uts/common/fs/zfs/dsl_scan.c | 36 +++------------- uts/common/fs/zfs/spa.c | 2 +- uts/common/fs/zfs/sys/arc.h | 8 +--- uts/common/fs/zfs/sys/dmu_traverse.h | 3 +- uts/common/fs/zfs/sys/dsl_pool.h | 6 --- uts/common/fs/zfs/zil.c | 4 +- uts/common/fs/zfs/zio.c | 2 +- uts/common/fs/zfs/zvol.c | 2 +- 16 files changed, 73 insertions(+), 193 deletions(-) diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c index 860a3b5..6625deb 100644 --- a/uts/common/fs/zfs/arc.c +++ b/uts/common/fs/zfs/arc.c @@ -823,7 +823,6 @@ buf_cons(void *vbuf, void *unused, int kmflag) bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); @@ -853,7 +852,6 @@ buf_dest(void *vbuf, void *unused) arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); - rw_destroy(&buf->b_data_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } @@ -2779,42 +2777,11 @@ arc_read_done(zio_t *zio) * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. - * - * Normal callers should use arc_read and pass the arc buffer and offset - * for the bp. But if you know you don't need locking, you can use - * arc_read_bp. */ int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) -{ - int err; - - if (pbuf == NULL) { - /* - * XXX This happens from traverse callback funcs, for - * the objset_phys_t block. - */ - return (arc_read_nolock(pio, spa, bp, done, private, priority, - zio_flags, arc_flags, zb)); - } - - ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); - ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); - rw_enter(&pbuf->b_data_lock, RW_READER); - - err = arc_read_nolock(pio, spa, bp, done, private, priority, - zio_flags, arc_flags, zb); - rw_exit(&pbuf->b_data_lock); - - return (err); -} - -int -arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, + void *private, int priority, int zio_flags, uint32_t *arc_flags, + const zbookmark_t *zb) { arc_buf_hdr_t *hdr; arc_buf_t *buf; @@ -3294,19 +3261,6 @@ arc_release(arc_buf_t *buf, void *tag) } } -/* - * Release this buffer. If it does not match the provided BP, fill it - * with that block's contents. - */ -/* ARGSUSED */ -int -arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, - zbookmark_t *zb) -{ - arc_release(buf, tag); - return (0); -} - int arc_released(arc_buf_t *buf) { diff --git a/uts/common/fs/zfs/bptree.c b/uts/common/fs/zfs/bptree.c index 1a009cf..73922db 100644 --- a/uts/common/fs/zfs/bptree.c +++ b/uts/common/fs/zfs/bptree.c @@ -135,7 +135,7 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, /* ARGSUSED */ static int -bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, +bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { int err; diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c index d56dbc9..8bf3d09 100644 --- a/uts/common/fs/zfs/dbuf.c +++ b/uts/common/fs/zfs/dbuf.c @@ -513,7 +513,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) spa_t *spa; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; - arc_buf_t *pbuf; DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -575,14 +574,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); - /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ - if (db->db_parent) - pbuf = db->db_parent->db_buf; - else - pbuf = db->db_objset->os_phys_buf; - - (void) dsl_read(zio, spa, db->db_blkptr, pbuf, + (void) arc_read(zio, spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -982,7 +975,6 @@ void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os; - zbookmark_t zb; DB_GET_OBJSET(&os, db); ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); @@ -990,13 +982,7 @@ dbuf_release_bp(dmu_buf_impl_t *db) list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); - zb.zb_objset = os->os_dsl_dataset ? - os->os_dsl_dataset->ds_object : 0; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; - (void) arc_release_bp(db->db_buf, db, - db->db_blkptr, os->os_spa, &zb); + (void) arc_release(db->db_buf, db); } dbuf_dirty_record_t * @@ -1831,7 +1817,6 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) if (bp && !BP_IS_HOLE(bp)) { int priority = dn->dn_type == DMU_OT_DDT_ZAP ? ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; - arc_buf_t *pbuf; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; @@ -1839,13 +1824,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, 0, blkid); - if (db) - pbuf = db->db_buf; - else - pbuf = dn->dn_objset->os_phys_buf; - - (void) dsl_read(NULL, dn->dn_objset->os_spa, - bp, pbuf, NULL, NULL, priority, + (void) arc_read(NULL, dn->dn_objset->os_spa, + bp, NULL, NULL, priority, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } diff --git a/uts/common/fs/zfs/dmu_diff.c b/uts/common/fs/zfs/dmu_diff.c index 22340eb..dc23778 100644 --- a/uts/common/fs/zfs/dmu_diff.c +++ b/uts/common/fs/zfs/dmu_diff.c @@ -105,7 +105,7 @@ report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) /* ARGSUSED */ static int -diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, +diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct diffarg *da = arg; @@ -132,9 +132,9 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, int blksz = BP_GET_LSIZE(bp); int i; - if (dsl_read(NULL, spa, bp, pbuf, - arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) return (EIO); blk = abuf->b_data; diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c index b840881..00cbe04 100644 --- a/uts/common/fs/zfs/dmu_objset.c +++ b/uts/common/fs/zfs/dmu_objset.c @@ -277,12 +277,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, aflags |= ARC_L2CACHE; dprintf_bp(os->os_rootbp, "reading %s", ""); - /* - * XXX when bprewrite scrub can change the bp, - * and this is called from dmu_objset_open_ds_os, the bp - * could change, and we'll need a lock. - */ - err = dsl_read_nolock(NULL, spa, os->os_rootbp, + err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { @@ -1173,8 +1168,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - VERIFY3U(0, ==, arc_release_bp(os->os_phys_buf, &os->os_phys_buf, - os->os_rootbp, os->os_spa, &zb)); + arc_release(os->os_phys_buf, &os->os_phys_buf); dmu_write_policy(os, NULL, 0, 0, &zp); @@ -1813,7 +1807,7 @@ dmu_objset_prefetch(const char *name, void *arg) SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - (void) dsl_read_nolock(NULL, dsl_dataset_get_spa(ds), + (void) arc_read(NULL, dsl_dataset_get_spa(ds), &ds->ds_phys->ds_bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c index e5644b5..c249335 100644 --- a/uts/common/fs/zfs/dmu_send.c +++ b/uts/common/fs/zfs/dmu_send.c @@ -301,7 +301,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) /* ARGSUSED */ static int -backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, +backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { dmu_sendarg_t *dsp = arg; @@ -330,9 +330,9 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; - if (dsl_read(NULL, spa, bp, pbuf, - arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) return (EIO); blk = abuf->b_data; @@ -349,9 +349,9 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); - if (arc_read_nolock(NULL, spa, bp, - arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) return (EIO); err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); @@ -361,9 +361,9 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); - if (dsl_read(NULL, spa, bp, pbuf, - arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_buf_alloc(spa, blksz, &abuf, diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c index 34f19cd..f3d5069 100644 --- a/uts/common/fs/zfs/dmu_traverse.c +++ b/uts/common/fs/zfs/dmu_traverse.c @@ -62,9 +62,9 @@ typedef struct traverse_data { } traverse_data_t; static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - arc_buf_t *buf, uint64_t objset, uint64_t object); + uint64_t objset, uint64_t object); static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, - arc_buf_t *buf, uint64_t objset, uint64_t object); + uint64_t objset, uint64_t object); static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) @@ -81,7 +81,7 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); + (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); return (0); } @@ -105,7 +105,7 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); - (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, + (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); } return (0); @@ -182,7 +182,7 @@ traverse_pause(traverse_data_t *td, const zbookmark_t *zb) static void traverse_prefetch_metadata(traverse_data_t *td, - arc_buf_t *pbuf, const blkptr_t *bp, const zbookmark_t *zb) + const blkptr_t *bp, const zbookmark_t *zb) { uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; @@ -200,14 +200,13 @@ traverse_prefetch_metadata(traverse_data_t *td, if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return; - (void) arc_read(NULL, td->td_spa, bp, - pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL, &flags, zb); + (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); } static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - arc_buf_t *pbuf, const blkptr_t *bp, const zbookmark_t *zb) + const blkptr_t *bp, const zbookmark_t *zb) { zbookmark_t czb; int err = 0, lasterr = 0; @@ -228,8 +227,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (BP_IS_HOLE(bp)) { - err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, - td->td_arg); + err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); return (err); } @@ -249,7 +247,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (td->td_flags & TRAVERSE_PRE) { - err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); @@ -265,8 +263,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - err = dsl_read(NULL, td->td_spa, bp, pbuf, - arc_getbuf_func, &buf, + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); @@ -276,7 +273,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); - traverse_prefetch_metadata(td, buf, &cbp[i], &czb); + traverse_prefetch_metadata(td, &cbp[i], &czb); } /* recursively visitbp() blocks below this */ @@ -284,7 +281,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); - err = traverse_visitbp(td, dnp, buf, &cbp[i], &czb); + err = traverse_visitbp(td, dnp, &cbp[i], &czb); if (err) { if (!hard) break; @@ -296,21 +293,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - err = dsl_read(NULL, td->td_spa, bp, pbuf, - arc_getbuf_func, &buf, + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); dnp = buf->b_data; for (i = 0; i < epb; i++) { - prefetch_dnode_metadata(td, &dnp[i], buf, zb->zb_objset, + prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { - err = traverse_dnode(td, &dnp[i], buf, zb->zb_objset, + err = traverse_dnode(td, &dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); if (err) { if (!hard) @@ -323,24 +319,23 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, objset_phys_t *osp; dnode_phys_t *dnp; - err = dsl_read_nolock(NULL, td->td_spa, bp, - arc_getbuf_func, &buf, + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); osp = buf->b_data; dnp = &osp->os_meta_dnode; - prefetch_dnode_metadata(td, dnp, buf, zb->zb_objset, + prefetch_dnode_metadata(td, dnp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { prefetch_dnode_metadata(td, &osp->os_userused_dnode, - buf, zb->zb_objset, DMU_USERUSED_OBJECT); + zb->zb_objset, DMU_USERUSED_OBJECT); prefetch_dnode_metadata(td, &osp->os_groupused_dnode, - buf, zb->zb_objset, DMU_USERUSED_OBJECT); + zb->zb_objset, DMU_USERUSED_OBJECT); } - err = traverse_dnode(td, dnp, buf, zb->zb_objset, + err = traverse_dnode(td, dnp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err && hard) { lasterr = err; @@ -348,7 +343,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_userused_dnode; - err = traverse_dnode(td, dnp, buf, zb->zb_objset, + err = traverse_dnode(td, dnp, zb->zb_objset, DMU_USERUSED_OBJECT); } if (err && hard) { @@ -357,7 +352,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { dnp = &osp->os_groupused_dnode; - err = traverse_dnode(td, dnp, buf, zb->zb_objset, + err = traverse_dnode(td, dnp, zb->zb_objset, DMU_GROUPUSED_OBJECT); } } @@ -367,8 +362,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, post: if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { - err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, - td->td_arg); + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err == ERESTART) pause = B_TRUE; } @@ -384,25 +378,25 @@ post: static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, - arc_buf_t *buf, uint64_t objset, uint64_t object) + uint64_t objset, uint64_t object) { int j; zbookmark_t czb; for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - traverse_prefetch_metadata(td, buf, &dnp->dn_blkptr[j], &czb); + traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - traverse_prefetch_metadata(td, buf, &dnp->dn_spill, &czb); + traverse_prefetch_metadata(td, &dnp->dn_spill, &czb); } } static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - arc_buf_t *buf, uint64_t objset, uint64_t object) + uint64_t objset, uint64_t object) { int j, err = 0, lasterr = 0; zbookmark_t czb; @@ -410,7 +404,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - err = traverse_visitbp(td, dnp, buf, &dnp->dn_blkptr[j], &czb); + err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); if (err) { if (!hard) break; @@ -420,7 +414,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - err = traverse_visitbp(td, dnp, buf, &dnp->dn_spill, &czb); + err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); if (err) { if (!hard) return (err); @@ -433,8 +427,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, /* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, - void *arg) + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; @@ -455,10 +448,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, cv_broadcast(&pfd->pd_cv); mutex_exit(&pfd->pd_mtx); - (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, zb); + (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); return (0); } @@ -476,7 +467,7 @@ traverse_prefetch_thread(void *arg) SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); + (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); mutex_enter(&td_main->td_pfd->pd_mtx); td_main->td_pfd->pd_exited = B_TRUE; @@ -540,7 +531,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); + err = traverse_visitbp(&td, NULL, rootbp, &czb); mutex_enter(&pd.pd_mtx); pd.pd_cancel = B_TRUE; diff --git a/uts/common/fs/zfs/dsl_dataset.c b/uts/common/fs/zfs/dsl_dataset.c index 6625444..177c63e 100644 --- a/uts/common/fs/zfs/dsl_dataset.c +++ b/uts/common/fs/zfs/dsl_dataset.c @@ -1260,7 +1260,7 @@ struct killarg { /* ARGSUSED */ static int -kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c index 8f08f04..aec313a 100644 --- a/uts/common/fs/zfs/dsl_scan.c +++ b/uts/common/fs/zfs/dsl_scan.c @@ -366,24 +366,6 @@ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); } -int -dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) -{ - return (arc_read(pio, spa, bpp, pbuf, done, private, - priority, zio_flags, arc_flags, zb)); -} - -int -dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) -{ - return (arc_read_nolock(pio, spa, bpp, done, private, - priority, zio_flags, arc_flags, zb)); -} - static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { @@ -554,12 +536,8 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); - /* - * XXX need to make sure all of these arc_read() prefetches are - * done before setting xlateall (similar to dsl_read()) - */ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, - buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); } @@ -617,8 +595,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, bufp, + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -640,8 +617,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) { uint32_t flags = ARC_WAIT; - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, bufp, + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -653,8 +629,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, bufp, + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -676,8 +651,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, uint32_t flags = ARC_WAIT; objset_phys_t *osp; - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, bufp, + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 4988b50..333bbee 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -1732,7 +1732,7 @@ spa_load_verify_done(zio_t *zio) /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { if (bp != NULL) { zio_t *rio = arg; diff --git a/uts/common/fs/zfs/sys/arc.h b/uts/common/fs/zfs/sys/arc.h index b109dca..916d2ab 100644 --- a/uts/common/fs/zfs/sys/arc.h +++ b/uts/common/fs/zfs/sys/arc.h @@ -49,7 +49,6 @@ struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; kmutex_t b_evict_lock; - krwlock_t b_data_lock; void *b_data; arc_evict_func_t *b_efunc; void *b_private; @@ -93,8 +92,6 @@ void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); -int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, - zbookmark_t *zb); int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); @@ -104,10 +101,7 @@ boolean_t arc_buf_eviction_needed(arc_buf_t *buf); int arc_referenced(arc_buf_t *buf); #endif -int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb); -int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, +int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int flags, uint32_t *arc_flags, const zbookmark_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, diff --git a/uts/common/fs/zfs/sys/dmu_traverse.h b/uts/common/fs/zfs/sys/dmu_traverse.h index 3cbf42f..bc1590b 100644 --- a/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/uts/common/fs/zfs/sys/dmu_traverse.h @@ -40,8 +40,7 @@ struct zilog; struct arc_buf; typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp, - void *arg); + const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); #define TRAVERSE_PRE (1<<0) #define TRAVERSE_POST (1<<1) diff --git a/uts/common/fs/zfs/sys/dsl_pool.h b/uts/common/fs/zfs/sys/dsl_pool.h index f8c98ed..ab1229a 100644 --- a/uts/common/fs/zfs/sys/dsl_pool.h +++ b/uts/common/fs/zfs/sys/dsl_pool.h @@ -134,12 +134,6 @@ void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); -int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb); -int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb); void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); diff --git a/uts/common/fs/zfs/zil.c b/uts/common/fs/zfs/zil.c index 9697234..81d2bb5 100644 --- a/uts/common/fs/zfs/zil.c +++ b/uts/common/fs/zfs/zil.c @@ -190,7 +190,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, + error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { @@ -266,7 +266,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); - error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, + error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c index 582c504..04b8ddf 100644 --- a/uts/common/fs/zfs/zio.c +++ b/uts/common/fs/zfs/zio.c @@ -1994,7 +1994,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) ddt_exit(ddt); - error = arc_read_nolock(NULL, spa, &blk, + error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); diff --git a/uts/common/fs/zfs/zvol.c b/uts/common/fs/zfs/zvol.c index 100fdd0..da1b0ef 100644 --- a/uts/common/fs/zfs/zvol.c +++ b/uts/common/fs/zfs/zvol.c @@ -236,7 +236,7 @@ struct maparg { /*ARGSUSED*/ static int -zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, +zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct maparg *ma = arg; -- cgit v1.1 From 7898655cb728e5ea7dd92f166ccd00463319ed1d Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 6 Feb 2013 08:29:00 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate version 13945:7a9c1d41dfbe Illumos ZFS issues: 3507 Tunable to allow block allocation even on degraded vdevs --- uts/common/fs/zfs/metaslab.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index 6449788..ceb7c6f 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ #include @@ -91,6 +92,11 @@ int metaslab_prefetch_limit = SPA_DVAS_PER_BP; int metaslab_smo_bonus_pct = 150; /* + * Should we be willing to write data to degraded vdevs? + */ +boolean_t zfs_write_to_degraded = B_FALSE; + +/* * ========================================================================== * Metaslab classes * ========================================================================== @@ -1377,10 +1383,13 @@ top: /* * Avoid writing single-copy data to a failing vdev + * unless the user instructs us that it is okay. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && - d == 0 && dshift == 3) { + d == 0 && dshift == 3 && + !(zfs_write_to_degraded && vd->vdev_state == + VDEV_STATE_DEGRADED)) { all_zero = B_FALSE; goto next; } -- cgit v1.1 From b600e6e2296f4cafc3434ee74664407e7af76ed8 Mon Sep 17 00:00:00 2001 From: mm Date: Sun, 10 Feb 2013 19:41:19 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate version 13949:4f6a155f70fe Illumos ZFS issues: 3512 rounding discrepancy in sa_find_sizes() 3513 mismatch between SA header size and layout --- uts/common/fs/zfs/sa.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/uts/common/fs/zfs/sa.c b/uts/common/fs/zfs/sa.c index 06607d7..7cd4ab0 100644 --- a/uts/common/fs/zfs/sa.c +++ b/uts/common/fs/zfs/sa.c @@ -553,6 +553,7 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, { int var_size = 0; int i; + int j = -1; int full_space; int hdrsize; boolean_t done = B_FALSE; @@ -574,10 +575,12 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, sizeof (sa_hdr_phys_t); full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size; + ASSERT(IS_P2ALIGNED(full_space, 8)); for (i = 0; i != attr_count; i++) { boolean_t is_var_sz; + *total = P2ROUNDUP(*total, 8); *total += attr_desc[i].sa_length; if (done) goto next; @@ -590,7 +593,14 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, if (is_var_sz && var_size > 1) { if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + *total < full_space) { + /* + * Account for header space used by array of + * optional sizes of variable-length attributes. + * Record the index in case this increase needs + * to be reversed due to spill-over. + */ hdrsize += sizeof (uint16_t); + j = i; } else { done = B_TRUE; *index = i; @@ -619,6 +629,14 @@ next: *will_spill = B_TRUE; } + /* + * j holds the index of the last variable-sized attribute for + * which hdrsize was increased. Reverse the increase if that + * attribute will be relocated to the spill block. + */ + if (*will_spill && j == *index) + hdrsize -= sizeof (uint16_t); + hdrsize = P2ROUNDUP(hdrsize, 8); return (hdrsize); } @@ -709,12 +727,15 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { uint16_t length; + ASSERT(IS_P2ALIGNED(data_start, 8)); + ASSERT(IS_P2ALIGNED(buf_space, 8)); attrs[i] = attr_desc[i].sa_attr; length = SA_REGISTERED_LEN(sa, attrs[i]); if (length == 0) length = attr_desc[i].sa_length; if (buf_space < length) { /* switch to spill buffer */ + VERIFY(spilling); VERIFY(bonustype == DMU_OT_SA); if (buftype == SA_BONUS && !sa->sa_force_spill) { sa_find_layout(hdl->sa_os, hash, attrs_start, -- cgit v1.1 From 852ad5f941bbecd32254ed4766c0dcdf6b503c45 Mon Sep 17 00:00:00 2001 From: mm Date: Mon, 11 Feb 2013 08:06:18 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate 13952:7a22d0770fc8 Illumos ZFS issues: 3522 zfs module should not allow uninitialized variables --- uts/common/fs/zfs/arc.c | 12 ++++++++---- uts/common/fs/zfs/dmu.c | 3 +-- uts/common/fs/zfs/dmu_objset.c | 3 ++- uts/common/fs/zfs/dsl_dataset.c | 9 +++------ uts/common/fs/zfs/dsl_scan.c | 3 ++- uts/common/fs/zfs/lzjb.c | 7 +++++-- uts/common/fs/zfs/refcount.c | 2 +- uts/common/fs/zfs/sa.c | 8 +++++--- uts/common/fs/zfs/spa.c | 5 +++-- uts/common/fs/zfs/vdev_raidz.c | 3 ++- uts/common/fs/zfs/zap_leaf.c | 2 +- uts/common/fs/zfs/zfs_byteswap.c | 2 +- uts/common/fs/zfs/zfs_fuid.c | 9 ++++++--- uts/common/fs/zfs/zfs_ioctl.c | 2 +- uts/common/fs/zfs/zfs_log.c | 3 +-- uts/common/fs/zfs/zfs_rlock.c | 2 +- uts/common/fs/zfs/zfs_vfsops.c | 18 ++++++++++++------ uts/common/fs/zfs/zfs_vnops.c | 9 +++++---- 18 files changed, 60 insertions(+), 42 deletions(-) diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c index 6625deb..258a89f 100644 --- a/uts/common/fs/zfs/arc.c +++ b/uts/common/fs/zfs/arc.c @@ -2784,7 +2784,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; - arc_buf_t *buf; + arc_buf_t *buf = NULL; kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_load_guid(spa); @@ -2866,7 +2866,7 @@ top: uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; - uint64_t addr; + uint64_t addr = 0; boolean_t devw = B_FALSE; if (hdr == NULL) { @@ -2981,6 +2981,10 @@ top: cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; + ASSERT(addr >= VDEV_LABEL_START_SIZE && + addr + size < vd->vdev_psize - + VDEV_LABEL_END_SIZE); + /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). @@ -3172,8 +3176,8 @@ arc_release(arc_buf_t *buf, void *tag) if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); hdr->b_l2hdr = NULL; - buf_size = hdr->b_size; } + buf_size = hdr->b_size; /* * Do we have more than one buf? @@ -4172,7 +4176,7 @@ l2arc_read_done(zio_t *zio) static list_t * l2arc_list_locked(int list_num, kmutex_t **lock) { - list_t *list; + list_t *list = NULL; ASSERT(list_num >= 0 && list_num <= 3); diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c index 9aebb97..21cdd7c 100644 --- a/uts/common/fs/zfs/dmu.c +++ b/uts/common/fs/zfs/dmu.c @@ -405,8 +405,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; - if (dp && dsl_pool_sync_context(dp)) - start = gethrtime(); + start = gethrtime(); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) { diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c index 00cbe04..74c1192 100644 --- a/uts/common/fs/zfs/dmu_objset.c +++ b/uts/common/fs/zfs/dmu_objset.c @@ -1372,7 +1372,8 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; - uint64_t *user, *group; + uint64_t *user = NULL; + uint64_t *group = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; diff --git a/uts/common/fs/zfs/dsl_dataset.c b/uts/common/fs/zfs/dsl_dataset.c index 177c63e..cf888d2 100644 --- a/uts/common/fs/zfs/dsl_dataset.c +++ b/uts/common/fs/zfs/dsl_dataset.c @@ -375,7 +375,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds = dmu_buf_get_user(dbuf); if (ds == NULL) { - dsl_dataset_t *winner; + dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; @@ -460,11 +460,8 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_reserved = ds->ds_quota = 0; } - if (err == 0) { - winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, - dsl_dataset_evict); - } - if (err || winner) { + if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, + &ds->ds_phys, dsl_dataset_evict)) != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c index aec313a..e171725 100644 --- a/uts/common/fs/zfs/dsl_scan.c +++ b/uts/common/fs/zfs/dsl_scan.c @@ -1627,7 +1627,8 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, zio_priority = ZIO_PRIORITY_SCRUB; needs_io = B_TRUE; scan_delay = zfs_scrub_delay; - } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + } else { + ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; zio_priority = ZIO_PRIORITY_RESILVER; needs_io = B_FALSE; diff --git a/uts/common/fs/zfs/lzjb.c b/uts/common/fs/zfs/lzjb.c index ab3de51..a938fee 100644 --- a/uts/common/fs/zfs/lzjb.c +++ b/uts/common/fs/zfs/lzjb.c @@ -37,6 +37,7 @@ */ #include +#include #define MATCH_BITS 6 #define MATCH_MIN 3 @@ -50,7 +51,8 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) { uchar_t *src = s_start; uchar_t *dst = d_start; - uchar_t *cpy, *copymap; + uchar_t *cpy; + uchar_t *copymap = NULL; int copymask = 1 << (NBBY - 1); int mlen, offset, hash; uint16_t *hp; @@ -99,7 +101,8 @@ lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) uchar_t *src = s_start; uchar_t *dst = d_start; uchar_t *d_end = (uchar_t *)d_start + d_len; - uchar_t *cpy, copymap; + uchar_t *cpy; + uchar_t copymap = 0; int copymask = 1 << (NBBY - 1); while (dst < d_end) { diff --git a/uts/common/fs/zfs/refcount.c b/uts/common/fs/zfs/refcount.c index 600132f..3a8e144 100644 --- a/uts/common/fs/zfs/refcount.c +++ b/uts/common/fs/zfs/refcount.c @@ -110,7 +110,7 @@ refcount_count(refcount_t *rc) int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder) { - reference_t *ref; + reference_t *ref = NULL; int64_t count; if (reference_tracking_enable) { diff --git a/uts/common/fs/zfs/sa.c b/uts/common/fs/zfs/sa.c index 7cd4ab0..cd3a58b 100644 --- a/uts/common/fs/zfs/sa.c +++ b/uts/common/fs/zfs/sa.c @@ -660,7 +660,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, int buf_space; sa_attr_type_t *attrs, *attrs_start; int i, lot_count; - int hdrsize, spillhdrsize; + int hdrsize; + int spillhdrsize = 0; int used; dmu_object_type_t bonustype; sa_lot_t *lot; @@ -834,7 +835,7 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) { sa_os_t *sa = os->os_sa; uint64_t sa_attr_count = 0; - uint64_t sa_reg_count; + uint64_t sa_reg_count = 0; int error = 0; uint64_t attr_value; sa_attr_table_t *tb; @@ -1639,7 +1640,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; - int bonus_data_size, spill_data_size; + int bonus_data_size = 0; + int spill_data_size = 0; int spill_attr_count = 0; int error; uint16_t length; diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 333bbee..09a322b 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -374,7 +374,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; - uint64_t objnum; + uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; @@ -1342,6 +1342,7 @@ spa_load_l2cache(spa_t *spa) newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); } else { nl2cache = 0; + newvdevs = NULL; } oldvdevs = sav->sav_vdevs; @@ -4446,7 +4447,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; - uint64_t unspare_guid; + uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); diff --git a/uts/common/fs/zfs/vdev_raidz.c b/uts/common/fs/zfs/vdev_raidz.c index efae534..5c36753 100644 --- a/uts/common/fs/zfs/vdev_raidz.c +++ b/uts/common/fs/zfs/vdev_raidz.c @@ -1190,7 +1190,8 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, uint64_t ccount; uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; - uint8_t log, val; + uint8_t log = 0; + uint8_t val; int ll; uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; diff --git a/uts/common/fs/zfs/zap_leaf.c b/uts/common/fs/zfs/zap_leaf.c index 19a795d..b867ac4 100644 --- a/uts/common/fs/zfs/zap_leaf.c +++ b/uts/common/fs/zfs/zap_leaf.c @@ -220,7 +220,7 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf, uint16_t chunk_head; uint16_t *chunkp = &chunk_head; int byten = 0; - uint64_t value; + uint64_t value = 0; int shift = (integer_size-1)*8; int len = num_integers; diff --git a/uts/common/fs/zfs/zfs_byteswap.c b/uts/common/fs/zfs/zfs_byteswap.c index acf632b..6048eb1 100644 --- a/uts/common/fs/zfs/zfs_byteswap.c +++ b/uts/common/fs/zfs/zfs_byteswap.c @@ -51,7 +51,7 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) { caddr_t end; caddr_t ptr; - zfs_ace_t *zacep; + zfs_ace_t *zacep = NULL; ace_t *acep; uint16_t entry_type; size_t entry_size; diff --git a/uts/common/fs/zfs/zfs_fuid.c b/uts/common/fs/zfs/zfs_fuid.c index a853f4d..6eb03c2 100644 --- a/uts/common/fs/zfs/zfs_fuid.c +++ b/uts/common/fs/zfs/zfs_fuid.c @@ -556,9 +556,9 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, uint32_t fuid_idx = FUID_INDEX(id); uint32_t rid; idmap_stat status; - uint64_t idx; + uint64_t idx = 0; zfs_fuid_t *zfuid = NULL; - zfs_fuid_info_t *fuidp; + zfs_fuid_info_t *fuidp = NULL; /* * If POSIX ID, or entry is already a FUID then @@ -583,6 +583,9 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, if (fuidp == NULL) return (UID_NOBODY); + VERIFY3U(type, >=, ZFS_OWNER); + VERIFY3U(type, <=, ZFS_ACE_GROUP); + switch (type) { case ZFS_ACE_USER: case ZFS_ACE_GROUP: @@ -599,7 +602,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, idx = FUID_INDEX(fuidp->z_fuid_group); break; }; - domain = fuidp->z_domain_table[idx -1]; + domain = fuidp->z_domain_table[idx - 1]; } else { if (type == ZFS_OWNER || type == ZFS_ACE_USER) status = kidmap_getsidbyuid(crgetzone(cr), id, diff --git a/uts/common/fs/zfs/zfs_ioctl.c b/uts/common/fs/zfs/zfs_ioctl.c index d7c7104..090d6c9 100644 --- a/uts/common/fs/zfs/zfs_ioctl.c +++ b/uts/common/fs/zfs/zfs_ioctl.c @@ -975,7 +975,7 @@ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; - int error; + int error = 0; nvpair_t *pair; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) diff --git a/uts/common/fs/zfs/zfs_log.c b/uts/common/fs/zfs/zfs_log.c index 26ab782..de786bf 100644 --- a/uts/common/fs/zfs/zfs_log.c +++ b/uts/common/fs/zfs/zfs_log.c @@ -238,7 +238,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, itx_t *itx; lr_create_t *lr; lr_acl_create_t *lracl; - size_t aclsize; + size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0; size_t xvatsize = 0; size_t txsize; xvattr_t *xvap = (xvattr_t *)vap; @@ -268,7 +268,6 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; lrsize = sizeof (*lr); } else { - aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; txsize = sizeof (lr_acl_create_t) + namesize + fuidsz + ZIL_ACE_LENGTH(aclsize) + xvatsize; diff --git a/uts/common/fs/zfs/zfs_rlock.c b/uts/common/fs/zfs/zfs_rlock.c index 08f88b8..be56249 100644 --- a/uts/common/fs/zfs/zfs_rlock.c +++ b/uts/common/fs/zfs/zfs_rlock.c @@ -463,7 +463,7 @@ static void zfs_range_unlock_reader(znode_t *zp, rl_t *remove) { avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl, *next; + rl_t *rl, *next = NULL; uint64_t len; /* diff --git a/uts/common/fs/zfs/zfs_vfsops.c b/uts/common/fs/zfs/zfs_vfsops.c index 3278a77..2a25017 100644 --- a/uts/common/fs/zfs/zfs_vfsops.c +++ b/uts/common/fs/zfs/zfs_vfsops.c @@ -407,12 +407,18 @@ zfs_register_callbacks(vfs_t *vfsp) objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; - int readonly, do_readonly = B_FALSE; - int setuid, do_setuid = B_FALSE; - int exec, do_exec = B_FALSE; - int devices, do_devices = B_FALSE; - int xattr, do_xattr = B_FALSE; - int atime, do_atime = B_FALSE; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; + boolean_t devices = B_FALSE; + boolean_t do_devices = B_FALSE; + boolean_t xattr = B_FALSE; + boolean_t do_xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; int error = 0; ASSERT(vfsp); diff --git a/uts/common/fs/zfs/zfs_vnops.c b/uts/common/fs/zfs/zfs_vnops.c index de7812f..7d7168f 100644 --- a/uts/common/fs/zfs/zfs_vnops.c +++ b/uts/common/fs/zfs/zfs_vnops.c @@ -447,7 +447,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; ssize_t n, nbytes; - int error; + int error = 0; rl_t *rl; xuio_t *xuio = NULL; @@ -599,9 +599,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; - int error; + int error = 0; arc_buf_t *abuf; - iovec_t *aiov; + iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; @@ -2186,6 +2186,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; + outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; @@ -2627,7 +2628,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; - uint_t saved_mask; + uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; -- cgit v1.1 From 6d1e6e607d30506f6440d76f89f8932d8b0bb32f Mon Sep 17 00:00:00 2001 From: mm Date: Mon, 11 Feb 2013 08:07:56 +0000 Subject: Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13953:0cc6917308f7 Illumos dtrace issues: 3529 iostat should display time used by dtrace --- uts/common/dtrace/dtrace.c | 9 +++++++-- uts/common/sys/cpuvar.h | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/uts/common/dtrace/dtrace.c b/uts/common/dtrace/dtrace.c index 7a5e67e..4bee438 100644 --- a/uts/common/dtrace/dtrace.c +++ b/uts/common/dtrace/dtrace.c @@ -5861,7 +5861,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, size_t size; int vtime, onintr; volatile uint16_t *flags; - hrtime_t now; + hrtime_t now, end; /* * Kick out immediately if this CPU is still being born (in which case @@ -5876,6 +5876,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, cpuid = CPU->cpu_id; onintr = CPU_ON_INTR(CPU); + CPU->cpu_dtrace_probes++; + if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE && probe->dtpr_predcache == curthread->t_predcache) { /* @@ -6455,8 +6457,11 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, buf->dtb_offset = offs + ecb->dte_size; } + end = dtrace_gethrtime(); if (vtime) - curthread->t_dtrace_start = dtrace_gethrtime(); + curthread->t_dtrace_start = end; + + CPU->cpu_dtrace_nsec += end - now; dtrace_interrupt_enable(cookie); } diff --git a/uts/common/sys/cpuvar.h b/uts/common/sys/cpuvar.h index d4075d5..6c07bcb 100644 --- a/uts/common/sys/cpuvar.h +++ b/uts/common/sys/cpuvar.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_CPUVAR_H @@ -187,6 +188,9 @@ typedef struct cpu { uintptr_t cpu_dtrace_caller; /* DTrace: caller, if any */ hrtime_t cpu_dtrace_chillmark; /* DTrace: chill mark time */ hrtime_t cpu_dtrace_chilled; /* DTrace: total chill time */ + uint64_t cpu_dtrace_probes; /* DTrace: total probes fired */ + hrtime_t cpu_dtrace_nsec; /* DTrace: ns in dtrace_probe */ + volatile uint16_t cpu_mstate; /* cpu microstate */ volatile uint16_t cpu_mstate_gen; /* generation counter */ volatile hrtime_t cpu_mstate_start; /* cpu microstate start time */ -- cgit v1.1 From 20e8d8825939368ddcaef368b97c925fb0ec0e93 Mon Sep 17 00:00:00 2001 From: mm Date: Mon, 18 Feb 2013 11:48:08 +0000 Subject: Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13957:512faafc0eaf Illumos ZFS issues: 3537 want pool io kstats --- uts/common/fs/zfs/spa_misc.c | 12 +++++++++ uts/common/fs/zfs/sys/spa_impl.h | 2 ++ uts/common/fs/zfs/vdev_queue.c | 54 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index 0e2d0ef..405d93c 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -480,6 +480,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); @@ -559,6 +560,13 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) KM_SLEEP) == 0); } + spa->spa_iokstat = kstat_create("zfs", 0, name, + "disk", KSTAT_TYPE_IO, 1, 0); + if (spa->spa_iokstat) { + spa->spa_iokstat->ks_lock = &spa->spa_iokstat_lock; + kstat_install(spa->spa_iokstat); + } + return (spa); } @@ -608,6 +616,9 @@ spa_remove(spa_t *spa) spa_config_lock_destroy(spa); + kstat_delete(spa->spa_iokstat); + spa->spa_iokstat = NULL; + for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); @@ -625,6 +636,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); + mutex_destroy(&spa->spa_iokstat_lock); kmem_free(spa, sizeof (spa_t)); } diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h index 42ce555..ffd676e 100644 --- a/uts/common/fs/zfs/sys/spa_impl.h +++ b/uts/common/fs/zfs/sys/spa_impl.h @@ -231,6 +231,8 @@ struct spa { uint64_t spa_deadman_calls; /* number of deadman calls */ uint64_t spa_sync_starttime; /* starting time fo spa_sync */ uint64_t spa_deadman_synctime; /* deadman expiration timer */ + kmutex_t spa_iokstat_lock; /* protects spa_iokstat_* */ + struct kstat *spa_iokstat; /* kstat of io to this pool */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/uts/common/fs/zfs/vdev_queue.c b/uts/common/fs/zfs/vdev_queue.c index 2b06040..a806e93 100644 --- a/uts/common/fs/zfs/vdev_queue.c +++ b/uts/common/fs/zfs/vdev_queue.c @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -142,15 +143,62 @@ vdev_queue_fini(vdev_t *vd) static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { + spa_t *spa = zio->io_spa; avl_add(&vq->vq_deadline_tree, zio); avl_add(zio->io_vdev_tree, zio); + + if (spa->spa_iokstat != NULL) { + mutex_enter(&spa->spa_iokstat_lock); + kstat_waitq_enter(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); + } } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { + spa_t *spa = zio->io_spa; avl_remove(&vq->vq_deadline_tree, zio); avl_remove(zio->io_vdev_tree, zio); + + if (spa->spa_iokstat != NULL) { + mutex_enter(&spa->spa_iokstat_lock); + kstat_waitq_exit(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); + } +} + +static void +vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + avl_add(&vq->vq_pending_tree, zio); + if (spa->spa_iokstat != NULL) { + mutex_enter(&spa->spa_iokstat_lock); + kstat_runq_enter(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); + } +} + +static void +vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + avl_remove(&vq->vq_pending_tree, zio); + if (spa->spa_iokstat != NULL) { + kstat_io_t *ksio = spa->spa_iokstat->ks_data; + + mutex_enter(&spa->spa_iokstat_lock); + kstat_runq_exit(spa->spa_iokstat->ks_data); + if (zio->io_type == ZIO_TYPE_READ) { + ksio->reads++; + ksio->nread += zio->io_size; + } else if (zio->io_type == ZIO_TYPE_WRITE) { + ksio->writes++; + ksio->nwritten += zio->io_size; + } + mutex_exit(&spa->spa_iokstat_lock); + } } static void @@ -317,7 +365,7 @@ again: zio_execute(dio); } while (dio != lio); - avl_add(&vq->vq_pending_tree, aio); + vdev_queue_pending_add(vq, aio); return (aio); } @@ -339,7 +387,7 @@ again: goto again; } - avl_add(&vq->vq_pending_tree, fio); + vdev_queue_pending_add(vq, fio); return (fio); } @@ -395,7 +443,7 @@ vdev_queue_io_done(zio_t *zio) mutex_enter(&vq->vq_lock); - avl_remove(&vq->vq_pending_tree, zio); + vdev_queue_pending_remove(vq, zio); vq->vq_io_complete_ts = ddi_get_lbolt64(); vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; -- cgit v1.1 From a0588e8c68bbcf2637d9aa5ae66269dc9e381bea Mon Sep 17 00:00:00 2001 From: mm Date: Sat, 23 Feb 2013 08:57:47 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate 13958:1fd91513472c Illumos ZFS issues: 3561 arc_meta_limit should be exposed via kstats 3116 zpool reguid may log negative guids to internal SPA history --- uts/common/fs/zfs/arc.c | 18 ++++++++++++------ uts/common/fs/zfs/spa.c | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c index 258a89f..ca3baea 100644 --- a/uts/common/fs/zfs/arc.c +++ b/uts/common/fs/zfs/arc.c @@ -294,6 +294,9 @@ typedef struct arc_stats { kstat_named_t arcstat_duplicate_buffers; kstat_named_t arcstat_duplicate_buffers_size; kstat_named_t arcstat_duplicate_reads; + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_meta_max; } arc_stats_t; static arc_stats_t arc_stats = { @@ -352,7 +355,10 @@ static arc_stats_t arc_stats = { { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "duplicate_buffers", KSTAT_DATA_UINT64 }, { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, - { "duplicate_reads", KSTAT_DATA_UINT64 } + { "duplicate_reads", KSTAT_DATA_UINT64 }, + { "arc_meta_used", KSTAT_DATA_UINT64 }, + { "arc_meta_limit", KSTAT_DATA_UINT64 }, + { "arc_meta_max", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -414,13 +420,13 @@ static arc_state_t *arc_l2c_only; #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; -static uint64_t arc_meta_used; -static uint64_t arc_meta_limit; -static uint64_t arc_meta_max = 0; typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; @@ -1218,7 +1224,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) break; } - atomic_add_64(&arc_meta_used, space); + ARCSTAT_INCR(arcstat_meta_used, space); atomic_add_64(&arc_size, space); } @@ -1245,7 +1251,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; - atomic_add_64(&arc_meta_used, -space); + ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 09a322b..fb68e91 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -715,7 +715,7 @@ spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); - spa_history_log_internal(spa, "guid change", tx, "old=%lld new=%lld", + spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", oldguid, *newguid); } -- cgit v1.1 From 60853ca1df453400cb3fa2fad2b1cfe5590d6ffc Mon Sep 17 00:00:00 2001 From: mm Date: Sat, 23 Feb 2013 09:00:35 +0000 Subject: Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13959:e03e14ddfb4c Illumos ZFS issues: 3552 condensing one space map burns 3 seconds of CPU in spa_sync() thread 3564 spa_sync() spends 5-10% of its time in metaslab_sync() (when not condensing) --- uts/common/fs/zfs/metaslab.c | 348 ++++++++++++++++++++++++++-------- uts/common/fs/zfs/space_map.c | 33 +++- uts/common/fs/zfs/sys/metaslab_impl.h | 32 +++- uts/common/fs/zfs/sys/space_map.h | 7 +- uts/common/fs/zfs/vdev.c | 1 + 5 files changed, 318 insertions(+), 103 deletions(-) diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index ceb7c6f..9c4f97a 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -48,6 +48,14 @@ uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space_map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +int zfs_condense_pct = 200; + +/* * This value defines the number of allowed allocation failures per vdev. * If a device reaches this threshold in a given txg then we consider skipping * allocations on that device. @@ -206,9 +214,9 @@ metaslab_compare(const void *x1, const void *x2) /* * If the weights are identical, use the offset to force uniqueness. */ - if (m1->ms_map.sm_start < m2->ms_map.sm_start) + if (m1->ms_map->sm_start < m2->ms_map->sm_start) return (-1); - if (m1->ms_map.sm_start > m2->ms_map.sm_start) + if (m1->ms_map->sm_start > m2->ms_map->sm_start) return (1); ASSERT3P(m1, ==, m2); @@ -723,14 +731,15 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ - space_map_create(&msp->ms_map, start, size, + msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); + space_map_create(msp->ms_map, start, size, vd->vdev_ashift, &msp->ms_lock); metaslab_group_add(mg, msp); if (metaslab_debug && smo->smo_object != 0) { mutex_enter(&msp->ms_lock); - VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, + VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops, SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); mutex_exit(&msp->ms_lock); } @@ -758,22 +767,27 @@ metaslab_fini(metaslab_t *msp) metaslab_group_t *mg = msp->ms_group; vdev_space_update(mg->mg_vd, - -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); + -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); - space_map_unload(&msp->ms_map); - space_map_destroy(&msp->ms_map); + space_map_unload(msp->ms_map); + space_map_destroy(msp->ms_map); + kmem_free(msp->ms_map, sizeof (*msp->ms_map)); for (int t = 0; t < TXG_SIZE; t++) { - space_map_destroy(&msp->ms_allocmap[t]); - space_map_destroy(&msp->ms_freemap[t]); + space_map_destroy(msp->ms_allocmap[t]); + space_map_destroy(msp->ms_freemap[t]); + kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t])); + kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t])); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_destroy(&msp->ms_defermap[t]); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_destroy(msp->ms_defermap[t]); + kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t])); + } ASSERT0(msp->ms_deferspace); @@ -792,7 +806,7 @@ static uint64_t metaslab_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; - space_map_t *sm = &msp->ms_map; + space_map_t *sm = msp->ms_map; space_map_obj_t *smo = &msp->ms_smo; vdev_t *vd = mg->mg_vd; uint64_t weight, space; @@ -852,7 +866,7 @@ metaslab_prefetch(metaslab_group_t *mg) * Prefetch the next potential metaslabs */ for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { - space_map_t *sm = &msp->ms_map; + space_map_t *sm = msp->ms_map; space_map_obj_t *smo = &msp->ms_smo; /* If we have reached our prefetch limit then we're done */ @@ -873,7 +887,7 @@ static int metaslab_activate(metaslab_t *msp, uint64_t activation_weight) { metaslab_group_t *mg = msp->ms_group; - space_map_t *sm = &msp->ms_map; + space_map_t *sm = msp->ms_map; space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -890,7 +904,7 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight) return (error); } for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_walk(&msp->ms_defermap[t], + space_map_walk(msp->ms_defermap[t], space_map_claim, sm); } @@ -921,12 +935,158 @@ metaslab_passivate(metaslab_t *msp, uint64_t size) * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ - ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); + ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0); metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } /* + * Determine if the in-core space map representation can be condensed on-disk. + * We would like to use the following criteria to make our decision: + * + * 1. The size of the space map object should not dramatically increase as a + * result of writing out our in-core free map. + * + * 2. The minimal on-disk space map representation is zfs_condense_pct/100 + * times the size than the in-core representation (i.e. zfs_condense_pct = 110 + * and in-core = 1MB, minimal = 1.1.MB). + * + * Checking the first condition is tricky since we don't want to walk + * the entire AVL tree calculating the estimated on-disk size. Instead we + * use the size-ordered AVL tree in the space map and calculate the + * size required for the largest segment in our in-core free map. If the + * size required to represent that segment on disk is larger than the space + * map object then we avoid condensing this map. + * + * To determine the second criterion we use a best-case estimate and assume + * each segment can be represented on-disk as a single 64-bit entry. We refer + * to this best-case estimate as the space map's minimal form. + */ +static boolean_t +metaslab_should_condense(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo_syncing; + space_seg_t *ss; + uint64_t size, entries, segsz; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(sm->sm_loaded); + + /* + * Use the sm_pp_root AVL tree, which is ordered by size, to obtain + * the largest segment in the in-core free map. If the tree is + * empty then we should condense the map. + */ + ss = avl_last(sm->sm_pp_root); + if (ss == NULL) + return (B_TRUE); + + /* + * Calculate the number of 64-bit entries this segment would + * require when written to disk. If this single segment would be + * larger on-disk than the entire current on-disk structure, then + * clearly condensing will increase the on-disk structure size. + */ + size = (ss->ss_end - ss->ss_start) >> sm->sm_shift; + entries = size / (MIN(size, SM_RUN_MAX)); + segsz = entries * sizeof (uint64_t); + + return (segsz <= smo->smo_objsize && + smo->smo_objsize >= (zfs_condense_pct * + sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100); +} + +/* + * Condense the on-disk space map representation to its minimized form. + * The minimized form consists of a small number of allocations followed by + * the in-core free map. + */ +static void +metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK]; + space_map_t condense_map; + space_map_t *sm = msp->ms_map; + objset_t *mos = spa_meta_objset(spa); + space_map_obj_t *smo = &msp->ms_smo_syncing; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(sm->sm_loaded); + + spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " + "smo size %llu, segments %lu", txg, + (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, + smo->smo_objsize, avl_numnodes(&sm->sm_root)); + + /* + * Create an map that is a 100% allocated map. We remove segments + * that have been freed in this txg, any deferred frees that exist, + * and any allocation in the future. Removing segments should be + * a relatively inexpensive operation since we expect these maps to + * a small number of nodes. + */ + space_map_create(&condense_map, sm->sm_start, sm->sm_size, + sm->sm_shift, sm->sm_lock); + space_map_add(&condense_map, condense_map.sm_start, + condense_map.sm_size); + + /* + * Remove what's been freed in this txg from the condense_map. + * Since we're in sync_pass 1, we know that all the frees from + * this txg are in the freemap. + */ + space_map_walk(freemap, space_map_remove, &condense_map); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(msp->ms_defermap[t], + space_map_remove, &condense_map); + + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) + space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK], + space_map_remove, &condense_map); + + /* + * We're about to drop the metaslab's lock thus allowing + * other consumers to change it's content. Set the + * space_map's sm_condensing flag to ensure that + * allocations on this metaslab do not occur while we're + * in the middle of committing it to disk. This is only critical + * for the ms_map as all other space_maps use per txg + * views of their content. + */ + sm->sm_condensing = B_TRUE; + + mutex_exit(&msp->ms_lock); + space_map_truncate(smo, mos, tx); + mutex_enter(&msp->ms_lock); + + /* + * While we would ideally like to create a space_map representation + * that consists only of allocation records, doing so can be + * prohibitively expensive because the in-core free map can be + * large, and therefore computationally expensive to subtract + * from the condense_map. Instead we sync out two maps, a cheap + * allocation only map followed by the in-core free map. While not + * optimal, this is typically close to optimal, and much cheaper to + * compute. + */ + space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx); + space_map_vacate(&condense_map, NULL, NULL); + space_map_destroy(&condense_map); + + space_map_sync(sm, SM_FREE, smo, mos, tx); + sm->sm_condensing = B_FALSE; + + spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, " + "smo size %llu", txg, + (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, + smo->smo_objsize); +} + +/* * Write a metaslab to disk in the context of the specified transaction group. */ void @@ -935,17 +1095,29 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); - space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; - space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *sm = &msp->ms_map; + space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK]; + space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK]; + space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *sm = msp->ms_map; space_map_obj_t *smo = &msp->ms_smo_syncing; dmu_buf_t *db; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); - if (allocmap->sm_space == 0 && freemap->sm_space == 0) + /* + * This metaslab has just been added so there's no work to do now. + */ + if (*freemap == NULL) { + ASSERT3P(allocmap, ==, NULL); + return; + } + + ASSERT3P(allocmap, !=, NULL); + ASSERT3P(*freemap, !=, NULL); + ASSERT3P(*freed_map, !=, NULL); + + if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0) return; /* @@ -973,49 +1145,36 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); - space_map_walk(freemap, space_map_add, freed_map); - - if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= - 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { - /* - * The in-core space map representation is twice as compact - * as the on-disk one, so it's time to condense the latter - * by generating a pure allocmap from first principles. - * - * This metaslab is 100% allocated, - * minus the content of the in-core map (sm), - * minus what's been freed this txg (freed_map), - * minus deferred frees (ms_defermap[]), - * minus allocations from txgs in the future - * (because they haven't been committed yet). - */ - space_map_vacate(allocmap, NULL, NULL); - space_map_vacate(freemap, NULL, NULL); - - space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); - - space_map_walk(sm, space_map_remove, allocmap); - space_map_walk(freed_map, space_map_remove, allocmap); - - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_walk(&msp->ms_defermap[t], - space_map_remove, allocmap); + if (sm->sm_loaded && spa_sync_pass(spa) == 1 && + metaslab_should_condense(msp)) { + metaslab_condense(msp, txg, tx); + } else { + space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); + space_map_sync(*freemap, SM_FREE, smo, mos, tx); + } - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) - space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], - space_map_remove, allocmap); + space_map_vacate(allocmap, NULL, NULL); - mutex_exit(&msp->ms_lock); - space_map_truncate(smo, mos, tx); - mutex_enter(&msp->ms_lock); + /* + * For sync pass 1, we avoid walking the entire space map and + * instead will just swap the pointers for freemap and + * freed_map. We can safely do this since the freed_map is + * guaranteed to be empty on the initial pass. + */ + if (spa_sync_pass(spa) == 1) { + ASSERT0((*freed_map)->sm_space); + ASSERT0(avl_numnodes(&(*freed_map)->sm_root)); + space_map_swap(freemap, freed_map); + } else { + space_map_vacate(*freemap, space_map_add, *freed_map); } - space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); - space_map_sync(freemap, SM_FREE, smo, mos, tx); + ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space); + ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space); mutex_exit(&msp->ms_lock); - VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, sizeof (*smo)); bcopy(smo, db->db_data, sizeof (*smo)); @@ -1033,9 +1192,9 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) { space_map_obj_t *smo = &msp->ms_smo; space_map_obj_t *smosync = &msp->ms_smo_syncing; - space_map_t *sm = &msp->ms_map; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; + space_map_t *sm = msp->ms_map; + space_map_t *freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; int64_t alloc_delta, defer_delta; @@ -1046,19 +1205,30 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If this metaslab is just becoming available, initialize its - * allocmaps and freemaps and add its capacity to the vdev. + * allocmaps, freemaps, and defermap and add its capacity to the vdev. */ - if (freed_map->sm_size == 0) { + if (freed_map == NULL) { + ASSERT(defer_map == NULL); for (int t = 0; t < TXG_SIZE; t++) { - space_map_create(&msp->ms_allocmap[t], sm->sm_start, + msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t), + KM_SLEEP); + space_map_create(msp->ms_allocmap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); - space_map_create(&msp->ms_freemap[t], sm->sm_start, + msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t), + KM_SLEEP); + space_map_create(msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_create(&msp->ms_defermap[t], sm->sm_start, + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t), + KM_SLEEP); + space_map_create(msp->ms_defermap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); + } + + freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; vdev_space_update(vd, 0, 0, sm->sm_size); } @@ -1068,8 +1238,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); - ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); - ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); + ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0); + ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0); /* * If there's a space_map_load() in progress, wait for it to complete @@ -1103,7 +1273,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) int evictable = 1; for (int t = 1; t < TXG_CONCURRENT_STATES; t++) - if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) + if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space) evictable = 0; if (evictable && !metaslab_debug) @@ -1128,7 +1298,7 @@ metaslab_sync_reassess(metaslab_group_t *mg) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp->ms_map.sm_start > mg->mg_bonus_area) + if (msp->ms_map->sm_start > mg->mg_bonus_area) break; mutex_enter(&msp->ms_lock); @@ -1149,7 +1319,7 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) { uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; - uint64_t start = msp->ms_map.sm_start >> ms_shift; + uint64_t start = msp->ms_map->sm_start >> ms_shift; if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (1ULL << 63); @@ -1237,6 +1407,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, mutex_enter(&msp->ms_lock); /* + * If this metaslab is currently condensing then pick again as + * we can't manipulate this metaslab until it's committed + * to disk. + */ + if (msp->ms_map->sm_condensing) { + mutex_exit(&msp->ms_lock); + continue; + } + + /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we @@ -1262,20 +1442,20 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, continue; } - if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) + if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) break; atomic_inc_64(&mg->mg_alloc_failures); - metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); + metaslab_passivate(msp, space_map_maxsize(msp->ms_map)); mutex_exit(&msp->ms_lock); } - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); + space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize); mutex_exit(&msp->ms_lock); @@ -1507,13 +1687,13 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) mutex_enter(&msp->ms_lock); if (now) { - space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], + space_map_remove(msp->ms_allocmap[txg & TXG_MASK], offset, size); - space_map_free(&msp->ms_map, offset, size); + space_map_free(msp->ms_map, offset, size); } else { - if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) + if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); + space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); @@ -1548,10 +1728,10 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); - if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded) error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) + if (error == 0 && !space_map_contains(msp->ms_map, offset, size)) error = ENOENT; if (error || txg == 0) { /* txg == 0 indicates dry run */ @@ -1559,12 +1739,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) return (error); } - space_map_claim(&msp->ms_map, offset, size); + space_map_claim(msp->ms_map, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); diff --git a/uts/common/fs/zfs/space_map.c b/uts/common/fs/zfs/space_map.c index 17dd860..30a35c8 100644 --- a/uts/common/fs/zfs/space_map.c +++ b/uts/common/fs/zfs/space_map.c @@ -107,6 +107,7 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) int merge_before, merge_after; ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(!sm->sm_condensing); VERIFY(size != 0); VERIFY3U(start, >=, sm->sm_start); VERIFY3U(end, <=, sm->sm_start + sm->sm_size); @@ -175,6 +176,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) int left_over, right_over; ASSERT(MUTEX_HELD(sm->sm_lock)); + VERIFY(!sm->sm_condensing); VERIFY(size != 0); VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); @@ -244,6 +246,20 @@ space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) } void +space_map_swap(space_map_t **msrc, space_map_t **mdst) +{ + space_map_t *sm; + + ASSERT(MUTEX_HELD((*msrc)->sm_lock)); + ASSERT0((*mdst)->sm_space); + ASSERT0(avl_numnodes(&(*mdst)->sm_root)); + + sm = *msrc; + *msrc = *mdst; + *mdst = sm; +} + +void space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) { space_seg_t *ss; @@ -424,9 +440,9 @@ space_map_sync(space_map_t *sm, uint8_t maptype, space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); - void *cookie = NULL; + avl_tree_t *t = &sm->sm_root; space_seg_t *ss; - uint64_t bufsize, start, size, run_len, delta, sm_space; + uint64_t bufsize, start, size, run_len, total, sm_space, nodes; uint64_t *entry, *entry_map, *entry_map_end; ASSERT(MUTEX_HELD(sm->sm_lock)); @@ -455,13 +471,14 @@ space_map_sync(space_map_t *sm, uint8_t maptype, SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - delta = 0; + total = 0; + nodes = avl_numnodes(&sm->sm_root); sm_space = sm->sm_space; - while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { + for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { size = ss->ss_end - ss->ss_start; start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; - delta += size; + total += size; size >>= sm->sm_shift; while (size) { @@ -483,7 +500,6 @@ space_map_sync(space_map_t *sm, uint8_t maptype, start += run_len; size -= run_len; } - kmem_cache_free(space_seg_cache, ss); } if (entry != entry_map) { @@ -499,12 +515,11 @@ space_map_sync(space_map_t *sm, uint8_t maptype, * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ + VERIFY3U(nodes, ==, avl_numnodes(&sm->sm_root)); VERIFY3U(sm->sm_space, ==, sm_space); + VERIFY3U(sm->sm_space, ==, total); zio_buf_free(entry_map, bufsize); - - sm->sm_space -= delta; - VERIFY0(sm->sm_space); } void diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h index f1f1b38..138e14e 100644 --- a/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/uts/common/fs/zfs/sys/metaslab_impl.h @@ -66,20 +66,38 @@ struct metaslab_group { }; /* - * Each metaslab's free space is tracked in space map object in the MOS, - * which is only updated in syncing context. Each time we sync a txg, + * Each metaslab maintains an in-core free map (ms_map) that contains the + * current list of free segments. As blocks are allocated, the allocated + * segment is removed from the ms_map and added to a per txg allocation map. + * As blocks are freed, they are added to the per txg free map. These per + * txg maps allow us to process all allocations and frees in syncing context + * where it is safe to update the on-disk space maps. + * + * Each metaslab's free space is tracked in a space map object in the MOS, + * which is only updated in syncing context. Each time we sync a txg, * we append the allocs and frees from that txg to the space map object. * When the txg is done syncing, metaslab_sync_done() updates ms_smo - * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. + * to ms_smo_syncing. Everything in ms_smo is always safe to allocate. + * + * To load the in-core free map we read the space map object from disk. + * This object contains a series of alloc and free records that are + * combined to make up the list of all free segments in this metaslab. These + * segments are represented in-core by the ms_map and are stored in an + * AVL tree. + * + * As the space map objects grows (as a result of the appends) it will + * eventually become space-inefficient. When the space map object is + * zfs_condense_pct/100 times the size of the minimal on-disk representation, + * we rewrite it in its minimized form. */ struct metaslab { kmutex_t ms_lock; /* metaslab lock */ space_map_obj_t ms_smo; /* synced space map object */ space_map_obj_t ms_smo_syncing; /* syncing space map object */ - space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ - space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ - space_map_t ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */ - space_map_t ms_map; /* in-core free space map */ + space_map_t *ms_allocmap[TXG_SIZE]; /* allocated this txg */ + space_map_t *ms_freemap[TXG_SIZE]; /* freed this txg */ + space_map_t *ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */ + space_map_t *ms_map; /* in-core free space map */ int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ metaslab_group_t *ms_group; /* metaslab group */ diff --git a/uts/common/fs/zfs/sys/space_map.h b/uts/common/fs/zfs/sys/space_map.h index 463b6bb..2da50fb 100644 --- a/uts/common/fs/zfs/sys/space_map.h +++ b/uts/common/fs/zfs/sys/space_map.h @@ -40,17 +40,17 @@ extern "C" { typedef struct space_map_ops space_map_ops_t; typedef struct space_map { - avl_tree_t sm_root; /* AVL tree of map segments */ + avl_tree_t sm_root; /* offset-ordered segment AVL tree */ uint64_t sm_space; /* sum of all segments in the map */ uint64_t sm_start; /* start of map */ uint64_t sm_size; /* size of map */ uint8_t sm_shift; /* unit shift */ - uint8_t sm_pad[3]; /* unused */ uint8_t sm_loaded; /* map loaded? */ uint8_t sm_loading; /* map loading? */ + uint8_t sm_condensing; /* map condensing? */ kcondvar_t sm_load_cv; /* map load completion */ space_map_ops_t *sm_ops; /* space map block picker ops vector */ - avl_tree_t *sm_pp_root; /* picker-private AVL tree */ + avl_tree_t *sm_pp_root; /* size-ordered, picker-private tree */ void *sm_ppd; /* picker-private data */ kmutex_t *sm_lock; /* pointer to lock that protects map */ } space_map_t; @@ -149,6 +149,7 @@ extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); extern boolean_t space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern void space_map_swap(space_map_t **msrc, space_map_t **mdest); extern void space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); extern void space_map_walk(space_map_t *sm, diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c index 4f5c4e9..41f9a07 100644 --- a/uts/common/fs/zfs/vdev.c +++ b/uts/common/fs/zfs/vdev.c @@ -1836,6 +1836,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) space_map_truncate(smo, mos, tx); space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); + space_map_vacate(&smsync, NULL, NULL); space_map_destroy(&smsync); -- cgit v1.1 From a4c8901728e37e5500089746107c70b99379b5b5 Mon Sep 17 00:00:00 2001 From: mm Date: Sat, 23 Feb 2013 09:02:46 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate 13966:0e1d84ebb004 Illumos ZFS issues: 3578 transferring the freed map to the defer map should be constant time 3579 ztest trips assertion in metaslab_weight() --- uts/common/fs/zfs/metaslab.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index 9c4f97a..76c66a0 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -814,6 +814,16 @@ metaslab_weight(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); /* + * This vdev is in the process of being removed so there is nothing + * for us to do here. + */ + if (vd->vdev_removing) { + ASSERT0(smo->smo_alloc); + ASSERT0(vd->vdev_ms_shift); + return (0); + } + + /* * The baseline weight is the metaslab's free space. */ space = sm->sm_size - smo->smo_alloc; @@ -1193,8 +1203,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) space_map_obj_t *smo = &msp->ms_smo; space_map_obj_t *smosync = &msp->ms_smo_syncing; space_map_t *sm = msp->ms_map; - space_map_t *freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; + space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; int64_t alloc_delta, defer_delta; @@ -1207,8 +1217,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * If this metaslab is just becoming available, initialize its * allocmaps, freemaps, and defermap and add its capacity to the vdev. */ - if (freed_map == NULL) { - ASSERT(defer_map == NULL); + if (*freed_map == NULL) { + ASSERT(*defer_map == NULL); for (int t = 0; t < TXG_SIZE; t++) { msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); @@ -1227,14 +1237,14 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) sm->sm_size, sm->sm_shift, sm->sm_lock); } - freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; + freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; vdev_space_update(vd, 0, 0, sm->sm_size); } alloc_delta = smosync->smo_alloc - smo->smo_alloc; - defer_delta = freed_map->sm_space - defer_map->sm_space; + defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space; vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); @@ -1244,12 +1254,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If there's a space_map_load() in progress, wait for it to complete * so that we have a consistent view of the in-core space map. - * Then, add defer_map (oldest deferred frees) to this map and - * transfer freed_map (this txg's frees) to defer_map. */ space_map_load_wait(sm); - space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); - space_map_vacate(freed_map, space_map_add, defer_map); + + /* + * Move the frees from the defer_map to this map (if it's loaded). + * Swap the freed_map and the defer_map -- this is safe to do + * because we've just emptied out the defer_map. + */ + space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm); + ASSERT0((*defer_map)->sm_space); + ASSERT0(avl_numnodes(&(*defer_map)->sm_root)); + space_map_swap(freed_map, defer_map); *smo = *smosync; -- cgit v1.1 From 500d802e1ac38a0c6bbe582e052d8c437dd002bc Mon Sep 17 00:00:00 2001 From: mm Date: Sat, 23 Feb 2013 09:06:36 +0000 Subject: Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13967:92bec6d87f59 Illumos ZFS issues: 3557 dumpvp_size is not updated correctly when a dump zvol's size is changed 3558 setting the volsize on a dump device does not return back ENOSPC 3559 setting a volsize larger than the space available sometimes succeeds --- uts/common/fs/zfs/sys/zvol.h | 2 +- uts/common/fs/zfs/zfs_ioctl.c | 3 +- uts/common/fs/zfs/zvol.c | 97 ++++++++++++++++++++++++------------------- 3 files changed, 56 insertions(+), 46 deletions(-) diff --git a/uts/common/fs/zfs/sys/zvol.h b/uts/common/fs/zfs/sys/zvol.h index 0059bf5..db1f05ae 100644 --- a/uts/common/fs/zfs/sys/zvol.h +++ b/uts/common/fs/zfs/sys/zvol.h @@ -43,7 +43,7 @@ extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); extern int zvol_create_minor(const char *); extern int zvol_remove_minor(const char *); extern void zvol_remove_minors(const char *); -extern int zvol_set_volsize(const char *, major_t, uint64_t); +extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); diff --git a/uts/common/fs/zfs/zfs_ioctl.c b/uts/common/fs/zfs/zfs_ioctl.c index 090d6c9..38adc19 100644 --- a/uts/common/fs/zfs/zfs_ioctl.c +++ b/uts/common/fs/zfs/zfs_ioctl.c @@ -2368,8 +2368,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, err = dsl_dataset_set_reservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: - err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip), - intval); + err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_VERSION: { diff --git a/uts/common/fs/zfs/zvol.c b/uts/common/fs/zfs/zvol.c index da1b0ef..b413f5e 100644 --- a/uts/common/fs/zfs/zvol.c +++ b/uts/common/fs/zfs/zvol.c @@ -145,10 +145,11 @@ static int zvol_dump_fini(zvol_state_t *zv); static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); static void -zvol_size_changed(uint64_t volsize, major_t maj, minor_t min) +zvol_size_changed(zvol_state_t *zv, uint64_t volsize) { - dev_t dev = makedevice(maj, min); + dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor); + zv->zv_volsize = volsize; VERIFY(ddi_prop_update_int64(dev, zfs_dip, "Size", volsize) == DDI_SUCCESS); VERIFY(ddi_prop_update_int64(dev, zfs_dip, @@ -610,22 +611,22 @@ zvol_first_open(zvol_state_t *zv) if (error) return (error); + zv->zv_objset = os; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { ASSERT(error == 0); dmu_objset_disown(os, zvol_tag); return (error); } - zv->zv_objset = os; + error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); if (error) { dmu_objset_disown(os, zvol_tag); return (error); } - zv->zv_volsize = volsize; + + zvol_size_changed(zv, volsize); zv->zv_zilog = zil_open(os, zvol_get_data); - zvol_size_changed(zv->zv_volsize, ddi_driver_major(zfs_dip), - zv->zv_minor); VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly, NULL) == 0); @@ -747,56 +748,37 @@ zvol_remove_minors(const char *name) mutex_exit(&zfsdev_state_lock); } -int -zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) +static int +zvol_set_volsize_impl(objset_t *os, zvol_state_t *zv, uint64_t volsize) { - zvol_state_t *zv = NULL; - objset_t *os; - int error; - dmu_object_info_t doi; uint64_t old_volsize = 0ULL; - uint64_t readonly; - - mutex_enter(&zfsdev_state_lock); - zv = zvol_minor_lookup(name); - if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { - mutex_exit(&zfsdev_state_lock); - return (error); - } - - if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || - (error = zvol_check_volsize(volsize, - doi.doi_data_block_size)) != 0) - goto out; - - VERIFY(dsl_prop_get_integer(name, "readonly", &readonly, - NULL) == 0); - if (readonly) { - error = EROFS; - goto out; - } + int error; + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); error = zvol_update_volsize(os, volsize); + /* * Reinitialize the dump area to the new size. If we * failed to resize the dump area then restore it back to - * its original size. + * its original size. We must set the new volsize prior + * to calling dumpvp_resize() to ensure that the devices' + * size(9P) is not visible by the dump subsystem. */ if (zv && error == 0) { + old_volsize = zv->zv_volsize; + zvol_size_changed(zv, volsize); + if (zv->zv_flags & ZVOL_DUMPIFIED) { - old_volsize = zv->zv_volsize; - zv->zv_volsize = volsize; if ((error = zvol_dumpify(zv)) != 0 || (error = dumpvp_resize()) != 0) { + int dumpify_error; + (void) zvol_update_volsize(os, old_volsize); - zv->zv_volsize = old_volsize; - error = zvol_dumpify(zv); + zvol_size_changed(zv, old_volsize); + dumpify_error = zvol_dumpify(zv); + error = dumpify_error ? dumpify_error : error; } } - if (error == 0) { - zv->zv_volsize = volsize; - zvol_size_changed(volsize, maj, zv->zv_minor); - } } /* @@ -819,12 +801,41 @@ zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) nvlist_free(attr); kmem_free(physpath, MAXPATHLEN); } + return (error); +} +int +zvol_set_volsize(const char *name, uint64_t volsize) +{ + zvol_state_t *zv = NULL; + objset_t *os; + int error; + dmu_object_info_t doi; + uint64_t readonly; + + mutex_enter(&zfsdev_state_lock); + zv = zvol_minor_lookup(name); + if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { + mutex_exit(&zfsdev_state_lock); + return (error); + } + + if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || + (error = zvol_check_volsize(volsize, + doi.doi_data_block_size)) != 0) + goto out; + + VERIFY3U(dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL), ==, 0); + if (readonly) { + error = EROFS; + goto out; + } + + error = zvol_set_volsize_impl(os, zv, volsize); out: dmu_objset_rele(os, FTAG); - mutex_exit(&zfsdev_state_lock); - return (error); } -- cgit v1.1 From 5b41fbc0be411a475db185016e80731e433c0e9b Mon Sep 17 00:00:00 2001 From: mm Date: Tue, 26 Feb 2013 08:51:39 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate 13968:e4988c7d0403 Illumos ZFS issues: 3552 condensing one space map burns 3 seconds of CPU in spa_sync() thread (fix race condition) --- uts/common/fs/zfs/metaslab.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index 76c66a0..bf9889e 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -1383,6 +1383,13 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, mutex_exit(&mg->mg_lock); return (-1ULL); } + + /* + * If the selected metaslab is condensing, skip it. + */ + if (msp->ms_map->sm_condensing) + continue; + was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -1423,16 +1430,6 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, mutex_enter(&msp->ms_lock); /* - * If this metaslab is currently condensing then pick again as - * we can't manipulate this metaslab until it's committed - * to disk. - */ - if (msp->ms_map->sm_condensing) { - mutex_exit(&msp->ms_lock); - continue; - } - - /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we @@ -1458,6 +1455,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, continue; } + /* + * If this metaslab is currently condensing then pick again as + * we can't manipulate this metaslab until it's committed + * to disk. + */ + if (msp->ms_map->sm_condensing) { + mutex_exit(&msp->ms_lock); + continue; + } + if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) break; -- cgit v1.1 From 7fd8f89eb9b49ef334451ff59d66dcdf139dd4cb Mon Sep 17 00:00:00 2001 From: mm Date: Tue, 26 Feb 2013 08:53:33 +0000 Subject: Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13969:b2c7608044b7 Illumos ZFS issues: 3588 provide zfs properties for logical (uncompressed) space used and referenced --- common/zfs/zfs_prop.c | 6 +++++- uts/common/fs/zfs/dsl_dataset.c | 2 ++ uts/common/fs/zfs/dsl_dir.c | 2 ++ uts/common/sys/fs/zfs.h | 2 ++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/common/zfs/zfs_prop.c b/common/zfs/zfs_prop.c index 9d83655..ebb2679 100644 --- a/common/zfs/zfs_prop.c +++ b/common/zfs/zfs_prop.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ @@ -350,6 +350,10 @@ zfs_prop_init(void) ZFS_TYPE_SNAPSHOT, "", "USERREFS"); zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "WRITTEN"); + zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, + PROP_READONLY, ZFS_TYPE_DATASET, "", "LUSED"); + zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", + 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); /* default number properties */ zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, diff --git a/uts/common/fs/zfs/dsl_dataset.c b/uts/common/fs/zfs/dsl_dataset.c index cf888d2..0a5ef83 100644 --- a/uts/common/fs/zfs/dsl_dataset.c +++ b/uts/common/fs/zfs/dsl_dataset.c @@ -2247,6 +2247,8 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) ds->ds_phys->ds_compressed_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, + ds->ds_phys->ds_uncompressed_bytes); if (dsl_dataset_is_snapshot(ds)) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); diff --git a/uts/common/fs/zfs/dsl_dir.c b/uts/common/fs/zfs/dsl_dir.c index df3f02b..5ccb686 100644 --- a/uts/common/fs/zfs/dsl_dir.c +++ b/uts/common/fs/zfs/dsl_dir.c @@ -530,6 +530,8 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) dd->dd_phys->dd_compressed_bytes == 0 ? 100 : (dd->dd_phys->dd_uncompressed_bytes * 100 / dd->dd_phys->dd_compressed_bytes)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, + dd->dd_phys->dd_uncompressed_bytes); if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h index 8639a2b..286b35b 100644 --- a/uts/common/sys/fs/zfs.h +++ b/uts/common/sys/fs/zfs.h @@ -138,6 +138,8 @@ typedef enum { ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, + ZFS_PROP_LOGICALUSED, + ZFS_PROP_LOGICALREFERENCED, ZFS_NUM_PROPS } zfs_prop_t; -- cgit v1.1 From 266fe973b594664493844a0e5189884daa178f6d Mon Sep 17 00:00:00 2001 From: mm Date: Fri, 1 Mar 2013 20:48:07 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate 13970:c9a5683da38e Illumos ZFS issues: 3543 Feature flags causes assertion in spa.c to miss certain cases --- uts/common/fs/zfs/spa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index fb68e91..151b780 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -5747,7 +5747,7 @@ spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) */ ASSERT(tx->tx_txg != TXG_INITIAL); - ASSERT(version <= SPA_VERSION); + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; @@ -6276,7 +6276,7 @@ spa_upgrade(spa_t *spa, uint64_t version) * future version would result in an unopenable pool, this shouldn't be * possible. */ - ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); + ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT(version >= spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; -- cgit v1.1 From 51890f5780799ca6a79d26d8fb147c525961510b Mon Sep 17 00:00:00 2001 From: mm Date: Fri, 1 Mar 2013 20:49:56 +0000 Subject: Update vendor-sys/illumos/dist to illumos-gate 13971:d1648d817bd6 Illumos ZFS issues: 3581 spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE]->tq_lock is piping hot --- uts/common/fs/zfs/spa.c | 200 +++++++++++++++++++++++++++------------ uts/common/fs/zfs/sys/spa_impl.h | 16 +++- uts/common/fs/zfs/vdev_file.c | 2 +- uts/common/fs/zfs/zio.c | 31 +++--- 4 files changed, 169 insertions(+), 80 deletions(-) diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index 151b780..fdc2870 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -77,23 +77,25 @@ #include "zfs_comutil.h" typedef enum zti_modes { - zti_mode_fixed, /* value is # of threads (min 1) */ - zti_mode_online_percent, /* value is % of online CPUs */ - zti_mode_batch, /* cpu-intensive; value is ignored */ - zti_mode_null, /* don't create a taskq */ - zti_nmodes + ZTI_MODE_FIXED, /* value is # of threads (min 1) */ + ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */ + ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ + ZTI_MODE_NULL, /* don't create a taskq */ + ZTI_NMODES } zti_modes_t; -#define ZTI_FIX(n) { zti_mode_fixed, (n) } -#define ZTI_PCT(n) { zti_mode_online_percent, (n) } -#define ZTI_BATCH { zti_mode_batch, 0 } -#define ZTI_NULL { zti_mode_null, 0 } +#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } +#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } +#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } +#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } -#define ZTI_ONE ZTI_FIX(1) +#define ZTI_N(n) ZTI_P(n, 1) +#define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { - enum zti_modes zti_mode; + zti_modes_t zti_mode; uint_t zti_value; + uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { @@ -101,17 +103,30 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { }; /* - * Define the taskq threads for the following I/O types: - * NULL, READ, WRITE, FREE, CLAIM, and IOCTL + * This table defines the taskq settings for each ZFS I/O type. When + * initializing a pool, we use this table to create an appropriately sized + * taskq. Some operations are low volume and therefore have a small, static + * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE + * macros. Other operations process a large amount of data; the ZTI_BATCH + * macro causes us to create a taskq oriented for throughput. Some operations + * are so high frequency and short-lived that the taskq itself can become a a + * point of lock contention. The ZTI_P(#, #) macro indicates that we need an + * additional degree of parallelism specified by the number of threads per- + * taskq and the number of taskqs; when dispatching an event in this case, the + * particular taskq is chosen at random. + * + * The different taskq priorities are to handle the different contexts (issue + * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that + * need to be handled with minimum delay. */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, - { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, - { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ + { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */ + { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ + { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; static dsl_syncfunc_t spa_sync_version; @@ -794,48 +809,120 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) offsetof(spa_error_entry_t, se_avl)); } -static taskq_t * -spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, - uint_t value) +static void +spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; + uint_t count = ztip->zti_count; + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + char name[32]; uint_t flags = 0; boolean_t batch = B_FALSE; - switch (mode) { - case zti_mode_null: - return (NULL); /* no taskq needed */ + if (mode == ZTI_MODE_NULL) { + tqs->stqs_count = 0; + tqs->stqs_taskq = NULL; + return; + } - case zti_mode_fixed: - ASSERT3U(value, >=, 1); - value = MAX(value, 1); - break; + ASSERT3U(count, >, 0); - case zti_mode_batch: - batch = B_TRUE; - flags |= TASKQ_THREADS_CPU_PCT; - value = zio_taskq_batch_pct; - break; + tqs->stqs_count = count; + tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); - case zti_mode_online_percent: - flags |= TASKQ_THREADS_CPU_PCT; - break; + for (uint_t i = 0; i < count; i++) { + taskq_t *tq; - default: - panic("unrecognized mode for %s taskq (%u:%u) in " - "spa_activate()", - name, mode, value); - break; + switch (mode) { + case ZTI_MODE_FIXED: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + break; + + case ZTI_MODE_BATCH: + batch = B_TRUE; + flags |= TASKQ_THREADS_CPU_PCT; + value = zio_taskq_batch_pct; + break; + + case ZTI_MODE_ONLINE_PERCENT: + flags |= TASKQ_THREADS_CPU_PCT; + break; + + default: + panic("unrecognized mode for %s_%s taskq (%u:%u) in " + "spa_activate()", + zio_type_name[t], zio_taskq_types[q], mode, value); + break; + } + + if (count > 1) { + (void) snprintf(name, sizeof (name), "%s_%s_%u", + zio_type_name[t], zio_taskq_types[q], i); + } else { + (void) snprintf(name, sizeof (name), "%s_%s", + zio_type_name[t], zio_taskq_types[q]); + } + + if (zio_taskq_sysdc && spa->spa_proc != &p0) { + if (batch) + flags |= TASKQ_DC_BATCH; + + tq = taskq_create_sysdc(name, value, 50, INT_MAX, + spa->spa_proc, zio_taskq_basedc, flags); + } else { + tq = taskq_create_proc(name, value, maxclsyspri, 50, + INT_MAX, spa->spa_proc, flags); + } + + tqs->stqs_taskq[i] = tq; + } +} + +static void +spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + + if (tqs->stqs_taskq == NULL) { + ASSERT0(tqs->stqs_count); + return; + } + + for (uint_t i = 0; i < tqs->stqs_count; i++) { + ASSERT3P(tqs->stqs_taskq[i], !=, NULL); + taskq_destroy(tqs->stqs_taskq[i]); } - if (zio_taskq_sysdc && spa->spa_proc != &p0) { - if (batch) - flags |= TASKQ_DC_BATCH; + kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); + tqs->stqs_taskq = NULL; +} + +/* + * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. + * Note that a type may have multiple discrete taskqs to avoid lock contention + * on the taskq itself. In that case we choose which taskq at random by using + * the low bits of gethrtime(). + */ +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + taskq_t *tq; - return (taskq_create_sysdc(name, value, 50, INT_MAX, - spa->spa_proc, zio_taskq_basedc, flags)); + ASSERT3P(tqs->stqs_taskq, !=, NULL); + ASSERT3U(tqs->stqs_count, !=, 0); + + if (tqs->stqs_count == 1) { + tq = tqs->stqs_taskq[0]; + } else { + tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; } - return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, - spa->spa_proc, flags)); + + taskq_dispatch_ent(tq, func, arg, flags, ent); } static void @@ -843,16 +930,7 @@ spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; - enum zti_modes mode = ztip->zti_mode; - uint_t value = ztip->zti_value; - char name[32]; - - (void) snprintf(name, sizeof (name), - "%s_%s", zio_type_name[t], zio_taskq_types[q]); - - spa->spa_zio_taskq[t][q] = - spa_taskq_create(spa, name, mode, value); + spa_taskqs_init(spa, t, q); } } } @@ -1009,9 +1087,7 @@ spa_deactivate(spa_t *spa) for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - if (spa->spa_zio_taskq[t][q] != NULL) - taskq_destroy(spa->spa_zio_taskq[t][q]); - spa->spa_zio_taskq[t][q] = NULL; + spa_taskqs_fini(spa, t, q); } } diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h index ffd676e..983103e 100644 --- a/uts/common/fs/zfs/sys/spa_impl.h +++ b/uts/common/fs/zfs/sys/spa_impl.h @@ -80,16 +80,16 @@ typedef struct spa_config_dirent { char *scd_path; } spa_config_dirent_t; -enum zio_taskq_type { +typedef enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES -}; +} zio_taskq_type_t; /* - * State machine for the zpool-pooname process. The states transitions + * State machine for the zpool-poolname process. The states transitions * are done as follows: * * From To Routine @@ -107,6 +107,11 @@ typedef enum spa_proc_state { SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ } spa_proc_state_t; +typedef struct spa_taskqs { + uint_t stqs_count; + taskq_t **stqs_taskq; +} spa_taskqs_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -125,7 +130,7 @@ struct spa { uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ uint64_t spa_import_flags; /* import specific flags */ - taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; + spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ metaslab_class_t *spa_normal_class; /* normal data class */ @@ -245,6 +250,9 @@ struct spa { extern const char *spa_config_path; +extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); + #ifdef __cplusplus } #endif diff --git a/uts/common/fs/zfs/vdev_file.c b/uts/common/fs/zfs/vdev_file.c index 1fbce5e..f7cfa9b 100644 --- a/uts/common/fs/zfs/vdev_file.c +++ b/uts/common/fs/zfs/vdev_file.c @@ -224,7 +224,7 @@ vdev_file_io_start(zio_t *zio) bp->b_private = vf->vf_vnode; bp->b_iodone = (int (*)())vdev_file_io_intr; - taskq_dispatch_ent(spa->spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE], + spa_taskq_dispatch_ent(spa, ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, vdev_file_io_strategy, bp, 0, &zio->io_tqent); return (ZIO_PIPELINE_STOP); diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c index 04b8ddf..432a992 100644 --- a/uts/common/fs/zfs/zio.c +++ b/uts/common/fs/zfs/zio.c @@ -1107,7 +1107,7 @@ zio_free_bp_init(zio_t *zio) */ static void -zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) +zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; @@ -1128,10 +1128,11 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) t = ZIO_TYPE_NULL; /* - * If this is a high priority I/O, then use the high priority taskq. + * If this is a high priority I/O, then use the high priority taskq if + * available. */ if (zio->io_priority == ZIO_PRIORITY_NOW && - spa->spa_zio_taskq[t][q + 1] != NULL) + spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); @@ -1142,19 +1143,24 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) * to dispatch the zio to another taskq at the same time. */ ASSERT(zio->io_tqent.tqent_next == NULL); - taskq_dispatch_ent(spa->spa_zio_taskq[t][q], - (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); + spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, + flags, &zio->io_tqent); } static boolean_t -zio_taskq_member(zio_t *zio, enum zio_taskq_type q) +zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { kthread_t *executor = zio->io_executor; spa_t *spa = zio->io_spa; - for (zio_type_t t = 0; t < ZIO_TYPES; t++) - if (taskq_member(spa->spa_zio_taskq[t][q], executor)) - return (B_TRUE); + for (zio_type_t t = 0; t < ZIO_TYPES; t++) { + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + uint_t i; + for (i = 0; i < tqs->stqs_count; i++) { + if (taskq_member(tqs->stqs_taskq[i], executor)) + return (B_TRUE); + } + } return (B_FALSE); } @@ -3017,10 +3023,9 @@ zio_done(zio_t *zio) * Hand it off to the otherwise-unused claim taskq. */ ASSERT(zio->io_tqent.tqent_next == NULL); - taskq_dispatch_ent( - spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], - (task_func_t *)zio_reexecute, zio, 0, - &zio->io_tqent); + spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, + ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, + 0, &zio->io_tqent); } return (ZIO_PIPELINE_STOP); } -- cgit v1.1 From 09a831de87c71a9f94f38dbd36b73746467e3182 Mon Sep 17 00:00:00 2001 From: mm Date: Fri, 1 Mar 2013 21:01:45 +0000 Subject: Update vendor/illumos/dist and vendor-sys/illumos/dist to illumos-gate 13973:4972ab336f54 Illumos ZFS issues: 3464 zfs synctask code needs restructuring --- common/nvpair/fnvpair.c | 13 + uts/common/Makefile.files | 4 +- uts/common/fs/zfs/arc.c | 10 +- uts/common/fs/zfs/bplist.c | 8 + uts/common/fs/zfs/bpobj.c | 4 + uts/common/fs/zfs/dbuf.c | 85 +- uts/common/fs/zfs/dmu.c | 2 +- uts/common/fs/zfs/dmu_diff.c | 80 +- uts/common/fs/zfs/dmu_objset.c | 855 ++++--- uts/common/fs/zfs/dmu_send.c | 911 ++++---- uts/common/fs/zfs/dmu_traverse.c | 42 +- uts/common/fs/zfs/dmu_tx.c | 31 +- uts/common/fs/zfs/dnode.c | 6 +- uts/common/fs/zfs/dnode_sync.c | 1 + uts/common/fs/zfs/dsl_dataset.c | 4126 ++++++++++++---------------------- uts/common/fs/zfs/dsl_deleg.c | 138 +- uts/common/fs/zfs/dsl_destroy.c | 926 ++++++++ uts/common/fs/zfs/dsl_dir.c | 587 +++-- uts/common/fs/zfs/dsl_pool.c | 216 +- uts/common/fs/zfs/dsl_prop.c | 475 ++-- uts/common/fs/zfs/dsl_scan.c | 118 +- uts/common/fs/zfs/dsl_synctask.c | 243 +- uts/common/fs/zfs/dsl_userhold.c | 536 +++++ uts/common/fs/zfs/metaslab.c | 38 + uts/common/fs/zfs/refcount.c | 21 +- uts/common/fs/zfs/rrwlock.c | 41 +- uts/common/fs/zfs/sa.c | 6 +- uts/common/fs/zfs/spa.c | 81 +- uts/common/fs/zfs/spa_history.c | 26 +- uts/common/fs/zfs/spa_misc.c | 23 +- uts/common/fs/zfs/space_map.c | 43 +- uts/common/fs/zfs/sys/arc.h | 2 +- uts/common/fs/zfs/sys/dbuf.h | 9 +- uts/common/fs/zfs/sys/dmu.h | 52 +- uts/common/fs/zfs/sys/dmu_objset.h | 11 +- uts/common/fs/zfs/sys/dmu_send.h | 66 + uts/common/fs/zfs/sys/dmu_tx.h | 6 +- uts/common/fs/zfs/sys/dsl_dataset.h | 112 +- uts/common/fs/zfs/sys/dsl_destroy.h | 52 + uts/common/fs/zfs/sys/dsl_dir.h | 17 +- uts/common/fs/zfs/sys/dsl_pool.h | 16 +- uts/common/fs/zfs/sys/dsl_prop.h | 53 +- uts/common/fs/zfs/sys/dsl_synctask.h | 46 +- uts/common/fs/zfs/sys/dsl_userhold.h | 57 + uts/common/fs/zfs/sys/metaslab.h | 3 +- uts/common/fs/zfs/sys/refcount.h | 5 +- uts/common/fs/zfs/sys/rrwlock.h | 7 +- uts/common/fs/zfs/sys/space_map.h | 2 + uts/common/fs/zfs/sys/txg.h | 9 +- uts/common/fs/zfs/sys/zfeature.h | 17 +- uts/common/fs/zfs/sys/zfs_debug.h | 12 +- uts/common/fs/zfs/sys/zfs_ioctl.h | 4 +- uts/common/fs/zfs/sys/zfs_znode.h | 3 +- uts/common/fs/zfs/sys/zil.h | 4 +- uts/common/fs/zfs/txg.c | 37 +- uts/common/fs/zfs/zfs_ctldir.c | 33 +- uts/common/fs/zfs/zfs_ioctl.c | 1083 ++++----- uts/common/fs/zfs/zfs_vfsops.c | 74 +- uts/common/fs/zfs/zil.c | 137 +- uts/common/fs/zfs/zio.c | 8 +- uts/common/fs/zfs/zvol.c | 85 +- uts/common/sys/nvpair.h | 1 + 62 files changed, 5969 insertions(+), 5750 deletions(-) create mode 100644 uts/common/fs/zfs/dsl_destroy.c create mode 100644 uts/common/fs/zfs/dsl_userhold.c create mode 100644 uts/common/fs/zfs/sys/dmu_send.h create mode 100644 uts/common/fs/zfs/sys/dsl_destroy.h create mode 100644 uts/common/fs/zfs/sys/dsl_userhold.h diff --git a/common/nvpair/fnvpair.c b/common/nvpair/fnvpair.c index 8c5591c0..b3cf173 100644 --- a/common/nvpair/fnvpair.c +++ b/common/nvpair/fnvpair.c @@ -26,6 +26,7 @@ #include #include #include +#include #ifndef _KERNEL #include #endif @@ -114,6 +115,18 @@ fnvlist_merge(nvlist_t *dst, nvlist_t *src) VERIFY0(nvlist_merge(dst, src, KM_SLEEP)); } +size_t +fnvlist_num_pairs(nvlist_t *nvl) +{ + size_t count = 0; + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL; + pair = nvlist_next_nvpair(nvl, pair)) + count++; + return (count); +} + void fnvlist_add_boolean(nvlist_t *nvl, const char *name) { diff --git a/uts/common/Makefile.files b/uts/common/Makefile.files index 1eb0e31..e0fa959 100644 --- a/uts/common/Makefile.files +++ b/uts/common/Makefile.files @@ -1346,8 +1346,10 @@ ZFS_COMMON_OBJS += \ dsl_dir.o \ dsl_dataset.o \ dsl_deadlist.o \ + dsl_destroy.o \ dsl_pool.o \ dsl_synctask.o \ + dsl_userhold.o \ dmu_zfetch.o \ dsl_deleg.o \ dsl_prop.o \ @@ -1358,6 +1360,7 @@ ZFS_COMMON_OBJS += \ lzjb.o \ metaslab.o \ refcount.o \ + rrwlock.o \ sa.o \ sha256.o \ spa.o \ @@ -1417,7 +1420,6 @@ ZFS_OBJS += \ zfs_onexit.o \ zfs_replay.o \ zfs_rlock.o \ - rrwlock.o \ zfs_vfsops.o \ zfs_vnops.o \ zvol.o diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c index ca3baea..9588e40 100644 --- a/uts/common/fs/zfs/arc.c +++ b/uts/common/fs/zfs/arc.c @@ -1633,12 +1633,12 @@ arc_buf_free(arc_buf_t *buf, void *tag) } } -int +boolean_t arc_buf_remove_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); - int no_callback = (buf->b_efunc == NULL); + boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { ASSERT(hdr->b_datacnt == 1); @@ -1843,7 +1843,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, ARCSTAT_INCR(arcstat_mutex_miss, missed); /* - * We have just evicted some date into the ghost state, make + * We have just evicted some data into the ghost state, make * sure we also adjust the ghost state size if necessary. */ if (arc_no_grow && @@ -2622,7 +2622,7 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg) == 1); + VERIFY(arc_buf_remove_ref(buf, arg)); } /* a generic arc_done_func_t */ @@ -2631,7 +2631,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg) == 1); + VERIFY(arc_buf_remove_ref(buf, arg)); *bufp = NULL; } else { *bufp = buf; diff --git a/uts/common/fs/zfs/bplist.c b/uts/common/fs/zfs/bplist.c index 066ccc6..ee12db3 100644 --- a/uts/common/fs/zfs/bplist.c +++ b/uts/common/fs/zfs/bplist.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -52,6 +53,12 @@ bplist_append(bplist_t *bpl, const blkptr_t *bp) mutex_exit(&bpl->bpl_lock); } +/* + * To aid debugging, we keep the most recently removed entry. This way if + * we are in the callback, we can easily locate the entry. + */ +static bplist_entry_t *bplist_iterate_last_removed; + void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) { @@ -59,6 +66,7 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) mutex_enter(&bpl->bpl_lock); while (bpe = list_head(&bpl->bpl_list)) { + bplist_iterate_last_removed = bpe; list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); func(arg, &bpe->bpe_blk, tx); diff --git a/uts/common/fs/zfs/bpobj.c b/uts/common/fs/zfs/bpobj.c index 1920da4..bcb5f33 100644 --- a/uts/common/fs/zfs/bpobj.c +++ b/uts/common/fs/zfs/bpobj.c @@ -392,6 +392,10 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); } + dmu_object_info_t doi; + ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); + mutex_enter(&bpo->bpo_lock); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c index 8bf3d09..57abfa1 100644 --- a/uts/common/fs/zfs/dbuf.c +++ b/uts/common/fs/zfs/dbuf.c @@ -39,7 +39,7 @@ #include static void dbuf_destroy(dmu_buf_impl_t *db); -static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); /* @@ -499,7 +499,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + VERIFY(arc_buf_remove_ref(buf, db)); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -828,10 +828,12 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) continue; /* found a level 0 buffer in the range */ - if (dbuf_undirty(db, tx)) + mutex_enter(&db->db_mtx); + if (dbuf_undirty(db, tx)) { + /* mutex has been dropped and dbuf destroyed */ continue; + } - mutex_enter(&db->db_mtx); if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { @@ -958,7 +960,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db) == 1); + VERIFY(arc_buf_remove_ref(obuf, db)); db->db.db_size = size; if (db->db_level == 0) { @@ -1258,7 +1260,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr); } -static int +/* + * Return TRUE if this evicted the dbuf. + */ +static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; @@ -1267,18 +1272,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(txg != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT0(db->db_level); + ASSERT(MUTEX_HELD(&db->db_mtx)); - mutex_enter(&db->db_mtx); /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; - if (dr == NULL || dr->dr_txg < txg) { - mutex_exit(&db->db_mtx); - return (0); - } + if (dr == NULL || dr->dr_txg < txg) + return (B_FALSE); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); @@ -1286,24 +1290,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dn = DB_DNODE(db); /* - * If this buffer is currently held, we cannot undirty - * it, since one of the current holders may be in the - * middle of an update. Note that users of dbuf_undirty() - * should not place a hold on the dbuf before the call. - * Also note: we can get here with a spill block, so - * test for that similar to how dbuf_dirty does. + * Note: This code will probably work even if there are concurrent + * holders, but it is untested in that scenerio, as the ZPL and + * ztest have additional locking (the range locks) that prevents + * that type of concurrent access. */ - if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - mutex_exit(&db->db_mtx); - /* Make sure we don't toss this buffer at sync phase */ - if (db->db_blkid != DMU_SPILL_BLKID) { - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); - mutex_exit(&dn->dn_mtx); - } - DB_DNODE_EXIT(db); - return (0); - } + ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -1332,21 +1324,13 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } DB_DNODE_EXIT(db); - if (db->db_level == 0) { - if (db->db_state != DB_NOFILL) { - dbuf_unoverride(dr); + if (db->db_state != DB_NOFILL) { + dbuf_unoverride(dr); - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db) == 1); - } - } else { ASSERT(db->db_buf != NULL); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1358,13 +1342,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); dbuf_set_data(db, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); - return (1); + return (B_TRUE); } - mutex_exit(&db->db_mtx); - return (0); + return (B_FALSE); } #pragma weak dmu_buf_will_dirty = dbuf_will_dirty @@ -1463,7 +1446,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + VERIFY(arc_buf_remove_ref(buf, db)); xuio_stat_wbuf_copied(); return; } @@ -1481,10 +1464,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + VERIFY(arc_buf_remove_ref(db->db_buf, db)); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + VERIFY(arc_buf_remove_ref(db->db_buf, db)); } db->db_buf = NULL; } @@ -2067,10 +2050,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) * This dbuf has anonymous data associated with it. */ dbuf_set_data(db, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); } else { - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); + VERIFY(!arc_buf_remove_ref(db->db_buf, db)); /* * A dbuf will be eligible for eviction if either the @@ -2567,7 +2550,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db) == 1); + db)); else if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); } diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c index 21cdd7c..6ee37ac 100644 --- a/uts/common/fs/zfs/dmu.c +++ b/uts/common/fs/zfs/dmu.c @@ -1194,7 +1194,7 @@ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); - VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); + VERIFY(arc_buf_remove_ref(buf, FTAG)); } /* diff --git a/uts/common/fs/zfs/dmu_diff.c b/uts/common/fs/zfs/dmu_diff.c index dc23778..2d1aaa4 100644 --- a/uts/common/fs/zfs/dmu_diff.c +++ b/uts/common/fs/zfs/dmu_diff.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -155,51 +156,49 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } int -dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) +dmu_diff(const char *tosnap_name, const char *fromsnap_name, + struct vnode *vp, offset_t *offp) { struct diffarg da; - dsl_dataset_t *ds = tosnap->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap->os_dsl_dataset; - dsl_dataset_t *findds; - dsl_dataset_t *relds; - int err = 0; - - /* make certain we are looking at snapshots */ - if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds)) + dsl_dataset_t *fromsnap; + dsl_dataset_t *tosnap; + dsl_pool_t *dp; + int error; + uint64_t fromtxg; + + if (strchr(tosnap_name, '@') == NULL || + strchr(fromsnap_name, '@') == NULL) return (EINVAL); - /* fromsnap must be earlier and from the same lineage as tosnap */ - if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg) - return (EXDEV); - - relds = NULL; - findds = ds; - - while (fromds->ds_dir != findds->ds_dir) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (!dsl_dir_is_clone(findds->ds_dir)) { - if (relds) - dsl_dataset_rele(relds, FTAG); - return (EXDEV); - } - - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_hold_obj(dp, - findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds); - rw_exit(&dp->dp_config_rwlock); + error = dsl_pool_hold(tosnap_name, FTAG, &dp); + if (error != 0) + return (error); - if (relds) - dsl_dataset_rele(relds, FTAG); + error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } - if (err) - return (EXDEV); + error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } - relds = findds; + if (!dsl_dataset_is_before(tosnap, fromsnap)) { + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (EXDEV); } - if (relds) - dsl_dataset_rele(relds, FTAG); + fromtxg = fromsnap->ds_phys->ds_creation_txg; + dsl_dataset_rele(fromsnap, FTAG); + + dsl_dataset_long_hold(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); da.da_vp = vp; da.da_offp = offp; @@ -207,15 +206,18 @@ dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; da.da_err = 0; - err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg, + error = traverse_dataset(tosnap, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); - if (err) { - da.da_err = err; + if (error != 0) { + da.da_err = error; } else { /* we set the da.da_err we return as side-effect */ (void) write_record(&da); } + dsl_dataset_long_rele(tosnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + return (da.da_err); } diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c index 74c1192..a646f40 100644 --- a/uts/common/fs/zfs/dmu_objset.c +++ b/uts/common/fs/zfs/dmu_objset.c @@ -44,6 +44,7 @@ #include #include #include +#include /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -280,7 +281,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); - if (err) { + if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) @@ -320,34 +321,49 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * checksum/compression/copies. */ if (ds) { - err = dsl_prop_register(ds, "primarycache", + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "secondarycache", + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); + } if (!dsl_dataset_is_snapshot(ds)) { - if (err == 0) - err = dsl_prop_register(ds, "checksum", + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "compression", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "copies", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "dedup", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "logbias", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "sync", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os); + } } - if (err) { + if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf) == 1); + &os->os_phys_buf)); kmem_free(os, sizeof (objset_t)); return (err); } @@ -425,44 +441,66 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) return (err); } -/* called from zpl */ +/* + * Holds the pool while the objset is held. Therefore only one objset + * can be held at a time. + */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp) { + dsl_pool_t *dp; dsl_dataset_t *ds; int err; - err = dsl_dataset_hold(name, tag, &ds); - if (err) + err = dsl_pool_hold(name, tag, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, name, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, tag); return (err); + } err = dmu_objset_from_ds(ds, osp); - if (err) + if (err != 0) { dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); + } return (err); } -/* called from zpl */ +/* + * dsl_pool must not be held when this is called. + * Upon successful return, there will be a longhold on the dataset, + * and the dsl_pool will not be held. + */ int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { + dsl_pool_t *dp; dsl_dataset_t *ds; int err; - err = dsl_dataset_own(name, B_FALSE, tag, &ds); - if (err) + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_own(dp, name, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); return (err); + } err = dmu_objset_from_ds(ds, osp); - if (err) { + dsl_pool_rele(dp, FTAG); + if (err != 0) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { - dmu_objset_disown(*osp, tag); + dsl_dataset_disown(ds, tag); return (EINVAL); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { - dmu_objset_disown(*osp, tag); + dsl_dataset_disown(ds, tag); return (EROFS); } return (err); @@ -471,7 +509,9 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, void dmu_objset_rele(objset_t *os, void *tag) { + dsl_pool_t *dp = dmu_objset_pool(os); dsl_dataset_rele(os->os_dsl_dataset, tag); + dsl_pool_rele(dp, tag); } void @@ -480,7 +520,7 @@ dmu_objset_disown(objset_t *os, void *tag) dsl_dataset_disown(os->os_dsl_dataset, tag); } -int +void dmu_objset_evict_dbufs(objset_t *os) { dnode_t *dn; @@ -515,9 +555,7 @@ dmu_objset_evict_dbufs(objset_t *os) mutex_enter(&os->os_lock); dn = next_dn; } - dn = list_head(&os->os_dnodes); mutex_exit(&os->os_lock); - return (dn != DMU_META_DNODE(os)); } void @@ -530,33 +568,37 @@ dmu_objset_evict(objset_t *os) if (ds) { if (!dsl_dataset_is_snapshot(ds)) { - VERIFY(0 == dsl_prop_unregister(ds, "checksum", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "compression", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "copies", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "dedup", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "logbias", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "sync", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os)); } - VERIFY(0 == dsl_prop_unregister(ds, "primarycache", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", + VERIFY0(dsl_prop_unregister(ds, + zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os)); } if (os->os_sa) sa_tear_down(os); - /* - * We should need only a single pass over the dnode list, since - * nothing can be added to the list at this point. - */ - (void) dmu_objset_evict_dbufs(os); + dmu_objset_evict_dbufs(os); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { @@ -567,7 +609,7 @@ dmu_objset_evict(objset_t *os) ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); + VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* * This is a barrier to prevent the objset from going away in @@ -599,10 +641,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); + if (ds != NULL) - VERIFY(0 == dmu_objset_from_ds(ds, &os)); + VERIFY0(dmu_objset_from_ds(ds, &os)); else - VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); + VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); mdn = DMU_META_DNODE(os); @@ -650,359 +693,181 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, return (os); } -struct oscarg { - void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); - void *userarg; - dsl_dataset_t *clone_origin; - const char *lastname; - dmu_objset_type_t type; - uint64_t flags; - cred_t *cr; -}; +typedef struct dmu_objset_create_arg { + const char *doca_name; + cred_t *doca_cred; + void (*doca_userfunc)(objset_t *os, void *arg, + cred_t *cr, dmu_tx_t *tx); + void *doca_userarg; + dmu_objset_type_t doca_type; + uint64_t doca_flags; +} dmu_objset_create_arg_t; /*ARGSUSED*/ static int -dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_objset_create_check(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct oscarg *oa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - int err; - uint64_t ddobj; - - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - oa->lastname, sizeof (uint64_t), 1, &ddobj); - if (err != ENOENT) - return (err ? err : EEXIST); + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + int error; - if (oa->clone_origin != NULL) { - /* You can't clone across pools. */ - if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) - return (EXDEV); + if (strchr(doca->doca_name, '@') != NULL) + return (EINVAL); - /* You can only clone snapshots, not the head datasets. */ - if (!dsl_dataset_is_snapshot(oa->clone_origin)) - return (EINVAL); + error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); + if (error != 0) + return (error); + if (tail == NULL) { + dsl_dir_rele(pdd, FTAG); + return (EEXIST); } + dsl_dir_rele(pdd, FTAG); return (0); } static void -dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - spa_t *spa = dd->dd_pool->dp_spa; - struct oscarg *oa = arg2; - uint64_t obj; + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; dsl_dataset_t *ds; + uint64_t obj; blkptr_t *bp; + objset_t *os; - ASSERT(dmu_tx_is_syncing(tx)); + VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); - obj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_origin, oa->flags, oa->cr, tx); + obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, + doca->doca_cred, tx); - VERIFY3U(0, ==, dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); bp = dsl_dataset_get_blkptr(ds); - if (BP_IS_HOLE(bp)) { - objset_t *os = - dmu_objset_create_impl(spa, ds, bp, oa->type, tx); + os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, + ds, bp, doca->doca_type, tx); - if (oa->userfunc) - oa->userfunc(os, oa->userarg, oa->cr, tx); + if (doca->doca_userfunc != NULL) { + doca->doca_userfunc(os, doca->doca_userarg, + doca->doca_cred, tx); } - if (oa->clone_origin == NULL) { - spa_history_log_internal_ds(ds, "create", tx, ""); - } else { - char namebuf[MAXNAMELEN]; - dsl_dataset_name(oa->clone_origin, namebuf); - spa_history_log_internal_ds(ds, "clone", tx, - "origin=%s (%llu)", namebuf, oa->clone_origin->ds_object); - } + spa_history_log_internal_ds(ds, "create", tx, ""); dsl_dataset_rele(ds, FTAG); + dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { - dsl_dir_t *pdd; - const char *tail; - int err = 0; - struct oscarg oa = { 0 }; - - ASSERT(strchr(name, '@') == NULL); - err = dsl_dir_open(name, FTAG, &pdd, &tail); - if (err) - return (err); - if (tail == NULL) { - dsl_dir_close(pdd, FTAG); - return (EEXIST); - } + dmu_objset_create_arg_t doca; - oa.userfunc = func; - oa.userarg = arg; - oa.lastname = tail; - oa.type = type; - oa.flags = flags; - oa.cr = CRED(); + doca.doca_name = name; + doca.doca_cred = CRED(); + doca.doca_flags = flags; + doca.doca_userfunc = func; + doca.doca_userarg = arg; + doca.doca_type = type; - err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, - dmu_objset_create_sync, pdd, &oa, 5); - dsl_dir_close(pdd, FTAG); - return (err); + return (dsl_sync_task(name, + dmu_objset_create_check, dmu_objset_create_sync, &doca, 5)); } -int -dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) +typedef struct dmu_objset_clone_arg { + const char *doca_clone; + const char *doca_origin; + cred_t *doca_cred; +} dmu_objset_clone_arg_t; + +/*ARGSUSED*/ +static int +dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { + dmu_objset_clone_arg_t *doca = arg; dsl_dir_t *pdd; const char *tail; - int err = 0; - struct oscarg oa = { 0 }; + int error; + dsl_dataset_t *origin; + dsl_pool_t *dp = dmu_tx_pool(tx); - ASSERT(strchr(name, '@') == NULL); - err = dsl_dir_open(name, FTAG, &pdd, &tail); - if (err) - return (err); + if (strchr(doca->doca_clone, '@') != NULL) + return (EINVAL); + + error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); + if (error != 0) + return (error); if (tail == NULL) { - dsl_dir_close(pdd, FTAG); + dsl_dir_rele(pdd, FTAG); return (EEXIST); } - - oa.lastname = tail; - oa.clone_origin = clone_origin; - oa.flags = flags; - oa.cr = CRED(); - - err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, - dmu_objset_create_sync, pdd, &oa, 5); - dsl_dir_close(pdd, FTAG); - return (err); -} - -int -dmu_objset_destroy(const char *name, boolean_t defer) -{ - dsl_dataset_t *ds; - int error; - - error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); - if (error == 0) { - error = dsl_dataset_destroy(ds, FTAG, defer); - /* dsl_dataset_destroy() closes the ds. */ + /* You can't clone across pools. */ + if (pdd->dd_pool != dp) { + dsl_dir_rele(pdd, FTAG); + return (EXDEV); } + dsl_dir_rele(pdd, FTAG); - return (error); -} - -typedef struct snapallarg { - dsl_sync_task_group_t *saa_dstg; - boolean_t saa_needsuspend; - nvlist_t *saa_props; - - /* the following are used only if 'temporary' is set: */ - boolean_t saa_temporary; - const char *saa_htag; - struct dsl_ds_holdarg *saa_ha; - dsl_dataset_t *saa_newds; -} snapallarg_t; - -typedef struct snaponearg { - const char *soa_longname; /* long snap name */ - const char *soa_snapname; /* short snap name */ - snapallarg_t *soa_saa; -} snaponearg_t; - -static int -snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - snaponearg_t *soa = arg2; - snapallarg_t *saa = soa->soa_saa; - int error; - - /* The props have already been checked by zfs_check_userprops(). */ - - error = dsl_dataset_snapshot_check(os->os_dsl_dataset, - soa->soa_snapname, tx); - if (error) + error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); + if (error != 0) return (error); - if (saa->saa_temporary) { - /* - * Ideally we would just call - * dsl_dataset_user_hold_check() and - * dsl_dataset_destroy_check() here. However the - * dataset we want to hold and destroy is the snapshot - * that we just confirmed we can create, but it won't - * exist until after these checks are run. Do any - * checks we can here and if more checks are added to - * those routines in the future, similar checks may be - * necessary here. - */ - if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) - return (ENOTSUP); - /* - * Not checking number of tags because the tag will be - * unique, as it will be the only tag. - */ - if (strlen(saa->saa_htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) - return (E2BIG); - - saa->saa_ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), - KM_SLEEP); - saa->saa_ha->temphold = B_TRUE; - saa->saa_ha->htag = saa->saa_htag; + /* You can't clone across pools. */ + if (origin->ds_dir->dd_pool != dp) { + dsl_dataset_rele(origin, FTAG); + return (EXDEV); } - return (error); -} -static void -snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - dsl_dataset_t *ds = os->os_dsl_dataset; - snaponearg_t *soa = arg2; - snapallarg_t *saa = soa->soa_saa; - - dsl_dataset_snapshot_sync(ds, soa->soa_snapname, tx); - - if (saa->saa_props != NULL) { - dsl_props_arg_t pa; - pa.pa_props = saa->saa_props; - pa.pa_source = ZPROP_SRC_LOCAL; - dsl_props_set_sync(ds->ds_prev, &pa, tx); + /* You can only clone snapshots, not the head datasets. */ + if (!dsl_dataset_is_snapshot(origin)) { + dsl_dataset_rele(origin, FTAG); + return (EINVAL); } + dsl_dataset_rele(origin, FTAG); - if (saa->saa_temporary) { - struct dsl_ds_destroyarg da; - - dsl_dataset_user_hold_sync(ds->ds_prev, saa->saa_ha, tx); - kmem_free(saa->saa_ha, sizeof (struct dsl_ds_holdarg)); - saa->saa_ha = NULL; - saa->saa_newds = ds->ds_prev; - - da.ds = ds->ds_prev; - da.defer = B_TRUE; - dsl_dataset_destroy_sync(&da, FTAG, tx); - } + return (0); } -static int -snapshot_one_impl(const char *snapname, void *arg) +static void +dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) { - char fsname[MAXPATHLEN]; - snapallarg_t *saa = arg; - snaponearg_t *soa; - objset_t *os; - int err; - - (void) strlcpy(fsname, snapname, sizeof (fsname)); - strchr(fsname, '@')[0] = '\0'; - - err = dmu_objset_hold(fsname, saa, &os); - if (err != 0) - return (err); - - /* - * If the objset is in an inconsistent state (eg, in the process - * of being destroyed), don't snapshot it. - */ - if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { - dmu_objset_rele(os, saa); - return (EBUSY); - } - - if (saa->saa_needsuspend) { - err = zil_suspend(dmu_objset_zil(os)); - if (err) { - dmu_objset_rele(os, saa); - return (err); - } - } + dmu_objset_clone_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + dsl_dataset_t *origin, *ds; + uint64_t obj; + char namebuf[MAXNAMELEN]; - soa = kmem_zalloc(sizeof (*soa), KM_SLEEP); - soa->soa_saa = saa; - soa->soa_longname = snapname; - soa->soa_snapname = strchr(snapname, '@') + 1; + VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); + VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); - dsl_sync_task_create(saa->saa_dstg, snapshot_check, snapshot_sync, - os, soa, 3); + obj = dsl_dataset_create_sync(pdd, tail, origin, 0, + doca->doca_cred, tx); - return (0); + VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); + dsl_dataset_name(origin, namebuf); + spa_history_log_internal_ds(ds, "clone", tx, + "origin=%s (%llu)", namebuf, origin->ds_object); + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); } -/* - * The snapshots must all be in the same pool. - */ int -dmu_objset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) +dmu_objset_clone(const char *clone, const char *origin) { - dsl_sync_task_t *dst; - snapallarg_t saa = { 0 }; - spa_t *spa; - int rv = 0; - int err; - nvpair_t *pair; - - pair = nvlist_next_nvpair(snaps, NULL); - if (pair == NULL) - return (0); - - err = spa_open(nvpair_name(pair), &spa, FTAG); - if (err) - return (err); - saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - saa.saa_props = props; - saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); - - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - err = snapshot_one_impl(nvpair_name(pair), &saa); - if (err != 0) { - if (errors != NULL) { - fnvlist_add_int32(errors, - nvpair_name(pair), err); - } - rv = err; - } - } + dmu_objset_clone_arg_t doca; - /* - * If any call to snapshot_one_impl() failed, don't execute the - * sync task. The error handling code below will clean up the - * snaponearg_t from any successful calls to - * snapshot_one_impl(). - */ - if (rv == 0) - err = dsl_sync_task_group_wait(saa.saa_dstg); - if (err != 0) - rv = err; - - for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst; - dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) { - objset_t *os = dst->dst_arg1; - snaponearg_t *soa = dst->dst_arg2; - if (dst->dst_err != 0) { - if (errors != NULL) { - fnvlist_add_int32(errors, - soa->soa_longname, dst->dst_err); - } - rv = dst->dst_err; - } - - if (saa.saa_needsuspend) - zil_resume(dmu_objset_zil(os)); - dmu_objset_rele(os, &saa); - kmem_free(soa, sizeof (*soa)); - } + doca.doca_clone = clone; + doca.doca_origin = origin; + doca.doca_cred = CRED(); - dsl_sync_task_group_destroy(saa.saa_dstg); - spa_close(spa, FTAG); - return (rv); + return (dsl_sync_task(clone, + dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5)); } int @@ -1013,59 +878,12 @@ dmu_objset_snapshot_one(const char *fsname, const char *snapname) nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, longsnap); - err = dmu_objset_snapshot(snaps, NULL, NULL); - fnvlist_free(snaps); strfree(longsnap); + err = dsl_dataset_snapshot(snaps, NULL, NULL); + fnvlist_free(snaps); return (err); } -int -dmu_objset_snapshot_tmp(const char *snapname, const char *tag, int cleanup_fd) -{ - dsl_sync_task_t *dst; - snapallarg_t saa = { 0 }; - spa_t *spa; - minor_t minor; - int err; - - err = spa_open(snapname, &spa, FTAG); - if (err) - return (err); - saa.saa_dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - saa.saa_htag = tag; - saa.saa_needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); - saa.saa_temporary = B_TRUE; - - if (cleanup_fd < 0) { - spa_close(spa, FTAG); - return (EINVAL); - } - if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { - spa_close(spa, FTAG); - return (err); - } - - err = snapshot_one_impl(snapname, &saa); - - if (err == 0) - err = dsl_sync_task_group_wait(saa.saa_dstg); - - for (dst = list_head(&saa.saa_dstg->dstg_tasks); dst; - dst = list_next(&saa.saa_dstg->dstg_tasks, dst)) { - objset_t *os = dst->dst_arg1; - dsl_register_onexit_hold_cleanup(saa.saa_newds, tag, minor); - if (saa.saa_needsuspend) - zil_resume(dmu_objset_zil(os)); - dmu_objset_rele(os, &saa); - } - - zfs_onexit_fd_rele(cleanup_fd); - dsl_sync_task_group_destroy(saa.saa_dstg); - spa_close(spa, FTAG); - return (err); -} - - static void dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { @@ -1101,9 +919,9 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; - ASSERT(bp == os->os_rootbp); - ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); - ASSERT(BP_GET_LEVEL(bp) == 0); + ASSERT3P(bp, ==, os->os_rootbp); + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); + ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects @@ -1210,7 +1028,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while (dr = list_head(list)) { - ASSERT(dr->dr_dbuf->db_level == 0); + ASSERT0(dr->dr_dbuf->db_level); list_remove(list, dr); if (dr->dr_zio) zio_nowait(dr->dr_zio); @@ -1505,12 +1323,12 @@ dmu_objset_userspace_upgrade(objset_t *os) return (EINTR); objerr = dmu_bonus_hold(os, obj, FTAG, &db); - if (objerr) + if (objerr != 0) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); - if (objerr) { + if (objerr != 0) { dmu_tx_abort(tx); continue; } @@ -1593,6 +1411,8 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, zap_cursor_t cursor; zap_attribute_t attr; + ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); + if (ds->ds_phys->ds_snapnames_zapobj == 0) return (ENOENT); @@ -1659,42 +1479,122 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, return (0); } -struct findarg { - int (*func)(const char *, void *); - void *arg; -}; - -/* ARGSUSED */ -static int -findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - struct findarg *fa = arg; - return (fa->func(dsname, fa->arg)); -} - /* - * Find all objsets under name, and for each, call 'func(child_name, arg)'. - * Perhaps change all callers to use dmu_objset_find_spa()? + * Find objsets under and including ddobj, call func(ds) on each. */ int -dmu_objset_find(char *name, int func(const char *, void *), void *arg, - int flags) +dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, + int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { - struct findarg fa; - fa.func = func; - fa.arg = arg; - return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); + dsl_dir_t *dd; + dsl_dataset_t *ds; + zap_cursor_t zc; + zap_attribute_t *attr; + uint64_t thisobj; + int err; + + ASSERT(dsl_pool_config_held(dp)); + + err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); + if (err != 0) + return (err); + + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_rele(dd, FTAG); + return (0); + } + + thisobj = dd->dd_phys->dd_head_dataset_obj; + attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* + * Iterate over all children. + */ + if (flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dd->dd_phys->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + err = dmu_objset_find_dp(dp, attr->za_first_integer, + func, arg, flags); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + + if (err != 0) { + dsl_dir_rele(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + return (err); + } + } + + /* + * Iterate over all snapshots. + */ + if (flags & DS_FIND_SNAPSHOTS) { + dsl_dataset_t *ds; + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + + if (err == 0) { + uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + err = dsl_dataset_hold_obj(dp, + attr->za_first_integer, FTAG, &ds); + if (err != 0) + break; + err = func(dp, ds, arg); + dsl_dataset_rele(ds, FTAG); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + } + } + + dsl_dir_rele(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + + if (err != 0) + return (err); + + /* + * Apply to self. + */ + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + if (err != 0) + return (err); + err = func(dp, ds, arg); + dsl_dataset_rele(ds, FTAG); + return (err); } /* - * Find all objsets under name, call func on each + * Find all objsets under name, and for each, call 'func(child_name, arg)'. + * The dp_config_rwlock must not be held when this is called, and it + * will not be held when the callback is called. + * Therefore this function should only be used when the pool is not changing + * (e.g. in syncing context), or the callback can deal with the possible races. */ -int -dmu_objset_find_spa(spa_t *spa, const char *name, - int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) +static int +dmu_objset_find_impl(spa_t *spa, const char *name, + int func(const char *, void *), void *arg, int flags) { dsl_dir_t *dd; - dsl_pool_t *dp; + dsl_pool_t *dp = spa_get_dsl(spa); dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; @@ -1702,21 +1602,23 @@ dmu_objset_find_spa(spa_t *spa, const char *name, uint64_t thisobj; int err; - if (name == NULL) - name = spa_name(spa); - err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); - if (err) + dsl_pool_config_enter(dp, FTAG); + + err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); return (err); + } /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); return (0); } thisobj = dd->dd_phys->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - dp = dd->dd_pool; /* * Iterate over all children. @@ -1726,19 +1628,24 @@ dmu_objset_find_spa(spa_t *spa, const char *name, dd->dd_phys->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s/%s", name, attr->za_name); - err = dmu_objset_find_spa(spa, child, func, arg, flags); + dsl_pool_config_exit(dp, FTAG); + err = dmu_objset_find_impl(spa, child, + func, arg, flags); + dsl_pool_config_enter(dp, FTAG); strfree(child); - if (err) + if (err != 0) break; } zap_cursor_fini(&zc); - if (err) { - dsl_dir_close(dd, FTAG); + if (err != 0) { + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } @@ -1748,11 +1655,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name, * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { - if (!dsl_pool_sync_context(dp)) - rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); - if (!dsl_pool_sync_context(dp)) - rw_exit(&dp->dp_config_rwlock); if (err == 0) { uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; @@ -1761,64 +1664,50 @@ dmu_objset_find_spa(spa_t *spa, const char *name, for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == + ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); + ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s@%s", name, attr->za_name); - err = func(spa, attr->za_first_integer, - child, arg); + dsl_pool_config_exit(dp, FTAG); + err = func(child, arg); + dsl_pool_config_enter(dp, FTAG); strfree(child); - if (err) + if (err != 0) break; } zap_cursor_fini(&zc); } } - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); + dsl_pool_config_exit(dp, FTAG); - if (err) + if (err != 0) return (err); - /* - * Apply to self if appropriate. - */ - err = func(spa, thisobj, name, arg); - return (err); + /* Apply to self. */ + return (func(name, arg)); } -/* ARGSUSED */ +/* + * See comment above dmu_objset_find_impl(). + */ int -dmu_objset_prefetch(const char *name, void *arg) +dmu_objset_find(char *name, int func(const char *, void *), void *arg, + int flags) { - dsl_dataset_t *ds; - - if (dsl_dataset_hold(name, FTAG, &ds)) - return (0); - - if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { - mutex_enter(&ds->ds_opening_lock); - if (ds->ds_objset == NULL) { - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; - zbookmark_t zb; - - SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, - ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - - (void) arc_read(NULL, dsl_dataset_get_spa(ds), - &ds->ds_phys->ds_bp, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &zb); - } - mutex_exit(&ds->ds_opening_lock); - } + spa_t *spa; + int error; - dsl_dataset_rele(ds, FTAG); - return (0); + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + error = dmu_objset_find_impl(spa, name, func, arg, flags); + spa_close(spa, FTAG); + return (error); } void @@ -1834,3 +1723,19 @@ dmu_objset_get_user(objset_t *os) ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); return (os->os_user_ptr); } + +/* + * Determine name of filesystem, given name of snapshot. + * buf must be at least MAXNAMELEN bytes + */ +int +dmu_fsname(const char *snapname, char *buf) +{ + char *atp = strchr(snapname, '@'); + if (atp == NULL) + return (EINVAL); + if (atp - snapname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strlcpy(buf, snapname, atp - snapname + 1); + return (0); +} diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c index c249335..a2a3647 100644 --- a/uts/common/fs/zfs/dmu_send.c +++ b/uts/common/fs/zfs/dmu_send.c @@ -46,11 +46,14 @@ #include #include #include +#include +#include /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; static char *dmu_recv_tag = "dmu_recv_tag"; +static const char *recv_clone_name = "%recv"; static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) @@ -290,7 +293,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) return (EINTR); - if (dsp->dsa_err) + if (dsp->dsa_err != 0) return (EINTR); return (0); } @@ -340,7 +343,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; err = dump_dnode(dsp, dnobj, blk+i); - if (err) + if (err != 0) break; } (void) arc_buf_remove_ref(abuf, &abuf); @@ -388,65 +391,33 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } /* - * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. - * For example, they could both be snapshots of the same filesystem, and - * 'earlier' is before 'later'. Or 'earlier' could be the origin of - * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's - * filesystem. Or 'earlier' could be the origin's origin. + * Releases dp, ds, and fromds, using the specified tag. */ -static boolean_t -is_before(dsl_dataset_t *later, dsl_dataset_t *earlier) -{ - dsl_pool_t *dp = later->ds_dir->dd_pool; - int error; - boolean_t ret; - dsl_dataset_t *origin; - - if (earlier->ds_phys->ds_creation_txg >= - later->ds_phys->ds_creation_txg) - return (B_FALSE); - - if (later->ds_dir == earlier->ds_dir) - return (B_TRUE); - if (!dsl_dir_is_clone(later->ds_dir)) - return (B_FALSE); - - rw_enter(&dp->dp_config_rwlock, RW_READER); - if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) { - rw_exit(&dp->dp_config_rwlock); - return (B_TRUE); - } - error = dsl_dataset_hold_obj(dp, - later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin); - rw_exit(&dp->dp_config_rwlock); - if (error != 0) - return (B_FALSE); - ret = is_before(origin, earlier); - dsl_dataset_rele(origin, FTAG); - return (ret); -} - -int -dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, - offset_t *off) +static int +dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, + dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off) { - dsl_dataset_t *ds = tosnap->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; + objset_t *os; dmu_replay_record_t *drr; dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; - /* tosnap must be a snapshot */ - if (ds->ds_phys->ds_next_snap_obj == 0) - return (EINVAL); - - /* - * fromsnap must be an earlier snapshot from the same fs as tosnap, - * or the origin's fs. - */ - if (fromds != NULL && !is_before(ds, fromds)) + if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) { + dsl_dataset_rele(fromds, tag); + dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); return (EXDEV); + } + + err = dmu_objset_from_ds(ds, &os); + if (err != 0) { + if (fromds != NULL) + dsl_dataset_rele(fromds, tag); + dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); + return (err); + } drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; @@ -455,13 +426,17 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, DMU_SUBSTREAM); #ifdef _KERNEL - if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { + if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; - if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); + if (fromds != NULL) + dsl_dataset_rele(fromds, tag); + dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); return (EINVAL); } - if (version == ZPL_VERSION_SA) { + if (version >= ZPL_VERSION_SA) { DMU_SET_FEATUREFLAGS( drr->drr_u.drr_begin.drr_versioninfo, DMU_BACKUP_FEATURE_SA_SPILL); @@ -471,19 +446,22 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, drr->drr_u.drr_begin.drr_creation_time = ds->ds_phys->ds_creation_time; - drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; + drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (fromds != NULL && ds->ds_dir != fromds->ds_dir) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - if (fromds) + if (fromds != NULL) drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (fromds) + if (fromds != NULL) { fromtxg = fromds->ds_phys->ds_creation_txg; + dsl_dataset_rele(fromds, tag); + fromds = NULL; + } dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); @@ -491,7 +469,7 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, dsp->dsa_vp = vp; dsp->dsa_outfd = outfd; dsp->dsa_proc = curproc; - dsp->dsa_os = tosnap; + dsp->dsa_os = os; dsp->dsa_off = off; dsp->dsa_toguid = ds->ds_phys->ds_guid; ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); @@ -506,6 +484,9 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, goto out; } + dsl_dataset_long_hold(ds, FTAG); + dsl_pool_rele(dp, tag); + err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, backup_cb, dsp); @@ -513,8 +494,8 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp, if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) err = EINTR; - if (err) { - if (err == EINTR && dsp->dsa_err) + if (err != 0) { + if (err == EINTR && dsp->dsa_err != 0) err = dsp->dsa_err; goto out; } @@ -537,27 +518,96 @@ out: kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); + dsl_dataset_long_rele(ds, FTAG); + dsl_dataset_rele(ds, tag); + return (err); } int -dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep) +dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, + int outfd, vnode_t *vp, offset_t *off) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + dsl_dataset_t *fromds = NULL; + int err; + + err = dsl_pool_hold(pool, FTAG, &dp); + if (err != 0) + return (err); + + err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (fromsnap != 0) { + err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); + if (err != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); + } + } + + return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); +} + +int +dmu_send(const char *tosnap, const char *fromsnap, + int outfd, vnode_t *vp, offset_t *off) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + dsl_dataset_t *fromds = NULL; + int err; + + if (strchr(tosnap, '@') == NULL) + return (EINVAL); + if (fromsnap != NULL && strchr(fromsnap, '@') == NULL) + return (EINVAL); + + err = dsl_pool_hold(tosnap, FTAG, &dp); + if (err != 0) + return (err); + + err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (fromsnap != NULL) { + err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); + if (err != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); + } + } + return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); +} + +int +dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) { - dsl_dataset_t *ds = tosnap->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; dsl_pool_t *dp = ds->ds_dir->dd_pool; int err; uint64_t size; + ASSERT(dsl_pool_config_held(dp)); + /* tosnap must be a snapshot */ - if (ds->ds_phys->ds_next_snap_obj == 0) + if (!dsl_dataset_is_snapshot(ds)) return (EINVAL); /* * fromsnap must be an earlier snapshot from the same fs as tosnap, * or the origin's fs. */ - if (fromds != NULL && !is_before(ds, fromds)) + if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) return (EXDEV); /* Get uncompressed size estimate of changed data. */ @@ -567,7 +617,7 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep) uint64_t used, comp; err = dsl_dataset_space_written(fromds, ds, &used, &comp, &size); - if (err) + if (err != 0) return (err); } @@ -587,11 +637,8 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep) * block, which we observe in practice. */ uint64_t recordsize; - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_prop_get_ds(ds, "recordsize", - sizeof (recordsize), 1, &recordsize, NULL); - rw_exit(&dp->dp_config_rwlock); - if (err) + err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); + if (err != 0) return (err); size -= size / recordsize * sizeof (blkptr_t); @@ -603,93 +650,40 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep) return (0); } -struct recvbeginsyncarg { - const char *tofs; - const char *tosnap; - dsl_dataset_t *origin; - uint64_t fromguid; - dmu_objset_type_t type; - void *tag; - boolean_t force; - uint64_t dsflags; - char clonelastname[MAXNAMELEN]; - dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ - cred_t *cr; -}; - -/* ARGSUSED */ -static int -recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct recvbeginsyncarg *rbsa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t val; - int err; - - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); - - if (err != ENOENT) - return (err ? err : EEXIST); - - if (rbsa->origin) { - /* make sure it's a snap in the same pool */ - if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) - return (EXDEV); - if (!dsl_dataset_is_snapshot(rbsa->origin)) - return (EINVAL); - if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); - } - - return (0); -} - -static void -recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct recvbeginsyncarg *rbsa = arg2; - uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; - uint64_t dsobj; - - /* Create and open new dataset. */ - dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, - rbsa->origin, flags, rbsa->cr, tx); - VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, - B_TRUE, dmu_recv_tag, &rbsa->ds)); - - if (rbsa->origin == NULL) { - (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, - rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); - } - - spa_history_log_internal_ds(rbsa->ds, "receive new", tx, ""); -} +typedef struct dmu_recv_begin_arg { + const char *drba_origin; + dmu_recv_cookie_t *drba_cookie; + cred_t *drba_cred; +} dmu_recv_begin_arg_t; -/* ARGSUSED */ static int -recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, + uint64_t fromguid) { - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - int err; uint64_t val; + int error; + dsl_pool_t *dp = ds->ds_dir->dd_pool; /* must not have any changes since most recent snapshot */ - if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) + if (!drba->drba_cookie->drc_force && + dsl_dataset_modified_since_lastsnap(ds)) return (ETXTBSY); + /* temporary clone name must not exist */ + error = zap_lookup(dp->dp_meta_objset, + ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, + 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? EBUSY : error); + /* new snapshot name must not exist */ - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); + error = zap_lookup(dp->dp_meta_objset, + ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, + 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? EEXIST : error); - if (rbsa->fromguid) { + if (fromguid != 0) { /* if incremental, most recent snapshot must match fromguid */ if (ds->ds_prev == NULL) return (ENODEV); @@ -698,20 +692,20 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) * most recent snapshot must match fromguid, or there are no * changes since the fromguid one */ - if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { + if (ds->ds_prev->ds_phys->ds_guid != fromguid) { uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; while (obj != 0) { dsl_dataset_t *snap; - err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - obj, FTAG, &snap); - if (err) + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) return (ENODEV); if (snap->ds_phys->ds_creation_txg < birth) { dsl_dataset_rele(snap, FTAG); return (ENODEV); } - if (snap->ds_phys->ds_guid == rbsa->fromguid) { + if (snap->ds_phys->ds_guid == fromguid) { dsl_dataset_rele(snap, FTAG); break; /* it's ok */ } @@ -727,58 +721,153 @@ recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) return (ENODEV); } - /* temporary clone name must not exist */ - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_dir->dd_phys->dd_child_dir_zapobj, - rbsa->clonelastname, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - return (0); + +} + +static int +dmu_recv_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + uint64_t fromguid = drrb->drr_fromguid; + int flags = drrb->drr_flags; + int error; + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) + return (EINVAL); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) { + return (ENOTSUP); + } + + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error == 0) { + /* target fs already exists; recv into temp clone */ + + /* Can't recv a clone into an existing fs */ + if (flags & DRR_FLAG_CLONE) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + + error = recv_begin_check_existing_impl(drba, ds, fromguid); + dsl_dataset_rele(ds, FTAG); + } else if (error == ENOENT) { + /* target fs does not exist; must be a full backup or clone */ + char buf[MAXNAMELEN]; + + /* + * If it's a non-clone incremental, we are missing the + * target fs, so fail the recv. + */ + if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) + return (ENOENT); + + /* Open the parent of tofs */ + ASSERT3U(strlen(tofs), <, MAXNAMELEN); + (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); + error = dsl_dataset_hold(dp, buf, FTAG, &ds); + if (error != 0) + return (error); + + if (drba->drba_origin != NULL) { + dsl_dataset_t *origin; + error = dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + if (!dsl_dataset_is_snapshot(origin)) { + dsl_dataset_rele(origin, FTAG); + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + if (origin->ds_phys->ds_guid != fromguid) { + dsl_dataset_rele(origin, FTAG); + dsl_dataset_rele(ds, FTAG); + return (ENODEV); + } + dsl_dataset_rele(origin, FTAG); + } + dsl_dataset_rele(ds, FTAG); + error = 0; + } + return (error); } -/* ARGSUSED */ static void -recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ohds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - dsl_pool_t *dp = ohds->ds_dir->dd_pool; - dsl_dataset_t *cds; - uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds, *newds; uint64_t dsobj; + int error; + uint64_t crflags; + + crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? + DS_FLAG_CI_DATASET : 0; - /* create and open the temporary clone */ - dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, - ohds->ds_prev, flags, rbsa->cr, tx); - VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error == 0) { + /* create temporary clone */ + dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, + ds->ds_prev, crflags, drba->drba_cred, tx); + dsl_dataset_rele(ds, FTAG); + } else { + dsl_dir_t *dd; + const char *tail; + dsl_dataset_t *origin = NULL; + + VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); + + if (drba->drba_origin != NULL) { + VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin)); + } + + /* Create new dataset. */ + dsobj = dsl_dataset_create_sync(dd, + strrchr(tofs, '/') + 1, + origin, crflags, drba->drba_cred, tx); + if (origin != NULL) + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(dd, FTAG); + drba->drba_cookie->drc_newfs = B_TRUE; + } + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + + dmu_buf_will_dirty(newds->ds_dbuf, tx); + newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; /* * If we actually created a non-clone, we need to create the * objset in our new dataset. */ - if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { + if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { (void) dmu_objset_create_impl(dp->dp_spa, - cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); + newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } - rbsa->ds = cds; - - spa_history_log_internal_ds(cds, "receive over existing", tx, ""); -} - -static boolean_t -dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) -{ - int featureflags; - - featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + drba->drba_cookie->drc_ds = newds; - /* Verify pool version supports SA if SA_SPILL feature set */ - return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && - (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); + spa_history_log_internal_ds(newds, "receive", tx, ""); } /* @@ -786,132 +875,55 @@ dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) * succeeds; otherwise we will leak the holds on the datasets. */ int -dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, - boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) +dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, + boolean_t force, char *origin, dmu_recv_cookie_t *drc) { - int err = 0; - boolean_t byteswap; - struct recvbeginsyncarg rbsa = { 0 }; - uint64_t versioninfo; - int flags; - dsl_dataset_t *ds; - - if (drrb->drr_magic == DMU_BACKUP_MAGIC) - byteswap = FALSE; - else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) - byteswap = TRUE; - else - return (EINVAL); - - rbsa.tofs = tofs; - rbsa.tosnap = tosnap; - rbsa.origin = origin ? origin->os_dsl_dataset : NULL; - rbsa.fromguid = drrb->drr_fromguid; - rbsa.type = drrb->drr_type; - rbsa.tag = FTAG; - rbsa.dsflags = 0; - rbsa.cr = CRED(); - versioninfo = drrb->drr_versioninfo; - flags = drrb->drr_flags; - - if (byteswap) { - rbsa.type = BSWAP_32(rbsa.type); - rbsa.fromguid = BSWAP_64(rbsa.fromguid); - versioninfo = BSWAP_64(versioninfo); - flags = BSWAP_32(flags); - } - - if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || - rbsa.type >= DMU_OST_NUMTYPES || - ((flags & DRR_FLAG_CLONE) && origin == NULL)) - return (EINVAL); - - if (flags & DRR_FLAG_CI_DATA) - rbsa.dsflags = DS_FLAG_CI_DATASET; + dmu_recv_begin_arg_t drba = { 0 }; + dmu_replay_record_t *drr; bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drrb = drrb; drc->drc_tosnap = tosnap; - drc->drc_top_ds = top_ds; + drc->drc_tofs = tofs; drc->drc_force = force; - /* - * Process the begin in syncing context. - */ - - /* open the dataset we are logically receiving into */ - err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); - if (err == 0) { - if (dmu_recv_verify_features(ds, drrb)) { - dsl_dataset_rele(ds, dmu_recv_tag); - return (ENOTSUP); - } - /* target fs already exists; recv into temp clone */ - - /* Can't recv a clone into an existing fs */ - if (flags & DRR_FLAG_CLONE) { - dsl_dataset_rele(ds, dmu_recv_tag); - return (EINVAL); - } - - /* must not have an incremental recv already in progress */ - if (!mutex_tryenter(&ds->ds_recvlock)) { - dsl_dataset_rele(ds, dmu_recv_tag); - return (EBUSY); - } - - /* tmp clone name is: tofs/%tosnap" */ - (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), - "%%%s", tosnap); - rbsa.force = force; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_existing_check, recv_existing_sync, ds, &rbsa, 5); - if (err) { - mutex_exit(&ds->ds_recvlock); - dsl_dataset_rele(ds, dmu_recv_tag); - return (err); - } - drc->drc_logical_ds = ds; - drc->drc_real_ds = rbsa.ds; - } else if (err == ENOENT) { - /* target fs does not exist; must be a full backup or clone */ - char *cp; - - /* - * If it's a non-clone incremental, we are missing the - * target fs, so fail the recv. - */ - if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) - return (ENOENT); - - /* Open the parent of tofs */ - cp = strrchr(tofs, '/'); - *cp = '\0'; - err = dsl_dataset_hold(tofs, FTAG, &ds); - *cp = '/'; - if (err) - return (err); + if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) + drc->drc_byteswap = B_TRUE; + else if (drrb->drr_magic != DMU_BACKUP_MAGIC) + return (EINVAL); - if (dmu_recv_verify_features(ds, drrb)) { - dsl_dataset_rele(ds, FTAG); - return (ENOTSUP); - } + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); + drr->drr_type = DRR_BEGIN; + drr->drr_u.drr_begin = *drc->drc_drrb; + if (drc->drc_byteswap) { + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + } else { + fletcher_4_incremental_native(drr, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + } + kmem_free(drr, sizeof (dmu_replay_record_t)); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); - dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; - drc->drc_newfs = B_TRUE; + if (drc->drc_byteswap) { + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); + drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } - return (err); + drba.drba_origin = origin; + drba.drba_cookie = drc; + drba.drba_cred = CRED(); + + return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5)); } struct restorearg { int err; - int byteswap; + boolean_t byteswap; vnode_t *vp; char *buf; uint64_t voff; @@ -947,7 +959,7 @@ free_guid_map_onexit(void *arg) guid_map_entry_t *gmep; while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { - dsl_dataset_rele(gmep->gme_ds, ca); + dsl_dataset_long_rele(gmep->gme_ds, gmep); kmem_free(gmep, sizeof (guid_map_entry_t)); } avl_destroy(ca); @@ -975,7 +987,7 @@ restore_read(struct restorearg *ra, int len) ra->err = EINVAL; ra->voff += len - done - resid; done = len - resid; - if (ra->err) + if (ra->err != 0) return (NULL); } @@ -1094,7 +1106,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) if (drro->drr_bonuslen) { data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); - if (ra->err) + if (ra->err != 0) return (ra->err); } @@ -1103,7 +1115,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_tx_abort(tx); return (err); } @@ -1117,14 +1129,14 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen); } - if (err) { + if (err != 0) { return (EINVAL); } tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, drro->drr_object); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_tx_abort(tx); return (err); } @@ -1172,7 +1184,7 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, continue; err = dmu_free_object(os, obj); - if (err) + if (err != 0) return (err); } return (0); @@ -1202,7 +1214,7 @@ restore_write(struct restorearg *ra, objset_t *os, dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_tx_abort(tx); return (err); } @@ -1264,7 +1276,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_tx_abort(tx); return (err); } @@ -1305,7 +1317,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); @@ -1344,6 +1356,16 @@ restore_free(struct restorearg *ra, objset_t *os, return (err); } +/* used to destroy the drc_ds on error */ +static void +dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) +{ + char name[MAXNAMELEN]; + dsl_dataset_name(drc->drc_ds, name); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + (void) dsl_destroy_head(name); +} + /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ @@ -1357,52 +1379,24 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, zio_cksum_t pcksum; int featureflags; - if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) - ra.byteswap = TRUE; - - { - /* compute checksum of drr_begin record */ - dmu_replay_record_t *drr; - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin = *drc->drc_drrb; - if (ra.byteswap) { - fletcher_4_incremental_byteswap(drr, - sizeof (dmu_replay_record_t), &ra.cksum); - } else { - fletcher_4_incremental_native(drr, - sizeof (dmu_replay_record_t), &ra.cksum); - } - kmem_free(drr, sizeof (dmu_replay_record_t)); - } - - if (ra.byteswap) { - struct drr_begin *drrb = drc->drc_drrb; - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); - } - + ra.byteswap = drc->drc_byteswap; + ra.cksum = drc->drc_cksum; ra.vp = vp; ra.voff = *voffp; ra.bufsize = 1<<20; ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); /* these were verified in dmu_recv_begin */ - ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == + ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); - ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); + ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ - VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); + VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); - ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); + ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); @@ -1415,7 +1409,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, goto out; } ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (ra.err) { + if (ra.err != 0) { cleanup_fd = -1; goto out; } @@ -1429,12 +1423,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, ra.err = zfs_onexit_add_cb(minor, free_guid_map_onexit, ra.guid_to_ds_map, action_handlep); - if (ra.err) + if (ra.err != 0) goto out; } else { ra.err = zfs_onexit_cb_data(minor, *action_handlep, (void **)&ra.guid_to_ds_map); - if (ra.err) + if (ra.err != 0) goto out; } @@ -1528,14 +1522,7 @@ out: * destroy what we created, so we don't leave it in the * inconsistent restoring state. */ - txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); - - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, - B_FALSE); - if (drc->drc_real_ds != drc->drc_logical_ds) { - mutex_exit(&drc->drc_logical_ds->ds_recvlock); - dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); - } + dmu_recv_cleanup_ds(drc); } kmem_free(ra.buf, ra.bufsize); @@ -1543,142 +1530,176 @@ out: return (ra.err); } -struct recvendsyncarg { - char *tosnap; - uint64_t creation_time; - uint64_t toguid; -}; - static int -recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_recv_end_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - struct recvendsyncarg *resa = arg2; + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int error; + + ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); - return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); + if (error != 0) + return (error); + error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, + origin_head, drc->drc_force); + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + error = dsl_dataset_snapshot_check_impl(origin_head, + drc->drc_tosnap, tx); + dsl_dataset_rele(origin_head, FTAG); + if (error != 0) + return (error); + + error = dsl_destroy_head_check_impl(drc->drc_ds, 1); + } else { + error = dsl_dataset_snapshot_check_impl(drc->drc_ds, + drc->drc_tosnap, tx); + } + return (error); } static void -recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - struct recvendsyncarg *resa = arg2; + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + + spa_history_log_internal_ds(drc->drc_ds, "finish receiving", + tx, "snap=%s", drc->drc_tosnap); + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, + &origin_head)); + dsl_dataset_clone_swap_sync_impl(drc->drc_ds, + origin_head, tx); + dsl_dataset_snapshot_sync_impl(origin_head, + drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); + origin_head->ds_prev->ds_phys->ds_creation_time = + drc->drc_drrb->drr_creation_time; + origin_head->ds_prev->ds_phys->ds_guid = + drc->drc_drrb->drr_toguid; + origin_head->ds_prev->ds_phys->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); + origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + + dsl_dataset_rele(origin_head, FTAG); + dsl_destroy_head_sync_impl(drc->drc_ds, tx); + } else { + dsl_dataset_t *ds = drc->drc_ds; - dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); + dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); - /* set snapshot's creation time and guid */ - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; - ds->ds_prev->ds_phys->ds_guid = resa->toguid; - ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ds->ds_prev->ds_phys->ds_creation_time = + drc->drc_drrb->drr_creation_time; + ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; + ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; - spa_history_log_internal_ds(ds, "finished receiving", tx, ""); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + } + drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; + /* + * Release the hold from dmu_recv_begin. This must be done before + * we return to open context, so that when we free the dataset's dnode, + * we can evict its bonus buffer. + */ + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + drc->drc_ds = NULL; } static int -add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) +add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; + dsl_pool_t *dp; dsl_dataset_t *snapds; guid_map_entry_t *gmep; int err; ASSERT(guid_map != NULL); - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snapds); if (err == 0) { gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); gmep->guid = snapds->ds_phys->ds_guid; gmep->gme_ds = snapds; avl_add(guid_map, gmep); + dsl_dataset_long_hold(snapds, gmep); + dsl_dataset_rele(snapds, FTAG); } - rw_exit(&dp->dp_config_rwlock); + dsl_pool_rele(dp, FTAG); return (err); } +static int dmu_recv_end_modified_blocks = 3; + static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { - struct recvendsyncarg resa; - dsl_dataset_t *ds = drc->drc_logical_ds; - int err, myerr; - - if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { - err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, - drc->drc_force); - if (err) - goto out; - } else { - mutex_exit(&ds->ds_recvlock); - dsl_dataset_rele(ds, dmu_recv_tag); - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, - B_FALSE); - return (EBUSY); - } + int error; + char name[MAXNAMELEN]; - resa.creation_time = drc->drc_drrb->drr_creation_time; - resa.toguid = drc->drc_drrb->drr_toguid; - resa.tosnap = drc->drc_tosnap; +#ifdef _KERNEL + /* + * We will be destroying the ds; make sure its origin is unmounted if + * necessary. + */ + dsl_dataset_name(drc->drc_ds, name); + zfs_destroy_unmount_origin(name); +#endif - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_end_check, recv_end_sync, ds, &resa, 3); - if (err) { - /* swap back */ - (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); - } + error = dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks); -out: - mutex_exit(&ds->ds_recvlock); - if (err == 0 && drc->drc_guid_to_ds_map != NULL) - (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); - dsl_dataset_disown(ds, dmu_recv_tag); - myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); - ASSERT0(myerr); - return (err); + if (error != 0) + dmu_recv_cleanup_ds(drc); + return (error); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { - struct recvendsyncarg resa; - dsl_dataset_t *ds = drc->drc_logical_ds; - int err; - - /* - * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() - * expects it to have a ds_user_ptr (and zil), but clone_swap() - * can close it. - */ - txg_wait_synced(ds->ds_dir->dd_pool, 0); + int error; - resa.creation_time = drc->drc_drrb->drr_creation_time; - resa.toguid = drc->drc_drrb->drr_toguid; - resa.tosnap = drc->drc_tosnap; + error = dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_end_check, recv_end_sync, ds, &resa, 3); - if (err) { - /* clean up the fs we just recv'd into */ - (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); - } else { - if (drc->drc_guid_to_ds_map != NULL) - (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); - /* release the hold from dmu_recv_begin */ - dsl_dataset_disown(ds, dmu_recv_tag); + if (error != 0) { + dmu_recv_cleanup_ds(drc); + } else if (drc->drc_guid_to_ds_map != NULL) { + (void) add_ds_to_guidmap(drc->drc_tofs, + drc->drc_guid_to_ds_map, + drc->drc_newsnapobj); } - return (err); + return (error); } int dmu_recv_end(dmu_recv_cookie_t *drc) { - if (drc->drc_logical_ds != drc->drc_real_ds) - return (dmu_recv_existing_end(drc)); - else + if (drc->drc_newfs) return (dmu_recv_new_end(drc)); + else + return (dmu_recv_existing_end(drc)); } diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c index f3d5069..e976517 100644 --- a/uts/common/fs/zfs/dmu_traverse.c +++ b/uts/common/fs/zfs/dmu_traverse.c @@ -265,7 +265,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) + if (err != 0) return (err); cbp = buf->b_data; @@ -282,7 +282,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &cbp[i], &czb); - if (err) { + if (err != 0) { if (!hard) break; lasterr = err; @@ -295,7 +295,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) + if (err != 0) return (err); dnp = buf->b_data; @@ -308,7 +308,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, for (i = 0; i < epb; i++) { err = traverse_dnode(td, &dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); - if (err) { + if (err != 0) { if (!hard) break; lasterr = err; @@ -321,7 +321,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) + if (err != 0) return (err); osp = buf->b_data; @@ -405,7 +405,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); - if (err) { + if (err != 0) { if (!hard) break; lasterr = err; @@ -415,7 +415,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); - if (err) { + if (err != 0) { if (!hard) return (err); lasterr = err; @@ -514,14 +514,20 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); /* See comment on ZIL traversal in dsl_scan_visitds. */ - if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { - objset_t *os; + if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + arc_buf_t *buf; - err = dmu_objset_from_ds(ds, &os); - if (err) + err = arc_read(NULL, td.td_spa, rootbp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); + if (err != 0) return (err); - traverse_zil(&td, &os->os_zil_header); + osp = buf->b_data; + traverse_zil(&td, &osp->os_zil_header); + (void) arc_buf_remove_ref(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || @@ -583,7 +589,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, /* visit the MOS */ err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), txg_start, NULL, flags, func, arg); - if (err) + if (err != 0) return (err); /* visit each dataset */ @@ -592,7 +598,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); - if (err) { + if (err != 0) { if (!hard) return (err); lasterr = err; @@ -603,10 +609,10 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, dsl_dataset_t *ds; uint64_t txg = txg_start; - rw_enter(&dp->dp_config_rwlock, RW_READER); + dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (err) { + dsl_pool_config_exit(dp, FTAG); + if (err != 0) { if (!hard) return (err); lasterr = err; @@ -616,7 +622,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, txg = ds->ds_phys->ds_prev_snap_txg; err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); - if (err) { + if (err != 0) { if (!hard) return (err); lasterr = err; diff --git a/uts/common/fs/zfs/dmu_tx.c b/uts/common/fs/zfs/dmu_tx.c index 556ae6a..8e6beec 100644 --- a/uts/common/fs/zfs/dmu_tx.c +++ b/uts/common/fs/zfs/dmu_tx.c @@ -898,7 +898,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) #endif static int -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) +dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) { dmu_tx_hold_t *txh; spa_t *spa = tx->tx_pool->dp_spa; @@ -962,13 +962,6 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) } /* - * NB: This check must be after we've held the dnodes, so that - * the dmu_tx_unassign() logic will work properly - */ - if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) - return (ERESTART); - - /* * If a snapshot has been taken since we made our estimates, * assume that we won't be able to free or overwrite anything. */ @@ -1048,26 +1041,25 @@ dmu_tx_unassign(dmu_tx_t *tx) * * (1) TXG_WAIT. If the current open txg is full, waits until there's * a new one. This should be used when you're not holding locks. - * If will only fail if we're truly out of space (or over quota). + * It will only fail if we're truly out of space (or over quota). * * (2) TXG_NOWAIT. If we can't assign into the current open txg without * blocking, returns immediately with ERESTART. This should be used * whenever you're holding locks. On an ERESTART error, the caller * should drop locks, do a dmu_tx_wait(tx), and try again. - * - * (3) A specific txg. Use this if you need to ensure that multiple - * transactions all sync in the same txg. Like TXG_NOWAIT, it - * returns ERESTART if it can't assign you into the requested txg. */ int -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { int err; ASSERT(tx->tx_txg == 0); - ASSERT(txg_how != 0); + ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + /* If we might wait, we must not hold the config lock. */ + ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); @@ -1088,6 +1080,7 @@ dmu_tx_wait(dmu_tx_t *tx) spa_t *spa = tx->tx_pool->dp_spa; ASSERT(tx->tx_txg == 0); + ASSERT(!dsl_pool_config_held(tx->tx_pool)); /* * It's possible that the pool has become active after this thread @@ -1214,6 +1207,14 @@ dmu_tx_get_txg(dmu_tx_t *tx) return (tx->tx_txg); } +dsl_pool_t * +dmu_tx_pool(dmu_tx_t *tx) +{ + ASSERT(tx->tx_pool != NULL); + return (tx->tx_pool); +} + + void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { diff --git a/uts/common/fs/zfs/dnode.c b/uts/common/fs/zfs/dnode.c index 6838576..5b5ece4 100644 --- a/uts/common/fs/zfs/dnode.c +++ b/uts/common/fs/zfs/dnode.c @@ -72,7 +72,11 @@ dnode_cons(void *arg, void *unused, int kmflag) mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); - refcount_create(&dn->dn_holds); + /* + * Every dbuf has a reference, and dropping a tracked reference is + * O(number of references), so don't track dn_holds. + */ + refcount_create_untracked(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); list_link_init(&dn->dn_link); diff --git a/uts/common/fs/zfs/dnode_sync.c b/uts/common/fs/zfs/dnode_sync.c index 38dab66..7d47ce0 100644 --- a/uts/common/fs/zfs/dnode_sync.c +++ b/uts/common/fs/zfs/dnode_sync.c @@ -477,6 +477,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT3P(dn->dn_bonus, ==, NULL); /* * XXX - It would be nice to assert this, but we may still diff --git a/uts/common/fs/zfs/dsl_dataset.c b/uts/common/fs/zfs/dsl_dataset.c index 0a5ef83..5e0446d 100644 --- a/uts/common/fs/zfs/dsl_dataset.c +++ b/uts/common/fs/zfs/dsl_dataset.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -45,12 +45,8 @@ #include #include #include - -static char *dsl_reaper = "the grim reaper"; - -static dsl_checkfunc_t dsl_dataset_destroy_begin_check; -static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; -static dsl_syncfunc_t dsl_dataset_set_reservation_sync; +#include +#include #define SWITCH64(x, y) \ { \ @@ -63,9 +59,6 @@ static dsl_syncfunc_t dsl_dataset_set_reservation_sync; #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE -#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) - - /* * Figure out how much of this delta should be propogated to the dsl_dir * layer. If there's a refreservation, that space has already been @@ -252,7 +245,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) { dsl_dataset_t *ds = dsv; - ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); + ASSERT(ds->ds_owner == NULL); unique_remove(ds->ds_fsid_guid); @@ -260,32 +253,26 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { - dsl_dataset_drop_ref(ds->ds_prev, ds); + dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = NULL; } bplist_destroy(&ds->ds_pending_deadlist); - if (db != NULL) { + if (ds->ds_phys->ds_deadlist_obj != 0) dsl_deadlist_close(&ds->ds_deadlist); - } else { - ASSERT(ds->ds_deadlist.dl_dbuf == NULL); - ASSERT(!ds->ds_deadlist.dl_oldfmt); - } if (ds->ds_dir) - dsl_dir_close(ds->ds_dir, ds); + dsl_dir_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - rw_destroy(&ds->ds_rwlock); - cv_destroy(&ds->ds_exclusive_cv); + refcount_destroy(&ds->ds_longholds); kmem_free(ds, sizeof (dsl_dataset_t)); } -static int +int dsl_dataset_get_snapname(dsl_dataset_t *ds) { dsl_dataset_phys_t *headphys; @@ -301,7 +288,7 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &headdbuf); - if (err) + if (err != 0) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, @@ -310,7 +297,7 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) return (err); } -static int +int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; @@ -330,8 +317,8 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) return (err); } -static int -dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) +int +dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; @@ -351,8 +338,8 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) return (err); } -static int -dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, +int +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; @@ -361,11 +348,10 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, int err; dmu_object_info_t doi; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); + ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); - if (err) + if (err != 0) return (err); /* Make sure dsobj has the correct object type. */ @@ -383,12 +369,9 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_phys = dbuf->db_data; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); - - rw_init(&ds->ds_rwlock, 0, 0, 0); - cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&ds->ds_longholds); bplist_create(&ds->ds_pending_deadlist); dsl_deadlist_open(&ds->ds_deadlist, @@ -398,15 +381,13 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, offsetof(dmu_sendarg_t, dsa_link)); if (err == 0) { - err = dsl_dir_open_obj(dp, + err = dsl_dir_hold_obj(dp, ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); } - if (err) { + if (err != 0) { mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - rw_destroy(&ds->ds_rwlock); - cv_destroy(&ds->ds_exclusive_cv); + refcount_destroy(&ds->ds_longholds); bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); @@ -416,8 +397,8 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, if (!dsl_dataset_is_snapshot(ds)) { ds->ds_snapname[0] = '\0'; - if (ds->ds_phys->ds_prev_snap_obj) { - err = dsl_dataset_get_ref(dp, + if (ds->ds_phys->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev); } @@ -433,29 +414,14 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, } if (err == 0 && !dsl_dataset_is_snapshot(ds)) { - /* - * In sync context, we're called with either no lock - * or with the write lock. If we're not syncing, - * we're always called with the read lock held. - */ - boolean_t need_lock = - !RW_WRITE_HELD(&dp->dp_config_rwlock) && - dsl_pool_sync_context(dp); - - if (need_lock) - rw_enter(&dp->dp_config_rwlock, RW_READER); - - err = dsl_prop_get_ds(ds, - "refreservation", sizeof (uint64_t), 1, - &ds->ds_reserved, NULL); + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + &ds->ds_reserved); if (err == 0) { - err = dsl_prop_get_ds(ds, - "refquota", sizeof (uint64_t), 1, - &ds->ds_quota, NULL); + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + &ds->ds_quota); } - - if (need_lock) - rw_exit(&dp->dp_config_rwlock); } else { ds->ds_reserved = ds->ds_quota = 0; } @@ -465,15 +431,13 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) - dsl_dataset_drop_ref(ds->ds_prev, ds); - dsl_dir_close(ds->ds_dir, ds); + dsl_dataset_rele(ds->ds_prev, ds); + dsl_dir_rele(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - rw_destroy(&ds->ds_rwlock); - cv_destroy(&ds->ds_exclusive_cv); + refcount_destroy(&ds->ds_longholds); kmem_free(ds, sizeof (dsl_dataset_t)); - if (err) { + if (err != 0) { dmu_buf_rele(dbuf, tag); return (err); } @@ -488,170 +452,118 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); - mutex_enter(&ds->ds_lock); - if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { - mutex_exit(&ds->ds_lock); - dmu_buf_rele(ds->ds_dbuf, tag); - return (ENOENT); - } - mutex_exit(&ds->ds_lock); *dsp = ds; return (0); } -static int -dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - /* - * In syncing context we don't want the rwlock lock: there - * may be an existing writer waiting for sync phase to - * finish. We don't need to worry about such writers, since - * sync phase is single-threaded, so the writer can't be - * doing anything while we are active. - */ - if (dsl_pool_sync_context(dp)) { - ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); - return (0); - } - - /* - * Normal users will hold the ds_rwlock as a READER until they - * are finished (i.e., call dsl_dataset_rele()). "Owners" will - * drop their READER lock after they set the ds_owner field. - * - * If the dataset is being destroyed, the destroy thread will - * obtain a WRITER lock for exclusive access after it's done its - * open-context work and then change the ds_owner to - * dsl_reaper once destruction is assured. So threads - * may block here temporarily, until the "destructability" of - * the dataset is determined. - */ - ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); - mutex_enter(&ds->ds_lock); - while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { - rw_exit(&dp->dp_config_rwlock); - cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); - if (DSL_DATASET_IS_DESTROYED(ds)) { - mutex_exit(&ds->ds_lock); - dsl_dataset_drop_ref(ds, tag); - rw_enter(&dp->dp_config_rwlock, RW_READER); - return (ENOENT); - } - /* - * The dp_config_rwlock lives above the ds_lock. And - * we need to check DSL_DATASET_IS_DESTROYED() while - * holding the ds_lock, so we have to drop and reacquire - * the ds_lock here. - */ - mutex_exit(&ds->ds_lock); - rw_enter(&dp->dp_config_rwlock, RW_READER); - mutex_enter(&ds->ds_lock); - } - mutex_exit(&ds->ds_lock); - return (0); -} - -int -dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **dsp) -{ - int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); - - if (err) - return (err); - return (dsl_dataset_hold_ref(*dsp, tag)); -} - int -dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, +dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); - if (err) - return (err); - if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { - dsl_dataset_rele(*dsp, tag); - *dsp = NULL; - return (EBUSY); - } - return (0); -} - -int -dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) -{ dsl_dir_t *dd; - dsl_pool_t *dp; const char *snapname; uint64_t obj; int err = 0; - err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); - if (err) + err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); + if (err != 0) return (err); - dp = dd->dd_pool; + ASSERT(dsl_pool_config_held(dp)); obj = dd->dd_phys->dd_head_dataset_obj; - rw_enter(&dp->dp_config_rwlock, RW_READER); - if (obj) - err = dsl_dataset_get_ref(dp, obj, tag, dsp); + if (obj != 0) + err = dsl_dataset_hold_obj(dp, obj, tag, dsp); else err = ENOENT; - if (err) - goto out; - - err = dsl_dataset_hold_ref(*dsp, tag); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { - dsl_dataset_t *ds = NULL; + dsl_dataset_t *ds; if (*snapname++ != '@') { dsl_dataset_rele(*dsp, tag); - err = ENOENT; - goto out; + dsl_dir_rele(dd, FTAG); + return (ENOENT); } dprintf("looking for snapshot '%s'\n", snapname); err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); if (err == 0) - err = dsl_dataset_get_ref(dp, obj, tag, &ds); + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); dsl_dataset_rele(*dsp, tag); - ASSERT3U((err == 0), ==, (ds != NULL)); - - if (ds) { + if (err == 0) { mutex_enter(&ds->ds_lock); if (ds->ds_snapname[0] == 0) (void) strlcpy(ds->ds_snapname, snapname, sizeof (ds->ds_snapname)); mutex_exit(&ds->ds_lock); - err = dsl_dataset_hold_ref(ds, tag); - *dsp = err ? NULL : ds; + *dsp = ds; } } -out: - rw_exit(&dp->dp_config_rwlock); - dsl_dir_close(dd, FTAG); + + dsl_dir_rele(dd, FTAG); return (err); } int -dsl_dataset_own(const char *name, boolean_t inconsistentok, +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); + if (err != 0) + return (err); + if (!dsl_dataset_tryown(*dsp, tag)) { + dsl_dataset_rele(*dsp, tag); + *dsp = NULL; + return (EBUSY); + } + return (0); +} + +int +dsl_dataset_own(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold(name, tag, dsp); - if (err) + int err = dsl_dataset_hold(dp, name, tag, dsp); + if (err != 0) return (err); - if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { + if (!dsl_dataset_tryown(*dsp, tag)) { dsl_dataset_rele(*dsp, tag); return (EBUSY); } return (0); } +/* + * See the comment above dsl_pool_hold() for details. In summary, a long + * hold is used to prevent destruction of a dataset while the pool hold + * is dropped, allowing other concurrent operations (e.g. spa_sync()). + * + * The dataset and pool must be held when this function is called. After it + * is called, the pool hold may be released while the dataset is still held + * and accessed. + */ +void +dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) +{ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + (void) refcount_add(&ds->ds_longholds, tag); +} + +void +dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) +{ + (void) refcount_remove(&ds->ds_longholds, tag); +} + +/* Return B_TRUE if there are any long holds on this dataset. */ +boolean_t +dsl_dataset_long_held(dsl_dataset_t *ds) +{ + return (!refcount_is_zero(&ds->ds_longholds)); +} + void dsl_dataset_name(dsl_dataset_t *ds, char *name) { @@ -659,7 +571,7 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) (void) strcpy(name, "mos"); } else { dsl_dir_name(ds->ds_dir, name); - VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { (void) strcat(name, "@"); /* @@ -686,7 +598,7 @@ dsl_dataset_namelen(dsl_dataset_t *ds) result = 3; /* "mos" */ } else { result = dsl_dir_namelen(ds->ds_dir); - VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { ++result; /* adding one for the @-sign */ if (!MUTEX_HELD(&ds->ds_lock)) { @@ -703,64 +615,41 @@ dsl_dataset_namelen(dsl_dataset_t *ds) } void -dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) -{ - dmu_buf_rele(ds->ds_dbuf, tag); -} - -void dsl_dataset_rele(dsl_dataset_t *ds, void *tag) { - if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { - rw_exit(&ds->ds_rwlock); - } - dsl_dataset_drop_ref(ds, tag); + dmu_buf_rele(ds->ds_dbuf, tag); } void dsl_dataset_disown(dsl_dataset_t *ds, void *tag) { - ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || - (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); + ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL); mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; - if (RW_WRITE_HELD(&ds->ds_rwlock)) { - rw_exit(&ds->ds_rwlock); - cv_broadcast(&ds->ds_exclusive_cv); - } mutex_exit(&ds->ds_lock); - if (ds->ds_dbuf) - dsl_dataset_drop_ref(ds, tag); + dsl_dataset_long_rele(ds, tag); + if (ds->ds_dbuf != NULL) + dsl_dataset_rele(ds, tag); else dsl_dataset_evict(NULL, ds); } boolean_t -dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) +dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) { boolean_t gotit = FALSE; mutex_enter(&ds->ds_lock); - if (ds->ds_owner == NULL && - (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { + if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { ds->ds_owner = tag; - if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) - rw_exit(&ds->ds_rwlock); + dsl_dataset_long_hold(ds, tag); gotit = TRUE; } mutex_exit(&ds->ds_lock); return (gotit); } -void -dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) -{ - ASSERT3P(owner, ==, ds->ds_owner); - if (!RW_WRITE_HELD(&ds->ds_rwlock)) - rw_enter(&ds->ds_rwlock, RW_WRITER); -} - uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx) @@ -781,7 +670,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); @@ -799,7 +688,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, if (origin == NULL) { dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); } else { - dsl_dataset_t *ohds; + dsl_dataset_t *ohds; /* head of the origin snapshot */ dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = @@ -816,7 +705,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dmu_buf_will_dirty(origin->ds_dbuf, tx); origin->ds_phys->ds_num_children++; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + VERIFY0(dsl_dataset_hold_obj(dp, origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); @@ -828,9 +717,8 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, zap_create(mos, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY(0 == zap_add_int(mos, - origin->ds_phys->ds_next_clones_obj, - dsobj, tx)); + VERIFY0(zap_add_int(mos, + origin->ds_phys->ds_next_clones_obj, dsobj, tx)); } dmu_buf_will_dirty(dd->dd_dbuf, tx); @@ -842,7 +730,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY3U(0, ==, zap_add_int(mos, + VERIFY0(zap_add_int(mos, origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); } } @@ -858,6 +746,16 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, return (dsobj); } +static void +dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *os; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + dsl_dataset_dirty(ds, tx); +} + uint64_t dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) @@ -866,29 +764,28 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, uint64_t dsobj, ddobj; dsl_dir_t *dd; + ASSERT(dmu_tx_is_syncing(tx)); ASSERT(lastname[0] != '@'); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); - VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); + VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); - dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); + dsobj = dsl_dataset_create_sync_dd(dd, origin, + flags & ~DS_CREATE_FLAG_NODIRTY, tx); dsl_deleg_set_create_perms(dd, tx, cr); - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); /* * If we are creating a clone, make sure we zero out any stale * data from the origin snapshots zil header. */ - if (origin != NULL) { + if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { dsl_dataset_t *ds; - objset_t *os; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); - bzero(&os->os_zil_header, sizeof (os->os_zil_header)); - dsl_dataset_dirty(ds, tx); + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_zero_zil(ds, tx); dsl_dataset_rele(ds, FTAG); } @@ -896,316 +793,110 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, } /* - * The snapshots must all be in the same pool. + * The unique space in the head dataset can be calculated by subtracting + * the space used in the most recent snapshot, that is still being used + * in this file system, from the space currently in use. To figure out + * the space in the most recent snapshot still in use, we need to take + * the total space used in the snapshot and subtract out the space that + * has been freed up since the snapshot was taken. */ -int -dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, - nvlist_t *errlist) +void +dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) { - int err; - dsl_sync_task_t *dst; - spa_t *spa; - nvpair_t *pair; - dsl_sync_task_group_t *dstg; - - pair = nvlist_next_nvpair(snaps, NULL); - if (pair == NULL) - return (0); - - err = spa_open(nvpair_name(pair), &spa, FTAG); - if (err) - return (err); - dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - dsl_dataset_t *ds; - - err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); - if (err == 0) { - struct dsl_ds_destroyarg *dsda; - - dsl_dataset_make_exclusive(ds, dstg); - dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), - KM_SLEEP); - dsda->ds = ds; - dsda->defer = defer; - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, dsda, dstg, 0); - } else if (err == ENOENT) { - err = 0; - } else { - fnvlist_add_int32(errlist, nvpair_name(pair), err); - break; - } - } + uint64_t mrs_used; + uint64_t dlused, dlcomp, dluncomp; - if (err == 0) - err = dsl_sync_task_group_wait(dstg); + ASSERT(!dsl_dataset_is_snapshot(ds)); - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - struct dsl_ds_destroyarg *dsda = dst->dst_arg1; - dsl_dataset_t *ds = dsda->ds; + if (ds->ds_phys->ds_prev_snap_obj != 0) + mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; + else + mrs_used = 0; - /* - * Return the snapshots that triggered the error. - */ - if (dst->dst_err != 0) { - char name[ZFS_MAXNAMELEN]; - dsl_dataset_name(ds, name); - fnvlist_add_int32(errlist, name, dst->dst_err); - } - ASSERT3P(dsda->rm_origin, ==, NULL); - dsl_dataset_disown(ds, dstg); - kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); - } + dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); - dsl_sync_task_group_destroy(dstg); - spa_close(spa, FTAG); - return (err); + ASSERT3U(dlused, <=, mrs_used); + ds->ds_phys->ds_unique_bytes = + ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } -static boolean_t -dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) +void +dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx) { - boolean_t might_destroy = B_FALSE; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t count; + int err; + + ASSERT(ds->ds_phys->ds_num_children >= 2); + err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); + /* + * The err should not be ENOENT, but a bug in a previous version + * of the code could cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a missing entry. + * If we knew that the pool was created after + * SPA_VERSION_NEXT_CLONES, we could assert that it isn't + * ENOENT. However, at least we can check that we don't have + * too many entries in the next_clones_obj even after failing to + * remove this one. + */ + if (err != ENOENT) + VERIFY0(err); + ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, + &count)); + ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); +} - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && - DS_IS_DEFER_DESTROY(ds)) - might_destroy = B_TRUE; - mutex_exit(&ds->ds_lock); - return (might_destroy); +blkptr_t * +dsl_dataset_get_blkptr(dsl_dataset_t *ds) +{ + return (&ds->ds_phys->ds_bp); } -/* - * If we're removing a clone, and these three conditions are true: - * 1) the clone's origin has no other children - * 2) the clone's origin has no user references - * 3) the clone's origin has been marked for deferred destruction - * Then, prepare to remove the origin as part of this sync task group. - */ -static int -dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) +void +dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { - dsl_dataset_t *ds = dsda->ds; - dsl_dataset_t *origin = ds->ds_prev; - - if (dsl_dataset_might_destroy_origin(origin)) { - char *name; - int namelen; - int error; - - namelen = dsl_dataset_namelen(origin) + 1; - name = kmem_alloc(namelen, KM_SLEEP); - dsl_dataset_name(origin, name); -#ifdef _KERNEL - error = zfs_unmount_snap(name, NULL); - if (error) { - kmem_free(name, namelen); - return (error); - } -#endif - error = dsl_dataset_own(name, B_TRUE, tag, &origin); - kmem_free(name, namelen); - if (error) - return (error); - dsda->rm_origin = origin; - dsl_dataset_make_exclusive(origin, tag); + ASSERT(dmu_tx_is_syncing(tx)); + /* If it's the meta-objset, set dp_meta_rootbp */ + if (ds == NULL) { + tx->tx_pool->dp_meta_rootbp = *bp; + } else { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_bp = *bp; } +} - return (0); +spa_t * +dsl_dataset_get_spa(dsl_dataset_t *ds) +{ + return (ds->ds_dir->dd_pool->dp_spa); } -/* - * ds must be opened as OWNER. On return (whether successful or not), - * ds will be closed and caller can no longer dereference it. - */ -int -dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) +void +dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) { - int err; - dsl_sync_task_group_t *dstg; - objset_t *os; - dsl_dir_t *dd; - uint64_t obj; - struct dsl_ds_destroyarg dsda = { 0 }; + dsl_pool_t *dp; - dsda.ds = ds; + if (ds == NULL) /* this is the meta-objset */ + return; - if (dsl_dataset_is_snapshot(ds)) { - /* Destroying a snapshot is simpler */ - dsl_dataset_make_exclusive(ds, tag); - - dsda.defer = defer; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - &dsda, tag, 0); - ASSERT3P(dsda.rm_origin, ==, NULL); - goto out; - } else if (defer) { - err = EINVAL; - goto out; - } + ASSERT(ds->ds_objset != NULL); - dd = ds->ds_dir; + if (ds->ds_phys->ds_next_snap_obj != 0) + panic("dirtying snapshot!"); - if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { - /* - * Check for errors and mark this ds as inconsistent, in - * case we crash while freeing the objects. - */ - err = dsl_sync_task_do(dd->dd_pool, - dsl_dataset_destroy_begin_check, - dsl_dataset_destroy_begin_sync, ds, NULL, 0); - if (err) - goto out; + dp = ds->ds_dir->dd_pool; - err = dmu_objset_from_ds(ds, &os); - if (err) - goto out; - - /* - * Remove all objects while in the open context so that - * there is less work to do in the syncing context. - */ - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, - ds->ds_phys->ds_prev_snap_txg)) { - /* - * Ignore errors, if there is not enough disk space - * we will deal with it in dsl_dataset_destroy_sync(). - */ - (void) dmu_free_object(os, obj); - } - if (err != ESRCH) - goto out; - - /* - * Sync out all in-flight IO. - */ - txg_wait_synced(dd->dd_pool, 0); - - /* - * If we managed to free all the objects in open - * context, the user space accounting should be zero. - */ - if (ds->ds_phys->ds_bp.blk_fill == 0 && - dmu_objset_userused_enabled(os)) { - uint64_t count; - - ASSERT(zap_count(os, DMU_USERUSED_OBJECT, - &count) != 0 || count == 0); - ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, - &count) != 0 || count == 0); - } - } - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); - rw_exit(&dd->dd_pool->dp_config_rwlock); - - if (err) - goto out; - - /* - * Blow away the dsl_dir + head dataset. - */ - dsl_dataset_make_exclusive(ds, tag); - /* - * If we're removing a clone, we might also need to remove its - * origin. - */ - do { - dsda.need_prep = B_FALSE; - if (dsl_dir_is_clone(dd)) { - err = dsl_dataset_origin_rm_prep(&dsda, tag); - if (err) { - dsl_dir_close(dd, FTAG); - goto out; - } - } - - dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, &dsda, tag, 0); - dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, dd, FTAG, 0); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); - - /* - * We could be racing against 'zfs release' or 'zfs destroy -d' - * on the origin snap, in which case we can get EBUSY if we - * needed to destroy the origin snap but were not ready to - * do so. - */ - if (dsda.need_prep) { - ASSERT(err == EBUSY); - ASSERT(dsl_dir_is_clone(dd)); - ASSERT(dsda.rm_origin == NULL); - } - } while (dsda.need_prep); - - if (dsda.rm_origin != NULL) - dsl_dataset_disown(dsda.rm_origin, tag); - - /* if it is successful, dsl_dir_destroy_sync will close the dd */ - if (err) - dsl_dir_close(dd, FTAG); -out: - dsl_dataset_disown(ds, tag); - return (err); -} - -blkptr_t * -dsl_dataset_get_blkptr(dsl_dataset_t *ds) -{ - return (&ds->ds_phys->ds_bp); -} - -void -dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - /* If it's the meta-objset, set dp_meta_rootbp */ - if (ds == NULL) { - tx->tx_pool->dp_meta_rootbp = *bp; - } else { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_bp = *bp; - } -} - -spa_t * -dsl_dataset_get_spa(dsl_dataset_t *ds) -{ - return (ds->ds_dir->dd_pool->dp_spa); -} - -void -dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp; - - if (ds == NULL) /* this is the meta-objset */ - return; - - ASSERT(ds->ds_objset != NULL); - - if (ds->ds_phys->ds_next_snap_obj != 0) - panic("dirtying snapshot!"); - - dp = ds->ds_dir->dd_pool; - - if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { - /* up the hold count until we can be written out */ - dmu_buf_add_ref(ds->ds_dbuf, ds); - } -} + if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + } +} boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds) @@ -1218,850 +909,146 @@ dsl_dataset_is_dirty(dsl_dataset_t *ds) return (B_FALSE); } -/* - * The unique space in the head dataset can be calculated by subtracting - * the space used in the most recent snapshot, that is still being used - * in this file system, from the space currently in use. To figure out - * the space in the most recent snapshot still in use, we need to take - * the total space used in the snapshot and subtract out the space that - * has been freed up since the snapshot was taken. - */ -static void -dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) -{ - uint64_t mrs_used; - uint64_t dlused, dlcomp, dluncomp; - - ASSERT(!dsl_dataset_is_snapshot(ds)); - - if (ds->ds_phys->ds_prev_snap_obj != 0) - mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; - else - mrs_used = 0; - - dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); - - ASSERT3U(dlused, <=, mrs_used); - ds->ds_phys->ds_unique_bytes = - ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); - - if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= - SPA_VERSION_UNIQUE_ACCURATE) - ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; -} - -struct killarg { - dsl_dataset_t *ds; - dmu_tx_t *tx; -}; - -/* ARGSUSED */ static int -kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) { - struct killarg *ka = arg; - dmu_tx_t *tx = ka->tx; + uint64_t asize; - if (bp == NULL) + if (!dmu_tx_is_syncing(tx)) return (0); - if (zb->zb_level == ZB_ZIL_LEVEL) { - ASSERT(zilog != NULL); - /* - * It's a block in the intent log. It has no - * accounting, so just free it. - */ - dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); - } else { - ASSERT(zilog == NULL); - ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); - (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); - } - - return (0); -} - -/* ARGSUSED */ -static int -dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t count; - int err; - /* - * Can't delete a head dataset if there are snapshots of it. - * (Except if the only snapshots are from the branch we cloned - * from.) + * If there's an fs-only reservation, any blocks that might become + * owned by the snapshot dataset must be accommodated by space + * outside of the reservation. */ - if (ds->ds_prev != NULL && - ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EBUSY); + ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); + asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); /* - * This is really a dsl_dir thing, but check it here so that - * we'll be less likely to leave this dataset inconsistent & - * nearly destroyed. + * Propagate any reserved space for this snapshot to other + * snapshot checks in this sync group. */ - err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); - if (err) - return (err); - if (count != 0) - return (EEXIST); + if (asize > 0) + dsl_dir_willuse_space(ds->ds_dir, asize, tx); return (0); } -/* ARGSUSED */ -static void -dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - /* Mark it as inconsistent on-disk, in case we crash */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - - spa_history_log_internal_ds(ds, "destroy begin", tx, ""); -} - -static int -dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, - dmu_tx_t *tx) -{ - dsl_dataset_t *ds = dsda->ds; - dsl_dataset_t *ds_prev = ds->ds_prev; - - if (dsl_dataset_might_destroy_origin(ds_prev)) { - struct dsl_ds_destroyarg ndsda = {0}; - - /* - * If we're not prepared to remove the origin, don't remove - * the clone either. - */ - if (dsda->rm_origin == NULL) { - dsda->need_prep = B_TRUE; - return (EBUSY); - } - - ndsda.ds = ds_prev; - ndsda.is_origin_rm = B_TRUE; - return (dsl_dataset_destroy_check(&ndsda, tag, tx)); - } - - /* - * If we're not going to remove the origin after all, - * undo the open context setup. - */ - if (dsda->rm_origin != NULL) { - dsl_dataset_disown(dsda->rm_origin, tag); - dsda->rm_origin = NULL; - } - - return (0); -} +typedef struct dsl_dataset_snapshot_arg { + nvlist_t *ddsa_snaps; + nvlist_t *ddsa_props; + nvlist_t *ddsa_errors; +} dsl_dataset_snapshot_arg_t; -/* - * If you add new checks here, you may need to add - * additional checks to the "temporary" case in - * snapshot_check() in dmu_objset.c. - */ -/* ARGSUSED */ int -dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx) { - struct dsl_ds_destroyarg *dsda = arg1; - dsl_dataset_t *ds = dsda->ds; + int error; + uint64_t value; - /* we have an owner hold, so noone else can destroy us */ - ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); + ds->ds_trysnap_txg = tx->tx_txg; - /* - * Only allow deferred destroy on pools that support it. - * NOTE: deferred destroy is only supported on snapshots. - */ - if (dsda->defer) { - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < - SPA_VERSION_USERREFS) - return (ENOTSUP); - ASSERT(dsl_dataset_is_snapshot(ds)); + if (!dmu_tx_is_syncing(tx)) return (0); - } /* - * Can't delete a head dataset if there are snapshots of it. - * (Except if the only snapshots are from the branch we cloned - * from.) + * We don't allow multiple snapshots of the same txg. If there + * is already one, try again. */ - if (ds->ds_prev != NULL && - ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EBUSY); + if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) + return (EAGAIN); /* - * If we made changes this txg, traverse_dsl_dataset won't find - * them. Try again. + * Check for conflicting snapshot name. */ - if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) - return (EAGAIN); - - if (dsl_dataset_is_snapshot(ds)) { - /* - * If this snapshot has an elevated user reference count, - * we can't destroy it yet. - */ - if (ds->ds_userrefs > 0 && !dsda->releasing) - return (EBUSY); + error = dsl_dataset_snap_lookup(ds, snapname, &value); + if (error == 0) + return (EEXIST); + if (error != ENOENT) + return (error); - mutex_enter(&ds->ds_lock); - /* - * Can't delete a branch point. However, if we're destroying - * a clone and removing its origin due to it having a user - * hold count of 0 and having been marked for deferred destroy, - * it's OK for the origin to have a single clone. - */ - if (ds->ds_phys->ds_num_children > - (dsda->is_origin_rm ? 2 : 1)) { - mutex_exit(&ds->ds_lock); - return (EEXIST); - } - mutex_exit(&ds->ds_lock); - } else if (dsl_dir_is_clone(ds->ds_dir)) { - return (dsl_dataset_origin_check(dsda, arg2, tx)); - } + error = dsl_dataset_snapshot_reserve_space(ds, tx); + if (error != 0) + return (error); - /* XXX we should do some i/o error checking... */ return (0); } -struct refsarg { - kmutex_t lock; - boolean_t gone; - kcondvar_t cv; -}; - -/* ARGSUSED */ -static void -dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) +static int +dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) { - struct refsarg *arg = argv; + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int rv = 0; + + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + int error = 0; + dsl_dataset_t *ds; + char *name, *atp; + char dsname[MAXNAMELEN]; + + name = nvpair_name(pair); + if (strlen(name) >= MAXNAMELEN) + error = ENAMETOOLONG; + if (error == 0) { + atp = strchr(name, '@'); + if (atp == NULL) + error = EINVAL; + if (error == 0) + (void) strlcpy(dsname, name, atp - name + 1); + } + if (error == 0) + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + error = dsl_dataset_snapshot_check_impl(ds, + atp + 1, tx); + dsl_dataset_rele(ds, FTAG); + } - mutex_enter(&arg->lock); - arg->gone = TRUE; - cv_signal(&arg->cv); - mutex_exit(&arg->lock); + if (error != 0) { + if (ddsa->ddsa_errors != NULL) { + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); + } + rv = error; + } + } + return (rv); } -static void -dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) +void +dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx) { - struct refsarg arg; + static zil_header_t zero_zil; - mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); - arg.gone = FALSE; - (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, - dsl_dataset_refs_gone); - dmu_buf_rele(ds->ds_dbuf, tag); - mutex_enter(&arg.lock); - while (!arg.gone) - cv_wait(&arg.cv, &arg.lock); - ASSERT(arg.gone); - mutex_exit(&arg.lock); - ds->ds_dbuf = NULL; - ds->ds_phys = NULL; - mutex_destroy(&arg.lock); - cv_destroy(&arg.cv); -} + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj, crtxg; + objset_t *mos = dp->dp_meta_objset; + objset_t *os; -static void -remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t count; - int err; + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - ASSERT(ds->ds_phys->ds_num_children >= 2); - err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); /* - * The err should not be ENOENT, but a bug in a previous version - * of the code could cause upgrade_clones_cb() to not set - * ds_next_snap_obj when it should, leading to a missing entry. - * If we knew that the pool was created after - * SPA_VERSION_NEXT_CLONES, we could assert that it isn't - * ENOENT. However, at least we can check that we don't have - * too many entries in the next_clones_obj even after failing to - * remove this one. + * If we are on an old pool, the zil must not be active, in which + * case it will be zeroed. Usually zil_suspend() accomplishes this. */ - if (err != ENOENT) { - VERIFY0(err); - } - ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, - &count)); - ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); -} + ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || + dmu_objset_from_ds(ds, &os) != 0 || + bcmp(&os->os_phys->os_zil_header, &zero_zil, + sizeof (zero_zil)) == 0); -static void -dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - zap_cursor_t zc; - zap_attribute_t za; /* - * If it is the old version, dd_clones doesn't exist so we can't - * find the clones, but deadlist_remove_key() is a no-op so it - * doesn't matter. - */ - if (ds->ds_dir->dd_phys->dd_clones == 0) - return; - - for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - dsl_dataset_t *clone; - - VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone)); - if (clone->ds_dir->dd_origin_txg > mintxg) { - dsl_deadlist_remove_key(&clone->ds_deadlist, - mintxg, tx); - dsl_dataset_remove_clones_key(clone, mintxg, tx); - } - dsl_dataset_rele(clone, FTAG); - } - zap_cursor_fini(&zc); -} - -struct process_old_arg { - dsl_dataset_t *ds; - dsl_dataset_t *ds_prev; - boolean_t after_branch_point; - zio_t *pio; - uint64_t used, comp, uncomp; -}; - -static int -process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -{ - struct process_old_arg *poa = arg; - dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; - - if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { - dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); - if (poa->ds_prev && !poa->after_branch_point && - bp->blk_birth > - poa->ds_prev->ds_phys->ds_prev_snap_txg) { - poa->ds_prev->ds_phys->ds_unique_bytes += - bp_get_dsize_sync(dp->dp_spa, bp); - } - } else { - poa->used += bp_get_dsize_sync(dp->dp_spa, bp); - poa->comp += BP_GET_PSIZE(bp); - poa->uncomp += BP_GET_UCSIZE(bp); - dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); - } - return (0); -} - -static void -process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, - dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) -{ - struct process_old_arg poa = { 0 }; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - - ASSERT(ds->ds_deadlist.dl_oldfmt); - ASSERT(ds_next->ds_deadlist.dl_oldfmt); - - poa.ds = ds; - poa.ds_prev = ds_prev; - poa.after_branch_point = after_branch_point; - poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, - process_old_cb, &poa, tx)); - VERIFY0(zio_wait(poa.pio)); - ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); - - /* change snapused */ - dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, - -poa.used, -poa.comp, -poa.uncomp, tx); - - /* swap next's deadlist to our deadlist */ - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_close(&ds_next->ds_deadlist); - SWITCH64(ds_next->ds_phys->ds_deadlist_obj, - ds->ds_phys->ds_deadlist_obj); - dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); - dsl_deadlist_open(&ds_next->ds_deadlist, mos, - ds_next->ds_phys->ds_deadlist_obj); -} - -static int -old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - int err; - struct killarg ka; - - /* - * Free everything that we point to (that's born after - * the previous snapshot, if we are a clone) - * - * NB: this should be very quick, because we already - * freed all the objects in open context. - */ - ka.ds = ds; - ka.tx = tx; - err = traverse_dataset(ds, - ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, - kill_blkptr, &ka); - ASSERT0(err); - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); - - return (err); -} - -void -dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) -{ - struct dsl_ds_destroyarg *dsda = arg1; - dsl_dataset_t *ds = dsda->ds; - int err; - int after_branch_point = FALSE; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - dsl_dataset_t *ds_prev = NULL; - boolean_t wont_destroy; - uint64_t obj; - - wont_destroy = (dsda->defer && - (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); - - ASSERT(ds->ds_owner || wont_destroy); - ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); - ASSERT(ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); - ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); - - if (wont_destroy) { - ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; - spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); - return; - } - - /* We need to log before removing it from the namespace. */ - spa_history_log_internal_ds(ds, "destroy", tx, ""); - - /* signal any waiters that this dataset is going away */ - mutex_enter(&ds->ds_lock); - ds->ds_owner = dsl_reaper; - cv_broadcast(&ds->ds_exclusive_cv); - mutex_exit(&ds->ds_lock); - - /* Remove our reservation */ - if (ds->ds_reserved != 0) { - dsl_prop_setarg_t psa; - uint64_t value = 0; - - dsl_prop_setarg_init_uint64(&psa, "refreservation", - (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), - &value); - psa.psa_effective_value = 0; /* predict default value */ - - dsl_dataset_set_reservation_sync(ds, &psa, tx); - ASSERT0(ds->ds_reserved); - } - - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - - dsl_scan_ds_destroyed(ds, tx); - - obj = ds->ds_object; - - if (ds->ds_phys->ds_prev_snap_obj != 0) { - if (ds->ds_prev) { - ds_prev = ds->ds_prev; - } else { - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); - } - after_branch_point = - (ds_prev->ds_phys->ds_next_snap_obj != obj); - - dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); - if (after_branch_point && - ds_prev->ds_phys->ds_next_clones_obj != 0) { - remove_from_next_clones(ds_prev, obj, tx); - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(0 == zap_add_int(mos, - ds_prev->ds_phys->ds_next_clones_obj, - ds->ds_phys->ds_next_snap_obj, tx)); - } - } - if (after_branch_point && - ds->ds_phys->ds_next_snap_obj == 0) { - /* This clone is toast. */ - ASSERT(ds_prev->ds_phys->ds_num_children > 1); - ds_prev->ds_phys->ds_num_children--; - - /* - * If the clone's origin has no other clones, no - * user holds, and has been marked for deferred - * deletion, then we should have done the necessary - * destroy setup for it. - */ - if (ds_prev->ds_phys->ds_num_children == 1 && - ds_prev->ds_userrefs == 0 && - DS_IS_DEFER_DESTROY(ds_prev)) { - ASSERT3P(dsda->rm_origin, !=, NULL); - } else { - ASSERT3P(dsda->rm_origin, ==, NULL); - } - } else if (!after_branch_point) { - ds_prev->ds_phys->ds_next_snap_obj = - ds->ds_phys->ds_next_snap_obj; - } - } - - if (dsl_dataset_is_snapshot(ds)) { - dsl_dataset_t *ds_next; - uint64_t old_unique; - uint64_t used = 0, comp = 0, uncomp = 0; - - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); - ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); - - old_unique = ds_next->ds_phys->ds_unique_bytes; - - dmu_buf_will_dirty(ds_next->ds_dbuf, tx); - ds_next->ds_phys->ds_prev_snap_obj = - ds->ds_phys->ds_prev_snap_obj; - ds_next->ds_phys->ds_prev_snap_txg = - ds->ds_phys->ds_prev_snap_txg; - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, - ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); - - - if (ds_next->ds_deadlist.dl_oldfmt) { - process_old_deadlist(ds, ds_prev, ds_next, - after_branch_point, tx); - } else { - /* Adjust prev's unique space. */ - if (ds_prev && !after_branch_point) { - dsl_deadlist_space_range(&ds_next->ds_deadlist, - ds_prev->ds_phys->ds_prev_snap_txg, - ds->ds_phys->ds_prev_snap_txg, - &used, &comp, &uncomp); - ds_prev->ds_phys->ds_unique_bytes += used; - } - - /* Adjust snapused. */ - dsl_deadlist_space_range(&ds_next->ds_deadlist, - ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &used, &comp, &uncomp); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, - -used, -comp, -uncomp, tx); - - /* Move blocks to be freed to pool's free list. */ - dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, - &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, - tx); - dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, - DD_USED_HEAD, used, comp, uncomp, tx); - - /* Merge our deadlist into next's and free it. */ - dsl_deadlist_merge(&ds_next->ds_deadlist, - ds->ds_phys->ds_deadlist_obj, tx); - } - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); - - /* Collapse range in clone heads */ - dsl_dataset_remove_clones_key(ds, - ds->ds_phys->ds_creation_txg, tx); - - if (dsl_dataset_is_snapshot(ds_next)) { - dsl_dataset_t *ds_nextnext; - - /* - * Update next's unique to include blocks which - * were previously shared by only this snapshot - * and it. Those blocks will be born after the - * prev snap and before this snap, and will have - * died after the next snap and before the one - * after that (ie. be on the snap after next's - * deadlist). - */ - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds_next->ds_phys->ds_next_snap_obj, - FTAG, &ds_nextnext)); - dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, - ds->ds_phys->ds_prev_snap_txg, - ds->ds_phys->ds_creation_txg, - &used, &comp, &uncomp); - ds_next->ds_phys->ds_unique_bytes += used; - dsl_dataset_rele(ds_nextnext, FTAG); - ASSERT3P(ds_next->ds_prev, ==, NULL); - - /* Collapse range in this head. */ - dsl_dataset_t *hds; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, - FTAG, &hds)); - dsl_deadlist_remove_key(&hds->ds_deadlist, - ds->ds_phys->ds_creation_txg, tx); - dsl_dataset_rele(hds, FTAG); - - } else { - ASSERT3P(ds_next->ds_prev, ==, ds); - dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); - ds_next->ds_prev = NULL; - if (ds_prev) { - VERIFY(0 == dsl_dataset_get_ref(dp, - ds->ds_phys->ds_prev_snap_obj, - ds_next, &ds_next->ds_prev)); - } - - dsl_dataset_recalc_head_uniq(ds_next); - - /* - * Reduce the amount of our unconsmed refreservation - * being charged to our parent by the amount of - * new unique data we have gained. - */ - if (old_unique < ds_next->ds_reserved) { - int64_t mrsdelta; - uint64_t new_unique = - ds_next->ds_phys->ds_unique_bytes; - - ASSERT(old_unique <= new_unique); - mrsdelta = MIN(new_unique - old_unique, - ds_next->ds_reserved - old_unique); - dsl_dir_diduse_space(ds->ds_dir, - DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); - } - } - dsl_dataset_rele(ds_next, FTAG); - } else { - zfeature_info_t *async_destroy = - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; - objset_t *os; - - /* - * There's no next snapshot, so this is a head dataset. - * Destroy the deadlist. Unless it's a clone, the - * deadlist should be empty. (If it's a clone, it's - * safe to ignore the deadlist contents.) - */ - dsl_deadlist_close(&ds->ds_deadlist); - dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); - ds->ds_phys->ds_deadlist_obj = 0; - - VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); - - if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { - err = old_synchronous_dataset_destroy(ds, tx); - } else { - /* - * Move the bptree into the pool's list of trees to - * clean up and update space accounting information. - */ - uint64_t used, comp, uncomp; - - zil_destroy_sync(dmu_objset_zil(os), tx); - - if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { - spa_feature_incr(dp->dp_spa, async_destroy, tx); - dp->dp_bptree_obj = bptree_alloc(mos, tx); - VERIFY(zap_add(mos, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, - &dp->dp_bptree_obj, tx) == 0); - } - - used = ds->ds_dir->dd_phys->dd_used_bytes; - comp = ds->ds_dir->dd_phys->dd_compressed_bytes; - uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; - - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - ds->ds_phys->ds_unique_bytes == used); - - bptree_add(mos, dp->dp_bptree_obj, - &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, - used, comp, uncomp, tx); - dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, - -used, -comp, -uncomp, tx); - dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, - used, comp, uncomp, tx); - } - - if (ds->ds_prev != NULL) { - if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { - VERIFY3U(0, ==, zap_remove_int(mos, - ds->ds_prev->ds_dir->dd_phys->dd_clones, - ds->ds_object, tx)); - } - dsl_dataset_rele(ds->ds_prev, ds); - ds->ds_prev = ds_prev = NULL; - } - } - - /* - * This must be done after the dsl_traverse(), because it will - * re-open the objset. - */ - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { - /* Erase the link in the dir */ - dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; - ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); - err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); - ASSERT(err == 0); - } else { - /* remove from snapshot namespace */ - dsl_dataset_t *ds_head; - ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); - VERIFY(0 == dsl_dataset_get_snapname(ds)); -#ifdef ZFS_DEBUG - { - uint64_t val; - - err = dsl_dataset_snap_lookup(ds_head, - ds->ds_snapname, &val); - ASSERT0(err); - ASSERT3U(val, ==, obj); - } -#endif - err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); - ASSERT(err == 0); - dsl_dataset_rele(ds_head, FTAG); - } - - if (ds_prev && ds->ds_prev != ds_prev) - dsl_dataset_rele(ds_prev, FTAG); - - spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - - if (ds->ds_phys->ds_next_clones_obj != 0) { - uint64_t count; - ASSERT(0 == zap_count(mos, - ds->ds_phys->ds_next_clones_obj, &count) && count == 0); - VERIFY(0 == dmu_object_free(mos, - ds->ds_phys->ds_next_clones_obj, tx)); - } - if (ds->ds_phys->ds_props_obj != 0) - VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); - if (ds->ds_phys->ds_userrefs_obj != 0) - VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); - dsl_dir_close(ds->ds_dir, ds); - ds->ds_dir = NULL; - dsl_dataset_drain_refs(ds, tag); - VERIFY(0 == dmu_object_free(mos, obj, tx)); - - if (dsda->rm_origin) { - /* - * Remove the origin of the clone we just destroyed. - */ - struct dsl_ds_destroyarg ndsda = {0}; - - ndsda.ds = dsda->rm_origin; - dsl_dataset_destroy_sync(&ndsda, tag, tx); - } -} - -static int -dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - uint64_t asize; - - if (!dmu_tx_is_syncing(tx)) - return (0); - - /* - * If there's an fs-only reservation, any blocks that might become - * owned by the snapshot dataset must be accommodated by space - * outside of the reservation. - */ - ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); - asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); - if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) - return (ENOSPC); - - /* - * Propagate any reserved space for this snapshot to other - * snapshot checks in this sync group. - */ - if (asize > 0) - dsl_dir_willuse_space(ds->ds_dir, asize, tx); - - return (0); -} - -int -dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx) -{ - int err; - uint64_t value; - - /* - * We don't allow multiple snapshots of the same txg. If there - * is already one, try again. - */ - if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) - return (EAGAIN); - - /* - * Check for conflicting snapshot name. - */ - err = dsl_dataset_snap_lookup(ds, snapname, &value); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - - /* - * Check that the dataset's name is not too long. Name consists - * of the dataset's length + 1 for the @-sign + snapshot name's length - */ - if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) - return (ENAMETOOLONG); - - err = dsl_dataset_snapshot_reserve_space(ds, tx); - if (err) - return (err); - - ds->ds_trysnap_txg = tx->tx_txg; - return (0); -} - -void -dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - dmu_buf_t *dbuf; - dsl_dataset_phys_t *dsphys; - uint64_t dsobj, crtxg; - objset_t *mos = dp->dp_meta_objset; - int err; - - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - - /* - * The origin's ds_creation_txg has to be < TXG_INITIAL + * The origin's ds_creation_txg has to be < TXG_INITIAL */ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) crtxg = 1; @@ -2070,7 +1057,7 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); @@ -2105,9 +1092,9 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, ds->ds_prev->ds_phys->ds_creation_txg); ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { - remove_from_next_clones(ds->ds_prev, + dsl_dataset_remove_from_next_clones(ds->ds_prev, dsphys->ds_next_snap_obj, tx); - VERIFY3U(0, ==, zap_add_int(mos, + VERIFY0(zap_add_int(mos, next_clones_obj, dsobj, tx)); } } @@ -2126,9 +1113,6 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, } dmu_buf_will_dirty(ds->ds_dbuf, tx); - zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", - ds->ds_dir->dd_myname, snapname, dsobj, - ds->ds_phys->ds_prev_snap_txg); ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); dsl_deadlist_close(&ds->ds_deadlist); @@ -2143,13 +1127,12 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, - snapname, 8, 1, &dsobj, tx); - ASSERT(err == 0); + VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, + snapname, 8, 1, &dsobj, tx)); if (ds->ds_prev) - dsl_dataset_drop_ref(ds->ds_prev, ds); - VERIFY(0 == dsl_dataset_get_ref(dp, + dsl_dataset_rele(ds->ds_prev, ds); + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); dsl_scan_ds_snapshotted(ds, tx); @@ -2159,89 +1142,265 @@ dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); } -void -dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +static void +dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) { - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(ds->ds_objset != NULL); - ASSERT(ds->ds_phys->ds_next_snap_obj == 0); - - /* - * in case we had to change ds_fsid_guid when we opened it, - * sync it out now. - */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; - dmu_objset_sync(ds->ds_objset, zio, tx); + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + dsl_dataset_t *ds; + char *name, *atp; + char dsname[MAXNAMELEN]; + + name = nvpair_name(pair); + atp = strchr(name, '@'); + (void) strlcpy(dsname, name, atp - name + 1); + VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); + if (ddsa->ddsa_props != NULL) { + dsl_props_set_sync_impl(ds->ds_prev, + ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); + } + dsl_dataset_rele(ds, FTAG); + } } -static void -get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) +/* + * The snapshots must all be in the same pool. + * All-or-nothing: if there are any failures, nothing will be modified. + */ +int +dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) { - uint64_t count = 0; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - zap_cursor_t zc; - zap_attribute_t za; - nvlist_t *propval; - nvlist_t *val; + dsl_dataset_snapshot_arg_t ddsa; + nvpair_t *pair; + boolean_t needsuspend; + int error; + spa_t *spa; + char *firstname; + nvlist_t *suspended = NULL; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + firstname = nvpair_name(pair); + + error = spa_open(firstname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); + + if (needsuspend) { + suspended = fnvlist_alloc(); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char fsname[MAXNAMELEN]; + char *snapname = nvpair_name(pair); + char *atp; + void *cookie; + + atp = strchr(snapname, '@'); + if (atp == NULL) { + error = EINVAL; + break; + } + (void) strlcpy(fsname, snapname, atp - snapname + 1); + + error = zil_suspend(fsname, &cookie); + if (error != 0) + break; + fnvlist_add_uint64(suspended, fsname, + (uintptr_t)cookie); + } + } + + ddsa.ddsa_snaps = snaps; + ddsa.ddsa_props = props; + ddsa.ddsa_errors = errors; + + if (error == 0) { + error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, + dsl_dataset_snapshot_sync, &ddsa, + fnvlist_num_pairs(snaps) * 3); + } + + if (suspended != NULL) { + for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; + pair = nvlist_next_nvpair(suspended, pair)) { + zil_resume((void *)(uintptr_t) + fnvpair_value_uint64(pair)); + } + fnvlist_free(suspended); + } + + return (error); +} + +typedef struct dsl_dataset_snapshot_tmp_arg { + const char *ddsta_fsname; + const char *ddsta_snapname; + minor_t ddsta_cleanup_minor; + const char *ddsta_htag; +} dsl_dataset_snapshot_tmp_arg_t; + +static int +dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); + } + error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, + B_TRUE, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); + dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, + ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); + dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); + + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, + minor_t cleanup_minor, const char *htag) +{ + dsl_dataset_snapshot_tmp_arg_t ddsta; + int error; + spa_t *spa; + boolean_t needsuspend; + void *cookie; + + ddsta.ddsta_fsname = fsname; + ddsta.ddsta_snapname = snapname; + ddsta.ddsta_cleanup_minor = cleanup_minor; + ddsta.ddsta_htag = htag; + + error = spa_open(fsname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); + + if (needsuspend) { + error = zil_suspend(fsname, &cookie); + if (error != 0) + return (error); + } + + error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, + dsl_dataset_snapshot_tmp_sync, &ddsta, 3); + + if (needsuspend) + zil_resume(cookie); + return (error); +} + + +void +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(ds->ds_objset != NULL); + ASSERT(ds->ds_phys->ds_next_snap_obj == 0); + + /* + * in case we had to change ds_fsid_guid when we opened it, + * sync it out now. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; + + dmu_objset_sync(ds->ds_objset, zio, tx); +} + +static void +get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) +{ + uint64_t count = 0; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + nvlist_t *propval = fnvlist_alloc(); + nvlist_t *val = fnvlist_alloc(); + + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* - * There may me missing entries in ds_next_clones_obj + * There may be missing entries in ds_next_clones_obj * due to a bug in a previous version of the code. * Only trust it if it has the right number of entries. */ if (ds->ds_phys->ds_next_clones_obj != 0) { - ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, + ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count)); } - if (count != ds->ds_phys->ds_num_children - 1) { + if (count != ds->ds_phys->ds_num_children - 1) goto fail; - } for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAXNAMELEN]; - /* - * Even though we hold the dp_config_rwlock, the dataset - * may fail to open, returning ENOENT. If there is a - * thread concurrently attempting to destroy this - * dataset, it will have the ds_rwlock held for - * RW_WRITER. Our call to dsl_dataset_hold_obj() -> - * dsl_dataset_hold_ref() will fail its - * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the - * dp_config_rwlock, and wait for the destroy progress - * and signal ds_exclusive_cv. If the destroy was - * successful, we will see that - * DSL_DATASET_IS_DESTROYED(), and return ENOENT. - */ - if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - za.za_first_integer, FTAG, &clone) != 0) - continue; + VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); dsl_dir_name(clone->ds_dir, buf); - VERIFY(nvlist_add_boolean(val, buf) == 0); + fnvlist_add_boolean(val, buf); dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); - VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); - VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), - propval) == 0); + fnvlist_add_nvlist(propval, ZPROP_VALUE, val); + fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval); fail: nvlist_free(val); nvlist_free(propval); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; uint64_t refd, avail, uobjs, aobjs, ratio; + ASSERT(dsl_pool_config_held(dp)); + ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : (ds->ds_phys->ds_uncompressed_bytes * 100 / ds->ds_phys->ds_compressed_bytes); @@ -2287,10 +1446,8 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_dataset_t *prev; - rw_enter(&dp->dp_config_rwlock, RW_READER); int err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - rw_exit(&dp->dp_config_rwlock); if (err == 0) { err = dsl_dataset_space_written(prev, ds, &written, &comp, &uncomp); @@ -2306,6 +1463,9 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + ASSERT(dsl_pool_config_held(dp)); + stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; stat->dds_guid = ds->ds_phys->ds_guid; @@ -2317,16 +1477,14 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) stat->dds_is_snapshot = B_FALSE; stat->dds_num_clones = 0; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); if (dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_t *ods; - VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); dsl_dataset_name(ods, stat->dds_origin); - dsl_dataset_drop_ref(ods, FTAG); + dsl_dataset_rele(ods, FTAG); } - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } } @@ -2364,8 +1522,7 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) { dsl_pool_t *dp = ds->ds_dir->dd_pool; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); + ASSERT(dsl_pool_config_held(dp)); if (ds->ds_prev == NULL) return (B_FALSE); if (ds->ds_phys->ds_bp.blk_birth > @@ -2387,237 +1544,225 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) return (B_FALSE); } +typedef struct dsl_dataset_rename_snapshot_arg { + const char *ddrsa_fsname; + const char *ddrsa_oldsnapname; + const char *ddrsa_newsnapname; + boolean_t ddrsa_recursive; + dmu_tx_t *ddrsa_tx; +} dsl_dataset_rename_snapshot_arg_t; + /* ARGSUSED */ static int -dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) { - dsl_dataset_t *ds = arg1; - char *newsnapname = arg2; - dsl_dir_t *dd = ds->ds_dir; - dsl_dataset_t *hds; + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + int error; uint64_t val; - int err; - - err = dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); - if (err) - return (err); - /* new name better not be in use */ - err = dsl_dataset_snap_lookup(hds, newsnapname, &val); - dsl_dataset_rele(hds, FTAG); + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + if (error != 0) { + /* ignore nonexistent snapshots */ + return (error == ENOENT ? 0 : error); + } - if (err == 0) - err = EEXIST; - else if (err == ENOENT) - err = 0; + /* new name should not exist */ + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); + if (error == 0) + error = EEXIST; + else if (error == ENOENT) + error = 0; /* dataset name + 1 for the "@" + the new snapshot name must fit */ - if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) - err = ENAMETOOLONG; + if (dsl_dir_namelen(hds->ds_dir) + 1 + + strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN) + error = ENAMETOOLONG; - return (err); + return (error); } -static void -dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +static int +dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - const char *newsnapname = arg2; - dsl_dir_t *dd = ds->ds_dir; - objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; - int err; - - ASSERT(ds->ds_phys->ds_next_snap_obj != 0); - - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); + int error; - VERIFY(0 == dsl_dataset_get_snapname(ds)); - err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); - ASSERT0(err); - mutex_enter(&ds->ds_lock); - (void) strcpy(ds->ds_snapname, newsnapname); - mutex_exit(&ds->ds_lock); - err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, 8, 1, &ds->ds_object, tx); - ASSERT0(err); + error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); + if (error != 0) + return (error); - spa_history_log_internal_ds(ds, "rename", tx, - "-> @%s", newsnapname); + if (ddrsa->ddrsa_recursive) { + error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_check_impl, ddrsa, + DS_FIND_CHILDREN); + } else { + error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); + } dsl_dataset_rele(hds, FTAG); + return (error); } -struct renamesnaparg { - dsl_sync_task_group_t *dstg; - char failed[MAXPATHLEN]; - char *oldsnap; - char *newsnap; -}; - static int -dsl_snapshot_rename_one(const char *name, void *arg) +dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) { - struct renamesnaparg *ra = arg; - dsl_dataset_t *ds = NULL; - char *snapname; - int err; - - snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); - (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_dataset_t *ds; + uint64_t val; + dmu_tx_t *tx = ddrsa->ddrsa_tx; + int error; - /* - * For recursive snapshot renames the parent won't be changing - * so we just pass name for both the to/from argument. - */ - err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); - if (err != 0) { - strfree(snapname); - return (err == ENOENT ? 0 : err); + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) { + /* ignore nonexistent snapshots */ + return (0); } -#ifdef _KERNEL - /* - * For all filesystems undergoing rename, we'll need to unmount it. - */ - (void) zfs_unmount_snap(snapname, NULL); -#endif - err = dsl_dataset_hold(snapname, ra->dstg, &ds); - strfree(snapname); - if (err != 0) - return (err == ENOENT ? 0 : err); + VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); + + /* log before we change the name */ + spa_history_log_internal_ds(ds, "rename", tx, + "-> @%s", ddrsa->ddrsa_newsnapname); - dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, - dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); + VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx)); + mutex_enter(&ds->ds_lock); + (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); + mutex_exit(&ds->ds_lock); + VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, + ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + dsl_dataset_rele(ds, FTAG); return (0); } -static int -dsl_recursive_rename(char *oldname, const char *newname) +static void +dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) { - int err; - struct renamesnaparg *ra; - dsl_sync_task_t *dst; - spa_t *spa; - char *cp, *fsname = spa_strdup(oldname); - int len = strlen(oldname) + 1; - - /* truncate the snapshot name to get the fsname */ - cp = strchr(fsname, '@'); - *cp = '\0'; - - err = spa_open(fsname, &spa, FTAG); - if (err) { - kmem_free(fsname, len); - return (err); - } - ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); - ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - - ra->oldsnap = strchr(oldname, '@') + 1; - ra->newsnap = strchr(newname, '@') + 1; - *ra->failed = '\0'; - - err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, - DS_FIND_CHILDREN); - kmem_free(fsname, len); - - if (err == 0) { - err = dsl_sync_task_group_wait(ra->dstg); - } + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; - for (dst = list_head(&ra->dstg->dstg_tasks); dst; - dst = list_next(&ra->dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; - if (dst->dst_err) { - dsl_dir_name(ds->ds_dir, ra->failed); - (void) strlcat(ra->failed, "@", sizeof (ra->failed)); - (void) strlcat(ra->failed, ra->newsnap, - sizeof (ra->failed)); - } - dsl_dataset_rele(ds, ra->dstg); + VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); + ddrsa->ddrsa_tx = tx; + if (ddrsa->ddrsa_recursive) { + VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_sync_impl, ddrsa, + DS_FIND_CHILDREN)); + } else { + VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); } - - if (err) - (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); - - dsl_sync_task_group_destroy(ra->dstg); - kmem_free(ra, sizeof (struct renamesnaparg)); - spa_close(spa, FTAG); - return (err); + dsl_dataset_rele(hds, FTAG); } -static int -dsl_valid_rename(const char *oldname, void *arg) +int +dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive) { - int delta = *(int *)arg; + dsl_dataset_rename_snapshot_arg_t ddrsa; - if (strlen(oldname) + delta >= MAXNAMELEN) - return (ENAMETOOLONG); + ddrsa.ddrsa_fsname = fsname; + ddrsa.ddrsa_oldsnapname = oldsnapname; + ddrsa.ddrsa_newsnapname = newsnapname; + ddrsa.ddrsa_recursive = recursive; - return (0); + return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, + dsl_dataset_rename_snapshot_sync, &ddrsa, 1)); } -#pragma weak dmu_objset_rename = dsl_dataset_rename -int -dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) +static int +dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd; + const char *fsname = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; - const char *tail; - int err; + int64_t unused_refres_delta; + int error; - err = dsl_dir_open(oldname, FTAG, &dd, &tail); - if (err) - return (err); + error = dsl_dataset_hold(dp, fsname, FTAG, &ds); + if (error != 0) + return (error); - if (tail == NULL) { - int delta = strlen(newname) - strlen(oldname); + /* must not be a snapshot */ + if (dsl_dataset_is_snapshot(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } - /* if we're growing, validate child name lengths */ - if (delta > 0) - err = dmu_objset_find(oldname, dsl_valid_rename, - &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + /* must have a most recent snapshot */ + if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } - if (err == 0) - err = dsl_dir_rename(dd, newname); - dsl_dir_close(dd, FTAG); - return (err); + if (dsl_dataset_long_held(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EBUSY); + } + + /* + * Check if the snap we are rolling back to uses more than + * the refquota. + */ + if (ds->ds_quota != 0 && + ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) { + dsl_dataset_rele(ds, FTAG); + return (EDQUOT); } - if (tail[0] != '@') { - /* the name ended in a nonexistent component */ - dsl_dir_close(dd, FTAG); - return (ENOENT); + /* + * When we do the clone swap, we will temporarily use more space + * due to the refreservation (the head will no longer have any + * unique space, so the entire amount of the refreservation will need + * to be free). We will immediately destroy the clone, freeing + * this space, but the freeing happens over many txg's. + */ + unused_refres_delta = (int64_t)MIN(ds->ds_reserved, + ds->ds_phys->ds_unique_bytes); + + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { + dsl_dataset_rele(ds, FTAG); + return (ENOSPC); } - dsl_dir_close(dd, FTAG); + dsl_dataset_rele(ds, FTAG); + return (0); +} - /* new name must be snapshot in same filesystem */ - tail = strchr(newname, '@'); - if (tail == NULL) - return (EINVAL); - tail++; - if (strncmp(oldname, newname, tail - newname) != 0) - return (EXDEV); +static void +dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) +{ + const char *fsname = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds, *clone; + uint64_t cloneobj; - if (recursive) { - err = dsl_recursive_rename(oldname, newname); - } else { - err = dsl_dataset_hold(oldname, FTAG, &ds); - if (err) - return (err); + VERIFY0(dsl_dataset_hold(dp, fsname, FTAG, &ds)); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_snapshot_rename_check, - dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); + cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", + ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); - dsl_dataset_rele(ds, FTAG); - } + VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); - return (err); + dsl_dataset_clone_swap_sync_impl(clone, ds, tx); + dsl_dataset_zero_zil(ds, tx); + + dsl_destroy_head_sync_impl(clone, tx); + + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_rollback(const char *fsname) +{ + return (dsl_sync_task(fsname, dsl_dataset_rollback_check, + dsl_dataset_rollback_sync, (void *)fsname, 1)); } struct promotenode { @@ -2625,49 +1770,66 @@ struct promotenode { dsl_dataset_t *ds; }; -struct promotearg { +typedef struct dsl_dataset_promote_arg { + const char *ddpa_clonename; + dsl_dataset_t *ddpa_clone; list_t shared_snaps, origin_snaps, clone_snaps; - dsl_dataset_t *origin_origin; + dsl_dataset_t *origin_origin; /* origin of the origin */ uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; -}; +} dsl_dataset_promote_arg_t; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); -static boolean_t snaplist_unstable(list_t *l); +static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, + void *tag); +static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); static int -dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *hds = arg1; - struct promotearg *pa = arg2; - struct promotenode *snap = list_head(&pa->shared_snaps); - dsl_dataset_t *origin_ds = snap->ds; + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds; int err; uint64_t unused; - /* Check that it is a real clone */ - if (!dsl_dir_is_clone(hds->ds_dir)) - return (EINVAL); + err = promote_hold(ddpa, dp, FTAG); + if (err != 0) + return (err); - /* Since this is so expensive, don't do the preliminary check */ - if (!dmu_tx_is_syncing(tx)) - return (0); + hds = ddpa->ddpa_clone; - if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) + if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) { + promote_rele(ddpa, FTAG); return (EXDEV); + } + + /* + * Compute and check the amount of space to transfer. Since this is + * so expensive, don't do the preliminary check. + */ + if (!dmu_tx_is_syncing(tx)) { + promote_rele(ddpa, FTAG); + return (0); + } + + snap = list_head(&ddpa->shared_snaps); + origin_ds = snap->ds; /* compute origin's new unique space */ - snap = list_tail(&pa->clone_snaps); + snap = list_tail(&ddpa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_deadlist_space_range(&snap->ds->ds_deadlist, origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &pa->unique, &unused, &unused); + &ddpa->unique, &unused, &unused); /* * Walk the snapshots that we are moving * * Compute space to transfer. Consider the incremental changes - * to used for each snapshot: + * to used by each snapshot: * (my used) = (prev's used) + (blocks born) - (blocks killed) * So each snapshot gave birth to: * (blocks born) = (my used) - (prev's used) + (blocks killed) @@ -2678,18 +1840,28 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ - pa->used = origin_ds->ds_phys->ds_referenced_bytes; - pa->comp = origin_ds->ds_phys->ds_compressed_bytes; - pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; - for (snap = list_head(&pa->shared_snaps); snap; - snap = list_next(&pa->shared_snaps, snap)) { + ddpa->used = origin_ds->ds_phys->ds_referenced_bytes; + ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes; + ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; + /* + * If there are long holds, we won't be able to evict + * the objset. + */ + if (dsl_dataset_long_held(ds)) { + err = EBUSY; + goto out; + } + /* Check that the snapshot name does not conflict */ - VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_get_snapname(ds)); err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); if (err == 0) { + (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname); err = EEXIST; goto out; } @@ -2702,26 +1874,27 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); - pa->used += dlused; - pa->comp += dlcomp; - pa->uncomp += dluncomp; + ddpa->used += dlused; + ddpa->comp += dlcomp; + ddpa->uncomp += dluncomp; } /* * If we are a clone of a clone then we never reached ORIGIN, * so we need to subtract out the clone origin's used space. */ - if (pa->origin_origin) { - pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; - pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; - pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; + if (ddpa->origin_origin) { + ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes; + ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes; + ddpa->uncomp -= + ddpa->origin_origin->ds_phys->ds_uncompressed_bytes; } /* Check that there is enough space here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, - pa->used); - if (err) - return (err); + ddpa->used); + if (err != 0) + goto out; /* * Compute the amounts of space that will be used by snapshots @@ -2739,68 +1912,75 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) * calls will be fast because they do not have to * iterate over all bps. */ - snap = list_head(&pa->origin_snaps); - err = snaplist_space(&pa->shared_snaps, - snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); - if (err) - return (err); + snap = list_head(&ddpa->origin_snaps); + err = snaplist_space(&ddpa->shared_snaps, + snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); + if (err != 0) + goto out; - err = snaplist_space(&pa->clone_snaps, + err = snaplist_space(&ddpa->clone_snaps, snap->ds->ds_dir->dd_origin_txg, &space); - if (err) - return (err); - pa->cloneusedsnap += space; + if (err != 0) + goto out; + ddpa->cloneusedsnap += space; } if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { - err = snaplist_space(&pa->origin_snaps, - origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); - if (err) - return (err); + err = snaplist_space(&ddpa->origin_snaps, + origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap); + if (err != 0) + goto out; } - return (0); out: - pa->err_ds = snap->ds->ds_snapname; + promote_rele(ddpa, FTAG); return (err); } static void -dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *hds = arg1; - struct promotearg *pa = arg2; - struct promotenode *snap = list_head(&pa->shared_snaps); - dsl_dataset_t *origin_ds = snap->ds; - dsl_dataset_t *origin_head; - dsl_dir_t *dd = hds->ds_dir; - dsl_pool_t *dp = hds->ds_dir->dd_pool; + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds; + dsl_dataset_t *origin_head; + dsl_dir_t *dd; dsl_dir_t *odd = NULL; uint64_t oldnext_obj; int64_t delta; - ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); + VERIFY0(promote_hold(ddpa, dp, FTAG)); + hds = ddpa->ddpa_clone; + + ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE); - snap = list_head(&pa->origin_snaps); + snap = list_head(&ddpa->shared_snaps); + origin_ds = snap->ds; + dd = hds->ds_dir; + + snap = list_head(&ddpa->origin_snaps); origin_head = snap->ds; /* * We need to explicitly open odd, since origin_ds's dd will be * changing. */ - VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, + VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, NULL, FTAG, &odd)); /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; - snap = list_tail(&pa->clone_snaps); + snap = list_tail(&ddpa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ if (origin_ds->ds_phys->ds_next_clones_obj) { - remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); - VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + dsl_dataset_remove_from_next_clones(origin_ds, + snap->ds->ds_object, tx); + VERIFY0(zap_add_int(dp->dp_meta_objset, origin_ds->ds_phys->ds_next_clones_obj, oldnext_obj, tx)); } @@ -2817,39 +1997,43 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) /* change dd_clone entries */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + VERIFY0(zap_remove_int(dp->dp_meta_objset, odd->dd_phys->dd_clones, hds->ds_object, tx)); - VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, - pa->origin_origin->ds_dir->dd_phys->dd_clones, + VERIFY0(zap_add_int(dp->dp_meta_objset, + ddpa->origin_origin->ds_dir->dd_phys->dd_clones, hds->ds_object, tx)); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, - pa->origin_origin->ds_dir->dd_phys->dd_clones, + VERIFY0(zap_remove_int(dp->dp_meta_objset, + ddpa->origin_origin->ds_dir->dd_phys->dd_clones, origin_head->ds_object, tx)); if (dd->dd_phys->dd_clones == 0) { dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + VERIFY0(zap_add_int(dp->dp_meta_objset, dd->dd_phys->dd_clones, origin_head->ds_object, tx)); - } /* move snapshots to this dir */ - for (snap = list_head(&pa->shared_snaps); snap; - snap = list_next(&pa->shared_snaps, snap)) { + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { dsl_dataset_t *ds = snap->ds; - /* unregister props as dsl_dir is changing */ + /* + * Property callbacks are registered to a particular + * dsl_dir. Since ours is changing, evict the objset + * so that they will be unregistered from the old dsl_dir. + */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } + /* move snap name entry */ - VERIFY(0 == dsl_dataset_get_snapname(ds)); - VERIFY(0 == dsl_dataset_snap_remove(origin_head, + VERIFY0(dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_snap_remove(origin_head, ds->ds_snapname, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, + VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); @@ -2858,8 +2042,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); ds->ds_phys->ds_dir_obj = dd->dd_object; ASSERT3P(ds->ds_dir, ==, odd); - dsl_dir_close(ds->ds_dir, ds); - VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, + dsl_dir_rele(ds->ds_dir, ds); + VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); /* move any clone references */ @@ -2868,1274 +2052,689 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) zap_cursor_t zc; zap_attribute_t za; - for (zap_cursor_init(&zc, dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - dsl_dataset_t *cnds; - uint64_t o; - - if (za.za_first_integer == oldnext_obj) { - /* - * We've already moved the - * origin's reference. - */ - continue; - } - - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &cnds)); - o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; - - VERIFY3U(zap_remove_int(dp->dp_meta_objset, - odd->dd_phys->dd_clones, o, tx), ==, 0); - VERIFY3U(zap_add_int(dp->dp_meta_objset, - dd->dd_phys->dd_clones, o, tx), ==, 0); - dsl_dataset_rele(cnds, FTAG); - } - zap_cursor_fini(&zc); - } - - ASSERT0(dsl_prop_numcb(ds)); - } - - /* - * Change space accounting. - * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either - * both be valid, or both be 0 (resulting in delta == 0). This - * is true for each of {clone,origin} independently. - */ - - delta = pa->cloneusedsnap - - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; - ASSERT3S(delta, >=, 0); - ASSERT3U(pa->used, >=, delta); - dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); - dsl_dir_diduse_space(dd, DD_USED_HEAD, - pa->used - delta, pa->comp, pa->uncomp, tx); - - delta = pa->originusedsnap - - odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; - ASSERT3S(delta, <=, 0); - ASSERT3U(pa->used, >=, -delta); - dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); - dsl_dir_diduse_space(odd, DD_USED_HEAD, - -pa->used - delta, -pa->comp, -pa->uncomp, tx); - - origin_ds->ds_phys->ds_unique_bytes = pa->unique; - - /* log history record */ - spa_history_log_internal_ds(hds, "promote", tx, ""); - - dsl_dir_close(odd, FTAG); -} - -static char *snaplist_tag = "snaplist"; -/* - * Make a list of dsl_dataset_t's for the snapshots between first_obj - * (exclusive) and last_obj (inclusive). The list will be in reverse - * order (last_obj will be the list_head()). If first_obj == 0, do all - * snapshots back to this dataset's origin. - */ -static int -snaplist_make(dsl_pool_t *dp, boolean_t own, - uint64_t first_obj, uint64_t last_obj, list_t *l) -{ - uint64_t obj = last_obj; - - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); - - list_create(l, sizeof (struct promotenode), - offsetof(struct promotenode, link)); - - while (obj != first_obj) { - dsl_dataset_t *ds; - struct promotenode *snap; - int err; - - if (own) { - err = dsl_dataset_own_obj(dp, obj, - 0, snaplist_tag, &ds); - if (err == 0) - dsl_dataset_make_exclusive(ds, snaplist_tag); - } else { - err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); - } - if (err == ENOENT) { - /* lost race with snapshot destroy */ - struct promotenode *last = list_tail(l); - ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); - obj = last->ds->ds_phys->ds_prev_snap_obj; - continue; - } else if (err) { - return (err); - } - - if (first_obj == 0) - first_obj = ds->ds_dir->dd_phys->dd_origin_obj; - - snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); - snap->ds = ds; - list_insert_tail(l, snap); - obj = ds->ds_phys->ds_prev_snap_obj; - } - - return (0); -} - -static int -snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) -{ - struct promotenode *snap; - - *spacep = 0; - for (snap = list_head(l); snap; snap = list_next(l, snap)) { - uint64_t used, comp, uncomp; - dsl_deadlist_space_range(&snap->ds->ds_deadlist, - mintxg, UINT64_MAX, &used, &comp, &uncomp); - *spacep += used; - } - return (0); -} - -static void -snaplist_destroy(list_t *l, boolean_t own) -{ - struct promotenode *snap; - - if (!l || !list_link_active(&l->list_head)) - return; - - while ((snap = list_tail(l)) != NULL) { - list_remove(l, snap); - if (own) - dsl_dataset_disown(snap->ds, snaplist_tag); - else - dsl_dataset_rele(snap->ds, snaplist_tag); - kmem_free(snap, sizeof (struct promotenode)); - } - list_destroy(l); -} - -/* - * Promote a clone. Nomenclature note: - * "clone" or "cds": the original clone which is being promoted - * "origin" or "ods": the snapshot which is originally clone's origin - * "origin head" or "ohds": the dataset which is the head - * (filesystem/volume) for the origin - * "origin origin": the origin of the origin's filesystem (typically - * NULL, indicating that the clone is not a clone of a clone). - */ -int -dsl_dataset_promote(const char *name, char *conflsnap) -{ - dsl_dataset_t *ds; - dsl_dir_t *dd; - dsl_pool_t *dp; - dmu_object_info_t doi; - struct promotearg pa = { 0 }; - struct promotenode *snap; - int err; - - err = dsl_dataset_hold(name, FTAG, &ds); - if (err) - return (err); - dd = ds->ds_dir; - dp = dd->dd_pool; - - err = dmu_object_info(dp->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, &doi); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } - - if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { - dsl_dataset_rele(ds, FTAG); - return (EINVAL); - } - - /* - * We are going to inherit all the snapshots taken before our - * origin (i.e., our new origin will be our parent's origin). - * Take ownership of them so that we can rename them into our - * namespace. - */ - rw_enter(&dp->dp_config_rwlock, RW_READER); - - err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, - &pa.shared_snaps); - if (err != 0) - goto out; - - err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); - if (err != 0) - goto out; - - snap = list_head(&pa.shared_snaps); - ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); - err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, - snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); - if (err != 0) - goto out; - - if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { - err = dsl_dataset_hold_obj(dp, - snap->ds->ds_dir->dd_phys->dd_origin_obj, - FTAG, &pa.origin_origin); - if (err != 0) - goto out; - } - -out: - rw_exit(&dp->dp_config_rwlock); - - /* - * Add in 128x the snapnames zapobj size, since we will be moving - * a bunch of snapnames to the promoted ds, and dirtying their - * bonus buffers. - */ - if (err == 0) { - err = dsl_sync_task_do(dp, dsl_dataset_promote_check, - dsl_dataset_promote_sync, ds, &pa, - 2 + 2 * doi.doi_physical_blocks_512); - if (err && pa.err_ds && conflsnap) - (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); - } - - snaplist_destroy(&pa.shared_snaps, B_TRUE); - snaplist_destroy(&pa.clone_snaps, B_FALSE); - snaplist_destroy(&pa.origin_snaps, B_FALSE); - if (pa.origin_origin) - dsl_dataset_rele(pa.origin_origin, FTAG); - dsl_dataset_rele(ds, FTAG); - return (err); -} - -struct cloneswaparg { - dsl_dataset_t *cds; /* clone dataset */ - dsl_dataset_t *ohds; /* origin's head dataset */ - boolean_t force; - int64_t unused_refres_delta; /* change in unconsumed refreservation */ -}; - -/* ARGSUSED */ -static int -dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - struct cloneswaparg *csa = arg1; - - /* they should both be heads */ - if (dsl_dataset_is_snapshot(csa->cds) || - dsl_dataset_is_snapshot(csa->ohds)) - return (EINVAL); - - /* the branch point should be just before them */ - if (csa->cds->ds_prev != csa->ohds->ds_prev) - return (EINVAL); - - /* cds should be the clone (unless they are unrelated) */ - if (csa->cds->ds_prev != NULL && - csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && - csa->ohds->ds_object != - csa->cds->ds_prev->ds_phys->ds_next_snap_obj) - return (EINVAL); - - /* the clone should be a child of the origin */ - if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) - return (EINVAL); - - /* ohds shouldn't be modified unless 'force' */ - if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) - return (ETXTBSY); - - /* adjust amount of any unconsumed refreservation */ - csa->unused_refres_delta = - (int64_t)MIN(csa->ohds->ds_reserved, - csa->ohds->ds_phys->ds_unique_bytes) - - (int64_t)MIN(csa->ohds->ds_reserved, - csa->cds->ds_phys->ds_unique_bytes); - - if (csa->unused_refres_delta > 0 && - csa->unused_refres_delta > - dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) - return (ENOSPC); - - if (csa->ohds->ds_quota != 0 && - csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) - return (EDQUOT); - - return (0); -} - -/* ARGSUSED */ -static void -dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - struct cloneswaparg *csa = arg1; - dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; - - ASSERT(csa->cds->ds_reserved == 0); - ASSERT(csa->ohds->ds_quota == 0 || - csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); - - dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); - dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); - - if (csa->cds->ds_objset != NULL) { - dmu_objset_evict(csa->cds->ds_objset); - csa->cds->ds_objset = NULL; - } - - if (csa->ohds->ds_objset != NULL) { - dmu_objset_evict(csa->ohds->ds_objset); - csa->ohds->ds_objset = NULL; - } - - /* - * Reset origin's unique bytes, if it exists. - */ - if (csa->cds->ds_prev) { - dsl_dataset_t *origin = csa->cds->ds_prev; - uint64_t comp, uncomp; - - dmu_buf_will_dirty(origin->ds_dbuf, tx); - dsl_deadlist_space_range(&csa->cds->ds_deadlist, - origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); - } - - /* swap blkptrs */ - { - blkptr_t tmp; - tmp = csa->ohds->ds_phys->ds_bp; - csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; - csa->cds->ds_phys->ds_bp = tmp; - } - - /* set dd_*_bytes */ - { - int64_t dused, dcomp, duncomp; - uint64_t cdl_used, cdl_comp, cdl_uncomp; - uint64_t odl_used, odl_comp, odl_uncomp; - - ASSERT3U(csa->cds->ds_dir->dd_phys-> - dd_used_breakdown[DD_USED_SNAP], ==, 0); - - dsl_deadlist_space(&csa->cds->ds_deadlist, - &cdl_used, &cdl_comp, &cdl_uncomp); - dsl_deadlist_space(&csa->ohds->ds_deadlist, - &odl_used, &odl_comp, &odl_uncomp); - - dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - - (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); - dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - - (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); - duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + - cdl_uncomp - - (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); - - dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, - dused, dcomp, duncomp, tx); - dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, - -dused, -dcomp, -duncomp, tx); - - /* - * The difference in the space used by snapshots is the - * difference in snapshot space due to the head's - * deadlist (since that's the only thing that's - * changing that affects the snapused). - */ - dsl_deadlist_space_range(&csa->cds->ds_deadlist, - csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, - &cdl_used, &cdl_comp, &cdl_uncomp); - dsl_deadlist_space_range(&csa->ohds->ds_deadlist, - csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, - &odl_used, &odl_comp, &odl_uncomp); - dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, - DD_USED_HEAD, DD_USED_SNAP, tx); - } - - /* swap ds_*_bytes */ - SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, - csa->cds->ds_phys->ds_referenced_bytes); - SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, - csa->cds->ds_phys->ds_compressed_bytes); - SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, - csa->cds->ds_phys->ds_uncompressed_bytes); - SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, - csa->cds->ds_phys->ds_unique_bytes); - - /* apply any parent delta for change in unconsumed refreservation */ - dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, - csa->unused_refres_delta, 0, 0, tx); - - /* - * Swap deadlists. - */ - dsl_deadlist_close(&csa->cds->ds_deadlist); - dsl_deadlist_close(&csa->ohds->ds_deadlist); - SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, - csa->cds->ds_phys->ds_deadlist_obj); - dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, - csa->cds->ds_phys->ds_deadlist_obj); - dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, - csa->ohds->ds_phys->ds_deadlist_obj); - - dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); - - spa_history_log_internal_ds(csa->cds, "clone swap", tx, - "parent=%s", csa->ohds->ds_dir->dd_myname); -} - -/* - * Swap 'clone' with its origin head datasets. Used at the end of "zfs - * recv" into an existing fs to swizzle the file system to the new - * version, and by "zfs rollback". Can also be used to swap two - * independent head datasets if neither has any snapshots. - */ -int -dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, - boolean_t force) -{ - struct cloneswaparg csa; - int error; - - ASSERT(clone->ds_owner); - ASSERT(origin_head->ds_owner); -retry: - /* - * Need exclusive access for the swap. If we're swapping these - * datasets back after an error, we already hold the locks. - */ - if (!RW_WRITE_HELD(&clone->ds_rwlock)) - rw_enter(&clone->ds_rwlock, RW_WRITER); - if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && - !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { - rw_exit(&clone->ds_rwlock); - rw_enter(&origin_head->ds_rwlock, RW_WRITER); - if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { - rw_exit(&origin_head->ds_rwlock); - goto retry; - } - } - csa.cds = clone; - csa.ohds = origin_head; - csa.force = force; - error = dsl_sync_task_do(clone->ds_dir->dd_pool, - dsl_dataset_clone_swap_check, - dsl_dataset_clone_swap_sync, &csa, NULL, 9); - return (error); -} - -/* - * Given a pool name and a dataset object number in that pool, - * return the name of that dataset. - */ -int -dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) -{ - spa_t *spa; - dsl_pool_t *dp; - dsl_dataset_t *ds; - int error; - - if ((error = spa_open(pname, &spa, FTAG)) != 0) - return (error); - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { - dsl_dataset_name(ds, buf); - dsl_dataset_rele(ds, FTAG); - } - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - - return (error); -} + for (zap_cursor_init(&zc, dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *cnds; + uint64_t o; -int -dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) -{ - int error = 0; + if (za.za_first_integer == oldnext_obj) { + /* + * We've already moved the + * origin's reference. + */ + continue; + } - ASSERT3S(asize, >, 0); + VERIFY0(dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &cnds)); + o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; - /* - * *ref_rsrv is the portion of asize that will come from any - * unconsumed refreservation space. - */ - *ref_rsrv = 0; + VERIFY0(zap_remove_int(dp->dp_meta_objset, + odd->dd_phys->dd_clones, o, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dd->dd_phys->dd_clones, o, tx)); + dsl_dataset_rele(cnds, FTAG); + } + zap_cursor_fini(&zc); + } - mutex_enter(&ds->ds_lock); - /* - * Make a space adjustment for reserved bytes. - */ - if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { - ASSERT3U(*used, >=, - ds->ds_reserved - ds->ds_phys->ds_unique_bytes); - *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); - *ref_rsrv = - asize - MIN(asize, parent_delta(ds, asize + inflight)); + ASSERT(!dsl_prop_hascb(ds)); } - if (!check_quota || ds->ds_quota == 0) { - mutex_exit(&ds->ds_lock); - return (0); - } /* - * If they are requesting more space, and our current estimate - * is over quota, they get to try again unless the actual - * on-disk is over quota and there are no pending changes (which - * may free up space for us). + * Change space accounting. + * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either + * both be valid, or both be 0 (resulting in delta == 0). This + * is true for each of {clone,origin} independently. */ - if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { - if (inflight > 0 || - ds->ds_phys->ds_referenced_bytes < ds->ds_quota) - error = ERESTART; - else - error = EDQUOT; - } - mutex_exit(&ds->ds_lock); - return (error); + delta = ddpa->cloneusedsnap - + dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, >=, 0); + ASSERT3U(ddpa->used, >=, delta); + dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); + + delta = ddpa->originusedsnap - + odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, <=, 0); + ASSERT3U(ddpa->used, >=, -delta); + dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(odd, DD_USED_HEAD, + -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); + + origin_ds->ds_phys->ds_unique_bytes = ddpa->unique; + + /* log history record */ + spa_history_log_internal_ds(hds, "promote", tx, ""); + + dsl_dir_rele(odd, FTAG); + promote_rele(ddpa, FTAG); } -/* ARGSUSED */ +/* + * Make a list of dsl_dataset_t's for the snapshots between first_obj + * (exclusive) and last_obj (inclusive). The list will be in reverse + * order (last_obj will be the list_head()). If first_obj == 0, do all + * snapshots back to this dataset's origin. + */ static int -dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +snaplist_make(dsl_pool_t *dp, + uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - int err; + uint64_t obj = last_obj; - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) - return (ENOTSUP); + list_create(l, sizeof (struct promotenode), + offsetof(struct promotenode, link)); - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + while (obj != first_obj) { + dsl_dataset_t *ds; + struct promotenode *snap; + int err; - if (psa->psa_effective_value == 0) - return (0); + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); + ASSERT(err != ENOENT); + if (err != 0) + return (err); - if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || - psa->psa_effective_value < ds->ds_reserved) - return (ENOSPC); + if (first_obj == 0) + first_obj = ds->ds_dir->dd_phys->dd_origin_obj; + + snap = kmem_alloc(sizeof (*snap), KM_SLEEP); + snap->ds = ds; + list_insert_tail(l, snap); + obj = ds->ds_phys->ds_prev_snap_obj; + } return (0); } -extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); - -void -dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) +static int +snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; - - dsl_prop_set_sync(ds, psa, tx); - DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); + struct promotenode *snap; - if (ds->ds_quota != effective_value) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_quota = effective_value; + *spacep = 0; + for (snap = list_head(l); snap; snap = list_next(l, snap)) { + uint64_t used, comp, uncomp; + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used, &comp, &uncomp); + *spacep += used; } + return (0); } -int -dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) +static void +snaplist_destroy(list_t *l, void *tag) { - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); - - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); - - /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. - */ - txg_wait_open(ds->ds_dir->dd_pool, 0); + struct promotenode *snap; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, - ds, &psa, 0); + if (l == NULL || !list_link_active(&l->list_head)) + return; - dsl_dataset_rele(ds, FTAG); - return (err); + while ((snap = list_tail(l)) != NULL) { + list_remove(l, snap); + dsl_dataset_rele(snap->ds, tag); + kmem_free(snap, sizeof (*snap)); + } + list_destroy(l); } static int -dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value; - uint64_t unique; - int err; + int error; + dsl_dir_t *dd; + struct promotenode *snap; - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < - SPA_VERSION_REFRESERVATION) - return (ENOTSUP); + error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, + &ddpa->ddpa_clone); + if (error != 0) + return (error); + dd = ddpa->ddpa_clone->ds_dir; - if (dsl_dataset_is_snapshot(ds)) + if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) || + !dsl_dir_is_clone(dd)) { + dsl_dataset_rele(ddpa->ddpa_clone, tag); return (EINVAL); + } - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); - - effective_value = psa->psa_effective_value; - - /* - * If we are doing the preliminary check in open context, the - * space estimates may be inaccurate. - */ - if (!dmu_tx_is_syncing(tx)) - return (0); + error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj, + &ddpa->shared_snaps, tag); + if (error != 0) + goto out; - mutex_enter(&ds->ds_lock); - if (!DS_UNIQUE_IS_ACCURATE(ds)) - dsl_dataset_recalc_head_uniq(ds); - unique = ds->ds_phys->ds_unique_bytes; - mutex_exit(&ds->ds_lock); + error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, + &ddpa->clone_snaps, tag); + if (error != 0) + goto out; - if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { - uint64_t delta = MAX(unique, effective_value) - - MAX(unique, ds->ds_reserved); + snap = list_head(&ddpa->shared_snaps); + ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); + error = snaplist_make(dp, dd->dd_phys->dd_origin_obj, + snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, + &ddpa->origin_snaps, tag); + if (error != 0) + goto out; - if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) - return (ENOSPC); - if (ds->ds_quota > 0 && - effective_value > ds->ds_quota) - return (ENOSPC); + if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { + error = dsl_dataset_hold_obj(dp, + snap->ds->ds_dir->dd_phys->dd_origin_obj, + tag, &ddpa->origin_origin); + if (error != 0) + goto out; } - - return (0); +out: + if (error != 0) + promote_rele(ddpa, tag); + return (error); } static void -dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) +promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; - uint64_t unique; - int64_t delta; + snaplist_destroy(&ddpa->shared_snaps, tag); + snaplist_destroy(&ddpa->clone_snaps, tag); + snaplist_destroy(&ddpa->origin_snaps, tag); + if (ddpa->origin_origin != NULL) + dsl_dataset_rele(ddpa->origin_origin, tag); + dsl_dataset_rele(ddpa->ddpa_clone, tag); +} - dsl_prop_set_sync(ds, psa, tx); - DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); +/* + * Promote a clone. + * + * If it fails due to a conflicting snapshot name, "conflsnap" will be filled + * in with the name. (It must be at least MAXNAMELEN bytes long.) + */ +int +dsl_dataset_promote(const char *name, char *conflsnap) +{ + dsl_dataset_promote_arg_t ddpa = { 0 }; + uint64_t numsnaps; + int error; + objset_t *os; - dmu_buf_will_dirty(ds->ds_dbuf, tx); + /* + * We will modify space proportional to the number of + * snapshots. Compute numsnaps. + */ + error = dmu_objset_hold(name, FTAG, &os); + if (error != 0) + return (error); + error = zap_count(dmu_objset_pool(os)->dp_meta_objset, + dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps); + dmu_objset_rele(os, FTAG); + if (error != 0) + return (error); - mutex_enter(&ds->ds_dir->dd_lock); - mutex_enter(&ds->ds_lock); - ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); - unique = ds->ds_phys->ds_unique_bytes; - delta = MAX(0, (int64_t)(effective_value - unique)) - - MAX(0, (int64_t)(ds->ds_reserved - unique)); - ds->ds_reserved = effective_value; - mutex_exit(&ds->ds_lock); + ddpa.ddpa_clonename = name; + ddpa.err_ds = conflsnap; - dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); - mutex_exit(&ds->ds_dir->dd_lock); + return (dsl_sync_task(name, dsl_dataset_promote_check, + dsl_dataset_promote_sync, &ddpa, 2 + numsnaps)); } int -dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, - uint64_t reservation) +dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, boolean_t force) { - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; + int64_t unused_refres_delta; - dsl_prop_setarg_init_uint64(&psa, "refreservation", source, - &reservation); + /* they should both be heads */ + if (dsl_dataset_is_snapshot(clone) || + dsl_dataset_is_snapshot(origin_head)) + return (EINVAL); - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); + /* the branch point should be just before them */ + if (clone->ds_prev != origin_head->ds_prev) + return (EINVAL); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_set_reservation_check, - dsl_dataset_set_reservation_sync, ds, &psa, 0); + /* clone should be the clone (unless they are unrelated) */ + if (clone->ds_prev != NULL && + clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && + origin_head->ds_object != + clone->ds_prev->ds_phys->ds_next_snap_obj) + return (EINVAL); - dsl_dataset_rele(ds, FTAG); - return (err); -} + /* the clone should be a child of the origin */ + if (clone->ds_dir->dd_parent != origin_head->ds_dir) + return (EINVAL); -typedef struct zfs_hold_cleanup_arg { - dsl_pool_t *dp; - uint64_t dsobj; - char htag[MAXNAMELEN]; -} zfs_hold_cleanup_arg_t; + /* origin_head shouldn't be modified unless 'force' */ + if (!force && dsl_dataset_modified_since_lastsnap(origin_head)) + return (ETXTBSY); -static void -dsl_dataset_user_release_onexit(void *arg) -{ - zfs_hold_cleanup_arg_t *ca = arg; + /* origin_head should have no long holds (e.g. is not mounted) */ + if (dsl_dataset_long_held(origin_head)) + return (EBUSY); + + /* check amount of any unconsumed refreservation */ + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + origin_head->ds_phys->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + clone->ds_phys->ds_unique_bytes); + + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + + /* clone can't be over the head's refquota */ + if (origin_head->ds_quota != 0 && + clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota) + return (EDQUOT); - (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, - B_TRUE); - kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); + return (0); } void -dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, - minor_t minor) +dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, dmu_tx_t *tx) { - zfs_hold_cleanup_arg_t *ca; - - ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); - ca->dp = ds->ds_dir->dd_pool; - ca->dsobj = ds->ds_object; - (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); - VERIFY3U(0, ==, zfs_onexit_add_cb(minor, - dsl_dataset_user_release_onexit, ca, NULL)); -} + dsl_pool_t *dp = dmu_tx_pool(tx); + int64_t unused_refres_delta; -/* - * If you add new checks here, you may need to add - * additional checks to the "temporary" case in - * snapshot_check() in dmu_objset.c. - */ -static int -dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct dsl_ds_holdarg *ha = arg2; - const char *htag = ha->htag; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - int error = 0; + ASSERT(clone->ds_reserved == 0); + ASSERT(origin_head->ds_quota == 0 || + clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota); - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) - return (ENOTSUP); + dmu_buf_will_dirty(clone->ds_dbuf, tx); + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); - if (!dsl_dataset_is_snapshot(ds)) - return (EINVAL); + if (clone->ds_objset != NULL) { + dmu_objset_evict(clone->ds_objset); + clone->ds_objset = NULL; + } - /* tags must be unique */ - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_userrefs_obj) { - error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, - 8, 1, tx); - if (error == 0) - error = EEXIST; - else if (error == ENOENT) - error = 0; + if (origin_head->ds_objset != NULL) { + dmu_objset_evict(origin_head->ds_objset); + origin_head->ds_objset = NULL; } - mutex_exit(&ds->ds_lock); - if (error == 0 && ha->temphold && - strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) - error = E2BIG; + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + origin_head->ds_phys->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + clone->ds_phys->ds_unique_bytes); + + /* + * Reset origin's unique bytes, if it exists. + */ + if (clone->ds_prev) { + dsl_dataset_t *origin = clone->ds_prev; + uint64_t comp, uncomp; + + dmu_buf_will_dirty(origin->ds_dbuf, tx); + dsl_deadlist_space_range(&clone->ds_deadlist, + origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); + } + + /* swap blkptrs */ + { + blkptr_t tmp; + tmp = origin_head->ds_phys->ds_bp; + origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp; + clone->ds_phys->ds_bp = tmp; + } + + /* set dd_*_bytes */ + { + int64_t dused, dcomp, duncomp; + uint64_t cdl_used, cdl_comp, cdl_uncomp; + uint64_t odl_used, odl_comp, odl_uncomp; + + ASSERT3U(clone->ds_dir->dd_phys-> + dd_used_breakdown[DD_USED_SNAP], ==, 0); + + dsl_deadlist_space(&clone->ds_deadlist, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space(&origin_head->ds_deadlist, + &odl_used, &odl_comp, &odl_uncomp); - return (error); -} + dused = clone->ds_phys->ds_referenced_bytes + cdl_used - + (origin_head->ds_phys->ds_referenced_bytes + odl_used); + dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp - + (origin_head->ds_phys->ds_compressed_bytes + odl_comp); + duncomp = clone->ds_phys->ds_uncompressed_bytes + + cdl_uncomp - + (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp); -void -dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct dsl_ds_holdarg *ha = arg2; - const char *htag = ha->htag; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - uint64_t now = gethrestime_sec(); - uint64_t zapobj; + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, + dused, dcomp, duncomp, tx); + dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, + -dused, -dcomp, -duncomp, tx); - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_userrefs_obj == 0) { /* - * This is the first user hold for this dataset. Create - * the userrefs zap object. + * The difference in the space used by snapshots is the + * difference in snapshot space due to the head's + * deadlist (since that's the only thing that's + * changing that affects the snapused). */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - zapobj = ds->ds_phys->ds_userrefs_obj = - zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); - } else { - zapobj = ds->ds_phys->ds_userrefs_obj; + dsl_deadlist_space_range(&clone->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space_range(&origin_head->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, + &odl_used, &odl_comp, &odl_uncomp); + dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, + DD_USED_HEAD, DD_USED_SNAP, tx); } - ds->ds_userrefs++; - mutex_exit(&ds->ds_lock); - VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + /* swap ds_*_bytes */ + SWITCH64(origin_head->ds_phys->ds_referenced_bytes, + clone->ds_phys->ds_referenced_bytes); + SWITCH64(origin_head->ds_phys->ds_compressed_bytes, + clone->ds_phys->ds_compressed_bytes); + SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes, + clone->ds_phys->ds_uncompressed_bytes); + SWITCH64(origin_head->ds_phys->ds_unique_bytes, + clone->ds_phys->ds_unique_bytes); - if (ha->temphold) { - VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, - htag, &now, tx)); - } + /* apply any parent delta for change in unconsumed refreservation */ + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, + unused_refres_delta, 0, 0, tx); - spa_history_log_internal_ds(ds, "hold", tx, - "tag = %s temp = %d holds now = %llu", - htag, (int)ha->temphold, ds->ds_userrefs); -} + /* + * Swap deadlists. + */ + dsl_deadlist_close(&clone->ds_deadlist); + dsl_deadlist_close(&origin_head->ds_deadlist); + SWITCH64(origin_head->ds_phys->ds_deadlist_obj, + clone->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, + clone->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, + origin_head->ds_phys->ds_deadlist_obj); -static int -dsl_dataset_user_hold_one(const char *dsname, void *arg) -{ - struct dsl_ds_holdarg *ha = arg; - dsl_dataset_t *ds; - int error; - char *name; + dsl_scan_ds_clone_swapped(origin_head, clone, tx); - /* alloc a buffer to hold dsname@snapname plus terminating NULL */ - name = kmem_asprintf("%s@%s", dsname, ha->snapname); - error = dsl_dataset_hold(name, ha->dstg, &ds); - strfree(name); - if (error == 0) { - ha->gotone = B_TRUE; - dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, - dsl_dataset_user_hold_sync, ds, ha, 0); - } else if (error == ENOENT && ha->recursive) { - error = 0; - } else { - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); - } - return (error); + spa_history_log_internal_ds(clone, "clone swap", tx, + "parent=%s", origin_head->ds_dir->dd_myname); } +/* + * Given a pool name and a dataset object number in that pool, + * return the name of that dataset. + */ int -dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, - boolean_t temphold) +dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { - struct dsl_ds_holdarg *ha; + dsl_pool_t *dp; + dsl_dataset_t *ds; int error; - ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); - ha->htag = htag; - ha->temphold = temphold; - error = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, - ds, ha, 0); - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + error = dsl_pool_hold(pname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + if (error == 0) { + dsl_dataset_name(ds, buf); + dsl_dataset_rele(ds, FTAG); + } + dsl_pool_rele(dp, FTAG); return (error); } int -dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, - boolean_t recursive, boolean_t temphold, int cleanup_fd) +dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) { - struct dsl_ds_holdarg *ha; - dsl_sync_task_t *dst; - spa_t *spa; - int error; - minor_t minor = 0; - - if (cleanup_fd != -1) { - /* Currently we only support cleanup-on-exit of tempholds. */ - if (!temphold) - return (EINVAL); - error = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (error) - return (error); - } + int error = 0; - ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + ASSERT3S(asize, >, 0); - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + /* + * *ref_rsrv is the portion of asize that will come from any + * unconsumed refreservation space. + */ + *ref_rsrv = 0; - error = spa_open(dsname, &spa, FTAG); - if (error) { - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - if (cleanup_fd != -1) - zfs_onexit_fd_rele(cleanup_fd); - return (error); + mutex_enter(&ds->ds_lock); + /* + * Make a space adjustment for reserved bytes. + */ + if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { + ASSERT3U(*used, >=, + ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + *ref_rsrv = + asize - MIN(asize, parent_delta(ds, asize + inflight)); } - ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - ha->htag = htag; - ha->snapname = snapname; - ha->recursive = recursive; - ha->temphold = temphold; - - if (recursive) { - error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, - ha, DS_FIND_CHILDREN); - } else { - error = dsl_dataset_user_hold_one(dsname, ha); + if (!check_quota || ds->ds_quota == 0) { + mutex_exit(&ds->ds_lock); + return (0); } - if (error == 0) - error = dsl_sync_task_group_wait(ha->dstg); - - for (dst = list_head(&ha->dstg->dstg_tasks); dst; - dst = list_next(&ha->dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; - - if (dst->dst_err) { - dsl_dataset_name(ds, ha->failed); - *strchr(ha->failed, '@') = '\0'; - } else if (error == 0 && minor != 0 && temphold) { - /* - * If this hold is to be released upon process exit, - * register that action now. - */ - dsl_register_onexit_hold_cleanup(ds, htag, minor); - } - dsl_dataset_rele(ds, ha->dstg); + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { + if (inflight > 0 || + ds->ds_phys->ds_referenced_bytes < ds->ds_quota) + error = ERESTART; + else + error = EDQUOT; } + mutex_exit(&ds->ds_lock); - if (error == 0 && recursive && !ha->gotone) - error = ENOENT; - - if (error) - (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); - - dsl_sync_task_group_destroy(ha->dstg); - - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - spa_close(spa, FTAG); - if (cleanup_fd != -1) - zfs_onexit_fd_rele(cleanup_fd); return (error); } -struct dsl_ds_releasearg { - dsl_dataset_t *ds; - const char *htag; - boolean_t own; /* do we own or just hold ds? */ -}; +typedef struct dsl_dataset_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dataset_set_qr_arg_t; + +/* ARGSUSED */ static int -dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, - boolean_t *might_destroy) +dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) { - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t zapobj; - uint64_t tmp; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; int error; + uint64_t newval; - *might_destroy = B_FALSE; + if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) + return (ENOTSUP); - mutex_enter(&ds->ds_lock); - zapobj = ds->ds_phys->ds_userrefs_obj; - if (zapobj == 0) { - /* The tag can't possibly exist */ - mutex_exit(&ds->ds_lock); - return (ESRCH); + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + if (dsl_dataset_is_snapshot(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); } - /* Make sure the tag exists */ - error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); - if (error) { - mutex_exit(&ds->ds_lock); - if (error == ENOENT) - error = ESRCH; + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); return (error); } - if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && - DS_IS_DEFER_DESTROY(ds)) - *might_destroy = B_TRUE; + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } - mutex_exit(&ds->ds_lock); + if (newval < ds->ds_phys->ds_referenced_bytes || + newval < ds->ds_reserved) { + dsl_dataset_rele(ds, FTAG); + return (ENOSPC); + } + + dsl_dataset_rele(ds, FTAG); return (0); } -static int -dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) +static void +dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) { - struct dsl_ds_releasearg *ra = arg1; - dsl_dataset_t *ds = ra->ds; - boolean_t might_destroy; - int error; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) - return (ENOTSUP); + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); - if (error) - return (error); + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - if (might_destroy) { - struct dsl_ds_destroyarg dsda = {0}; + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); - if (dmu_tx_is_syncing(tx)) { - /* - * If we're not prepared to remove the snapshot, - * we can't allow the release to happen right now. - */ - if (!ra->own) - return (EBUSY); - } - dsda.ds = ds; - dsda.releasing = B_TRUE; - return (dsl_dataset_destroy_check(&dsda, tag, tx)); + if (ds->ds_quota != newval) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_quota = newval; } - - return (0); + dsl_dataset_rele(ds, FTAG); } -static void -dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) +int +dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, + uint64_t refquota) { - struct dsl_ds_releasearg *ra = arg1; - dsl_dataset_t *ds = ra->ds; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - uint64_t zapobj; - uint64_t refs; - int error; + dsl_dataset_set_qr_arg_t ddsqra; - mutex_enter(&ds->ds_lock); - ds->ds_userrefs--; - refs = ds->ds_userrefs; - mutex_exit(&ds->ds_lock); - error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); - VERIFY(error == 0 || error == ENOENT); - zapobj = ds->ds_phys->ds_userrefs_obj; - VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); - - spa_history_log_internal_ds(ds, "release", tx, - "tag = %s refs now = %lld", ra->htag, (longlong_t)refs); - - if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && - DS_IS_DEFER_DESTROY(ds)) { - struct dsl_ds_destroyarg dsda = {0}; - - ASSERT(ra->own); - dsda.ds = ds; - dsda.releasing = B_TRUE; - /* We already did the destroy_check */ - dsl_dataset_destroy_sync(&dsda, tag, tx); - } + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refquota; + + return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, + dsl_dataset_set_refquota_sync, &ddsqra, 0)); } static int -dsl_dataset_user_release_one(const char *dsname, void *arg) +dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) { - struct dsl_ds_holdarg *ha = arg; - struct dsl_ds_releasearg *ra; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; - void *dtag = ha->dstg; - char *name; - boolean_t own = B_FALSE; - boolean_t might_destroy; - - /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ - name = kmem_asprintf("%s@%s", dsname, ha->snapname); - error = dsl_dataset_hold(name, dtag, &ds); - strfree(name); - if (error == ENOENT && ha->recursive) - return (0); - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); - if (error) - return (error); - - ha->gotone = B_TRUE; + uint64_t newval, unique; - ASSERT(dsl_dataset_is_snapshot(ds)); + if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) + return (ENOTSUP); - error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); - if (error) { - dsl_dataset_rele(ds, dtag); + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) return (error); - } - if (might_destroy) { -#ifdef _KERNEL - name = kmem_asprintf("%s@%s", dsname, ha->snapname); - error = zfs_unmount_snap(name, NULL); - strfree(name); - if (error) { - dsl_dataset_rele(ds, dtag); - return (error); - } -#endif - if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { - dsl_dataset_rele(ds, dtag); - return (EBUSY); - } else { - own = B_TRUE; - dsl_dataset_make_exclusive(ds, dtag); - } + if (dsl_dataset_is_snapshot(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); } - ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); - ra->ds = ds; - ra->htag = ha->htag; - ra->own = own; - dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, - dsl_dataset_user_release_sync, ra, dtag, 0); - - return (0); -} - -int -dsl_dataset_user_release(char *dsname, char *snapname, char *htag, - boolean_t recursive) -{ - struct dsl_ds_holdarg *ha; - dsl_sync_task_t *dst; - spa_t *spa; - int error; - -top: - ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); - - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); - - error = spa_open(dsname, &spa, FTAG); - if (error) { - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); return (error); } - ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - ha->htag = htag; - ha->snapname = snapname; - ha->recursive = recursive; - if (recursive) { - error = dmu_objset_find(dsname, dsl_dataset_user_release_one, - ha, DS_FIND_CHILDREN); - } else { - error = dsl_dataset_user_release_one(dsname, ha); + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); + return (0); } - if (error == 0) - error = dsl_sync_task_group_wait(ha->dstg); - - for (dst = list_head(&ha->dstg->dstg_tasks); dst; - dst = list_next(&ha->dstg->dstg_tasks, dst)) { - struct dsl_ds_releasearg *ra = dst->dst_arg1; - dsl_dataset_t *ds = ra->ds; - if (dst->dst_err) - dsl_dataset_name(ds, ha->failed); + mutex_enter(&ds->ds_lock); + if (!DS_UNIQUE_IS_ACCURATE(ds)) + dsl_dataset_recalc_head_uniq(ds); + unique = ds->ds_phys->ds_unique_bytes; + mutex_exit(&ds->ds_lock); - if (ra->own) - dsl_dataset_disown(ds, ha->dstg); - else - dsl_dataset_rele(ds, ha->dstg); + if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { + uint64_t delta = MAX(unique, newval) - + MAX(unique, ds->ds_reserved); - kmem_free(ra, sizeof (struct dsl_ds_releasearg)); + if (delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || + (ds->ds_quota > 0 && newval > ds->ds_quota)) { + dsl_dataset_rele(ds, FTAG); + return (ENOSPC); + } } - if (error == 0 && recursive && !ha->gotone) - error = ENOENT; - - if (error && error != EBUSY) - (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); - - dsl_sync_task_group_destroy(ha->dstg); - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - spa_close(spa, FTAG); - - /* - * We can get EBUSY if we were racing with deferred destroy and - * dsl_dataset_user_release_check() hadn't done the necessary - * open context setup. We can also get EBUSY if we're racing - * with destroy and that thread is the ds_owner. Either way - * the busy condition should be transient, and we should retry - * the release operation. - */ - if (error == EBUSY) - goto top; - - return (error); + dsl_dataset_rele(ds, FTAG); + return (0); } -/* - * Called at spa_load time (with retry == B_FALSE) to release a stale - * temporary user hold. Also called by the onexit code (with retry == B_TRUE). - */ -int -dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, - boolean_t retry) +void +dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, + zprop_source_t source, uint64_t value, dmu_tx_t *tx) { - dsl_dataset_t *ds; - char *snap; - char *name; - int namelen; - int error; + uint64_t newval; + uint64_t unique; + int64_t delta; - do { - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (error) - return (error); - namelen = dsl_dataset_namelen(ds)+1; - name = kmem_alloc(namelen, KM_SLEEP); - dsl_dataset_name(ds, name); - dsl_dataset_rele(ds, FTAG); + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + source, sizeof (value), 1, &value, tx); - snap = strchr(name, '@'); - *snap = '\0'; - ++snap; - error = dsl_dataset_user_release(name, snap, htag, B_FALSE); - kmem_free(name, namelen); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); - /* - * The object can't have been destroyed because we have a hold, - * but it might have been renamed, resulting in ENOENT. Retry - * if we've been requested to do so. - * - * It would be nice if we could use the dsobj all the way - * through and avoid ENOENT entirely. But we might need to - * unmount the snapshot, and there's currently no way to lookup - * a vfsp using a ZFS object id. - */ - } while ((error == ENOENT) && retry); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + mutex_enter(&ds->ds_lock); + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + unique = ds->ds_phys->ds_unique_bytes; + delta = MAX(0, (int64_t)(newval - unique)) - + MAX(0, (int64_t)(ds->ds_reserved - unique)); + ds->ds_reserved = newval; + mutex_exit(&ds->ds_lock); - return (error); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); + mutex_exit(&ds->ds_dir->dd_lock); } -int -dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) +static void +dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) { + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; - int err; - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); - - VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); - if (ds->ds_phys->ds_userrefs_obj != 0) { - zap_attribute_t *za; - zap_cursor_t zc; - - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_userrefs_obj); - zap_cursor_retrieve(&zc, za) == 0; - zap_cursor_advance(&zc)) { - VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, - za->za_first_integer)); - } - zap_cursor_fini(&zc); - kmem_free(za, sizeof (zap_attribute_t)); - } + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + dsl_dataset_set_refreservation_sync_impl(ds, + ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); dsl_dataset_rele(ds, FTAG); - return (0); } -/* - * Note, this function is used as the callback for dmu_objset_find(). We - * always return 0 so that we will continue to find and process - * inconsistent datasets, even if we encounter an error trying to - * process one of them. - */ -/* ARGSUSED */ int -dsl_destroy_inconsistent(const char *dsname, void *arg) +dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, + uint64_t refreservation) { - dsl_dataset_t *ds; + dsl_dataset_set_qr_arg_t ddsqra; - if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { - if (DS_IS_INCONSISTENT(ds)) - (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); - else - dsl_dataset_disown(ds, FTAG); - } - return (0); + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refreservation; + + return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, + dsl_dataset_set_refreservation_sync, &ddsqra, 0)); } /* @@ -4163,6 +2762,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t snapobj; dsl_pool_t *dp = new->ds_dir->dd_pool; + ASSERT(dsl_pool_config_held(dp)); + *usedp = 0; *usedp += new->ds_phys->ds_referenced_bytes; *usedp -= oldsnap->ds_phys->ds_referenced_bytes; @@ -4175,7 +2776,6 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, *uncompp += new->ds_phys->ds_uncompressed_bytes; *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; - rw_enter(&dp->dp_config_rwlock, RW_READER); snapobj = new->ds_object; while (snapobj != oldsnap->ds_object) { dsl_dataset_t *snap; @@ -4224,7 +2824,6 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, } } - rw_exit(&dp->dp_config_rwlock); return (err); } @@ -4266,7 +2865,6 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, *usedp = *compp = *uncompp = 0; - rw_enter(&dp->dp_config_rwlock, RW_READER); snapobj = lastsnap->ds_phys->ds_next_snap_obj; while (snapobj != firstsnap->ds_object) { dsl_dataset_t *ds; @@ -4287,6 +2885,42 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, ASSERT3U(snapobj, !=, 0); dsl_dataset_rele(ds, FTAG); } - rw_exit(&dp->dp_config_rwlock); return (err); } + +/* + * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. + * For example, they could both be snapshots of the same filesystem, and + * 'earlier' is before 'later'. Or 'earlier' could be the origin of + * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's + * filesystem. Or 'earlier' could be the origin's origin. + */ +boolean_t +dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier) +{ + dsl_pool_t *dp = later->ds_dir->dd_pool; + int error; + boolean_t ret; + + ASSERT(dsl_pool_config_held(dp)); + + if (earlier->ds_phys->ds_creation_txg >= + later->ds_phys->ds_creation_txg) + return (B_FALSE); + + if (later->ds_dir == earlier->ds_dir) + return (B_TRUE); + if (!dsl_dir_is_clone(later->ds_dir)) + return (B_FALSE); + + if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) + return (B_TRUE); + dsl_dataset_t *origin; + error = dsl_dataset_hold_obj(dp, + later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin); + if (error != 0) + return (B_FALSE); + ret = dsl_dataset_is_before(origin, earlier); + dsl_dataset_rele(origin, FTAG); + return (ret); +} diff --git a/uts/common/fs/zfs/dsl_deleg.c b/uts/common/fs/zfs/dsl_deleg.c index ba620bd..f09cb2f 100644 --- a/uts/common/fs/zfs/dsl_deleg.c +++ b/uts/common/fs/zfs/dsl_deleg.c @@ -147,28 +147,37 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) return (0); } +typedef struct dsl_deleg_arg { + const char *dda_name; + nvlist_t *dda_nvlist; +} dsl_deleg_arg_t; + static void -dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - nvlist_t *nvp = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; nvpair_t *whopair = NULL; - uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t zapobj; + + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + zapobj = dd->dd_phys->dd_deleg_zapobj; if (zapobj == 0) { dmu_buf_will_dirty(dd->dd_dbuf, tx); zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); } - while (whopair = nvlist_next_nvpair(nvp, whopair)) { + while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { const char *whokey = nvpair_name(whopair); nvlist_t *perms; nvpair_t *permpair = NULL; uint64_t jumpobj; - VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + perms = fnvpair_value_nvlist(whopair); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, @@ -185,21 +194,27 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) "%s %s", whokey, perm); } } + dsl_dir_rele(dd, FTAG); } static void -dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - nvlist_t *nvp = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; nvpair_t *whopair = NULL; - uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t zapobj; - if (zapobj == 0) + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + zapobj = dd->dd_phys->dd_deleg_zapobj; + if (zapobj == 0) { + dsl_dir_rele(dd, FTAG); return; + } - while (whopair = nvlist_next_nvpair(nvp, whopair)) { + while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { const char *whokey = nvpair_name(whopair); nvlist_t *perms; nvpair_t *permpair = NULL; @@ -234,35 +249,40 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) "%s %s", whokey, perm); } } + dsl_dir_rele(dd, FTAG); } -int -dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +static int +dsl_deleg_check(void *arg, dmu_tx_t *tx) { + dsl_deleg_arg_t *dda = arg; dsl_dir_t *dd; int error; - nvpair_t *whopair = NULL; - int blocks_modified = 0; - error = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (error) - return (error); - - if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) < + if (spa_version(dmu_tx_pool(tx)->dp_spa) < SPA_VERSION_DELEGATED_PERMS) { - dsl_dir_close(dd, FTAG); return (ENOTSUP); } - while (whopair = nvlist_next_nvpair(nvp, whopair)) - blocks_modified++; + error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL); + if (error == 0) + dsl_dir_rele(dd, FTAG); + return (error); +} - error = dsl_sync_task_do(dd->dd_pool, NULL, - unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, - dd, nvp, blocks_modified); - dsl_dir_close(dd, FTAG); +int +dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +{ + dsl_deleg_arg_t dda; - return (error); + /* nvp must already have been verified to be valid */ + + dda.dda_name = ddname; + dda.dda_nvlist = nvp; + + return (dsl_sync_task(ddname, dsl_deleg_check, + unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, + &dda, fnvlist_num_pairs(nvp))); } /* @@ -290,16 +310,21 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) int error; objset_t *mos; - error = dsl_dir_open(ddname, FTAG, &startdd, NULL); - if (error) + error = dsl_pool_hold(ddname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL); + if (error != 0) { + dsl_pool_rele(dp, FTAG); return (error); + } dp = startdd->dd_pool; mos = dp->dp_meta_objset; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - rw_enter(&dp->dp_config_rwlock, RW_READER); for (dd = startdd; dd != NULL; dd = dd->dd_parent) { zap_cursor_t basezc; zap_attribute_t baseza; @@ -307,15 +332,12 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) uint64_t n; char source[MAXNAMELEN]; - if (dd->dd_phys->dd_deleg_zapobj && - (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, - &n) == 0) && n) { - VERIFY(nvlist_alloc(&sp_nvp, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - } else { + if (dd->dd_phys->dd_deleg_zapobj == 0 || + zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 || + n == 0) continue; - } + sp_nvp = fnvlist_alloc(); for (zap_cursor_init(&basezc, mos, dd->dd_phys->dd_deleg_zapobj); zap_cursor_retrieve(&basezc, &baseza) == 0; @@ -327,29 +349,26 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) ASSERT(baseza.za_integer_length == 8); ASSERT(baseza.za_num_integers == 1); - VERIFY(nvlist_alloc(&perms_nvp, - NV_UNIQUE_NAME, KM_SLEEP) == 0); + perms_nvp = fnvlist_alloc(); for (zap_cursor_init(&zc, mos, baseza.za_first_integer); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { - VERIFY(nvlist_add_boolean(perms_nvp, - za.za_name) == 0); + fnvlist_add_boolean(perms_nvp, za.za_name); } zap_cursor_fini(&zc); - VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name, - perms_nvp) == 0); - nvlist_free(perms_nvp); + fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp); + fnvlist_free(perms_nvp); } zap_cursor_fini(&basezc); dsl_dir_name(dd, source); - VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); + fnvlist_add_nvlist(*nvp, source, sp_nvp); nvlist_free(sp_nvp); } - rw_exit(&dp->dp_config_rwlock); - dsl_dir_close(startdd, FTAG); + dsl_dir_rele(startdd, FTAG); + dsl_pool_rele(dp, FTAG); return (0); } @@ -555,7 +574,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), offsetof(perm_set_t, p_node)); - rw_enter(&dp->dp_config_rwlock, RW_READER); + ASSERT(dsl_pool_config_held(dp)); for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, checkflag = ZFS_DELEG_DESCENDENT) { uint64_t zapobj; @@ -616,7 +635,6 @@ again: } error = EPERM; success: - rw_exit(&dp->dp_config_rwlock); cookie = NULL; while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) @@ -628,15 +646,19 @@ success: int dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) { + dsl_pool_t *dp; dsl_dataset_t *ds; int error; - error = dsl_dataset_hold(dsname, FTAG, &ds); - if (error) + error = dsl_pool_hold(dsname, FTAG, &dp); + if (error != 0) return (error); - - error = dsl_deleg_access_impl(ds, perm, cr); - dsl_dataset_rele(ds, FTAG); + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + error = dsl_deleg_access_impl(ds, perm, cr); + dsl_dataset_rele(ds, FTAG); + } + dsl_pool_rele(dp, FTAG); return (error); } diff --git a/uts/common/fs/zfs/dsl_destroy.c b/uts/common/fs/zfs/dsl_destroy.c new file mode 100644 index 0000000..20d401f --- /dev/null +++ b/uts/common/fs/zfs/dsl_destroy.c @@ -0,0 +1,926 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct dmu_snapshots_destroy_arg { + nvlist_t *dsda_snaps; + nvlist_t *dsda_successful_snaps; + boolean_t dsda_defer; + nvlist_t *dsda_errlist; +} dmu_snapshots_destroy_arg_t; + +/* + * ds must be owned. + */ +static int +dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) +{ + if (!dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + if (dsl_dataset_long_held(ds)) + return (EBUSY); + + /* + * Only allow deferred destroy on pools that support it. + * NOTE: deferred destroy is only supported on snapshots. + */ + if (defer) { + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_USERREFS) + return (ENOTSUP); + return (0); + } + + /* + * If this snapshot has an elevated user reference count, + * we can't destroy it yet. + */ + if (ds->ds_userrefs > 0) + return (EBUSY); + + /* + * Can't delete a branch point. + */ + if (ds->ds_phys->ds_num_children > 1) + return (EEXIST); + + return (0); +} + +static int +dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dmu_snapshots_destroy_arg_t *dsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int error = 0; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) { + dsl_dataset_t *ds; + + error = dsl_dataset_hold(dp, nvpair_name(pair), + FTAG, &ds); + + /* + * If the snapshot does not exist, silently ignore it + * (it's "already destroyed"). + */ + if (error == ENOENT) + continue; + + if (error == 0) { + error = dsl_destroy_snapshot_check_impl(ds, + dsda->dsda_defer); + dsl_dataset_rele(ds, FTAG); + } + + if (error == 0) { + fnvlist_add_boolean(dsda->dsda_successful_snaps, + nvpair_name(pair)); + } else { + fnvlist_add_int32(dsda->dsda_errlist, + nvpair_name(pair), error); + } + } + + pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL); + if (pair != NULL) + return (fnvpair_value_int32(pair)); + return (0); +} + +struct process_old_arg { + dsl_dataset_t *ds; + dsl_dataset_t *ds_prev; + boolean_t after_branch_point; + zio_t *pio; + uint64_t used, comp, uncomp; +}; + +static int +process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct process_old_arg *poa = arg; + dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; + + if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); + if (poa->ds_prev && !poa->after_branch_point && + bp->blk_birth > + poa->ds_prev->ds_phys->ds_prev_snap_txg) { + poa->ds_prev->ds_phys->ds_unique_bytes += + bp_get_dsize_sync(dp->dp_spa, bp); + } + } else { + poa->used += bp_get_dsize_sync(dp->dp_spa, bp); + poa->comp += BP_GET_PSIZE(bp); + poa->uncomp += BP_GET_UCSIZE(bp); + dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); + } + return (0); +} + +static void +process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, + dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) +{ + struct process_old_arg poa = { 0 }; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t deadlist_obj; + + ASSERT(ds->ds_deadlist.dl_oldfmt); + ASSERT(ds_next->ds_deadlist.dl_oldfmt); + + poa.ds = ds; + poa.ds_prev = ds_prev; + poa.after_branch_point = after_branch_point; + poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, + process_old_cb, &poa, tx)); + VERIFY0(zio_wait(poa.pio)); + ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -poa.used, -poa.comp, -poa.uncomp, tx); + + /* swap next's deadlist to our deadlist */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_close(&ds_next->ds_deadlist); + deadlist_obj = ds->ds_phys->ds_deadlist_obj; + ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj; + ds_next->ds_phys->ds_deadlist_obj = deadlist_obj; + dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&ds_next->ds_deadlist, mos, + ds_next->ds_phys->ds_deadlist_obj); +} + +static void +dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + + /* + * If it is the old version, dd_clones doesn't exist so we can't + * find the clones, but dsl_deadlist_remove_key() is a no-op so it + * doesn't matter. + */ + if (ds->ds_dir->dd_phys->dd_clones == 0) + return; + + for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + + VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); + if (clone->ds_dir->dd_origin_txg > mintxg) { + dsl_deadlist_remove_key(&clone->ds_deadlist, + mintxg, tx); + dsl_dataset_remove_clones_key(clone, mintxg, tx); + } + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); +} + +void +dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) +{ + int err; + int after_branch_point = FALSE; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *ds_prev = NULL; + uint64_t obj; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + ASSERT(refcount_is_zero(&ds->ds_longholds)); + + if (defer && + (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) { + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; + spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); + return; + } + + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, ""); + + dsl_scan_ds_destroyed(ds, tx); + + obj = ds->ds_object; + + if (ds->ds_phys->ds_prev_snap_obj != 0) { + ASSERT3P(ds->ds_prev, ==, NULL); + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); + after_branch_point = + (ds_prev->ds_phys->ds_next_snap_obj != obj); + + dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); + if (after_branch_point && + ds_prev->ds_phys->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); + if (ds->ds_phys->ds_next_snap_obj != 0) { + VERIFY0(zap_add_int(mos, + ds_prev->ds_phys->ds_next_clones_obj, + ds->ds_phys->ds_next_snap_obj, tx)); + } + } + if (!after_branch_point) { + ds_prev->ds_phys->ds_next_snap_obj = + ds->ds_phys->ds_next_snap_obj; + } + } + + dsl_dataset_t *ds_next; + uint64_t old_unique; + uint64_t used = 0, comp = 0, uncomp = 0; + + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); + ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); + + old_unique = ds_next->ds_phys->ds_unique_bytes; + + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); + ds_next->ds_phys->ds_prev_snap_obj = + ds->ds_phys->ds_prev_snap_obj; + ds_next->ds_phys->ds_prev_snap_txg = + ds->ds_phys->ds_prev_snap_txg; + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, + ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); + + if (ds_next->ds_deadlist.dl_oldfmt) { + process_old_deadlist(ds, ds_prev, ds_next, + after_branch_point, tx); + } else { + /* Adjust prev's unique space. */ + if (ds_prev && !after_branch_point) { + dsl_deadlist_space_range(&ds_next->ds_deadlist, + ds_prev->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_prev_snap_txg, + &used, &comp, &uncomp); + ds_prev->ds_phys->ds_unique_bytes += used; + } + + /* Adjust snapused. */ + dsl_deadlist_space_range(&ds_next->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -comp, -uncomp, tx); + + /* Move blocks to be freed to pool's free list. */ + dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, + &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, + tx); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, comp, uncomp, tx); + + /* Merge our deadlist into next's and free it. */ + dsl_deadlist_merge(&ds_next->ds_deadlist, + ds->ds_phys->ds_deadlist_obj, tx); + } + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_deadlist_obj = 0; + + /* Collapse range in clone heads */ + dsl_dataset_remove_clones_key(ds, + ds->ds_phys->ds_creation_txg, tx); + + if (dsl_dataset_is_snapshot(ds_next)) { + dsl_dataset_t *ds_nextnext; + + /* + * Update next's unique to include blocks which + * were previously shared by only this snapshot + * and it. Those blocks will be born after the + * prev snap and before this snap, and will have + * died after the next snap and before the one + * after that (ie. be on the snap after next's + * deadlist). + */ + VERIFY0(dsl_dataset_hold_obj(dp, + ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext)); + dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_creation_txg, + &used, &comp, &uncomp); + ds_next->ds_phys->ds_unique_bytes += used; + dsl_dataset_rele(ds_nextnext, FTAG); + ASSERT3P(ds_next->ds_prev, ==, NULL); + + /* Collapse range in this head. */ + dsl_dataset_t *hds; + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds)); + dsl_deadlist_remove_key(&hds->ds_deadlist, + ds->ds_phys->ds_creation_txg, tx); + dsl_dataset_rele(hds, FTAG); + + } else { + ASSERT3P(ds_next->ds_prev, ==, ds); + dsl_dataset_rele(ds_next->ds_prev, ds_next); + ds_next->ds_prev = NULL; + if (ds_prev) { + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, + ds_next, &ds_next->ds_prev)); + } + + dsl_dataset_recalc_head_uniq(ds_next); + + /* + * Reduce the amount of our unconsumed refreservation + * being charged to our parent by the amount of + * new unique data we have gained. + */ + if (old_unique < ds_next->ds_reserved) { + int64_t mrsdelta; + uint64_t new_unique = + ds_next->ds_phys->ds_unique_bytes; + + ASSERT(old_unique <= new_unique); + mrsdelta = MIN(new_unique - old_unique, + ds_next->ds_reserved - old_unique); + dsl_dir_diduse_space(ds->ds_dir, + DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); + } + } + dsl_dataset_rele(ds_next, FTAG); + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* remove from snapshot namespace */ + dsl_dataset_t *ds_head; + ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); + VERIFY0(dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); + VERIFY0(dsl_dataset_get_snapname(ds)); +#ifdef ZFS_DEBUG + { + uint64_t val; + + err = dsl_dataset_snap_lookup(ds_head, + ds->ds_snapname, &val); + ASSERT0(err); + ASSERT3U(val, ==, obj); + } +#endif + VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx)); + dsl_dataset_rele(ds_head, FTAG); + + if (ds_prev != NULL) + dsl_dataset_rele(ds_prev, FTAG); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + ASSERT0(zap_count(mos, + ds->ds_phys->ds_next_clones_obj, &count) && count == 0); + VERIFY0(dmu_object_free(mos, + ds->ds_phys->ds_next_clones_obj, tx)); + } + if (ds->ds_phys->ds_props_obj != 0) + VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); + if (ds->ds_phys->ds_userrefs_obj != 0) + VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + VERIFY0(dmu_object_free(mos, obj, tx)); +} + +static void +dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dmu_snapshots_destroy_arg_t *dsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL); + pair != NULL; + pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + + dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx); + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The semantics of this function are described in the comment above + * lzc_destroy_snaps(). To summarize: + * + * The snapshots must all be in the same pool. + * + * Snapshots that don't exist will be silently ignored (considered to be + * "already deleted"). + * + * On success, all snaps will be destroyed and this will return 0. + * On failure, no snaps will be destroyed, the errlist will be filled in, + * and this will return an errno. + */ +int +dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, + nvlist_t *errlist) +{ + dmu_snapshots_destroy_arg_t dsda; + int error; + nvpair_t *pair; + + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + + dsda.dsda_snaps = snaps; + dsda.dsda_successful_snaps = fnvlist_alloc(); + dsda.dsda_defer = defer; + dsda.dsda_errlist = errlist; + + error = dsl_sync_task(nvpair_name(pair), + dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync, + &dsda, 0); + fnvlist_free(dsda.dsda_successful_snaps); + + return (error); +} + +int +dsl_destroy_snapshot(const char *name, boolean_t defer) +{ + int error; + nvlist_t *nvl = fnvlist_alloc(); + nvlist_t *errlist = fnvlist_alloc(); + + fnvlist_add_boolean(nvl, name); + error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); + fnvlist_free(errlist); + fnvlist_free(nvl); + return (error); +} + +struct killarg { + dsl_dataset_t *ds; + dmu_tx_t *tx; +}; + +/* ARGSUSED */ +static int +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct killarg *ka = arg; + dmu_tx_t *tx = ka->tx; + + if (bp == NULL) + return (0); + + if (zb->zb_level == ZB_ZIL_LEVEL) { + ASSERT(zilog != NULL); + /* + * It's a block in the intent log. It has no + * accounting, so just free it. + */ + dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); + } else { + ASSERT(zilog == NULL); + ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); + } + + return (0); +} + +static void +old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + struct killarg ka; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.tx = tx; + VERIFY0(traverse_dataset(ds, + ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, + kill_blkptr, &ka)); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); +} + +typedef struct dsl_destroy_head_arg { + const char *ddha_name; +} dsl_destroy_head_arg_t; + +int +dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) +{ + int error; + uint64_t count; + objset_t *mos; + + if (dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + if (refcount_count(&ds->ds_longholds) != expected_holds) + return (EBUSY); + + mos = ds->ds_dir->dd_pool->dp_meta_objset; + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) + return (EBUSY); + + /* + * Can't delete if there are children of this fs. + */ + error = zap_count(mos, + ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); + if (error != 0) + return (error); + if (count != 0) + return (EEXIST); + + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && + ds->ds_prev->ds_phys->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0) { + /* We need to remove the origin snapshot as well. */ + if (!refcount_is_zero(&ds->ds_prev->ds_longholds)) + return (EBUSY); + } + return (0); +} + +static int +dsl_destroy_head_check(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_destroy_head_check_impl(ds, 0); + dsl_dataset_rele(ds, FTAG); + return (error); +} + +static void +dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) +{ + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dd_used_t t; + + ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); + + VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); + + ASSERT0(dd->dd_phys->dd_head_dataset_obj); + + /* + * Remove our reservation. The impl() routine avoids setting the + * actual property, which would require the (already destroyed) ds. + */ + dsl_dir_set_reservation_sync_impl(dd, 0, tx); + + ASSERT0(dd->dd_phys->dd_used_bytes); + ASSERT0(dd->dd_phys->dd_reserved); + for (t = 0; t < DD_USED_NUM; t++) + ASSERT0(dd->dd_phys->dd_used_breakdown[t]); + + VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); + VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); + VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); + VERIFY0(zap_remove(mos, + dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); + + dsl_dir_rele(dd, FTAG); + VERIFY0(dmu_object_free(mos, ddobj, tx)); +} + +void +dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + uint64_t obj, ddobj, prevobj = 0; + boolean_t rmorigin; + + ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT(ds->ds_prev == NULL || + ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); + ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, ""); + + rmorigin = (dsl_dir_is_clone(ds->ds_dir) && + DS_IS_DEFER_DESTROY(ds->ds_prev) && + ds->ds_prev->ds_phys->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0); + + /* Remove our reservation */ + if (ds->ds_reserved != 0) { + dsl_dataset_set_refreservation_sync_impl(ds, + (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), + 0, tx); + ASSERT0(ds->ds_reserved); + } + + dsl_scan_ds_destroyed(ds, tx); + + obj = ds->ds_object; + + if (ds->ds_phys->ds_prev_snap_obj != 0) { + /* This is a clone */ + ASSERT(ds->ds_prev != NULL); + ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj); + ASSERT0(ds->ds_phys->ds_next_snap_obj); + + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds->ds_prev, + obj, tx); + } + + ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1); + ds->ds_prev->ds_phys->ds_num_children--; + } + + zfeature_info_t *async_destroy = + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; + objset_t *os; + + /* + * Destroy the deadlist. Unless it's a clone, the + * deadlist should be empty. (If it's a clone, it's + * safe to ignore the deadlist contents.) + */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_deadlist_obj = 0; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + + if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { + old_synchronous_dataset_destroy(ds, tx); + } else { + /* + * Move the bptree into the pool's list of trees to + * clean up and update space accounting information. + */ + uint64_t used, comp, uncomp; + + zil_destroy_sync(dmu_objset_zil(os), tx); + + if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { + spa_feature_incr(dp->dp_spa, async_destroy, tx); + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY0(zap_add(mos, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx)); + } + + used = ds->ds_dir->dd_phys->dd_used_bytes; + comp = ds->ds_dir->dd_phys->dd_compressed_bytes; + uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + ds->ds_phys->ds_unique_bytes == used); + + bptree_add(mos, dp->dp_bptree_obj, + &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, + used, comp, uncomp, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + } + + if (ds->ds_prev != NULL) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY0(zap_remove_int(mos, + ds->ds_prev->ds_dir->dd_phys->dd_clones, + ds->ds_object, tx)); + } + prevobj = ds->ds_prev->ds_object; + dsl_dataset_rele(ds->ds_prev, ds); + ds->ds_prev = NULL; + } + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* Erase the link in the dir */ + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; + ddobj = ds->ds_dir->dd_object; + ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); + VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx)); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + ASSERT0(ds->ds_phys->ds_next_clones_obj); + ASSERT0(ds->ds_phys->ds_props_obj); + ASSERT0(ds->ds_phys->ds_userrefs_obj); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + VERIFY0(dmu_object_free(mos, obj, tx)); + + dsl_dir_destroy_sync(ddobj, tx); + + if (rmorigin) { + dsl_dataset_t *prev; + VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); + dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); + dsl_dataset_rele(prev, FTAG); + } +} + +static void +dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + dsl_destroy_head_sync_impl(ds, tx); + dsl_dataset_rele(ds, FTAG); +} + +static void +dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + + /* Mark it as inconsistent on-disk, in case we crash */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_log_internal_ds(ds, "destroy begin", tx, ""); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_destroy_head(const char *name) +{ + dsl_destroy_head_arg_t ddha; + int error; + spa_t *spa; + boolean_t isenabled; + +#ifdef _KERNEL + zfs_destroy_unmount_origin(name); +#endif + + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + isenabled = spa_feature_is_enabled(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]); + spa_close(spa, FTAG); + + ddha.ddha_name = name; + + if (!isenabled) { + objset_t *os; + + error = dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_begin_sync, &ddha, 0); + if (error != 0) + return (error); + + /* + * Head deletion is processed in one txg on old pools; + * remove the objects from open context so that the txg sync + * is not too long. + */ + error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os); + if (error == 0) { + uint64_t prev_snap_txg = + dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg; + for (uint64_t obj = 0; error == 0; + error = dmu_object_next(os, &obj, FALSE, + prev_snap_txg)) + (void) dmu_free_object(os, obj); + /* sync out all frees */ + txg_wait_synced(dmu_objset_pool(os), 0); + dmu_objset_disown(os, FTAG); + } + } + + return (dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_sync, &ddha, 0)); +} + +/* + * Note, this function is used as the callback for dmu_objset_find(). We + * always return 0 so that we will continue to find and process + * inconsistent datasets, even if we encounter an error trying to + * process one of them. + */ +/* ARGSUSED */ +int +dsl_destroy_inconsistent(const char *dsname, void *arg) +{ + objset_t *os; + + if (dmu_objset_hold(dsname, FTAG, &os) == 0) { + boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + dmu_objset_rele(os, FTAG); + if (inconsistent) + (void) dsl_destroy_head(dsname); + } + return (0); +} diff --git a/uts/common/fs/zfs/dsl_dir.c b/uts/common/fs/zfs/dsl_dir.c index 5ccb686..1e7ba6d 100644 --- a/uts/common/fs/zfs/dsl_dir.c +++ b/uts/common/fs/zfs/dsl_dir.c @@ -40,8 +40,6 @@ #include "zfs_namecheck.h" static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, - uint64_t value, dmu_tx_t *tx); /* ARGSUSED */ static void @@ -58,7 +56,7 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) } if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); + dsl_dir_rele(dd->dd_parent, dd); spa_close(dd->dd_pool->dp_spa, dd); @@ -72,18 +70,17 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) } int -dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, +dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; int err; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); + ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); - if (err) + if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG @@ -110,9 +107,9 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dsl_dir_snap_cmtime_update(dd); if (dd->dd_phys->dd_parent_obj) { - err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, + err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj, NULL, dd, &dd->dd_parent); - if (err) + if (err != 0) goto errout; if (tail) { #ifdef ZFS_DEBUG @@ -129,7 +126,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_parent->dd_phys->dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } - if (err) + if (err != 0) goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); @@ -146,7 +143,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, */ err = dmu_bonus_hold(dp->dp_meta_objset, dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); - if (err) + if (err != 0) goto errout; origin_phys = origin_bonus->db_data; dd->dd_origin_txg = @@ -158,7 +155,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dsl_dir_evict); if (winner) { if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); + dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; @@ -185,7 +182,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, errout: if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); + dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); @@ -193,7 +190,7 @@ errout: } void -dsl_dir_close(dsl_dir_t *dd, void *tag) +dsl_dir_rele(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); @@ -250,6 +247,7 @@ static int getcomponent(const char *path, char *component, const char **nextp) { char *p; + if ((path == NULL) || (path[0] == '\0')) return (ENOENT); /* This would be a good place to reserve some namespace... */ @@ -272,10 +270,10 @@ getcomponent(const char *path, char *component, const char **nextp) (void) strcpy(component, path); p = NULL; } else if (p[0] == '/') { - if (p-path >= MAXNAMELEN) + if (p - path >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(component, path, p - path); - component[p-path] = '\0'; + component[p - path] = '\0'; p++; } else if (p[0] == '@') { /* @@ -284,65 +282,54 @@ getcomponent(const char *path, char *component, const char **nextp) */ if (strchr(path, '/')) return (EINVAL); - if (p-path >= MAXNAMELEN) + if (p - path >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(component, path, p - path); - component[p-path] = '\0'; + component[p - path] = '\0'; } else { - ASSERT(!"invalid p"); + panic("invalid p=%p", (void *)p); } *nextp = p; return (0); } /* - * same as dsl_open_dir, ignore the first component of name and use the - * spa instead + * Return the dsl_dir_t, and possibly the last component which couldn't + * be found in *tail. The name must be in the specified dsl_pool_t. This + * thread must hold the dp_config_rwlock for the pool. Returns NULL if the + * path is bogus, or if tail==NULL and we couldn't parse the whole name. + * (*tail)[0] == '@' means that the last component is a snapshot. */ int -dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, +dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { char buf[MAXNAMELEN]; - const char *next, *nextnext = NULL; + const char *spaname, *next, *nextnext = NULL; int err; dsl_dir_t *dd; - dsl_pool_t *dp; uint64_t ddobj; - int openedspa = FALSE; - - dprintf("%s\n", name); err = getcomponent(name, buf, &next); - if (err) + if (err != 0) return (err); - if (spa == NULL) { - err = spa_open(buf, &spa, FTAG); - if (err) { - dprintf("spa_open(%s) failed\n", buf); - return (err); - } - openedspa = TRUE; - /* XXX this assertion belongs in spa_open */ - ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); - } + /* Make sure the name is in the specified pool. */ + spaname = spa_name(dp->dp_spa); + if (strcmp(buf, spaname) != 0) + return (EINVAL); - dp = spa_get_dsl(spa); + ASSERT(dsl_pool_config_held(dp)); - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); - if (err) { - rw_exit(&dp->dp_config_rwlock); - if (openedspa) - spa_close(spa, FTAG); + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); + if (err != 0) { return (err); } while (next != NULL) { dsl_dir_t *child_ds; err = getcomponent(next, buf, &nextnext); - if (err) + if (err != 0) break; ASSERT(next[0] != '\0'); if (next[0] == '@') @@ -353,25 +340,22 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, err = zap_lookup(dp->dp_meta_objset, dd->dd_phys->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); - if (err) { + if (err != 0) { if (err == ENOENT) err = 0; break; } - err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); - if (err) + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); + if (err != 0) break; - dsl_dir_close(dd, tag); + dsl_dir_rele(dd, tag); dd = child_ds; next = nextnext; } - rw_exit(&dp->dp_config_rwlock); - if (err) { - dsl_dir_close(dd, tag); - if (openedspa) - spa_close(spa, FTAG); + if (err != 0) { + dsl_dir_rele(dd, tag); return (err); } @@ -382,30 +366,16 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ - dsl_dir_close(dd, tag); + dsl_dir_rele(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); err = ENOENT; } - if (tailp) + if (tailp != NULL) *tailp = next; - if (openedspa) - spa_close(spa, FTAG); *ddp = dd; return (err); } -/* - * Return the dsl_dir_t, and possibly the last component which couldn't - * be found in *tail. Return NULL if the path is bogus, or if - * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' - * means that the last component is a snapshot. - */ -int -dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) -{ - return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); -} - uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) @@ -443,71 +413,6 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, return (ddobj); } -/* ARGSUSED */ -int -dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err; - uint64_t count; - - /* - * There should be exactly two holds, both from - * dsl_dataset_destroy: one on the dd directory, and one on its - * head ds. If there are more holds, then a concurrent thread is - * performing a lookup inside this dir while we're trying to destroy - * it. To minimize this possibility, we perform this check only - * in syncing context and fail the operation if we encounter - * additional holds. The dp_config_rwlock ensures that nobody else - * opens it after we check. - */ - if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2) - return (EBUSY); - - err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); - if (err) - return (err); - if (count != 0) - return (EEXIST); - - return (0); -} - -void -dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t obj; - dd_used_t t; - - ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); - ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); - - /* - * Remove our reservation. The impl() routine avoids setting the - * actual property, which would require the (already destroyed) ds. - */ - dsl_dir_set_reservation_sync_impl(dd, 0, tx); - - ASSERT0(dd->dd_phys->dd_used_bytes); - ASSERT0(dd->dd_phys->dd_reserved); - for (t = 0; t < DD_USED_NUM; t++) - ASSERT0(dd->dd_phys->dd_used_breakdown[t]); - - VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); - VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); - VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); - VERIFY(0 == zap_remove(mos, - dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); - - obj = dd->dd_object; - dsl_dir_close(dd, tag); - VERIFY(0 == dmu_object_free(mos, obj, tx)); -} - boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { @@ -545,18 +450,16 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) } mutex_exit(&dd->dd_lock); - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } - rw_exit(&dd->dd_pool->dp_config_rwlock); } void @@ -566,7 +469,7 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) ASSERT(dd->dd_phys); - if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { + if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } @@ -853,7 +756,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, FALSE, asize > usize, tr_list, tx, TRUE); } - if (err) + if (err != 0) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; @@ -1004,115 +907,123 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, mutex_exit(&dd->dd_lock); } +typedef struct dsl_dir_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dir_set_qr_arg_t; + static int -dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - int err; - uint64_t towrite; + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t towrite, newval; - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_prop_predict(ds->ds_dir, "quota", + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } - if (psa->psa_effective_value == 0) + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); return (0); + } - mutex_enter(&dd->dd_lock); + mutex_enter(&ds->ds_dir->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ - towrite = dsl_dir_space_towrite(dd); + towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && - (psa->psa_effective_value < dd->dd_phys->dd_reserved || - psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) { - err = ENOSPC; + (newval < ds->ds_dir->dd_phys->dd_reserved || + newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) { + error = ENOSPC; } - mutex_exit(&dd->dd_lock); - return (err); + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); + return (error); } -extern dsl_syncfunc_t dsl_prop_set_sync; - static void -dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; - dsl_prop_set_sync(ds, psa, tx); - DSL_PROP_CHECK_PREDICTION(dd, psa); + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - mutex_enter(&dd->dd_lock); - dd->dd_phys->dd_quota = effective_value; - mutex_exit(&dd->dd_lock); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); + + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + ds->ds_dir->dd_phys->dd_quota = newval; + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { - dsl_dir_t *dd; - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "quota", source, "a); - - err = dsl_dataset_hold(ddname, FTAG, &ds); - if (err) - return (err); + dsl_dir_set_qr_arg_t ddsqra; - err = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = quota; - ASSERT(ds->ds_dir == dd); - - /* - * If someone removes a file, then tries to set the quota, we want to - * make sure the file freeing takes effect. - */ - txg_wait_open(dd->dd_pool, 0); - - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, ds, &psa, 0); - - dsl_dir_close(dd, FTAG); - dsl_dataset_rele(ds, FTAG); - return (err); + return (dsl_sync_task(ddname, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, &ddsqra, 0)); } int -dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value; - uint64_t used, avail; - int err; - - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + uint64_t newval, used, avail; + int error; - effective_value = psa->psa_effective_value; + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + dd = ds->ds_dir; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ - if (!dmu_tx_is_syncing(tx)) + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); return (0); + } + + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; @@ -1125,21 +1036,21 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } - if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) { - uint64_t delta = MAX(used, effective_value) - + if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) { + uint64_t delta = MAX(used, newval) - MAX(used, dd->dd_phys->dd_reserved); - if (delta > avail) - return (ENOSPC); - if (dd->dd_phys->dd_quota > 0 && - effective_value > dd->dd_phys->dd_quota) - return (ENOSPC); + if (delta > avail || + (dd->dd_phys->dd_quota > 0 && + newval > dd->dd_phys->dd_quota)) + error = ENOSPC; } - return (0); + dsl_dataset_rele(ds, FTAG); + return (error); } -static void +void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { uint64_t used; @@ -1162,48 +1073,38 @@ dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) static void -dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - uint64_t value = psa->psa_effective_value; + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - dsl_prop_set_sync(ds, psa, tx); - DSL_PROP_CHECK_PREDICTION(dd, psa); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); - dsl_dir_set_reservation_sync_impl(dd, value, tx); + dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); + dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation) { - dsl_dir_t *dd; - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation); - - err = dsl_dataset_hold(ddname, FTAG, &ds); - if (err) - return (err); - - err = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } + dsl_dir_set_qr_arg_t ddsqra; - ASSERT(ds->ds_dir == dd); + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = reservation; - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, - dsl_dir_set_reservation_sync, ds, &psa, 0); - - dsl_dir_close(dd, FTAG); - dsl_dataset_rele(ds, FTAG); - return (err); + return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, + dsl_dir_set_reservation_sync, &ddsqra, 0)); } static dsl_dir_t * @@ -1235,79 +1136,123 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) return (would_change(dd->dd_parent, delta, ancestor)); } -struct renamearg { - dsl_dir_t *newparent; - const char *mynewname; -}; +typedef struct dsl_dir_rename_arg { + const char *ddra_oldname; + const char *ddra_newname; +} dsl_dir_rename_arg_t; +/* ARGSUSED */ static int -dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { - dsl_dir_t *dd = arg1; - struct renamearg *ra = arg2; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err; - uint64_t val; + int *deltap = arg; + char namebuf[MAXNAMELEN]; - /* - * There should only be one reference, from dmu_objset_rename(). - * Fleeting holds are also possible (eg, from "zfs list" getting - * stats), but any that are present in open context will likely - * be gone by syncing context, so only fail from syncing - * context. - */ - if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1) - return (EBUSY); + dsl_dataset_name(ds, namebuf); + + if (strlen(namebuf) + *deltap >= MAXNAMELEN) + return (ENAMETOOLONG); + return (0); +} + +static int +dsl_dir_rename_check(void *arg, dmu_tx_t *tx) +{ + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + const char *mynewname; + int error; + int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); - /* check for existing name */ - err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, - ra->mynewname, 8, 1, &val); - if (err == 0) + /* target dir should exist */ + error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); + if (error != 0) + return (error); + + /* new parent should exist */ + error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, + &newparent, &mynewname); + if (error != 0) { + dsl_dir_rele(dd, FTAG); + return (error); + } + + /* can't rename to different pool */ + if (dd->dd_pool != newparent->dd_pool) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (ENXIO); + } + + /* new name should not already exist */ + if (mynewname == NULL) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); return (EEXIST); - if (err != ENOENT) - return (err); + } + + /* if the name length is growing, validate child name lengths */ + if (delta > 0) { + error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, + &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } + } - if (ra->newparent != dd->dd_parent) { + if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); /* no rename into our descendant */ - if (closest_common_ancestor(dd, ra->newparent) == dd) + if (closest_common_ancestor(dd, newparent) == dd) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); return (EINVAL); + } - if (err = dsl_dir_transfer_possible(dd->dd_parent, - ra->newparent, myspace)) - return (err); + error = dsl_dir_transfer_possible(dd->dd_parent, + newparent, myspace); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } } + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); return (0); } static void -dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct renamearg *ra = arg2; - dsl_pool_t *dp = dd->dd_pool; + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + const char *mynewname; + int error; objset_t *mos = dp->dp_meta_objset; - int err; - char namebuf[MAXNAMELEN]; - ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); + VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); + VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, + &mynewname)); /* Log this before we change the name. */ - dsl_dir_name(ra->newparent, namebuf); spa_history_log_internal_dd(dd, "rename", tx, - "-> %s/%s", namebuf, ra->mynewname); + "-> %s", ddra->ddra_newname); - if (ra->newparent != dd->dd_parent) { + if (newparent != dd->dd_parent) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dd->dd_phys->dd_used_bytes, -dd->dd_phys->dd_compressed_bytes, -dd->dd_phys->dd_uncompressed_bytes, tx); - dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, + dsl_dir_diduse_space(newparent, DD_USED_CHILD, dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_compressed_bytes, dd->dd_phys->dd_uncompressed_bytes, tx); @@ -1318,7 +1263,7 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); - dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, + dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } @@ -1326,52 +1271,36 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ - err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, + error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx); - ASSERT0(err); + ASSERT0(error); - (void) strcpy(dd->dd_myname, ra->mynewname); - dsl_dir_close(dd->dd_parent, dd); - dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; - VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, - ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); + (void) strcpy(dd->dd_myname, mynewname); + dsl_dir_rele(dd->dd_parent, dd); + dd->dd_phys->dd_parent_obj = newparent->dd_object; + VERIFY0(dsl_dir_hold_obj(dp, + newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ - err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, - dd->dd_myname, 8, 1, &dd->dd_object, tx); - ASSERT0(err); + VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj, + dd->dd_myname, 8, 1, &dd->dd_object, tx)); + + dsl_prop_notify_all(dd); + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); } int -dsl_dir_rename(dsl_dir_t *dd, const char *newname) +dsl_dir_rename(const char *oldname, const char *newname) { - struct renamearg ra; - int err; + dsl_dir_rename_arg_t ddra; - /* new parent should exist */ - err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); - if (err) - return (err); + ddra.ddra_oldname = oldname; + ddra.ddra_newname = newname; - /* can't rename to different pool */ - if (dd->dd_pool != ra.newparent->dd_pool) { - err = ENXIO; - goto out; - } - - /* new name should not already exist */ - if (ra.mynewname == NULL) { - err = EEXIST; - goto out; - } - - err = dsl_sync_task_do(dd->dd_pool, - dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); - -out: - dsl_dir_close(ra.newparent, FTAG); - return (err); + return (dsl_sync_task(oldname, + dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3)); } int diff --git a/uts/common/fs/zfs/dsl_pool.c b/uts/common/fs/zfs/dsl_pool.c index 38d656a..6af6316 100644 --- a/uts/common/fs/zfs/dsl_pool.c +++ b/uts/common/fs/zfs/dsl_pool.c @@ -43,6 +43,7 @@ #include #include #include +#include int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -69,7 +70,7 @@ dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) if (err) return (err); - return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); + return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * @@ -81,7 +82,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; - rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); + rrw_init(&dp->dp_config_rwlock, B_TRUE); dp->dp_write_limit = zfs_write_limit_min; txg_init(dp, txg); @@ -92,7 +93,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_dirty_dirs, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, - offsetof(dsl_sync_task_group_t, dstg_node)); + offsetof(dsl_sync_task_t, dst_node)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); @@ -126,14 +127,14 @@ dsl_pool_open(dsl_pool_t *dp) dsl_dataset_t *ds; uint64_t obj; - rw_enter(&dp->dp_config_rwlock, RW_WRITER); + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); if (err) goto out; - err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir); if (err) goto out; @@ -154,7 +155,7 @@ dsl_pool_open(dsl_pool_t *dp) &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } - dsl_dir_close(dd, dp); + dsl_dir_rele(dd, dp); if (err) goto out; } @@ -169,7 +170,7 @@ dsl_pool_open(dsl_pool_t *dp) DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; - VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } @@ -202,7 +203,7 @@ dsl_pool_open(dsl_pool_t *dp) err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: - rw_exit(&dp->dp_config_rwlock); + rrw_exit(&dp->dp_config_rwlock, FTAG); return (err); } @@ -217,13 +218,13 @@ dsl_pool_close(dsl_pool_t *dp) * and not a hold, so just drop that here. */ if (dp->dp_origin_snap) - dsl_dataset_drop_ref(dp->dp_origin_snap, dp); + dsl_dataset_rele(dp->dp_origin_snap, dp); if (dp->dp_mos_dir) - dsl_dir_close(dp->dp_mos_dir, dp); + dsl_dir_rele(dp->dp_mos_dir, dp); if (dp->dp_free_dir) - dsl_dir_close(dp->dp_free_dir, dp); + dsl_dir_rele(dp->dp_free_dir, dp); if (dp->dp_root_dir) - dsl_dir_close(dp->dp_root_dir, dp); + dsl_dir_rele(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); @@ -239,7 +240,7 @@ dsl_pool_close(dsl_pool_t *dp) arc_flush(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); - rw_destroy(&dp->dp_config_rwlock); + rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_vnrele_taskq); if (dp->dp_blkstats) @@ -257,6 +258,8 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) dsl_dataset_t *ds; uint64_t obj; + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); @@ -267,30 +270,30 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) ASSERT0(err); /* Initialize scan structures */ - VERIFY3U(0, ==, dsl_scan_init(dp, txg)); + VERIFY0(dsl_scan_init(dp, txg)); /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); - VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_special_dir(dp, + VERIFY0(dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { /* create and open the free dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_special_dir(dp, + VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* create and open the free_bplist */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); - VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } @@ -301,7 +304,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); /* create the root objset */ - VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); #ifdef _KERNEL @@ -311,6 +314,8 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) dmu_tx_commit(tx); + rrw_exit(&dp->dp_config_rwlock, FTAG); + return (dp); } @@ -333,10 +338,7 @@ static int deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; - dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); - rw_enter(&dp->dp_config_rwlock, RW_READER); dsl_deadlist_insert(dl, bp, tx); - rw_exit(&dp->dp_config_rwlock); return (0); } @@ -358,7 +360,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) /* * We need to copy dp_space_towrite() before doing - * dsl_sync_task_group_sync(), because + * dsl_sync_task_sync(), because * dsl_dataset_snapshot_reserve_space() will increase * dp_space_towrite but not actually write anything. */ @@ -472,14 +474,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ DTRACE_PROBE(pool_sync__3task); if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { - dsl_sync_task_group_t *dstg; + dsl_sync_task_t *dst; /* * No more sync tasks should have been added while we * were syncing. */ ASSERT(spa_sync_pass(dp->dp_spa) == 1); - while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) - dsl_sync_task_group_sync(dstg, tx); + while (dst = txg_list_remove(&dp->dp_sync_tasks, txg)) + dsl_sync_task_sync(dst, tx); } dmu_tx_commit(tx); @@ -654,14 +656,13 @@ dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) /* ARGSUSED */ static int -upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds, *prev = NULL; int err; - dsl_pool_t *dp = spa_get_dsl(spa); - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); @@ -687,7 +688,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) * The $ORIGIN can't have any data, or the accounting * will be wrong. */ - ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); + ASSERT0(prev->ds_phys->ds_bp.blk_birth); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { @@ -707,13 +708,13 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) if (ds->ds_phys->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); - VERIFY(0 == dsl_dataset_hold_obj(dp, + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); } } - ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); - ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); + ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object); + ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object); if (prev->ds_phys->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); @@ -721,7 +722,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY(0 == zap_add_int(dp->dp_meta_objset, + VERIFY0(zap_add_int(dp->dp_meta_objset, prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); @@ -736,25 +737,21 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); - VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, tx, DS_FIND_CHILDREN)); } /* ARGSUSED */ static int -upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dmu_tx_t *tx = arg; - dsl_dataset_t *ds; - dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - - if (ds->ds_dir->dd_phys->dd_origin_obj) { + if (ds->ds_dir->dd_phys->dd_origin_obj != 0) { dsl_dataset_t *origin; - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); if (origin->ds_dir->dd_phys->dd_clones == 0) { @@ -763,13 +760,11 @@ upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, - origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); dsl_dataset_rele(origin, FTAG); } - - dsl_dataset_rele(ds, FTAG); return (0); } @@ -780,7 +775,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) uint64_t obj; (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_special_dir(dp, + VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* @@ -790,12 +785,11 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) */ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); - VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); - VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, - dp->dp_meta_objset, obj)); + VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); - VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); } @@ -807,17 +801,16 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap == NULL); + ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); /* create the origin dir, ds, & snap-ds */ - rw_enter(&dp->dp_config_rwlock, RW_WRITER); dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, tx); - VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); - VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); + VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); - rw_exit(&dp->dp_config_rwlock); } taskq_t * @@ -852,7 +845,7 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) *htag = '\0'; ++htag; dsobj = strtonum(za.za_name, NULL); - (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); + dsl_dataset_user_release_tmp(dp, dsobj, htag); } zap_cursor_fini(&zc); } @@ -874,7 +867,7 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) static int dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, - const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) + const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) { objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; @@ -899,7 +892,7 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); if (holding) - error = zap_add(mos, zapobj, name, 8, 1, now, tx); + error = zap_add(mos, zapobj, name, 8, 1, &now, tx); else error = zap_remove(mos, zapobj, name, tx); strfree(name); @@ -912,7 +905,7 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, */ int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, - uint64_t *now, dmu_tx_t *tx) + uint64_t now, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); } @@ -927,3 +920,106 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, tx, B_FALSE)); } + +/* + * DSL Pool Configuration Lock + * + * The dp_config_rwlock protects against changes to DSL state (e.g. dataset + * creation / destruction / rename / property setting). It must be held for + * read to hold a dataset or dsl_dir. I.e. you must call + * dsl_pool_config_enter() or dsl_pool_hold() before calling + * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock + * must be held continuously until all datasets and dsl_dirs are released. + * + * The only exception to this rule is that if a "long hold" is placed on + * a dataset, then the dp_config_rwlock may be dropped while the dataset + * is still held. The long hold will prevent the dataset from being + * destroyed -- the destroy will fail with EBUSY. A long hold can be + * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset + * (by calling dsl_{dataset,objset}_{try}own{_obj}). + * + * Legitimate long-holders (including owners) should be long-running, cancelable + * tasks that should cause "zfs destroy" to fail. This includes DMU + * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), + * "zfs send", and "zfs diff". There are several other long-holders whose + * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). + * + * The usual formula for long-holding would be: + * dsl_pool_hold() + * dsl_dataset_hold() + * ... perform checks ... + * dsl_dataset_long_hold() + * dsl_pool_rele() + * ... perform long-running task ... + * dsl_dataset_long_rele() + * dsl_dataset_rele() + * + * Note that when the long hold is released, the dataset is still held but + * the pool is not held. The dataset may change arbitrarily during this time + * (e.g. it could be destroyed). Therefore you shouldn't do anything to the + * dataset except release it. + * + * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only + * or modifying operations. + * + * Modifying operations should generally use dsl_sync_task(). The synctask + * infrastructure enforces proper locking strategy with respect to the + * dp_config_rwlock. See the comment above dsl_sync_task() for details. + * + * Read-only operations will manually hold the pool, then the dataset, obtain + * information from the dataset, then release the pool and dataset. + * dmu_objset_{hold,rele}() are convenience routines that also do the pool + * hold/rele. + */ + +int +dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) +{ + spa_t *spa; + int error; + + error = spa_open(name, &spa, tag); + if (error == 0) { + *dp = spa_get_dsl(spa); + dsl_pool_config_enter(*dp, tag); + } + return (error); +} + +void +dsl_pool_rele(dsl_pool_t *dp, void *tag) +{ + dsl_pool_config_exit(dp, tag); + spa_close(dp->dp_spa, tag); +} + +void +dsl_pool_config_enter(dsl_pool_t *dp, void *tag) +{ + /* + * We use a "reentrant" reader-writer lock, but not reentrantly. + * + * The rrwlock can (with the track_all flag) track all reading threads, + * which is very useful for debugging which code path failed to release + * the lock, and for verifying that the *current* thread does hold + * the lock. + * + * (Unlike a rwlock, which knows that N threads hold it for + * read, but not *which* threads, so rw_held(RW_READER) returns TRUE + * if any thread holds it for read, even if this thread doesn't). + */ + ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); + rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); +} + +void +dsl_pool_config_exit(dsl_pool_t *dp, void *tag) +{ + rrw_exit(&dp->dp_config_rwlock, tag); +} + +boolean_t +dsl_pool_config_held(dsl_pool_t *dp) +{ + return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); +} diff --git a/uts/common/fs/zfs/dsl_prop.c b/uts/common/fs/zfs/dsl_prop.c index 5bbe14f..cd7c3ec 100644 --- a/uts/common/fs/zfs/dsl_prop.c +++ b/uts/common/fs/zfs/dsl_prop.c @@ -82,7 +82,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, char *inheritstr; char *recvdstr; - ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); + ASSERT(dsl_pool_config_held(dd->dd_pool)); if (setpoint) setpoint[0] = '\0'; @@ -97,8 +97,6 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, * after this loop. */ for (; dd != NULL; dd = dd->dd_parent) { - ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); - if (dd != target || snapshot) { if (!inheritable) break; @@ -167,7 +165,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, boolean_t snapshot; uint64_t zapobj; - ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)); zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj); @@ -235,18 +233,12 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, uint64_t value; dsl_prop_cb_record_t *cbr; int err; - int need_rwlock; - need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock); - if (need_rwlock) - rw_enter(&dp->dp_config_rwlock, RW_READER); + ASSERT(dsl_pool_config_held(dp)); - err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL); - if (err != 0) { - if (need_rwlock) - rw_exit(&dp->dp_config_rwlock); + err = dsl_prop_get_int_ds(ds, propname, &value); + if (err != 0) return (err); - } cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); cbr->cbr_ds = ds; @@ -259,9 +251,6 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, mutex_exit(&dd->dd_lock); cbr->cbr_func(cbr->cbr_arg, value); - - if (need_rwlock) - rw_exit(&dp->dp_config_rwlock); return (0); } @@ -269,19 +258,18 @@ int dsl_prop_get(const char *dsname, const char *propname, int intsz, int numints, void *buf, char *setpoint) { - dsl_dataset_t *ds; - int err; + objset_t *os; + int error; - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + error = dsl_prop_get_ds(dmu_objset_ds(os), propname, + intsz, numints, buf, setpoint); - dsl_dataset_rele(ds, FTAG); - return (err); + dmu_objset_rele(os, FTAG); + return (error); } /* @@ -299,17 +287,11 @@ dsl_prop_get_integer(const char *ddname, const char *propname, return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); } -void -dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, - zprop_source_t source, uint64_t *value) +int +dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, + uint64_t *valuep) { - psa->psa_name = propname; - psa->psa_source = source; - psa->psa_intsz = 8; - psa->psa_numints = 1; - psa->psa_value = value; - - psa->psa_effective_value = -1ULL; + return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); } /* @@ -323,11 +305,10 @@ dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, * a property not handled by this function. */ int -dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) +dsl_prop_predict(dsl_dir_t *dd, const char *propname, + zprop_source_t source, uint64_t value, uint64_t *newvalp) { - const char *propname = psa->psa_name; zfs_prop_t prop = zfs_name_to_prop(propname); - zprop_source_t source = psa->psa_source; objset_t *mos; uint64_t zapobj; uint64_t version; @@ -359,36 +340,33 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) switch (source) { case ZPROP_SRC_NONE: /* Revert to the received value, if any. */ - err = zap_lookup(mos, zapobj, recvdstr, 8, 1, - &psa->psa_effective_value); + err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); if (err == ENOENT) - psa->psa_effective_value = 0; + *newvalp = 0; break; case ZPROP_SRC_LOCAL: - psa->psa_effective_value = *(uint64_t *)psa->psa_value; + *newvalp = value; break; case ZPROP_SRC_RECEIVED: /* * If there's no local setting, then the new received value will * be the effective value. */ - err = zap_lookup(mos, zapobj, propname, 8, 1, - &psa->psa_effective_value); + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) - psa->psa_effective_value = *(uint64_t *)psa->psa_value; + *newvalp = value; break; case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): /* * We're clearing the received value, so the local setting (if * it exists) remains the effective value. */ - err = zap_lookup(mos, zapobj, propname, 8, 1, - &psa->psa_effective_value); + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) - psa->psa_effective_value = 0; + *newvalp = 0; break; default: - cmn_err(CE_PANIC, "unexpected property source: %d", source); + panic("unexpected property source: %d", source); } strfree(recvdstr); @@ -399,37 +377,6 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) return (err); } -#ifdef ZFS_DEBUG -void -dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa) -{ - zfs_prop_t prop = zfs_name_to_prop(psa->psa_name); - uint64_t intval; - char setpoint[MAXNAMELEN]; - uint64_t version = spa_version(dd->dd_pool->dp_spa); - int err; - - if (version < SPA_VERSION_RECVD_PROPS) { - switch (prop) { - case ZFS_PROP_QUOTA: - case ZFS_PROP_RESERVATION: - return; - } - } - - err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval, - setpoint, B_FALSE); - if (err == 0 && intval != psa->psa_effective_value) { - cmn_err(CE_PANIC, "%s property, source: %x, " - "predicted effective value: %llu, " - "actual effective value: %llu (setpoint: %s)", - psa->psa_name, psa->psa_source, - (unsigned long long)psa->psa_effective_value, - (unsigned long long)intval, setpoint); - } -} -#endif - /* * Unregister this callback. Return 0 on success, ENOENT if ddname is * invalid, ENOMSG if no matching callback registered. @@ -464,25 +411,57 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, return (0); } -/* - * Return the number of callbacks that are registered for this dataset. - */ -int -dsl_prop_numcb(dsl_dataset_t *ds) +boolean_t +dsl_prop_hascb(dsl_dataset_t *ds) { dsl_dir_t *dd = ds->ds_dir; + boolean_t rv = B_FALSE; dsl_prop_cb_record_t *cbr; - int num = 0; mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); - cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - if (cbr->cbr_ds == ds) - num++; + for (cbr = list_head(&dd->dd_prop_cbs); cbr; + cbr = list_next(&dd->dd_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds) { + rv = B_TRUE; + break; + } } mutex_exit(&dd->dd_lock); + return (rv); +} - return (num); +/* ARGSUSED */ +static int +dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_cb_record_t *cbr; + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&dd->dd_prop_cbs); cbr; + cbr = list_next(&dd->dd_prop_cbs, cbr)) { + uint64_t value; + + if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname, + sizeof (value), 1, &value, NULL) == 0) + cbr->cbr_func(cbr->cbr_arg, value); + } + mutex_exit(&dd->dd_lock); + + return (0); +} + +/* + * Update all property values for ddobj & its descendants. This is used + * when renaming the dir. + */ +void +dsl_prop_notify_all(dsl_dir_t *dd) +{ + dsl_pool_t *dp = dd->dd_pool; + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, + NULL, DS_FIND_CHILDREN); } static void @@ -496,8 +475,8 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, zap_attribute_t *za; int err; - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); if (err) return; @@ -508,7 +487,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, */ err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname); if (err == 0) { - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); return; } ASSERT3U(err, ==, ENOENT); @@ -543,26 +522,24 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, } kmem_free(za, sizeof (zap_attribute_t)); zap_cursor_fini(&zc); - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); } void -dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, + zprop_source_t source, int intsz, int numints, const void *value, + dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t zapobj, intval, dummy; int isint; char valbuf[32]; - char *valstr = NULL; + const char *valstr = NULL; char *inheritstr; char *recvdstr; char *tbuf = NULL; int err; uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); - const char *propname = psa->psa_name; - zprop_source_t source = psa->psa_source; isint = (dodefault(propname, 8, 1, &intval) == 0); @@ -612,8 +589,8 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) */ err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); - VERIFY(0 == zap_update(mos, zapobj, propname, - psa->psa_intsz, psa->psa_numints, psa->psa_value, tx)); + VERIFY0(zap_update(mos, zapobj, propname, + intsz, numints, value, tx)); break; case ZPROP_SRC_INHERITED: /* @@ -624,12 +601,10 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); if (version >= SPA_VERSION_RECVD_PROPS && - dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, - NULL) == 0) { + dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { dummy = 0; - err = zap_update(mos, zapobj, inheritstr, - 8, 1, &dummy, tx); - ASSERT(err == 0); + VERIFY0(zap_update(mos, zapobj, inheritstr, + 8, 1, &dummy, tx)); } break; case ZPROP_SRC_RECEIVED: @@ -637,7 +612,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) * set propname$recvd -> value */ err = zap_update(mos, zapobj, recvdstr, - psa->psa_intsz, psa->psa_numints, psa->psa_value, tx); + intsz, numints, value, tx); ASSERT(err == 0); break; case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): @@ -667,7 +642,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) strfree(recvdstr); if (isint) { - VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL)); + VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { dsl_prop_cb_record_t *cbr; @@ -694,7 +669,7 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) valstr = valbuf; } else { if (source == ZPROP_SRC_LOCAL) { - valstr = (char *)psa->psa_value; + valstr = value; } else { tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); if (dsl_prop_get_ds(ds, propname, 1, @@ -711,118 +686,73 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) kmem_free(tbuf, ZAP_MAXVALUELEN); } -void -dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) +int +dsl_prop_set_int(const char *dsname, const char *propname, + zprop_source_t source, uint64_t value) { - dsl_dataset_t *ds = arg1; - dsl_props_arg_t *pa = arg2; - nvlist_t *props = pa->pa_props; - dsl_prop_setarg_t psa; - nvpair_t *elem = NULL; - - psa.psa_source = pa->pa_source; - - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - nvpair_t *pair = elem; - - psa.psa_name = nvpair_name(pair); + nvlist_t *nvl = fnvlist_alloc(); + int error; - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - /* - * dsl_prop_get_all_impl() returns properties in this - * format. - */ - nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &pair) == 0); - } - - if (nvpair_type(pair) == DATA_TYPE_STRING) { - VERIFY(nvpair_value_string(pair, - (char **)&psa.psa_value) == 0); - psa.psa_intsz = 1; - psa.psa_numints = strlen(psa.psa_value) + 1; - } else { - uint64_t intval; - VERIFY(nvpair_value_uint64(pair, &intval) == 0); - psa.psa_intsz = sizeof (intval); - psa.psa_numints = 1; - psa.psa_value = &intval; - } - dsl_prop_set_sync(ds, &psa, tx); - } + fnvlist_add_uint64(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); } int -dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source, - int intsz, int numints, const void *buf) +dsl_prop_set_string(const char *dsname, const char *propname, + zprop_source_t source, const char *value) { - dsl_dataset_t *ds; - uint64_t version; - int err; - dsl_prop_setarg_t psa; - - /* - * We must do these checks before we get to the syncfunc, since - * it can't fail. - */ - if (strlen(propname) >= ZAP_MAXNAMELEN) - return (ENAMETOOLONG); - - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); - - version = spa_version(ds->ds_dir->dd_pool->dp_spa); - if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ? - ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { - dsl_dataset_rele(ds, FTAG); - return (E2BIG); - } - if (dsl_dataset_is_snapshot(ds) && - version < SPA_VERSION_SNAP_PROPS) { - dsl_dataset_rele(ds, FTAG); - return (ENOTSUP); - } + nvlist_t *nvl = fnvlist_alloc(); + int error; - psa.psa_name = propname; - psa.psa_source = source; - psa.psa_intsz = intsz; - psa.psa_numints = numints; - psa.psa_value = buf; - psa.psa_effective_value = -1ULL; + fnvlist_add_string(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); +} - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - NULL, dsl_prop_set_sync, ds, &psa, 2); +int +dsl_prop_inherit(const char *dsname, const char *propname, + zprop_source_t source) +{ + nvlist_t *nvl = fnvlist_alloc(); + int error; - dsl_dataset_rele(ds, FTAG); - return (err); + fnvlist_add_boolean(nvl, propname); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); } -int -dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) +typedef struct dsl_props_set_arg { + const char *dpsa_dsname; + zprop_source_t dpsa_source; + nvlist_t *dpsa_props; +} dsl_props_set_arg_t; + +static int +dsl_props_set_check(void *arg, dmu_tx_t *tx) { + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t version; nvpair_t *elem = NULL; - dsl_props_arg_t pa; int err; - if (err = dsl_dataset_hold(dsname, FTAG, &ds)) + err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); + if (err != 0) return (err); - /* - * Do these checks before the syncfunc, since it can't fail. - */ + version = spa_version(ds->ds_dir->dd_pool->dp_spa); - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { dsl_dataset_rele(ds, FTAG); return (ENAMETOOLONG); } if (nvpair_type(elem) == DATA_TYPE_STRING) { - char *valstr; - VERIFY(nvpair_value_string(elem, &valstr) == 0); + char *valstr = fnvpair_value_string(elem); if (strlen(valstr) >= (version < SPA_VERSION_STMF_PROP ? ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { @@ -832,20 +762,83 @@ dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) } } - if (dsl_dataset_is_snapshot(ds) && - version < SPA_VERSION_SNAP_PROPS) { + if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (ENOTSUP); } + dsl_dataset_rele(ds, FTAG); + return (0); +} - pa.pa_props = props; - pa.pa_source = source; +void +dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, + nvlist_t *props, dmu_tx_t *tx) +{ + nvpair_t *elem = NULL; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - NULL, dsl_props_set_sync, ds, &pa, 2); + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + nvpair_t *pair = elem; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * dsl_prop_get_all_impl() returns properties in this + * format. + */ + nvlist_t *attrs = fnvpair_value_nvlist(pair); + pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); + } + + if (nvpair_type(pair) == DATA_TYPE_STRING) { + const char *value = fnvpair_value_string(pair); + dsl_prop_set_sync_impl(ds, nvpair_name(pair), + source, 1, strlen(value) + 1, value, tx); + } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { + uint64_t intval = fnvpair_value_uint64(pair); + dsl_prop_set_sync_impl(ds, nvpair_name(pair), + source, sizeof (intval), 1, &intval, tx); + } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { + dsl_prop_set_sync_impl(ds, nvpair_name(pair), + source, 0, 0, NULL, tx); + } else { + panic("invalid nvpair type"); + } + } +} + +static void +dsl_props_set_sync(void *arg, dmu_tx_t *tx) +{ + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); + dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); dsl_dataset_rele(ds, FTAG); - return (err); +} + +/* + * All-or-nothing; if any prop can't be set, nothing will be modified. + */ +int +dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) +{ + dsl_props_set_arg_t dpsa; + int nblks = 0; + + dpsa.dpsa_dsname = dsname; + dpsa.dpsa_source = source; + dpsa.dpsa_props = props; + + /* + * If the source includes NONE, then we will only be removing entries + * from the ZAP object. In that case don't check for ENOSPC. + */ + if ((source & ZPROP_SRC_NONE) == 0) + nblks = 2 * fnvlist_num_pairs(props); + + return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, + &dpsa, nblks)); } typedef enum dsl_prop_getflags { @@ -995,7 +988,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, if (dsl_dataset_is_snapshot(ds)) flags |= DSL_PROP_GET_SNAPSHOT; - rw_enter(&dp->dp_config_rwlock, RW_READER); + ASSERT(dsl_pool_config_held(dp)); if (ds->ds_phys->ds_props_obj != 0) { ASSERT(flags & DSL_PROP_GET_SNAPSHOT); @@ -1020,58 +1013,51 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, break; } out: - rw_exit(&dp->dp_config_rwlock); return (err); } boolean_t -dsl_prop_get_hasrecvd(objset_t *os) +dsl_prop_get_hasrecvd(const char *dsname) { - dsl_dataset_t *ds = os->os_dsl_dataset; - int rc; uint64_t dummy; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); - return (rc == 0); + return (0 == + dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); } -static void -dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) +static int +dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) { - dsl_dataset_t *ds = os->os_dsl_dataset; - uint64_t dummy = 0; - dsl_prop_setarg_t psa; - - if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS) - return; + uint64_t version; + spa_t *spa; + int error = 0; - dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy); + VERIFY0(spa_open(dsname, &spa, FTAG)); + version = spa_version(spa); + spa_close(spa, FTAG); - (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, - dsl_prop_set_sync, ds, &psa, 2); + if (version >= SPA_VERSION_RECVD_PROPS) + error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); + return (error); } /* * Call after successfully receiving properties to ensure that only the first * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. */ -void -dsl_prop_set_hasrecvd(objset_t *os) +int +dsl_prop_set_hasrecvd(const char *dsname) { - if (dsl_prop_get_hasrecvd(os)) { - ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); - return; - } - dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL); + int error = 0; + if (!dsl_prop_get_hasrecvd(dsname)) + error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); + return (error); } void -dsl_prop_unset_hasrecvd(objset_t *os) +dsl_prop_unset_hasrecvd(const char *dsname) { - dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE); + VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); } int @@ -1081,16 +1067,25 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp) } int -dsl_prop_get_received(objset_t *os, nvlist_t **nvp) +dsl_prop_get_received(const char *dsname, nvlist_t **nvp) { + objset_t *os; + int error; + /* * Received properties are not distinguishable from local properties * until the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. */ - dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ? + dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); - return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags)); + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); + error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); + dmu_objset_rele(os, FTAG); + return (error); } void diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c index e171725..3de3c6e 100644 --- a/uts/common/fs/zfs/dsl_scan.c +++ b/uts/common/fs/zfs/dsl_scan.c @@ -55,7 +55,7 @@ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); static scan_cb_t dsl_scan_defrag_cb; static scan_cb_t dsl_scan_scrub_cb; static scan_cb_t dsl_scan_remove_cb; -static dsl_syncfunc_t dsl_scan_cancel_sync; +static void dsl_scan_cancel_sync(void *, dmu_tx_t *); static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ @@ -154,9 +154,9 @@ dsl_scan_fini(dsl_pool_t *dp) /* ARGSUSED */ static int -dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { - dsl_scan_t *scn = arg1; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state == DSS_SCANNING) return (EBUSY); @@ -164,12 +164,11 @@ dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ static void -dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { - dsl_scan_t *scn = arg1; - pool_scan_func_t *funcp = arg2; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; @@ -315,9 +314,9 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) /* ARGSUSED */ static int -dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { - dsl_scan_t *scn = arg1; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state != DSS_SCANNING) return (ENOENT); @@ -326,9 +325,9 @@ dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { - dsl_scan_t *scn = arg1; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx); @@ -337,12 +336,8 @@ dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) int dsl_scan_cancel(dsl_pool_t *dp) { - boolean_t complete = B_FALSE; - int err; - - err = dsl_sync_task_do(dp, dsl_scan_cancel_check, - dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); - return (err); + return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, + dsl_scan_cancel_sync, NULL, 3)); } static void dsl_scan_visitbp(blkptr_t *bp, @@ -378,7 +373,7 @@ dsl_scan_ds_maxtxg(dsl_dataset_t *ds) static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) { - VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); @@ -950,33 +945,33 @@ struct enqueue_clones_arg { /* ARGSUSED */ static int -enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { struct enqueue_clones_arg *eca = arg; dsl_dataset_t *ds; int err; - dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj) + return (0); + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); - if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { - while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); + while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - ds = prev; - } - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, - ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; } + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } @@ -1065,17 +1060,17 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) } if (usenext) { - VERIFY(zap_join_key(dp->dp_meta_objset, + VERIFY0(zap_join_key(dp->dp_meta_objset, ds->ds_phys->ds_next_clones_obj, scn->scn_phys.scn_queue_obj, - ds->ds_phys->ds_creation_txg, tx) == 0); + ds->ds_phys->ds_creation_txg, tx)); } else { struct enqueue_clones_arg eca; eca.tx = tx; eca.originobj = ds->ds_object; - (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, - NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); } } @@ -1085,15 +1080,14 @@ out: /* ARGSUSED */ static int -enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds; int err; - dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); @@ -1248,8 +1242,8 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, - NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_cb, tx, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); @@ -1384,7 +1378,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", func, tx->tx_txg); - dsl_scan_setup_sync(scn, &func, tx); + dsl_scan_setup_sync(&func, tx); } if (!dsl_scan_active(scn) || @@ -1418,21 +1412,21 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); - VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); - if (err != 0) - return; - - /* disable async destroy feature */ - spa_feature_decr(spa, - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); - ASSERT(!spa_feature_is_active(spa, - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); - VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_BPTREE_OBJ, tx)); - VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, - dp->dp_bptree_obj, tx)); - dp->dp_bptree_obj = 0; + VERIFY0(zio_wait(scn->scn_zio_root)); + + if (err == 0) { + zfeature_info_t *feat = &spa_feature_table + [SPA_FEATURE_ASYNC_DESTROY]; + /* finished; deactivate async destroy feature */ + spa_feature_decr(spa, feat, tx); + ASSERT(!spa_feature_is_active(spa, feat)); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY0(bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + } } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " @@ -1479,7 +1473,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + dsl_pool_config_enter(dp, FTAG); dsl_scan_visit(scn, tx); + dsl_pool_config_exit(dp, FTAG); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; @@ -1714,6 +1710,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); - return (dsl_sync_task_do(dp, dsl_scan_setup_check, - dsl_scan_setup_sync, dp->dp_scan, &func, 0)); + return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, + dsl_scan_setup_sync, &func, 0)); } diff --git a/uts/common/fs/zfs/dsl_synctask.c b/uts/common/fs/zfs/dsl_synctask.c index e248128..ecb45fb 100644 --- a/uts/common/fs/zfs/dsl_synctask.c +++ b/uts/common/fs/zfs/dsl_synctask.c @@ -34,136 +34,115 @@ /* ARGSUSED */ static int -dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_null_checkfunc(void *arg, dmu_tx_t *tx) { return (0); } -dsl_sync_task_group_t * -dsl_sync_task_group_create(dsl_pool_t *dp) -{ - dsl_sync_task_group_t *dstg; - - dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP); - list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), - offsetof(dsl_sync_task_t, dst_node)); - dstg->dstg_pool = dp; - - return (dstg); -} - -void -dsl_sync_task_create(dsl_sync_task_group_t *dstg, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified) -{ - dsl_sync_task_t *dst; - - if (checkfunc == NULL) - checkfunc = dsl_null_checkfunc; - dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP); - dst->dst_checkfunc = checkfunc; - dst->dst_syncfunc = syncfunc; - dst->dst_arg1 = arg1; - dst->dst_arg2 = arg2; - list_insert_tail(&dstg->dstg_tasks, dst); - - dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT; -} - +/* + * Called from open context to perform a callback in syncing context. Waits + * for the operation to complete. + * + * The checkfunc will be called from open context as a preliminary check + * which can quickly fail. If it succeeds, it will be called again from + * syncing context. The checkfunc should generally be designed to work + * properly in either context, but if necessary it can check + * dmu_tx_is_syncing(tx). + * + * The synctask infrastructure enforces proper locking strategy with respect + * to the dp_config_rwlock -- the lock will always be held when the callbacks + * are called. It will be held for read during the open-context (preliminary) + * call to the checkfunc, and then held for write from syncing context during + * the calls to the check and sync funcs. + * + * A dataset or pool name can be passed as the first argument. Typically, + * the check func will hold, check the return value of the hold, and then + * release the dataset. The sync func will VERIFYO(hold()) the dataset. + * This is safe because no changes can be made between the check and sync funcs, + * and the sync func will only be called if the check func successfully opened + * the dataset. + */ int -dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) +dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified) { + spa_t *spa; dmu_tx_t *tx; - uint64_t txg; - dsl_sync_task_t *dst; - -top: - tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); - - txg = dmu_tx_get_txg(tx); + int err; + dsl_sync_task_t dst = { 0 }; + dsl_pool_t *dp; - /* Do a preliminary error check. */ - dstg->dstg_err = 0; -#ifdef ZFS_DEBUG - /* - * Only check half the time, otherwise, the sync-context - * check will almost never fail. - */ - if (spa_get_random(2) == 0) - goto skip; -#endif - rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER); - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_err = - dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); - if (dst->dst_err) - dstg->dstg_err = dst->dst_err; - } - rw_exit(&dstg->dstg_pool->dp_config_rwlock); + err = spa_open(pool, &spa, FTAG); + if (err != 0) + return (err); + dp = spa_get_dsl(spa); - if (dstg->dstg_err) { +top: + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + dst.dst_pool = dp; + dst.dst_txg = dmu_tx_get_txg(tx); + dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc; + dst.dst_syncfunc = syncfunc; + dst.dst_arg = arg; + dst.dst_error = 0; + dst.dst_nowaiter = B_FALSE; + + dsl_pool_config_enter(dp, FTAG); + err = dst.dst_checkfunc(arg, tx); + dsl_pool_config_exit(dp, FTAG); + + if (err != 0) { dmu_tx_commit(tx); - return (dstg->dstg_err); + spa_close(spa, FTAG); + return (err); } -skip: - /* - * We don't generally have many sync tasks, so pay the price of - * add_tail to get the tasks executed in the right order. - */ - VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, - dstg, txg)); + VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg)); dmu_tx_commit(tx); - txg_wait_synced(dstg->dstg_pool, txg); + txg_wait_synced(dp, dst.dst_txg); - if (dstg->dstg_err == EAGAIN) { - txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE); + if (dst.dst_error == EAGAIN) { + txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); goto top; } - return (dstg->dstg_err); + spa_close(spa, FTAG); + return (dst.dst_error); } void -dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, dmu_tx_t *tx) { - uint64_t txg; + dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); - dstg->dstg_nowaiter = B_TRUE; - txg = dmu_tx_get_txg(tx); - /* - * We don't generally have many sync tasks, so pay the price of - * add_tail to get the tasks executed in the right order. - */ - VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, - dstg, txg)); -} - -void -dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) -{ - dsl_sync_task_t *dst; + dst->dst_pool = dp; + dst->dst_txg = dmu_tx_get_txg(tx); + dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst->dst_checkfunc = dsl_null_checkfunc; + dst->dst_syncfunc = syncfunc; + dst->dst_arg = arg; + dst->dst_error = 0; + dst->dst_nowaiter = B_TRUE; - while (dst = list_head(&dstg->dstg_tasks)) { - list_remove(&dstg->dstg_tasks, dst); - kmem_free(dst, sizeof (dsl_sync_task_t)); - } - kmem_free(dstg, sizeof (dsl_sync_task_group_t)); + VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg)); } +/* + * Called in syncing context to execute the synctask. + */ void -dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) { - dsl_sync_task_t *dst; - dsl_pool_t *dp = dstg->dstg_pool; + dsl_pool_t *dp = dst->dst_pool; uint64_t quota, used; - ASSERT0(dstg->dstg_err); + ASSERT0(dst->dst_error); /* * Check for sufficient space. We just check against what's @@ -175,63 +154,21 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); used = dp->dp_root_dir->dd_phys->dd_used_bytes; /* MOS space is triple-dittoed, so we multiply by 3. */ - if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) { - dstg->dstg_err = ENOSPC; + if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) { + dst->dst_error = ENOSPC; + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); return; } /* - * Check for errors by calling checkfuncs. + * Check for errors by calling checkfunc. */ - rw_enter(&dp->dp_config_rwlock, RW_WRITER); - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_err = - dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); - if (dst->dst_err) - dstg->dstg_err = dst->dst_err; - } - - if (dstg->dstg_err == 0) { - /* - * Execute sync tasks. - */ - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); - } - } - rw_exit(&dp->dp_config_rwlock); - - if (dstg->dstg_nowaiter) - dsl_sync_task_group_destroy(dstg); -} - -int -dsl_sync_task_do(dsl_pool_t *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified) -{ - dsl_sync_task_group_t *dstg; - int err; - - ASSERT(spa_writeable(dp->dp_spa)); - - dstg = dsl_sync_task_group_create(dp); - dsl_sync_task_create(dstg, checkfunc, syncfunc, - arg1, arg2, blocks_modified); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); - return (err); -} - -void -dsl_sync_task_do_nowait(dsl_pool_t *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) -{ - dsl_sync_task_group_t *dstg = dsl_sync_task_group_create(dp); - dsl_sync_task_create(dstg, checkfunc, syncfunc, - arg1, arg2, blocks_modified); - dsl_sync_task_group_nowait(dstg, tx); + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx); + if (dst->dst_error == 0) + dst->dst_syncfunc(dst->dst_arg, tx); + rrw_exit(&dp->dp_config_rwlock, FTAG); + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); } diff --git a/uts/common/fs/zfs/dsl_userhold.c b/uts/common/fs/zfs/dsl_userhold.c new file mode 100644 index 0000000..e30169e --- /dev/null +++ b/uts/common/fs/zfs/dsl_userhold.c @@ -0,0 +1,536 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct dsl_dataset_user_hold_arg { + nvlist_t *dduha_holds; + nvlist_t *dduha_errlist; + minor_t dduha_minor; +} dsl_dataset_user_hold_arg_t; + +/* + * If you add new checks here, you may need to add additional checks to the + * "temporary" case in snapshot_check() in dmu_objset.c. + */ +int +dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, + boolean_t temphold, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + int error = 0; + + if (strlen(htag) > MAXNAMELEN) + return (E2BIG); + /* Tempholds have a more restricted length */ + if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + return (E2BIG); + + /* tags must be unique (if ds already exists) */ + if (ds != NULL) { + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj != 0) { + uint64_t value; + error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, + htag, 8, 1, &value); + if (error == 0) + error = EEXIST; + else if (error == ENOENT) + error = 0; + } + mutex_exit(&ds->ds_lock); + } + + return (error); +} + +static int +dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int rv = 0; + + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { + int error = 0; + dsl_dataset_t *ds; + char *htag; + + /* must be a snapshot */ + if (strchr(nvpair_name(pair), '@') == NULL) + error = EINVAL; + + if (error == 0) + error = nvpair_value_string(pair, &htag); + if (error == 0) { + error = dsl_dataset_hold(dp, + nvpair_name(pair), FTAG, &ds); + } + if (error == 0) { + error = dsl_dataset_user_hold_check_one(ds, htag, + dduha->dduha_minor != 0, tx); + dsl_dataset_rele(ds, FTAG); + } + + if (error != 0) { + rv = error; + fnvlist_add_int32(dduha->dduha_errlist, + nvpair_name(pair), error); + } + } + return (rv); +} + +void +dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, + minor_t minor, uint64_t now, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj == 0) { + /* + * This is the first user hold for this dataset. Create + * the userrefs zap object. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + zapobj = ds->ds_phys->ds_userrefs_obj = + zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); + } else { + zapobj = ds->ds_phys->ds_userrefs_obj; + } + ds->ds_userrefs++; + mutex_exit(&ds->ds_lock); + + VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + + if (minor != 0) { + VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, + htag, now, tx)); + dsl_register_onexit_hold_cleanup(ds, htag, minor); + } + + spa_history_log_internal_ds(ds, "hold", tx, + "tag=%s temp=%d refs=%llu", + htag, minor != 0, ds->ds_userrefs); +} + +static void +dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + uint64_t now = gethrestime_sec(); + + for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { + dsl_dataset_t *ds; + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + dsl_dataset_user_hold_sync_one(ds, fnvpair_value_string(pair), + dduha->dduha_minor, now, tx); + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * holds is nvl of snapname -> holdname + * errlist will be filled in with snapname -> error + * if cleanup_minor is not 0, the holds will be temporary, cleaned up + * when the process exits. + * + * if any fails, all will fail. + */ +int +dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) +{ + dsl_dataset_user_hold_arg_t dduha; + nvpair_t *pair; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + dduha.dduha_holds = holds; + dduha.dduha_errlist = errlist; + dduha.dduha_minor = cleanup_minor; + + return (dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds))); +} + +typedef struct dsl_dataset_user_release_arg { + nvlist_t *ddura_holds; + nvlist_t *ddura_todelete; + nvlist_t *ddura_errlist; +} dsl_dataset_user_release_arg_t; + +static int +dsl_dataset_user_release_check_one(dsl_dataset_t *ds, + nvlist_t *holds, boolean_t *todelete) +{ + uint64_t zapobj; + nvpair_t *pair; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int error; + int numholds = 0; + + *todelete = B_FALSE; + + if (!dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + zapobj = ds->ds_phys->ds_userrefs_obj; + if (zapobj == 0) + return (ESRCH); + + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + /* Make sure the hold exists */ + uint64_t tmp; + error = zap_lookup(mos, zapobj, nvpair_name(pair), 8, 1, &tmp); + if (error == ENOENT) + error = ESRCH; + if (error != 0) + return (error); + numholds++; + } + + if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 && + ds->ds_userrefs == numholds) { + /* we need to destroy the snapshot as well */ + + if (dsl_dataset_long_held(ds)) + return (EBUSY); + *todelete = B_TRUE; + } + return (0); +} + +static int +dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int rv = 0; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { + const char *name = nvpair_name(pair); + int error; + dsl_dataset_t *ds; + nvlist_t *holds; + + error = nvpair_value_nvlist(pair, &holds); + if (error != 0) + return (EINVAL); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error == 0) { + boolean_t deleteme; + error = dsl_dataset_user_release_check_one(ds, + holds, &deleteme); + if (error == 0 && deleteme) { + fnvlist_add_boolean(ddura->ddura_todelete, + name); + } + dsl_dataset_rele(ds, FTAG); + } + if (error != 0) { + if (ddura->ddura_errlist != NULL) { + fnvlist_add_int32(ddura->ddura_errlist, + name, error); + } + rv = error; + } + } + return (rv); +} + +static void +dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj; + int error; + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + ds->ds_userrefs--; + error = dsl_pool_user_release(dp, ds->ds_object, + nvpair_name(pair), tx); + VERIFY(error == 0 || error == ENOENT); + zapobj = ds->ds_phys->ds_userrefs_obj; + VERIFY0(zap_remove(mos, zapobj, nvpair_name(pair), tx)); + + spa_history_log_internal_ds(ds, "release", tx, + "tag=%s refs=%lld", nvpair_name(pair), + (longlong_t)ds->ds_userrefs); + } +} + +static void +dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + dsl_dataset_user_release_sync_one(ds, + fnvpair_value_nvlist(pair), tx); + if (nvlist_exists(ddura->ddura_todelete, + nvpair_name(pair))) { + ASSERT(ds->ds_userrefs == 0 && + ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)); + dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); + } + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * holds is nvl of snapname -> { holdname, ... } + * errlist will be filled in with snapname -> error + * + * if any fails, all will fail. + */ +int +dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) +{ + dsl_dataset_user_release_arg_t ddura; + nvpair_t *pair; + int error; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + ddura.ddura_holds = holds; + ddura.ddura_errlist = errlist; + ddura.ddura_todelete = fnvlist_alloc(); + + error = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, &ddura, fnvlist_num_pairs(holds)); + fnvlist_free(ddura.ddura_todelete); + return (error); +} + +typedef struct dsl_dataset_user_release_tmp_arg { + uint64_t ddurta_dsobj; + nvlist_t *ddurta_holds; + boolean_t ddurta_deleteme; +} dsl_dataset_user_release_tmp_arg_t; + +static int +dsl_dataset_user_release_tmp_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_tmp_arg_t *ddurta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + error = dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds); + if (error) + return (error); + + error = dsl_dataset_user_release_check_one(ds, + ddurta->ddurta_holds, &ddurta->ddurta_deleteme); + dsl_dataset_rele(ds, FTAG); + return (error); +} + +static void +dsl_dataset_user_release_tmp_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_tmp_arg_t *ddurta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds)); + dsl_dataset_user_release_sync_one(ds, ddurta->ddurta_holds, tx); + if (ddurta->ddurta_deleteme) { + ASSERT(ds->ds_userrefs == 0 && + ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)); + dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); + } + dsl_dataset_rele(ds, FTAG); +} + +/* + * Called at spa_load time to release a stale temporary user hold. + * Also called by the onexit code. + */ +void +dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, const char *htag) +{ + dsl_dataset_user_release_tmp_arg_t ddurta; + dsl_dataset_t *ds; + int error; + +#ifdef _KERNEL + /* Make sure it is not mounted. */ + dsl_pool_config_enter(dp, FTAG); + error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (error == 0) { + char name[MAXNAMELEN]; + dsl_dataset_name(ds, name); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + zfs_unmount_snap(name); + } else { + dsl_pool_config_exit(dp, FTAG); + } +#endif + + ddurta.ddurta_dsobj = dsobj; + ddurta.ddurta_holds = fnvlist_alloc(); + fnvlist_add_boolean(ddurta.ddurta_holds, htag); + + (void) dsl_sync_task(spa_name(dp->dp_spa), + dsl_dataset_user_release_tmp_check, + dsl_dataset_user_release_tmp_sync, &ddurta, 1); + fnvlist_free(ddurta.ddurta_holds); +} + +typedef struct zfs_hold_cleanup_arg { + char zhca_spaname[MAXNAMELEN]; + uint64_t zhca_spa_load_guid; + uint64_t zhca_dsobj; + char zhca_htag[MAXNAMELEN]; +} zfs_hold_cleanup_arg_t; + +static void +dsl_dataset_user_release_onexit(void *arg) +{ + zfs_hold_cleanup_arg_t *ca = arg; + spa_t *spa; + int error; + + error = spa_open(ca->zhca_spaname, &spa, FTAG); + if (error != 0) { + zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s " + "because pool is no longer loaded", + ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag); + return; + } + if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { + zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s " + "because pool is no longer loaded (guid doesn't match)", + ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag); + spa_close(spa, FTAG); + return; + } + + dsl_dataset_user_release_tmp(spa_get_dsl(spa), + ca->zhca_dsobj, ca->zhca_htag); + kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); + spa_close(spa, FTAG); +} + +void +dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, + minor_t minor) +{ + zfs_hold_cleanup_arg_t *ca = kmem_alloc(sizeof (*ca), KM_SLEEP); + spa_t *spa = dsl_dataset_get_spa(ds); + (void) strlcpy(ca->zhca_spaname, spa_name(spa), + sizeof (ca->zhca_spaname)); + ca->zhca_spa_load_guid = spa_load_guid(spa); + ca->zhca_dsobj = ds->ds_object; + (void) strlcpy(ca->zhca_htag, htag, sizeof (ca->zhca_htag)); + VERIFY0(zfs_onexit_add_cb(minor, + dsl_dataset_user_release_onexit, ca, NULL)); +} + +int +dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (ds->ds_phys->ds_userrefs_obj != 0) { + zap_attribute_t *za; + zap_cursor_t zc; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + fnvlist_add_uint64(nvl, za->za_name, + za->za_first_integer); + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + } + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (0); +} diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c index bf9889e..aae2ccd 100644 --- a/uts/common/fs/zfs/metaslab.c +++ b/uts/common/fs/zfs/metaslab.c @@ -1866,3 +1866,41 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } + +static void +checkmap(space_map_t *sm, uint64_t off, uint64_t size) +{ + space_seg_t *ss; + avl_index_t where; + + mutex_enter(sm->sm_lock); + ss = space_map_find(sm, off, size, &where); + if (ss != NULL) + panic("freeing free block; ss=%p", (void *)ss); + mutex_exit(sm->sm_lock); +} + +void +metaslab_check_free(spa_t *spa, const blkptr_t *bp) +{ + if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) + return; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]); + vdev_t *vd = vdev_lookup_top(spa, vdid); + uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]); + uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); + metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift]; + + if (ms->ms_map->sm_loaded) + checkmap(ms->ms_map, off, size); + + for (int j = 0; j < TXG_SIZE; j++) + checkmap(ms->ms_freemap[j], off, size); + for (int j = 0; j < TXG_DEFER_SIZE; j++) + checkmap(ms->ms_defermap[j], off, size); + } + spa_config_exit(spa, SCL_VDEV, FTAG); +} diff --git a/uts/common/fs/zfs/refcount.c b/uts/common/fs/zfs/refcount.c index 3a8e144..df0f256 100644 --- a/uts/common/fs/zfs/refcount.c +++ b/uts/common/fs/zfs/refcount.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include @@ -32,7 +33,7 @@ int reference_tracking_enable = FALSE; /* runs out of memory too easily */ #else int reference_tracking_enable = TRUE; #endif -int reference_history = 4; /* tunable */ +int reference_history = 3; /* tunable */ static kmem_cache_t *reference_cache; static kmem_cache_t *reference_history_cache; @@ -64,6 +65,14 @@ refcount_create(refcount_t *rc) offsetof(reference_t, ref_link)); rc->rc_count = 0; rc->rc_removed_count = 0; + rc->rc_tracked = reference_tracking_enable; +} + +void +refcount_create_untracked(refcount_t *rc) +{ + refcount_create(rc); + rc->rc_tracked = B_FALSE; } void @@ -96,14 +105,12 @@ refcount_destroy(refcount_t *rc) int refcount_is_zero(refcount_t *rc) { - ASSERT(rc->rc_count >= 0); return (rc->rc_count == 0); } int64_t refcount_count(refcount_t *rc) { - ASSERT(rc->rc_count >= 0); return (rc->rc_count); } @@ -113,14 +120,14 @@ refcount_add_many(refcount_t *rc, uint64_t number, void *holder) reference_t *ref = NULL; int64_t count; - if (reference_tracking_enable) { + if (rc->rc_tracked) { ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; } mutex_enter(&rc->rc_mtx); ASSERT(rc->rc_count >= 0); - if (reference_tracking_enable) + if (rc->rc_tracked) list_insert_head(&rc->rc_list, ref); rc->rc_count += number; count = rc->rc_count; @@ -144,7 +151,7 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) mutex_enter(&rc->rc_mtx); ASSERT(rc->rc_count >= number); - if (!reference_tracking_enable) { + if (!rc->rc_tracked) { rc->rc_count -= number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); @@ -161,7 +168,7 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) KM_SLEEP); list_insert_head(&rc->rc_removed, ref); rc->rc_removed_count++; - if (rc->rc_removed_count >= reference_history) { + if (rc->rc_removed_count > reference_history) { ref = list_tail(&rc->rc_removed); list_remove(&rc->rc_removed, ref); kmem_cache_free(reference_history_cache, diff --git a/uts/common/fs/zfs/rrwlock.c b/uts/common/fs/zfs/rrwlock.c index 7f9290b..8e80166 100644 --- a/uts/common/fs/zfs/rrwlock.c +++ b/uts/common/fs/zfs/rrwlock.c @@ -75,8 +75,9 @@ uint_t rrw_tsd_key; typedef struct rrw_node { - struct rrw_node *rn_next; - rrwlock_t *rn_rrl; + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; + void *rn_tag; } rrw_node_t; static rrw_node_t * @@ -98,13 +99,14 @@ rrn_find(rrwlock_t *rrl) * Add a node to the head of the singly linked list. */ static void -rrn_add(rrwlock_t *rrl) +rrn_add(rrwlock_t *rrl, void *tag) { rrw_node_t *rn; rn = kmem_alloc(sizeof (*rn), KM_SLEEP); rn->rn_rrl = rrl; rn->rn_next = tsd_get(rrw_tsd_key); + rn->rn_tag = tag; VERIFY(tsd_set(rrw_tsd_key, rn) == 0); } @@ -113,7 +115,7 @@ rrn_add(rrwlock_t *rrl) * thread's list and return TRUE; otherwise return FALSE. */ static boolean_t -rrn_find_and_remove(rrwlock_t *rrl) +rrn_find_and_remove(rrwlock_t *rrl, void *tag) { rrw_node_t *rn; rrw_node_t *prev = NULL; @@ -122,7 +124,7 @@ rrn_find_and_remove(rrwlock_t *rrl) return (B_FALSE); for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { - if (rn->rn_rrl == rrl) { + if (rn->rn_rrl == rrl && rn->rn_tag == tag) { if (prev) prev->rn_next = rn->rn_next; else @@ -136,7 +138,7 @@ rrn_find_and_remove(rrwlock_t *rrl) } void -rrw_init(rrwlock_t *rrl) +rrw_init(rrwlock_t *rrl, boolean_t track_all) { mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); @@ -144,6 +146,7 @@ rrw_init(rrwlock_t *rrl) refcount_create(&rrl->rr_anon_rcount); refcount_create(&rrl->rr_linked_rcount); rrl->rr_writer_wanted = B_FALSE; + rrl->rr_track_all = track_all; } void @@ -156,12 +159,13 @@ rrw_destroy(rrwlock_t *rrl) refcount_destroy(&rrl->rr_linked_rcount); } -static void +void rrw_enter_read(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); #if !defined(DEBUG) && defined(_KERNEL) - if (!rrl->rr_writer && !rrl->rr_writer_wanted) { + if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && + !rrl->rr_track_all) { rrl->rr_anon_rcount.rc_count++; mutex_exit(&rrl->rr_lock); return; @@ -171,14 +175,14 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) ASSERT(rrl->rr_writer != curthread); ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); - while (rrl->rr_writer || (rrl->rr_writer_wanted && + while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && refcount_is_zero(&rrl->rr_anon_rcount) && rrn_find(rrl) == NULL)) cv_wait(&rrl->rr_cv, &rrl->rr_lock); - if (rrl->rr_writer_wanted) { + if (rrl->rr_writer_wanted || rrl->rr_track_all) { /* may or may not be a re-entrant enter */ - rrn_add(rrl); + rrn_add(rrl, tag); (void) refcount_add(&rrl->rr_linked_rcount, tag); } else { (void) refcount_add(&rrl->rr_anon_rcount, tag); @@ -187,7 +191,7 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) mutex_exit(&rrl->rr_lock); } -static void +void rrw_enter_write(rrwlock_t *rrl) { mutex_enter(&rrl->rr_lock); @@ -233,10 +237,12 @@ rrw_exit(rrwlock_t *rrl, void *tag) if (rrl->rr_writer == NULL) { int64_t count; - if (rrn_find_and_remove(rrl)) + if (rrn_find_and_remove(rrl, tag)) { count = refcount_remove(&rrl->rr_linked_rcount, tag); - else + } else { + ASSERT(!rrl->rr_track_all); count = refcount_remove(&rrl->rr_anon_rcount, tag); + } if (count == 0) cv_broadcast(&rrl->rr_cv); } else { @@ -249,6 +255,11 @@ rrw_exit(rrwlock_t *rrl, void *tag) mutex_exit(&rrl->rr_lock); } +/* + * If the lock was created with track_all, rrw_held(RW_READER) will return + * B_TRUE iff the current thread has the lock for reader. Otherwise it may + * return B_TRUE if any thread has the lock for reader. + */ boolean_t rrw_held(rrwlock_t *rrl, krw_t rw) { @@ -259,7 +270,7 @@ rrw_held(rrwlock_t *rrl, krw_t rw) held = (rrl->rr_writer == curthread); } else { held = (!refcount_is_zero(&rrl->rr_anon_rcount) || - !refcount_is_zero(&rrl->rr_linked_rcount)); + rrn_find(rrl) != NULL); } mutex_exit(&rrl->rr_lock); diff --git a/uts/common/fs/zfs/sa.c b/uts/common/fs/zfs/sa.c index cd3a58b..05f329c 100644 --- a/uts/common/fs/zfs/sa.c +++ b/uts/common/fs/zfs/sa.c @@ -1001,10 +1001,10 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, sa_attr_type_t *tb; int error; - mutex_enter(&os->os_lock); + mutex_enter(&os->os_user_ptr_lock); if (os->os_sa) { mutex_enter(&os->os_sa->sa_lock); - mutex_exit(&os->os_lock); + mutex_exit(&os->os_user_ptr_lock); tb = os->os_sa->sa_user_table; mutex_exit(&os->os_sa->sa_lock); *user_table = tb; @@ -1017,7 +1017,7 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, os->os_sa = sa; mutex_enter(&sa->sa_lock); - mutex_exit(&os->os_lock); + mutex_exit(&os->os_user_ptr_lock); avl_create(&sa->sa_layout_num_tree, layout_num_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c index fdc2870..544a040 100644 --- a/uts/common/fs/zfs/spa.c +++ b/uts/common/fs/zfs/spa.c @@ -63,6 +63,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -129,10 +130,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; -static dsl_syncfunc_t spa_sync_version; -static dsl_syncfunc_t spa_sync_props; -static dsl_checkfunc_t spa_change_guid_check; -static dsl_syncfunc_t spa_change_guid_sync; +static void spa_sync_version(void *arg, dmu_tx_t *tx); +static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, @@ -325,10 +324,10 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) dsl_dataset_t *ds = NULL; dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); + dsl_pool_config_enter(dp, FTAG); if (err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds)) { - rw_exit(&dp->dp_config_rwlock); + dsl_pool_config_exit(dp, FTAG); break; } @@ -337,7 +336,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); - rw_exit(&dp->dp_config_rwlock); + dsl_pool_config_exit(dp, FTAG); } else { strval = NULL; intval = za.za_first_integer; @@ -491,9 +490,10 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (dmu_objset_type(os) != DMU_OST_ZFS) { error = ENOTSUP; - } else if ((error = dsl_prop_get_integer(strval, + } else if ((error = + dsl_prop_get_int_ds(dmu_objset_ds(os), zfs_prop_to_name(ZFS_PROP_COMPRESSION), - &compress, NULL)) == 0 && + &compress)) == 0 && !BOOTFS_COMPRESS_VALID(compress)) { error = ENOTSUP; } else { @@ -660,8 +660,8 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) * read object, the features for write object, or the * feature descriptions object. */ - error = dsl_sync_task_do(spa_get_dsl(spa), NULL, - spa_sync_version, spa, &ver, 6); + error = dsl_sync_task(spa->spa_name, NULL, + spa_sync_version, &ver, 6); if (error) return (error); continue; @@ -672,8 +672,8 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) } if (need_sync) { - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 6)); + return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, + nvp, 6)); } return (0); @@ -695,10 +695,10 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) /*ARGSUSED*/ static int -spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) +spa_change_guid_check(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - uint64_t *newguid = arg2; + uint64_t *newguid = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; @@ -715,10 +715,10 @@ spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) +spa_change_guid_sync(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - uint64_t *newguid = arg2; + uint64_t *newguid = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; @@ -752,8 +752,8 @@ spa_change_guid(spa_t *spa) mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); - error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, - spa_change_guid_sync, spa, &guid, 5); + error = dsl_sync_task(spa->spa_name, spa_change_guid_check, + spa_change_guid_sync, &guid, 5); if (error == 0) { spa_config_sync(spa, B_FALSE, B_TRUE); @@ -1687,21 +1687,22 @@ spa_config_valid(spa_t *spa, nvlist_t *config) /* * Check for missing log devices */ -static int +static boolean_t spa_check_logs(spa_t *spa) { + boolean_t rv = B_FALSE; + switch (spa->spa_log_state) { case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: - if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, - DS_FIND_CHILDREN)) { + rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, + NULL, DS_FIND_CHILDREN) != 0); + if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); - return (1); - } break; } - return (0); + return (rv); } static boolean_t @@ -1747,11 +1748,11 @@ spa_activate_log(spa_t *spa) int spa_offline_log(spa_t *spa) { - int error = 0; - - if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN)) == 0) { + int error; + error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN); + if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed @@ -3549,7 +3550,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_sync_props(spa, props, tx); + spa_sync_props(props, tx); } dmu_tx_commit(tx); @@ -5813,10 +5814,11 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) } static void -spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) +spa_sync_version(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - uint64_t version = *(uint64_t *)arg2; + uint64_t *versionp = arg; + uint64_t version = *versionp; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. @@ -5835,11 +5837,11 @@ spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) * Set zpool properties. */ static void -spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) +spa_sync_props(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; + nvlist_t *nvp = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; - nvlist_t *nvp = arg2; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); @@ -5990,6 +5992,8 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) ASSERT(spa->spa_sync_pass == 1); + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); @@ -6015,6 +6019,7 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } + rrw_exit(&dp->dp_config_rwlock, FTAG); } /* diff --git a/uts/common/fs/zfs/spa_history.c b/uts/common/fs/zfs/spa_history.c index 9ae2873..eef642a 100644 --- a/uts/common/fs/zfs/spa_history.c +++ b/uts/common/fs/zfs/spa_history.c @@ -195,10 +195,10 @@ spa_history_zone(void) */ /*ARGSUSED*/ static void -spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) +spa_history_log_sync(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - nvlist_t *nvl = arg2; + nvlist_t *nvl = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; spa_history_phys_t *shpp; @@ -220,7 +220,7 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) * Get the offset of where we need to write via the bonus buffer. * Update the offset when the write completes. */ - VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); @@ -321,8 +321,8 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); /* Kick this off asynchronously; errors are ignored. */ - dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, - spa_history_log_sync, spa, nvarg, 0, tx); + dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, + nvarg, 0, tx); dmu_tx_commit(tx); /* spa_history_log_sync will free nvl */ @@ -455,10 +455,10 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); if (dmu_tx_is_syncing(tx)) { - spa_history_log_sync(spa, nvl, tx); + spa_history_log_sync(nvl, tx); } else { - dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, - spa_history_log_sync, spa, nvl, 0, tx); + dsl_sync_task_nowait(spa_get_dsl(spa), + spa_history_log_sync, nvl, 0, tx); } /* spa_history_log_sync() will free nvl */ } @@ -530,15 +530,9 @@ spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, void spa_history_log_version(spa_t *spa, const char *operation) { -#ifdef _KERNEL - uint64_t current_vers = spa_version(spa); - spa_history_log_internal(spa, operation, NULL, "pool version %llu; software version %llu/%d; uts %s %s %s %s", - (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, + (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION, utsname.nodename, utsname.release, utsname.version, utsname.machine); - cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", operation, - (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); -#endif } diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c index 405d93c..733d260 100644 --- a/uts/common/fs/zfs/spa_misc.c +++ b/uts/common/fs/zfs/spa_misc.c @@ -237,8 +237,8 @@ kmem_cache_t *spa_buffer_pool; int spa_mode_global; #ifdef ZFS_DEBUG -/* Everything except dprintf is on by default in debug builds */ -int zfs_flags = ~ZFS_DEBUG_DPRINTF; +/* Everything except dprintf and spa is on by default in debug builds */ +int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); #else int zfs_flags = 0; #endif @@ -282,7 +282,7 @@ spa_config_lock_init(spa_t *spa) spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); - refcount_create(&scl->scl_count); + refcount_create_untracked(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; } @@ -335,6 +335,8 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) { int wlocks_held = 0; + ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); + for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) @@ -413,27 +415,22 @@ spa_lookup(const char *name) static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; - char c; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); + (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); + /* * If it's a full dataset name, figure out the pool name and * just use that. */ - cp = strpbrk(name, "/@"); - if (cp) { - c = *cp; + cp = strpbrk(search.spa_name, "/@"); + if (cp != NULL) *cp = '\0'; - } - (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); spa = avl_find(&spa_namespace_avl, &search, &where); - if (cp) - *cp = c; - return (spa); } @@ -567,6 +564,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) kstat_install(spa->spa_iokstat); } + spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); + return (spa); } diff --git a/uts/common/fs/zfs/space_map.c b/uts/common/fs/zfs/space_map.c index 30a35c8..fb30b34 100644 --- a/uts/common/fs/zfs/space_map.c +++ b/uts/common/fs/zfs/space_map.c @@ -102,7 +102,7 @@ void space_map_add(space_map_t *sm, uint64_t start, uint64_t size) { avl_index_t where; - space_seg_t ssearch, *ss_before, *ss_after, *ss; + space_seg_t *ss_before, *ss_after, *ss; uint64_t end = start + size; int merge_before, merge_after; @@ -115,11 +115,8 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); - - if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { + ss = space_map_find(sm, start, size, &where); + if (ss != NULL) { zfs_panic_recover("zfs: allocating allocated segment" "(offset=%llu size=%llu)\n", (longlong_t)start, (longlong_t)size); @@ -171,19 +168,12 @@ void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) { avl_index_t where; - space_seg_t ssearch, *ss, *newseg; + space_seg_t *ss, *newseg; uint64_t end = start + size; int left_over, right_over; - ASSERT(MUTEX_HELD(sm->sm_lock)); VERIFY(!sm->sm_condensing); - VERIFY(size != 0); - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); + ss = space_map_find(sm, start, size, &where); /* Make sure we completely overlap with someone */ if (ss == NULL) { @@ -226,12 +216,11 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) sm->sm_space -= size; } -boolean_t -space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) +space_seg_t * +space_map_find(space_map_t *sm, uint64_t start, uint64_t size, + avl_index_t *wherep) { - avl_index_t where; space_seg_t ssearch, *ss; - uint64_t end = start + size; ASSERT(MUTEX_HELD(sm->sm_lock)); VERIFY(size != 0); @@ -239,10 +228,20 @@ space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); + ssearch.ss_end = start + size; + ss = avl_find(&sm->sm_root, &ssearch, wherep); + + if (ss != NULL && ss->ss_start <= start && ss->ss_end >= start + size) + return (ss); + return (NULL); +} + +boolean_t +space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) +{ + avl_index_t where; - return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); + return (space_map_find(sm, start, size, &where) != 0); } void diff --git a/uts/common/fs/zfs/sys/arc.h b/uts/common/fs/zfs/sys/arc.h index 916d2ab..0e86290 100644 --- a/uts/common/fs/zfs/sys/arc.h +++ b/uts/common/fs/zfs/sys/arc.h @@ -89,7 +89,7 @@ arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); -int arc_buf_remove_ref(arc_buf_t *buf, void *tag); +boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); diff --git a/uts/common/fs/zfs/sys/dbuf.h b/uts/common/fs/zfs/sys/dbuf.h index 8591f15..a29f7b3 100644 --- a/uts/common/fs/zfs/sys/dbuf.h +++ b/uts/common/fs/zfs/sys/dbuf.h @@ -311,20 +311,17 @@ void dbuf_fini(void); boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); -#define DBUF_IS_METADATA(_db) \ - (dbuf_is_metadata(_db)) - #define DBUF_GET_BUFC_TYPE(_db) \ - (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) + (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) #define DBUF_IS_CACHEABLE(_db) \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (DBUF_IS_METADATA(_db) && \ + (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) #define DBUF_IS_L2CACHEABLE(_db) \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (DBUF_IS_METADATA(_db) && \ + (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) #ifdef ZFS_DEBUG diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h index ef0a6a7..1366a99 100644 --- a/uts/common/fs/zfs/sys/dmu.h +++ b/uts/common/fs/zfs/sys/dmu.h @@ -217,6 +217,11 @@ typedef enum dmu_object_type { DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; +typedef enum txg_how { + TXG_WAIT = 1, + TXG_NOWAIT, +} txg_how_t; + void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); @@ -255,22 +260,19 @@ void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); -int dmu_objset_evict_dbufs(objset_t *os); +void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, - uint64_t flags); -int dmu_objset_destroy(const char *name, boolean_t defer); -int dmu_snapshots_destroy_nvl(struct nvlist *snaps, boolean_t defer, +int dmu_objset_clone(const char *name, const char *origin); +int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); -int dmu_objset_snapshot(struct nvlist *snaps, struct nvlist *, struct nvlist *); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_snapshot_tmp(const char *, const char *, int); -int dmu_objset_rename(const char *name, const char *newname, - boolean_t recursive); int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); +int dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ @@ -545,7 +547,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); @@ -788,36 +790,8 @@ typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); -int dmu_send(objset_t *tosnap, objset_t *fromsnap, - int outfd, struct vnode *vp, offset_t *off); -int dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep); - -typedef struct dmu_recv_cookie { - /* - * This structure is opaque! - * - * If logical and real are different, we are recving the stream - * into the "real" temporary clone, and then switching it with - * the "logical" target. - */ - struct dsl_dataset *drc_logical_ds; - struct dsl_dataset *drc_real_ds; - struct drr_begin *drc_drrb; - char *drc_tosnap; - char *drc_top_ds; - boolean_t drc_newfs; - boolean_t drc_force; - struct avl_tree *drc_guid_to_ds_map; -} dmu_recv_cookie_t; - -int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *, - boolean_t force, objset_t *origin, dmu_recv_cookie_t *); -int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, - int cleanup_fd, uint64_t *action_handlep); -int dmu_recv_end(dmu_recv_cookie_t *drc); - -int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, - offset_t *off); +int dmu_diff(const char *tosnap_name, const char *fromsnap_name, + struct vnode *vp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ diff --git a/uts/common/fs/zfs/sys/dmu_objset.h b/uts/common/fs/zfs/sys/dmu_objset.h index 578b947..143e594 100644 --- a/uts/common/fs/zfs/sys/dmu_objset.h +++ b/uts/common/fs/zfs/sys/dmu_objset.h @@ -43,6 +43,7 @@ extern "C" { extern krwlock_t os_lock; +struct dsl_pool; struct dsl_dataset; struct dmu_tx; @@ -114,8 +115,6 @@ struct objset { /* stuff we store for the user */ kmutex_t os_user_ptr_lock; void *os_user_ptr; - - /* SA layout/attribute registration */ sa_os_t *os_sa; }; @@ -143,10 +142,11 @@ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dmu_objset_fsid_guid(objset_t *os); -int dmu_objset_find_spa(spa_t *spa, const char *name, - int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); +int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, + int func(struct dsl_pool *, struct dsl_dataset *, void *), + void *arg, int flags); int dmu_objset_prefetch(const char *name, void *arg); -int dmu_objset_evict_dbufs(objset_t *os); +void dmu_objset_evict_dbufs(objset_t *os); timestruc_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ @@ -162,6 +162,7 @@ void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); +int dmu_fsname(const char *snapname, char *buf); void dmu_objset_init(void); void dmu_objset_fini(void); diff --git a/uts/common/fs/zfs/sys/dmu_send.h b/uts/common/fs/zfs/sys/dmu_send.h new file mode 100644 index 0000000..ee0885a --- /dev/null +++ b/uts/common/fs/zfs/sys/dmu_send.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _DMU_SEND_H +#define _DMU_SEND_H + +#include +#include + +struct vnode; +struct dsl_dataset; +struct drr_begin; +struct avl_tree; + +int dmu_send(const char *tosnap, const char *fromsnap, int outfd, + struct vnode *vp, offset_t *off); +int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, + uint64_t *sizep); +int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, + int outfd, struct vnode *vp, offset_t *off); + +typedef struct dmu_recv_cookie { + struct dsl_dataset *drc_ds; + struct drr_begin *drc_drrb; + const char *drc_tofs; + const char *drc_tosnap; + boolean_t drc_newfs; + boolean_t drc_byteswap; + boolean_t drc_force; + struct avl_tree *drc_guid_to_ds_map; + zio_cksum_t drc_cksum; + uint64_t drc_newsnapobj; +} dmu_recv_cookie_t; + +int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, + boolean_t force, char *origin, dmu_recv_cookie_t *drc); +int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, + int cleanup_fd, uint64_t *action_handlep); +int dmu_recv_end(dmu_recv_cookie_t *drc); + +#endif /* _DMU_SEND_H */ diff --git a/uts/common/fs/zfs/sys/dmu_tx.h b/uts/common/fs/zfs/sys/dmu_tx.h index c5ea50f..dbd2242 100644 --- a/uts/common/fs/zfs/sys/dmu_tx.h +++ b/uts/common/fs/zfs/sys/dmu_tx.h @@ -22,6 +22,9 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ #ifndef _SYS_DMU_TX_H #define _SYS_DMU_TX_H @@ -108,10 +111,11 @@ typedef struct dmu_tx_callback { * These routines are defined in dmu.h, and are called by the user. */ dmu_tx_t *dmu_tx_create(objset_t *dd); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_abort(dmu_tx_t *tx); uint64_t dmu_tx_get_txg(dmu_tx_t *tx); +struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, diff --git a/uts/common/fs/zfs/sys/dsl_dataset.h b/uts/common/fs/zfs/sys/dsl_dataset.h index 272c3ec..6729f9f 100644 --- a/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/uts/common/fs/zfs/sys/dsl_dataset.h @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -48,10 +49,8 @@ struct dsl_pool; #define DS_IS_INCONSISTENT(ds) \ ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) /* - * NB: nopromote can not yet be set, but we want support for it in this - * on-disk version, so that we don't need to upgrade for it later. It - * will be needed when we implement 'zfs split' (where the split off - * clone should not be promoted). + * Note: nopromote can not yet be set, but we want support for it in this + * on-disk version, so that we don't need to upgrade for it later. */ #define DS_FLAG_NOPROMOTE (1ULL<<1) @@ -76,6 +75,8 @@ struct dsl_pool; */ #define DS_FLAG_CI_DATASET (1ULL<<16) +#define DS_CREATE_FLAG_NODIRTY (1ULL<<24) + typedef struct dsl_dataset_phys { uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */ uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */ @@ -125,9 +126,6 @@ typedef struct dsl_dataset { dsl_deadlist_t ds_deadlist; bplist_t ds_pending_deadlist; - /* to protect against multiple concurrent incremental recv */ - kmutex_t ds_recvlock; - /* protected by lock on pool's dp_dirty_datasets list */ txg_node_t ds_dirty_link; list_node_t ds_synced_link; @@ -139,13 +137,15 @@ typedef struct dsl_dataset { kmutex_t ds_lock; objset_t *ds_objset; uint64_t ds_userrefs; + void *ds_owner; /* - * ds_owner is protected by the ds_rwlock and the ds_lock + * Long holds prevent the ds from being destroyed; they allow the + * ds to remain held even after dropping the dp_config_rwlock. + * Owning counts as a long hold. See the comments above + * dsl_pool_hold() for details. */ - krwlock_t ds_rwlock; - kcondvar_t ds_exclusive_cv; - void *ds_owner; + refcount_t ds_longholds; /* no locking; only for making guesses */ uint64_t ds_trysnap_txg; @@ -163,76 +163,44 @@ typedef struct dsl_dataset { char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; -struct dsl_ds_destroyarg { - dsl_dataset_t *ds; /* ds to destroy */ - dsl_dataset_t *rm_origin; /* also remove our origin? */ - boolean_t is_origin_rm; /* set if removing origin snap */ - boolean_t defer; /* destroy -d requested? */ - boolean_t releasing; /* destroying due to release? */ - boolean_t need_prep; /* do we need to retry due to EBUSY? */ -}; - /* * The max length of a temporary tag prefix is the number of hex digits * required to express UINT64_MAX plus one for the hyphen. */ #define MAX_TAG_PREFIX_LEN 17 -struct dsl_ds_holdarg { - dsl_sync_task_group_t *dstg; - const char *htag; - char *snapname; - boolean_t recursive; - boolean_t gotone; - boolean_t temphold; - char failed[MAXPATHLEN]; -}; - #define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) #define DS_UNIQUE_IS_ACCURATE(ds) \ (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) -int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); -int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, - void *tag, dsl_dataset_t **); -int dsl_dataset_own(const char *name, boolean_t inconsistentok, +int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, + dsl_dataset_t **dsp); +int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **); +void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); +int dsl_dataset_own(struct dsl_pool *dp, const char *name, void *tag, dsl_dataset_t **dsp); int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, - boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp); -void dsl_dataset_name(dsl_dataset_t *ds, char *name); -void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); + void *tag, dsl_dataset_t **dsp); void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); -void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); -boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, - void *tag); -void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag); +void dsl_dataset_name(dsl_dataset_t *ds, char *name); +boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, minor_t minor); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx); -int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer); -dsl_checkfunc_t dsl_dataset_destroy_check; -dsl_syncfunc_t dsl_dataset_destroy_sync; -dsl_syncfunc_t dsl_dataset_user_hold_sync; -int dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *, dmu_tx_t *tx); -void dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *, dmu_tx_t *tx); -int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); +int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors); int dsl_dataset_promote(const char *name, char *conflsnap); int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force); -int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, - boolean_t recursive, boolean_t temphold, int cleanup_fd); -int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, - boolean_t temphold); -int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, - boolean_t recursive); -int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, - char *htag, boolean_t retry); -int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); +int dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive); +int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, + minor_t cleanup_minor, const char *htag); blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); @@ -269,13 +237,35 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv); -int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, +int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, uint64_t quota); -dsl_syncfunc_t dsl_dataset_set_quota_sync; -int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, +int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, uint64_t reservation); -int dsl_destroy_inconsistent(const char *dsname, void *arg); +boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier); +void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag); +void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag); +boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); + +int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, boolean_t force); +void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, dmu_tx_t *tx); +int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx); +void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx); + +void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx); +void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds); +int dsl_dataset_get_snapname(dsl_dataset_t *ds); +int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, + uint64_t *value); +int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx); +void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, + zprop_source_t source, uint64_t value, dmu_tx_t *tx); +int dsl_dataset_rollback(const char *fsname); #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ diff --git a/uts/common/fs/zfs/sys/dsl_destroy.h b/uts/common/fs/zfs/sys/dsl_destroy.h new file mode 100644 index 0000000..c5a70bb --- /dev/null +++ b/uts/common/fs/zfs/sys/dsl_destroy.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_DSL_DESTROY_H +#define _SYS_DSL_DESTROY_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct nvlist; +struct dsl_dataset; +struct dmu_tx; + +int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, + struct nvlist *errlist); +int dsl_destroy_snapshot(const char *name, boolean_t defer); +int dsl_destroy_head(const char *name); +int dsl_destroy_head_check_impl(struct dsl_dataset *ds, int expected_holds); +void dsl_destroy_head_sync_impl(struct dsl_dataset *ds, struct dmu_tx *tx); +int dsl_destroy_inconsistent(const char *dsname, void *arg); +void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *ds, + boolean_t defer, struct dmu_tx *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DESTROY_H */ diff --git a/uts/common/fs/zfs/sys/dsl_dir.h b/uts/common/fs/zfs/sys/dsl_dir.h index 2191635..641bcfc 100644 --- a/uts/common/fs/zfs/sys/dsl_dir.h +++ b/uts/common/fs/zfs/sys/dsl_dir.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DIR_H @@ -101,18 +102,15 @@ struct dsl_dir { char dd_myname[MAXNAMELEN]; }; -void dsl_dir_close(dsl_dir_t *dd, void *tag); -int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail); -int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **, - const char **tailp); -int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, +void dsl_dir_rele(dsl_dir_t *dd, void *tag); +int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, + dsl_dir_t **, const char **tail); +int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **); void dsl_dir_name(dsl_dir_t *dd, char *buf); int dsl_dir_namelen(dsl_dir_t *dd); uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx); -dsl_checkfunc_t dsl_dir_destroy_check; -dsl_syncfunc_t dsl_dir_destroy_sync; void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly); @@ -131,14 +129,15 @@ int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota); int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation); -int dsl_dir_rename(dsl_dir_t *dd, const char *newname); +int dsl_dir_rename(const char *oldname, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); -int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); +void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, + dmu_tx_t *tx); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" diff --git a/uts/common/fs/zfs/sys/dsl_pool.h b/uts/common/fs/zfs/sys/dsl_pool.h index ab1229a..b0160ed 100644 --- a/uts/common/fs/zfs/sys/dsl_pool.h +++ b/uts/common/fs/zfs/sys/dsl_pool.h @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -113,7 +114,7 @@ typedef struct dsl_pool { * syncing context does not need to ever have it for read, since * nobody else could possibly have it for write. */ - krwlock_t dp_config_rwlock; + rrwlock_t dp_config_rwlock; zfs_all_blkstats_t *dp_blkstats; } dsl_pool_t; @@ -139,15 +140,20 @@ void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_mos_diduse_space(dsl_pool_t *dp, int64_t used, int64_t comp, int64_t uncomp); +void dsl_pool_config_enter(dsl_pool_t *dp, void *tag); +void dsl_pool_config_exit(dsl_pool_t *dp, void *tag); +boolean_t dsl_pool_config_held(dsl_pool_t *dp); taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); -extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, - const char *tag, uint64_t *now, dmu_tx_t *tx); -extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, +int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t now, dmu_tx_t *tx); +int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx); -extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); +void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); +int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); +void dsl_pool_rele(dsl_pool_t *dp, void *tag); #ifdef __cplusplus } diff --git a/uts/common/fs/zfs/sys/dsl_prop.h b/uts/common/fs/zfs/sys/dsl_prop.h index b0d9a52..5fe18d6 100644 --- a/uts/common/fs/zfs/sys/dsl_prop.h +++ b/uts/common/fs/zfs/sys/dsl_prop.h @@ -54,58 +54,47 @@ typedef struct dsl_props_arg { zprop_source_t pa_source; } dsl_props_arg_t; -typedef struct dsl_prop_set_arg { - const char *psa_name; - zprop_source_t psa_source; - int psa_intsz; - int psa_numints; - const void *psa_value; - - /* - * Used to handle the special requirements of the quota and reservation - * properties. - */ - uint64_t psa_effective_value; -} dsl_prop_setarg_t; - int dsl_prop_register(struct dsl_dataset *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg); int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg); -int dsl_prop_numcb(struct dsl_dataset *ds); +void dsl_prop_notify_all(struct dsl_dir *dd); +boolean_t dsl_prop_hascb(struct dsl_dataset *ds); int dsl_prop_get(const char *ddname, const char *propname, int intsz, int numints, void *buf, char *setpoint); int dsl_prop_get_integer(const char *ddname, const char *propname, uint64_t *valuep, char *setpoint); int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); -int dsl_prop_get_received(objset_t *os, nvlist_t **nvp); +int dsl_prop_get_received(const char *dsname, nvlist_t **nvp); int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname, + uint64_t *valuep); int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot); -dsl_syncfunc_t dsl_props_set_sync; -int dsl_prop_set(const char *ddname, const char *propname, - zprop_source_t source, int intsz, int numints, const void *buf); +void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source, + nvlist_t *props, dmu_tx_t *tx); +void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname, + zprop_source_t source, int intsz, int numints, const void *value, + dmu_tx_t *tx); int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); +int dsl_prop_set_int(const char *dsname, const char *propname, + zprop_source_t source, uint64_t value); +int dsl_prop_set_string(const char *dsname, const char *propname, + zprop_source_t source, const char *value); +int dsl_prop_inherit(const char *dsname, const char *propname, + zprop_source_t source); -void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, - zprop_source_t source, uint64_t *value); -int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa); -#ifdef ZFS_DEBUG -void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa); -#define DSL_PROP_CHECK_PREDICTION(dd, psa) \ - dsl_prop_check_prediction((dd), (psa)) -#else -#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */ -#endif +int dsl_prop_predict(dsl_dir_t *dd, const char *propname, + zprop_source_t source, uint64_t value, uint64_t *newvalp); /* flag first receive on or after SPA_VERSION_RECVD_PROPS */ -boolean_t dsl_prop_get_hasrecvd(objset_t *os); -void dsl_prop_set_hasrecvd(objset_t *os); -void dsl_prop_unset_hasrecvd(objset_t *os); +boolean_t dsl_prop_get_hasrecvd(const char *dsname); +int dsl_prop_set_hasrecvd(const char *dsname); +void dsl_prop_unset_hasrecvd(const char *dsname); void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); void dsl_prop_nvlist_add_string(nvlist_t *nv, diff --git a/uts/common/fs/zfs/sys/dsl_synctask.h b/uts/common/fs/zfs/sys/dsl_synctask.h index 9126290..ef86fb6 100644 --- a/uts/common/fs/zfs/sys/dsl_synctask.h +++ b/uts/common/fs/zfs/sys/dsl_synctask.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_SYNCTASK_H @@ -34,43 +35,26 @@ extern "C" { struct dsl_pool; -typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); -typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); +typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *); typedef struct dsl_sync_task { - list_node_t dst_node; + txg_node_t dst_node; + struct dsl_pool *dst_pool; + uint64_t dst_txg; + int dst_space; dsl_checkfunc_t *dst_checkfunc; dsl_syncfunc_t *dst_syncfunc; - void *dst_arg1; - void *dst_arg2; - int dst_err; + void *dst_arg; + int dst_error; + boolean_t dst_nowaiter; } dsl_sync_task_t; -typedef struct dsl_sync_task_group { - txg_node_t dstg_node; - list_t dstg_tasks; - struct dsl_pool *dstg_pool; - uint64_t dstg_txg; - int dstg_err; - int dstg_space; - boolean_t dstg_nowaiter; -} dsl_sync_task_group_t; - -dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp); -void dsl_sync_task_create(dsl_sync_task_group_t *dstg, - dsl_checkfunc_t *, dsl_syncfunc_t *, - void *arg1, void *arg2, int blocks_modified); -int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg); -void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); -void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg); -void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); - -int dsl_sync_task_do(struct dsl_pool *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified); -void dsl_sync_task_do_nowait(struct dsl_pool *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx); +void dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx); +int dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified); +void dsl_sync_task_nowait(struct dsl_pool *dp, dsl_syncfunc_t *syncfunc, + void *arg, int blocks_modified, dmu_tx_t *tx); #ifdef __cplusplus } diff --git a/uts/common/fs/zfs/sys/dsl_userhold.h b/uts/common/fs/zfs/sys/dsl_userhold.h new file mode 100644 index 0000000..56c6c8f --- /dev/null +++ b/uts/common/fs/zfs/sys/dsl_userhold.h @@ -0,0 +1,57 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_DSL_USERHOLD_H +#define _SYS_DSL_USERHOLD_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; +struct dsl_dataset; +struct dmu_tx; + +int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, + nvlist_t *errlist); +int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist); +int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl); +void dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, + const char *htag); +int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag, + boolean_t temphold, struct dmu_tx *tx); +void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag, + minor_t minor, uint64_t now, struct dmu_tx *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_USERHOLD_H */ diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h index 2cf4d2b..d6c0bf4 100644 --- a/uts/common/fs/zfs/sys/metaslab.h +++ b/uts/common/fs/zfs/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -56,6 +56,7 @@ extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); +extern void metaslab_check_free(spa_t *spa, const blkptr_t *bp); extern metaslab_class_t *metaslab_class_create(spa_t *spa, space_map_ops_t *ops); diff --git a/uts/common/fs/zfs/sys/refcount.h b/uts/common/fs/zfs/sys/refcount.h index 1dcd467..9efc5f1 100644 --- a/uts/common/fs/zfs/sys/refcount.h +++ b/uts/common/fs/zfs/sys/refcount.h @@ -51,15 +51,17 @@ typedef struct reference { typedef struct refcount { kmutex_t rc_mtx; + boolean_t rc_tracked; list_t rc_list; list_t rc_removed; uint64_t rc_count; uint64_t rc_removed_count; } refcount_t; -/* Note: refcount_t must be initialized with refcount_create() */ +/* Note: refcount_t must be initialized with refcount_create[_untracked]() */ void refcount_create(refcount_t *rc); +void refcount_create_untracked(refcount_t *rc); void refcount_destroy(refcount_t *rc); void refcount_destroy_many(refcount_t *rc, uint64_t number); int refcount_is_zero(refcount_t *rc); @@ -80,6 +82,7 @@ typedef struct refcount { } refcount_t; #define refcount_create(rc) ((rc)->rc_count = 0) +#define refcount_create_untracked(rc) ((rc)->rc_count = 0) #define refcount_destroy(rc) ((rc)->rc_count = 0) #define refcount_destroy_many(rc, number) ((rc)->rc_count = 0) #define refcount_is_zero(rc) ((rc)->rc_count == 0) diff --git a/uts/common/fs/zfs/sys/rrwlock.h b/uts/common/fs/zfs/sys/rrwlock.h index 239268b..e1e6d31 100644 --- a/uts/common/fs/zfs/sys/rrwlock.h +++ b/uts/common/fs/zfs/sys/rrwlock.h @@ -58,6 +58,7 @@ typedef struct rrwlock { refcount_t rr_anon_rcount; refcount_t rr_linked_rcount; boolean_t rr_writer_wanted; + boolean_t rr_track_all; } rrwlock_t; /* @@ -65,15 +66,19 @@ typedef struct rrwlock { * 'tag' must be the same in a rrw_enter() as in its * corresponding rrw_exit(). */ -void rrw_init(rrwlock_t *rrl); +void rrw_init(rrwlock_t *rrl, boolean_t track_all); void rrw_destroy(rrwlock_t *rrl); void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); +void rrw_enter_read(rrwlock_t *rrl, void *tag); +void rrw_enter_write(rrwlock_t *rrl); void rrw_exit(rrwlock_t *rrl, void *tag); boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); void rrw_tsd_destroy(void *arg); #define RRW_READ_HELD(x) rrw_held(x, RW_READER) #define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) +#define RRW_LOCK_HELD(x) \ + (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER)) #ifdef __cplusplus } diff --git a/uts/common/fs/zfs/sys/space_map.h b/uts/common/fs/zfs/sys/space_map.h index 2da50fb..64223da 100644 --- a/uts/common/fs/zfs/sys/space_map.h +++ b/uts/common/fs/zfs/sys/space_map.h @@ -149,6 +149,8 @@ extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); extern boolean_t space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern space_seg_t *space_map_find(space_map_t *sm, uint64_t start, + uint64_t size, avl_index_t *wherep); extern void space_map_swap(space_map_t **msrc, space_map_t **mdest); extern void space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); diff --git a/uts/common/fs/zfs/sys/txg.h b/uts/common/fs/zfs/sys/txg.h index 1287f09..2df33f0 100644 --- a/uts/common/fs/zfs/sys/txg.h +++ b/uts/common/fs/zfs/sys/txg.h @@ -45,9 +45,6 @@ extern "C" { /* Number of txgs worth of frees we defer adding to in-core spacemaps */ #define TXG_DEFER_SIZE 2 -#define TXG_WAIT 1ULL -#define TXG_NOWAIT 2ULL - typedef struct tx_cpu tx_cpu_t; typedef struct txg_handle { @@ -119,11 +116,11 @@ extern boolean_t txg_sync_waiting(struct dsl_pool *dp); extern void txg_list_create(txg_list_t *tl, size_t offset); extern void txg_list_destroy(txg_list_t *tl); extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg); -extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); -extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); +extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg); +extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); -extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); +extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg); extern void *txg_list_head(txg_list_t *tl, uint64_t txg); extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); diff --git a/uts/common/fs/zfs/sys/zfeature.h b/uts/common/fs/zfs/sys/zfeature.h index 481e85b..1a081e4 100644 --- a/uts/common/fs/zfs/sys/zfeature.h +++ b/uts/common/fs/zfs/sys/zfeature.h @@ -26,7 +26,6 @@ #ifndef _SYS_ZFEATURE_H #define _SYS_ZFEATURE_H -#include #include #include "zfeature_common.h" @@ -34,14 +33,18 @@ extern "C" { #endif -extern boolean_t feature_is_supported(objset_t *os, uint64_t obj, +struct spa; +struct dmu_tx; +struct objset; + +extern boolean_t feature_is_supported(struct objset *os, uint64_t obj, uint64_t desc_obj, nvlist_t *unsup_feat, nvlist_t *enabled_feat); -struct spa; -extern void spa_feature_create_zap_objects(struct spa *, dmu_tx_t *); -extern void spa_feature_enable(struct spa *, zfeature_info_t *, dmu_tx_t *); -extern void spa_feature_incr(struct spa *, zfeature_info_t *, dmu_tx_t *); -extern void spa_feature_decr(struct spa *, zfeature_info_t *, dmu_tx_t *); +extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *); +extern void spa_feature_enable(struct spa *, zfeature_info_t *, + struct dmu_tx *); +extern void spa_feature_incr(struct spa *, zfeature_info_t *, struct dmu_tx *); +extern void spa_feature_decr(struct spa *, zfeature_info_t *, struct dmu_tx *); extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *); extern boolean_t spa_feature_is_active(struct spa *, zfeature_info_t *); diff --git a/uts/common/fs/zfs/sys/zfs_debug.h b/uts/common/fs/zfs/sys/zfs_debug.h index 9462622..14eb2ab 100644 --- a/uts/common/fs/zfs/sys/zfs_debug.h +++ b/uts/common/fs/zfs/sys/zfs_debug.h @@ -48,11 +48,13 @@ extern "C" { extern int zfs_flags; -#define ZFS_DEBUG_DPRINTF 0x0001 -#define ZFS_DEBUG_DBUF_VERIFY 0x0002 -#define ZFS_DEBUG_DNODE_VERIFY 0x0004 -#define ZFS_DEBUG_SNAPNAMES 0x0008 -#define ZFS_DEBUG_MODIFY 0x0010 +#define ZFS_DEBUG_DPRINTF (1<<0) +#define ZFS_DEBUG_DBUF_VERIFY (1<<1) +#define ZFS_DEBUG_DNODE_VERIFY (1<<2) +#define ZFS_DEBUG_SNAPNAMES (1<<3) +#define ZFS_DEBUG_MODIFY (1<<4) +#define ZFS_DEBUG_SPA (1<<5) +#define ZFS_DEBUG_ZIO_FREE (1<<6) #ifdef ZFS_DEBUG extern void __dprintf(const char *file, const char *func, diff --git a/uts/common/fs/zfs/sys/zfs_ioctl.h b/uts/common/fs/zfs/sys/zfs_ioctl.h index 86e901b..874d422 100644 --- a/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -293,7 +293,6 @@ typedef struct zfs_cmd { uint64_t zc_history; /* really (char *) */ char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; - char zc_top_ds[MAXPATHLEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; @@ -345,7 +344,8 @@ extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int zfs_busy(void); -extern int zfs_unmount_snap(const char *, void *); +extern void zfs_unmount_snap(const char *); +extern void zfs_destroy_unmount_origin(const char *); /* * ZFS minor numbers can refer to either a control device instance or diff --git a/uts/common/fs/zfs/sys/zfs_znode.h b/uts/common/fs/zfs/sys/zfs_znode.h index 3e9621a..cf0bbee 100644 --- a/uts/common/fs/zfs/sys/zfs_znode.h +++ b/uts/common/fs/zfs/sys/zfs_znode.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_FS_ZFS_ZNODE_H @@ -240,7 +241,7 @@ typedef struct znode { */ #define ZFS_ENTER(zfsvfs) \ { \ - rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \ + rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ if ((zfsvfs)->z_unmounted) { \ ZFS_EXIT(zfsvfs); \ return (EIO); \ diff --git a/uts/common/fs/zfs/sys/zil.h b/uts/common/fs/zfs/sys/zil.h index e52c65b..a212e4f 100644 --- a/uts/common/fs/zfs/sys/zil.h +++ b/uts/common/fs/zfs/sys/zil.h @@ -411,8 +411,8 @@ extern int zil_check_log_chain(const char *osname, void *txarg); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); -extern int zil_suspend(zilog_t *zilog); -extern void zil_resume(zilog_t *zilog); +extern int zil_suspend(const char *osname, void **cookiep); +extern void zil_resume(void *cookie); extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp); extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); diff --git a/uts/common/fs/zfs/txg.c b/uts/common/fs/zfs/txg.c index 4760387..58690e3 100644 --- a/uts/common/fs/zfs/txg.c +++ b/uts/common/fs/zfs/txg.c @@ -576,6 +576,8 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; + ASSERT(!dsl_pool_config_held(dp)); + mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) @@ -599,6 +601,8 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; + ASSERT(!dsl_pool_config_held(dp)); + mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) @@ -664,42 +668,43 @@ txg_list_empty(txg_list_t *tl, uint64_t txg) } /* - * Add an entry to the list. - * Returns 0 if it's a new entry, 1 if it's already there. + * Add an entry to the list (unless it's already on the list). + * Returns B_TRUE if it was actually added. */ -int +boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - int already_on_list; + boolean_t add; mutex_enter(&tl->tl_lock); - already_on_list = tn->tn_member[t]; - if (!already_on_list) { + add = (tn->tn_member[t] == 0); + if (add) { tn->tn_member[t] = 1; tn->tn_next[t] = tl->tl_head[t]; tl->tl_head[t] = tn; } mutex_exit(&tl->tl_lock); - return (already_on_list); + return (add); } /* - * Add an entry to the end of the list (walks list to find end). - * Returns 0 if it's a new entry, 1 if it's already there. + * Add an entry to the end of the list, unless it's already on the list. + * (walks list to find end) + * Returns B_TRUE if it was actually added. */ -int +boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - int already_on_list; + boolean_t add; mutex_enter(&tl->tl_lock); - already_on_list = tn->tn_member[t]; - if (!already_on_list) { + add = (tn->tn_member[t] == 0); + if (add) { txg_node_t **tp; for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) @@ -711,7 +716,7 @@ txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) } mutex_exit(&tl->tl_lock); - return (already_on_list); + return (add); } /* @@ -762,13 +767,13 @@ txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) return (NULL); } -int +boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - return (tn->tn_member[t]); + return (tn->tn_member[t] != 0); } /* diff --git a/uts/common/fs/zfs/zfs_ctldir.c b/uts/common/fs/zfs/zfs_ctldir.c index d902ff6..ef9a561 100644 --- a/uts/common/fs/zfs/zfs_ctldir.c +++ b/uts/common/fs/zfs/zfs_ctldir.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -615,7 +616,7 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, zfsvfs_t *zfsvfs; avl_index_t where; char from[MAXNAMELEN], to[MAXNAMELEN]; - char real[MAXNAMELEN]; + char real[MAXNAMELEN], fsname[MAXNAMELEN]; int err; zfsvfs = sdvp->v_vfsp->vfs_data; @@ -634,12 +635,14 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, ZFS_EXIT(zfsvfs); + dmu_objset_name(zfsvfs->z_os, fsname); + err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); - if (!err) + if (err == 0) err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); - if (!err) + if (err == 0) err = zfs_secpolicy_rename_perms(from, to, cr); - if (err) + if (err != 0) return (err); /* @@ -659,7 +662,7 @@ zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, return (ENOENT); } - err = dmu_objset_rename(from, to, B_FALSE); + err = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); if (err == 0) zfsctl_rename_snap(sdp, sep, tnm); @@ -699,9 +702,9 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, ZFS_EXIT(zfsvfs); err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); - if (!err) + if (err == 0) err = zfs_secpolicy_destroy_perms(snapname, cr); - if (err) + if (err != 0) return (err); mutex_enter(&sdp->sd_lock); @@ -711,10 +714,10 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, if (sep) { avl_remove(&sdp->sd_snaps, sep); err = zfsctl_unmount_snap(sep, MS_FORCE, cr); - if (err) + if (err != 0) avl_add(&sdp->sd_snaps, sep); else - err = dmu_objset_destroy(snapname, B_FALSE); + err = dsl_destroy_snapshot(snapname, B_FALSE); } else { err = ENOENT; } @@ -746,12 +749,12 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, *vpp = NULL; err = zfs_secpolicy_snapshot_perms(name, cr); - if (err) + if (err != 0) return (err); if (err == 0) { err = dmu_objset_snapshot_one(name, dirname); - if (err) + if (err != 0) return (err); err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); } @@ -831,7 +834,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, *vpp = sep->se_root; VN_HOLD(*vpp); err = traverse(vpp); - if (err) { + if (err != 0) { VN_RELE(*vpp); *vpp = NULL; } else if (*vpp == sep->se_root) { @@ -857,7 +860,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, * The requested snapshot is not currently mounted, look it up. */ err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); - if (err) { + if (err != 0) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); /* @@ -930,7 +933,7 @@ domount: * If we had an error, drop our hold on the vnode and * zfsctl_snapshot_inactive() will clean up. */ - if (err) { + if (err != 0) { VN_RELE(*vpp); *vpp = NULL; } @@ -982,8 +985,10 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, ZFS_ENTER(zfsvfs); cookie = *offp; + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, &cookie, &case_conflict); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error) { ZFS_EXIT(zfsvfs); if (error == ENOENT) { diff --git a/uts/common/fs/zfs/zfs_ioctl.c b/uts/common/fs/zfs/zfs_ioctl.c index 38adc19..caad34c 100644 --- a/uts/common/fs/zfs/zfs_ioctl.c +++ b/uts/common/fs/zfs/zfs_ioctl.c @@ -156,6 +156,7 @@ #include #include #include +#include #include #include #include @@ -173,6 +174,9 @@ #include #include #include +#include +#include +#include #include #include "zfs_namecheck.h" @@ -237,11 +241,7 @@ static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); -static int zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature); -static int zfs_prop_activate_feature_check(void *arg1, void *arg2, - dmu_tx_t *tx); -static void zfs_prop_activate_feature_sync(void *arg1, void *arg2, - dmu_tx_t *tx); +static int zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature); /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void @@ -461,49 +461,48 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) { - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + if (dsl_prop_get_int_ds(ds, "zoned", &zoned)) return (ENOENT); - } - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int -zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) +zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, + const char *perm, cred_t *cr) { int error; - dsl_dataset_t *ds; - - error = dsl_dataset_hold(name, FTAG, &ds); - if (error != 0) - return (error); error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); - if (error) + if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } - - dsl_dataset_rele(ds, FTAG); return (error); } static int -zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, - const char *perm, cred_t *cr) +zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; + dsl_dataset_t *ds; + dsl_pool_t *dp; - error = zfs_dozonecheck_ds(name, ds, cr); - if (error == 0) { - error = secpolicy_zfs(cr); - if (error) - error = dsl_deleg_access_impl(ds, perm, cr); + error = dsl_pool_hold(name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); } + + error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); + + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -525,7 +524,7 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); - if (error) + if (error != 0) return (EPERM); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) @@ -575,7 +574,7 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, setsl_tag, &os); - if (error) + if (error != 0) return (EPERM); dmu_objset_disown(os, setsl_tag); @@ -663,7 +662,7 @@ zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) int error; error = zfs_dozonecheck(zc->zc_name, cr); - if (error) + if (error != 0) return (error); /* @@ -685,7 +684,6 @@ zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - spa_t *spa; dsl_pool_t *dp; dsl_dataset_t *ds; char *cp; @@ -698,23 +696,22 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (EINVAL); - error = spa_open(zc->zc_name, &spa, FTAG); - if (error) + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) return (error); - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - if (error) + if (error != 0) { + dsl_pool_rele(dp, FTAG); return (error); + } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -835,12 +832,21 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (EINVAL); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { + dsl_pool_t *dp; dsl_dataset_t *ds; + error = dsl_pool_hold(nvpair_name(pair), FTAG, &dp); + if (error != 0) + break; nextpair = nvlist_next_nvpair(snaps, pair); - error = dsl_dataset_hold(nvpair_name(pair), FTAG, &ds); - if (error == 0) { + error = dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds); + if (error == 0) dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + if (error == 0) { + error = zfs_secpolicy_destroy_perms(nvpair_name(pair), + cr); } else if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider @@ -852,11 +858,7 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) */ fnvlist_remove_nvpair(snaps, pair); error = 0; - continue; - } else { - break; } - error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error != 0) break; } @@ -904,41 +906,47 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - char parentname[MAXNAMELEN]; - objset_t *clone; + dsl_pool_t *dp; + dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); - if (error) + if (error != 0) + return (error); + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) return (error); - error = dmu_objset_hold(zc->zc_name, FTAG, &clone); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { - dsl_dataset_t *pclone = NULL; + char parentname[MAXNAMELEN]; + dsl_dataset_t *origin = NULL; dsl_dir_t *dd; - dd = clone->os_dsl_dataset->ds_dir; + dd = clone->ds_dir; - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_origin_obj, FTAG, &pclone); - rw_exit(&dd->dd_pool->dp_config_rwlock); - if (error) { - dmu_objset_rele(clone, FTAG); + dd->dd_phys->dd_origin_obj, FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(clone, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } - error = zfs_secpolicy_write_perms(zc->zc_name, + error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); - dsl_dataset_name(pclone, parentname); - dmu_objset_rele(clone, FTAG); - dsl_dataset_rele(pclone, FTAG); - if (error == 0) - error = zfs_secpolicy_write_perms(parentname, + dsl_dataset_name(origin, parentname); + if (error == 0) { + error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); + } + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(origin, FTAG); } + dsl_pool_rele(dp, FTAG); return (error); } @@ -1147,16 +1155,47 @@ zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_HOLD, cr)); + nvpair_t *pair; + nvlist_t *holds; + int error; + + error = nvlist_lookup_nvlist(innvl, "holds", &holds); + if (error != 0) + return (EINVAL); + + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + char fsname[MAXNAMELEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_HOLD, cr); + if (error != 0) + return (error); + } + return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_RELEASE, cr)); + nvpair_t *pair; + int error; + + for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(innvl, pair)) { + char fsname[MAXNAMELEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_RELEASE, cr); + if (error != 0) + return (error); + } + return (0); } /* @@ -1177,11 +1216,11 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); - if (!error) + if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); - if (!error) + if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); - if (!error) + if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); return (error); } @@ -1291,7 +1330,7 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp) int error; error = dmu_objset_hold(dsname, FTAG, &os); - if (error) + if (error != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); @@ -1394,7 +1433,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); - if (error) + if (error != 0) goto pool_props_bad; } @@ -1667,12 +1706,7 @@ zfs_ioc_pool_reguid(zfs_cmd_t *zc) static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { - int error; - - if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)) - return (error); - - return (0); + return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* @@ -1988,15 +2022,14 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { - objset_t *os = NULL; + objset_t *os; int error; - if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) - return (error); - - error = zfs_ioc_objset_stats_impl(zc, os); - - dmu_objset_rele(os, FTAG); + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, os); + dmu_objset_rele(os, FTAG); + } return (error); } @@ -2017,30 +2050,23 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { - objset_t *os = NULL; - int error; + int error = 0; nvlist_t *nv; - if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) - return (error); - /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ - if (!dsl_prop_get_hasrecvd(os)) { - dmu_objset_rele(os, FTAG); + if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (ENOTSUP); - } if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_received(os, &nv)) == 0) { + (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } - dmu_objset_rele(os, FTAG); return (error); } @@ -2155,20 +2181,6 @@ top: (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); - /* - * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 - * but is not declared void because its called by dmu_objset_find(). - */ - if (zc->zc_cookie == 0) { - uint64_t cookie = 0; - int len = sizeof (zc->zc_name) - (p - zc->zc_name); - - while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) { - if (!dataset_name_hidden(zc->zc_name)) - (void) dmu_objset_prefetch(zc->zc_name, NULL); - } - } - do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, @@ -2211,14 +2223,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *os; int error; -top: - if (zc->zc_cookie == 0) - (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, - NULL, DS_FIND_SNAPSHOTS); - error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error) + if (error != 0) { return (error == ENOENT ? ESRCH : error); + } /* * A dataset name of maximum length cannot have any snapshots, @@ -2238,24 +2246,8 @@ top: dsl_dataset_t *ds; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; - /* - * Since we probably don't have a hold on this snapshot, - * it's possible that the objsetid could have been destroyed - * and reused for a new objset. It's OK if this happens during - * a zfs send operation, since the new createtxg will be - * beyond the range we're interested in. - */ - rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (error) { - if (error == ENOENT) { - /* Racing with destroy, get the next one. */ - *strchr(zc->zc_name, '@') = '\0'; - dmu_objset_rele(os, FTAG); - goto top; - } - } else { + if (error == 0) { objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); @@ -2269,7 +2261,7 @@ top: dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ - if (error) + if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } @@ -2359,13 +2351,13 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: - err = dsl_dataset_set_quota(dsname, source, intval); + err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: - err = dsl_dataset_set_reservation(dsname, source, intval); + err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); @@ -2396,19 +2388,16 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, zfeature_info_t *feature = &spa_feature_table[SPA_FEATURE_LZ4_COMPRESS]; spa_t *spa; - dsl_pool_t *dp; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); - dp = spa->spa_dsl_pool; - /* * Setting the LZ4 compression algorithm activates * the feature. */ if (!spa_feature_is_active(spa, feature)) { - if ((err = zfs_prop_activate_feature(dp, + if ((err = zfs_prop_activate_feature(spa, feature)) != 0) { spa_close(spa, FTAG); return (err); @@ -2567,12 +2556,12 @@ retry: if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); - err = dsl_prop_set(dsname, propname, source, 1, - strlen(strval) + 1, strval); + err = dsl_prop_set_string(dsname, propname, + source, strval); } else { intval = fnvpair_value_uint64(propval); - err = dsl_prop_set(dsname, propname, source, 8, - 1, &intval); + err = dsl_prop_set_int(dsname, propname, source, + intval); } if (err != 0) { @@ -2638,7 +2627,7 @@ props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) } static int -clear_received_props(objset_t *os, const char *fs, nvlist_t *props, +clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; @@ -2650,8 +2639,8 @@ clear_received_props(objset_t *os, const char *fs, nvlist_t *props, * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | - (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0)); - err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL); + (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); + err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); @@ -2683,22 +2672,19 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) if (received) { nvlist_t *origprops; - objset_t *os; - - if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) { - if (dsl_prop_get_received(os, &origprops) == 0) { - (void) clear_received_props(os, - zc->zc_name, origprops, nvl); - nvlist_free(origprops); - } - dsl_prop_set_hasrecvd(os); - dmu_objset_rele(os, FTAG); + if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { + (void) clear_received_props(zc->zc_name, + origprops, nvl); + nvlist_free(origprops); } + + error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); - error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); + if (error == 0) + error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != NULL && errors != NULL) { (void) put_nvlist(zc, errors); @@ -2781,7 +2767,7 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) } /* property name has been validated by zfs_secpolicy_inherit_prop() */ - return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL)); + return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); } static int @@ -2893,7 +2879,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) */ error = secpolicy_zfs(CRED()); - if (error) { + if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); @@ -3221,7 +3207,7 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) - (void) dmu_objset_destroy(fsname, B_FALSE); + (void) dsl_destroy_head(fsname); } return (error); } @@ -3240,7 +3226,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) int error = 0; nvlist_t *nvprops = NULL; char *origin_name; - dsl_dataset_t *origin; if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) return (EINVAL); @@ -3252,14 +3237,8 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (EINVAL); - - error = dsl_dataset_hold(origin_name, FTAG, &origin); - if (error) - return (error); - - error = dmu_objset_clone(fsname, origin, 0); - dsl_dataset_rele(origin, FTAG); - if (error) + error = dmu_objset_clone(fsname, origin_name); + if (error != 0) return (error); /* @@ -3269,7 +3248,7 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) - (void) dmu_objset_destroy(fsname, B_FALSE); + (void) dsl_destroy_head(fsname); } return (error); } @@ -3281,7 +3260,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) * } * * outnvl: snapshot -> error code (int32) - * */ static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) @@ -3331,7 +3309,7 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) } } - error = dmu_objset_snapshot(snaps, props, outnvl); + error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } @@ -3376,30 +3354,73 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) return (error); } -/* ARGSUSED */ -int -zfs_unmount_snap(const char *name, void *arg) +/* + * The dp_config_rwlock must not be held when calling this, because the + * unmount may need to write out data. + * + * This function is best-effort. Callers must deal gracefully if it + * remains mounted (or is remounted after this call). + */ +void +zfs_unmount_snap(const char *snapname) { vfs_t *vfsp; - int err; + zfsvfs_t *zfsvfs; - if (strchr(name, '@') == NULL) - return (0); + if (strchr(snapname, '@') == NULL) + return; - vfsp = zfs_get_vfs(name); + vfsp = zfs_get_vfs(snapname); if (vfsp == NULL) - return (0); + return; - if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { + zfsvfs = vfsp->vfs_data; + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); + + if (vn_vfswlock(vfsp->vfs_vnodecovered) != 0) { VFS_RELE(vfsp); - return (err); + return; } VFS_RELE(vfsp); /* * Always force the unmount for snapshots. */ - return (dounmount(vfsp, MS_FORCE, kcred)); + (void) dounmount(vfsp, MS_FORCE, kcred); +} + +/* ARGSUSED */ +static int +zfs_unmount_snap_cb(const char *snapname, void *arg) +{ + zfs_unmount_snap(snapname); + return (0); +} + +/* + * When a clone is destroyed, its origin may also need to be destroyed, + * in which case it must be unmounted. This routine will do that unmount + * if necessary. + */ +void +zfs_destroy_unmount_origin(const char *fsname) +{ + int error; + objset_t *os; + dsl_dataset_t *ds; + + error = dmu_objset_hold(fsname, FTAG, &os); + if (error != 0) + return; + ds = dmu_objset_ds(os); + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { + char originname[MAXNAMELEN]; + dsl_dataset_name(ds->ds_prev, originname); + dmu_objset_rele(os, FTAG); + zfs_unmount_snap(originname); + } else { + dmu_objset_rele(os, FTAG); + } } /* @@ -3435,14 +3456,10 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) (name[poollen] != '/' && name[poollen] != '@')) return (EXDEV); - /* - * Ignore failures to unmount; dmu_snapshots_destroy_nvl() - * will deal with this gracefully (by filling in outnvl). - */ - (void) zfs_unmount_snap(name, NULL); + zfs_unmount_snap(name); } - return (dmu_snapshots_destroy_nvl(snaps, defer, outnvl)); + return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* @@ -3457,13 +3474,13 @@ static int zfs_ioc_destroy(zfs_cmd_t *zc) { int err; - if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { - err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); - } + if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) + zfs_unmount_snap(zc->zc_name); - err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); + if (strchr(zc->zc_name, '@')) + err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); + else + err = dsl_destroy_head(zc->zc_name); if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) (void) zvol_remove_minor(zc->zc_name); return (err); @@ -3478,79 +3495,34 @@ zfs_ioc_destroy(zfs_cmd_t *zc) static int zfs_ioc_rollback(zfs_cmd_t *zc) { - dsl_dataset_t *ds, *clone; - int error; zfsvfs_t *zfsvfs; - char *clone_name; - - error = dsl_dataset_hold(zc->zc_name, FTAG, &ds); - if (error) - return (error); - - /* must not be a snapshot */ - if (dsl_dataset_is_snapshot(ds)) { - dsl_dataset_rele(ds, FTAG); - return (EINVAL); - } - - /* must have a most recent snapshot */ - if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { - dsl_dataset_rele(ds, FTAG); - return (EINVAL); - } - - /* - * Create clone of most recent snapshot. - */ - clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name); - error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT); - if (error) - goto out; - - error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone); - if (error) - goto out; + int error; - /* - * Do clone swap. - */ if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; - if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { - error = dsl_dataset_clone_swap(clone, ds, - B_TRUE); - dsl_dataset_disown(ds, FTAG); - ds = NULL; - } else { - error = EBUSY; - } + error = dsl_dataset_rollback(zc->zc_name); resume_err = zfs_resume_fs(zfsvfs, zc->zc_name); error = error ? error : resume_err; } VFS_RELE(zfsvfs->z_vfs); } else { - if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { - error = dsl_dataset_clone_swap(clone, ds, B_TRUE); - dsl_dataset_disown(ds, FTAG); - ds = NULL; - } else { - error = EBUSY; - } + error = dsl_dataset_rollback(zc->zc_name); } + return (error); +} - /* - * Destroy clone (which also closes it). - */ - (void) dsl_dataset_destroy(clone, FTAG, B_FALSE); +static int +recursive_unmount(const char *fsname, void *arg) +{ + const char *snapname = arg; + char fullname[MAXNAMELEN]; -out: - strfree(clone_name); - if (ds) - dsl_dataset_rele(ds, FTAG); - return (error); + (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname); + zfs_unmount_snap(fullname); + return (0); } /* @@ -3565,26 +3537,33 @@ static int zfs_ioc_rename(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie & 1; + char *at; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '%')) return (EINVAL); - /* - * Unmount snapshot unless we're doing a recursive rename, - * in which case the dataset code figures out which snapshots - * to unmount. - */ - if (!recursive && strchr(zc->zc_name, '@') != NULL && - zc->zc_objset_type == DMU_OST_ZFS) { - int err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); + at = strchr(zc->zc_name, '@'); + if (at != NULL) { + /* snaps must be in same fs */ + if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) + return (EXDEV); + *at = '\0'; + if (zc->zc_objset_type == DMU_OST_ZFS) { + int error = dmu_objset_find(zc->zc_name, + recursive_unmount, at + 1, + recursive ? DS_FIND_CHILDREN : 0); + if (error != 0) + return (error); + } + return (dsl_dataset_rename_snapshot(zc->zc_name, + at + 1, strchr(zc->zc_value, '@') + 1, recursive)); + } else { + if (zc->zc_objset_type == DMU_OST_ZVOL) + (void) zvol_remove_minor(zc->zc_name); + return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } - if (zc->zc_objset_type == DMU_OST_ZVOL) - (void) zvol_remove_minor(zc->zc_name); - return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); } static int @@ -3728,35 +3707,14 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) } /* - * Activates a feature on a pool in response to a property setting. This - * creates a new sync task which modifies the pool to reflect the feature - * as being active. - */ -static int -zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature) -{ - int err; - - /* EBUSY here indicates that the feature is already active */ - err = dsl_sync_task_do(dp, zfs_prop_activate_feature_check, - zfs_prop_activate_feature_sync, dp->dp_spa, feature, 2); - - if (err != 0 && err != EBUSY) - return (err); - else - return (0); -} - -/* * Checks for a race condition to make sure we don't increment a feature flag * multiple times. */ -/*ARGSUSED*/ static int -zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) +zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - zfeature_info_t *feature = arg2; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; if (!spa_feature_is_active(spa, feature)) return (0); @@ -3769,15 +3727,36 @@ zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) * zfs_prop_activate_feature. */ static void -zfs_prop_activate_feature_sync(void *arg1, void *arg2, dmu_tx_t *tx) +zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - zfeature_info_t *feature = arg2; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; spa_feature_incr(spa, feature, tx); } /* + * Activates a feature on a pool in response to a property setting. This + * creates a new sync task which modifies the pool to reflect the feature + * as being active. + */ +static int +zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature) +{ + int err; + + /* EBUSY here indicates that the feature is already active */ + err = dsl_sync_task(spa_name(spa), + zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, + feature, 2); + + if (err != 0 && err != EBUSY) + return (err); + else + return (0); +} + +/* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. @@ -3931,7 +3910,6 @@ static int zfs_ioc_recv(zfs_cmd_t *zc) { file_t *fp; - objset_t *os; dmu_recv_cookie_t drc; boolean_t force = (boolean_t)zc->zc_guid; int fd; @@ -3941,7 +3919,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) offset_t off; nvlist_t *props = NULL; /* sent properties */ nvlist_t *origprops = NULL; /* existing properties */ - objset_t *origin = NULL; + char *origin = NULL; char *tosnap; char tofs[ZFS_MAXNAMELEN]; boolean_t first_recvd_props = B_FALSE; @@ -3969,18 +3947,31 @@ zfs_ioc_recv(zfs_cmd_t *zc) VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) { - if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) && - !dsl_prop_get_hasrecvd(os)) { + if (zc->zc_string[0]) + origin = zc->zc_string; + + error = dmu_recv_begin(tofs, tosnap, + &zc->zc_begin_record, force, origin, &drc); + if (error != 0) + goto out; + + /* + * Set properties before we receive the stream so that they are applied + * to the new data. Note that we must call dmu_recv_stream() if + * dmu_recv_begin() succeeds. + */ + if (props != NULL && !drc.drc_newfs) { + if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= + SPA_VERSION_RECVD_PROPS && + !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; - } /* * If new received properties are supplied, they are to * completely replace the existing received properties, so stash * away the existing ones. */ - if (dsl_prop_get_received(os, &origprops) == 0) { + if (dsl_prop_get_received(tofs, &origprops) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't @@ -3992,53 +3983,25 @@ zfs_ioc_recv(zfs_cmd_t *zc) */ if (!first_recvd_props) props_reduce(props, origprops); - if (zfs_check_clearable(tofs, origprops, - &errlist) != 0) + if (zfs_check_clearable(tofs, origprops, &errlist) != 0) (void) nvlist_merge(errors, errlist, 0); nvlist_free(errlist); - } - dmu_objset_rele(os, FTAG); - } - - if (zc->zc_string[0]) { - error = dmu_objset_hold(zc->zc_string, FTAG, &origin); - if (error) - goto out; - } - - error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds, - &zc->zc_begin_record, force, origin, &drc); - if (origin) - dmu_objset_rele(origin, FTAG); - if (error) - goto out; - - /* - * Set properties before we receive the stream so that they are applied - * to the new data. Note that we must call dmu_recv_stream() if - * dmu_recv_begin() succeeds. - */ - if (props) { - if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) { - if (drc.drc_newfs) { - if (spa_version(os->os_spa) >= - SPA_VERSION_RECVD_PROPS) - first_recvd_props = B_TRUE; - } else if (origprops != NULL) { - if (clear_received_props(os, tofs, origprops, - first_recvd_props ? NULL : props) != 0) - zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } else { + if (clear_received_props(tofs, origprops, + first_recvd_props ? NULL : props) != 0) zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } - dsl_prop_set_hasrecvd(os); - } else if (!drc.drc_newfs) { + } else { zc->zc_obj |= ZPROP_ERR_NOCLEAR; } + } + + if (props != NULL) { + props_error = dsl_prop_set_hasrecvd(tofs); - (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, - props, errors); + if (props_error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + props, errors); + } } if (zc->zc_nvlist_dst_size != 0 && @@ -4090,22 +4053,16 @@ zfs_ioc_recv(zfs_cmd_t *zc) /* * On error, restore the original props. */ - if (error && props) { - if (dmu_objset_hold(tofs, FTAG, &os) == 0) { - if (clear_received_props(os, tofs, props, NULL) != 0) { - /* - * We failed to clear the received properties. - * Since we may have left a $recvd value on the - * system, we can't clear the $hasrecvd flag. - */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; - } else if (first_recvd_props) { - dsl_prop_unset_hasrecvd(os); - } - dmu_objset_rele(os, FTAG); - } else if (!drc.drc_newfs) { - /* We failed to clear the received properties. */ + if (error != 0 && props != NULL && !drc.drc_newfs) { + if (clear_received_props(tofs, props, NULL) != 0) { + /* + * We failed to clear the received properties. + * Since we may have left a $recvd value on the + * system, we can't clear the $hasrecvd flag. + */ zc->zc_obj |= ZPROP_ERR_NORESTORE; + } else if (first_recvd_props) { + dsl_prop_unset_hasrecvd(tofs); } if (origprops == NULL && !drc.drc_newfs) { @@ -4157,100 +4114,75 @@ out: static int zfs_ioc_send(zfs_cmd_t *zc) { - objset_t *fromsnap = NULL; - objset_t *tosnap; int error; offset_t off; - dsl_dataset_t *ds; - dsl_dataset_t *dsfrom = NULL; - spa_t *spa; - dsl_pool_t *dp; boolean_t estimate = (zc->zc_guid != 0); - error = spa_open(zc->zc_name, &spa, FTAG); - if (error) - return (error); + if (zc->zc_obj != 0) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - if (error) - return (error); - - error = dmu_objset_from_ds(ds, &tosnap); - if (error) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - if (zc->zc_fromobj != 0) { - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom); - rw_exit(&dp->dp_config_rwlock); - if (error) { - dsl_dataset_rele(ds, FTAG); + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) return (error); - } - error = dmu_objset_from_ds(dsfrom, &fromsnap); - if (error) { - dsl_dataset_rele(dsfrom, FTAG); - dsl_dataset_rele(ds, FTAG); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); return (error); } + + if (dsl_dir_is_clone(tosnap->ds_dir)) + zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj; + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); } - if (zc->zc_obj) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; + if (estimate) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + dsl_dataset_t *fromsnap = NULL; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); - if (fromsnap != NULL) { - dsl_dataset_rele(dsfrom, FTAG); - dsl_dataset_rele(ds, FTAG); - return (EINVAL); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); } - if (dsl_dir_is_clone(ds->ds_dir)) { - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &dsfrom); - rw_exit(&dp->dp_config_rwlock); - if (error) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - error = dmu_objset_from_ds(dsfrom, &fromsnap); - if (error) { - dsl_dataset_rele(dsfrom, FTAG); - dsl_dataset_rele(ds, FTAG); + if (zc->zc_fromobj != 0) { + error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, + FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } } - } - if (estimate) { error = dmu_send_estimate(tosnap, fromsnap, &zc->zc_objset_type); + + if (fromsnap != NULL) + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); } else { file_t *fp = getf(zc->zc_cookie); - if (fp == NULL) { - dsl_dataset_rele(ds, FTAG); - if (dsfrom) - dsl_dataset_rele(dsfrom, FTAG); + if (fp == NULL) return (EBADF); - } off = fp->f_offset; - error = dmu_send(tosnap, fromsnap, - zc->zc_cookie, fp->f_vnode, &off); + error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, + zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(zc->zc_cookie); } - if (dsfrom) - dsl_dataset_rele(dsfrom, FTAG); - dsl_dataset_rele(ds, FTAG); return (error); } @@ -4265,12 +4197,20 @@ zfs_ioc_send(zfs_cmd_t *zc) static int zfs_ioc_send_progress(zfs_cmd_t *zc) { + dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendarg_t *dsp = NULL; int error; - if ((error = dsl_dataset_hold(zc->zc_name, FTAG, &ds)) != 0) + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); return (error); + } mutex_enter(&ds->ds_sendstream_lock); @@ -4294,6 +4234,7 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -4400,7 +4341,7 @@ zfs_ioc_clear(zfs_cmd_t *zc) } } - if (error) + if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); @@ -4438,7 +4379,7 @@ zfs_ioc_pool_reopen(zfs_cmd_t *zc) int error; error = spa_open(zc->zc_name, &spa, FTAG); - if (error) + if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); @@ -4478,7 +4419,7 @@ zfs_ioc_promote(zfs_cmd_t *zc) if (cp) *cp = '\0'; (void) dmu_objset_find(zc->zc_value, - zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); + zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } @@ -4504,7 +4445,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) return (EINVAL); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); - if (error) + if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, @@ -4535,7 +4476,7 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc) return (ENOMEM); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); - if (error) + if (error != 0) return (error); void *buf = kmem_alloc(bufsize, KM_SLEEP); @@ -4585,7 +4526,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error) + if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); @@ -4658,7 +4599,7 @@ zfs_ioc_share(zfs_cmd_t *zc) return (ENOSYS); } error = zfs_init_sharefs(); - if (error) { + if (error != 0) { mutex_exit(&zfs_share_lock); return (ENOSYS); } @@ -4683,7 +4624,7 @@ zfs_ioc_share(zfs_cmd_t *zc) return (ENOSYS); } error = zfs_init_sharefs(); - if (error) { + if (error != 0) { mutex_exit(&zfs_share_lock); return (ENOSYS); } @@ -4748,7 +4689,7 @@ zfs_ioc_next_obj(zfs_cmd_t *zc) int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error) + if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, @@ -4771,25 +4712,26 @@ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; + char *hold_name; int error; + minor_t minor; - snap_name = kmem_asprintf("%s@%s-%016llx", zc->zc_name, zc->zc_value, - (u_longlong_t)ddi_get_lbolt64()); - - if (strlen(snap_name) >= MAXPATHLEN) { - strfree(snap_name); - return (E2BIG); - } - - error = dmu_objset_snapshot_tmp(snap_name, "%temp", zc->zc_cleanup_fd); - if (error != 0) { - strfree(snap_name); + error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); + if (error != 0) return (error); - } - (void) strcpy(zc->zc_value, strchr(snap_name, '@') + 1); + snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, + (u_longlong_t)ddi_get_lbolt64()); + hold_name = kmem_asprintf("%%%s", zc->zc_value); + + error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, + hold_name); + if (error == 0) + (void) strcpy(zc->zc_value, snap_name); strfree(snap_name); - return (0); + strfree(hold_name); + zfs_onexit_fd_rele(zc->zc_cleanup_fd); + return (error); } /* @@ -4804,39 +4746,22 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) static int zfs_ioc_diff(zfs_cmd_t *zc) { - objset_t *fromsnap; - objset_t *tosnap; file_t *fp; offset_t off; int error; - error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); - if (error) - return (error); - - error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap); - if (error) { - dmu_objset_rele(tosnap, FTAG); - return (error); - } - fp = getf(zc->zc_cookie); - if (fp == NULL) { - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); + if (fp == NULL) return (EBADF); - } off = fp->f_offset; - error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off); + error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(zc->zc_cookie); - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); return (error); } @@ -4905,13 +4830,13 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) ZFS_SHARES_DIR); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { + if (error != 0) { dmu_tx_abort(tx); } else { error = zfs_create_share_dir(zfsvfs, tx); dmu_tx_commit(tx); } - if (error) { + if (error != 0) { mutex_exit(&zfsvfs->z_lock); VN_RELE(vp); ZFS_EXIT(zfsvfs); @@ -4990,124 +4915,82 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) } /* - * inputs: - * zc_name name of filesystem - * zc_value short name of snap - * zc_string user-supplied tag for this hold - * zc_cookie recursive flag - * zc_temphold set if hold is temporary - * zc_cleanup_fd cleanup-on-exit file descriptor for calling process - * zc_sendobj if non-zero, the objid for zc_name@zc_value - * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg + * innvl: { + * "holds" -> { snapname -> holdname (string), ... } + * (optional) "cleanup_fd" -> fd (int32) + * } * - * outputs: none + * outnvl: { + * snapname -> error value (int32) + * ... + * } */ +/* ARGSUSED */ static int -zfs_ioc_hold(zfs_cmd_t *zc) +zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { - boolean_t recursive = zc->zc_cookie; - spa_t *spa; - dsl_pool_t *dp; - dsl_dataset_t *ds; + nvlist_t *holds; + int cleanup_fd = -1; int error; minor_t minor = 0; - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - - if (zc->zc_sendobj == 0) { - return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, - zc->zc_string, recursive, zc->zc_temphold, - zc->zc_cleanup_fd)); - } - - if (recursive) + error = nvlist_lookup_nvlist(args, "holds", &holds); + if (error != 0) return (EINVAL); - error = spa_open(zc->zc_name, &spa, FTAG); - if (error) - return (error); - - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - if (error) - return (error); - - /* - * Until we have a hold on this snapshot, it's possible that - * zc_sendobj could've been destroyed and reused as part - * of a later txg. Make sure we're looking at the right object. - */ - if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) { - dsl_dataset_rele(ds, FTAG); - return (ENOENT); - } - - if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) { - error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); - if (error) { - dsl_dataset_rele(ds, FTAG); + if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { + error = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (error != 0) return (error); - } } - error = dsl_dataset_user_hold_for_send(ds, zc->zc_string, - zc->zc_temphold); - if (minor != 0) { - if (error == 0) { - dsl_register_onexit_hold_cleanup(ds, zc->zc_string, - minor); - } - zfs_onexit_fd_rele(zc->zc_cleanup_fd); - } - dsl_dataset_rele(ds, FTAG); - + error = dsl_dataset_user_hold(holds, minor, errlist); + if (minor != 0) + zfs_onexit_fd_rele(cleanup_fd); return (error); } /* - * inputs: - * zc_name name of dataset from which we're releasing a user hold - * zc_value short name of snap - * zc_string user-supplied tag for this hold - * zc_cookie recursive flag + * innvl is not used. * - * outputs: none + * outnvl: { + * holdname -> time added (uint64 seconds since epoch) + * ... + * } */ +/* ARGSUSED */ static int -zfs_ioc_release(zfs_cmd_t *zc) +zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { - boolean_t recursive = zc->zc_cookie; - - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - - return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, - zc->zc_string, recursive)); + return (dsl_dataset_get_holds(snapname, outnvl)); } /* - * inputs: - * zc_name name of filesystem + * innvl: { + * snapname -> { holdname, ... } + * ... + * } * - * outputs: - * zc_nvlist_src{_size} nvlist of snapshot holds + * outnvl: { + * snapname -> error value (int32) + * ... + * } */ +/* ARGSUSED */ static int -zfs_ioc_get_holds(zfs_cmd_t *zc) +zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { - nvlist_t *nvp; - int error; + nvpair_t *pair; - if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { - error = put_nvlist(zc, nvp); - nvlist_free(nvp); - } + /* + * The release may cause the snapshot to be destroyed; make sure it + * is not mounted. + */ + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) + zfs_unmount_snap(nvpair_name(pair)); - return (error); + return (dsl_dataset_user_release(holds, errlist)); } /* @@ -5124,14 +5007,21 @@ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; + dsl_pool_t *dp; dsl_dataset_t *new, *old; - error = dsl_dataset_hold(zc->zc_name, FTAG, &new); + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); - error = dsl_dataset_hold(zc->zc_value, FTAG, &old); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -5139,8 +5029,10 @@ zfs_ioc_space_written(zfs_cmd_t *zc) &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } + /* * innvl: { * "firstsnap" -> snapshot name @@ -5156,6 +5048,7 @@ static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; + dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; @@ -5163,18 +5056,26 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) return (EINVAL); - error = dsl_dataset_hold(lastsnap, FTAG, &new); + error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); - error = dsl_dataset_hold(firstsnap, FTAG, &old); + + error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); @@ -5193,47 +5094,27 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { - objset_t *fromsnap = NULL; - objset_t *tosnap; int error; offset_t off; - char *fromname; + char *fromname = NULL; int fd; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) return (EINVAL); - error = dmu_objset_hold(snapname, FTAG, &tosnap); - if (error) - return (error); - - error = nvlist_lookup_string(innvl, "fromsnap", &fromname); - if (error == 0) { - error = dmu_objset_hold(fromname, FTAG, &fromsnap); - if (error) { - dmu_objset_rele(tosnap, FTAG); - return (error); - } - } + (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); file_t *fp = getf(fd); - if (fp == NULL) { - dmu_objset_rele(tosnap, FTAG); - if (fromsnap != NULL) - dmu_objset_rele(fromsnap, FTAG); + if (fp == NULL) return (EBADF); - } off = fp->f_offset; - error = dmu_send(tosnap, fromsnap, fd, fp->f_vnode, &off); + error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(fd); - if (fromsnap != NULL) - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); return (error); } @@ -5252,21 +5133,29 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { - objset_t *fromsnap = NULL; - objset_t *tosnap; + dsl_pool_t *dp; + dsl_dataset_t *fromsnap = NULL; + dsl_dataset_t *tosnap; int error; char *fromname; uint64_t space; - error = dmu_objset_hold(snapname, FTAG, &tosnap); - if (error) + error = dsl_pool_hold(snapname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); return (error); + } error = nvlist_lookup_string(innvl, "fromsnap", &fromname); if (error == 0) { - error = dmu_objset_hold(fromname, FTAG, &fromsnap); - if (error) { - dmu_objset_rele(tosnap, FTAG); + error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } } @@ -5275,8 +5164,9 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) fnvlist_add_uint64(outnvl, "space", space); if (fromsnap != NULL) - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -5421,6 +5311,17 @@ zfs_ioctl_init(void) zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("hold", ZFS_IOC_HOLD, + zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("release", ZFS_IOC_RELEASE, + zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, + zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, @@ -5498,8 +5399,6 @@ zfs_ioctl_init(void) zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); - zfs_ioctl_register_dataset_read(ZFS_IOC_GET_HOLDS, - zfs_ioc_get_holds); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, @@ -5542,10 +5441,6 @@ zfs_ioctl_init(void) zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); - zfs_ioctl_register_dataset_modify(ZFS_IOC_HOLD, zfs_ioc_hold, - zfs_secpolicy_hold); - zfs_ioctl_register_dataset_modify(ZFS_IOC_RELEASE, zfs_ioc_release, - zfs_secpolicy_release); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, diff --git a/uts/common/fs/zfs/zfs_vfsops.c b/uts/common/fs/zfs/zfs_vfsops.c index 2a25017..42486ea 100644 --- a/uts/common/fs/zfs/zfs_vfsops.c +++ b/uts/common/fs/zfs/zfs_vfsops.c @@ -513,27 +513,31 @@ zfs_register_callbacks(vfs_t *vfsp) * overboard... */ ds = dmu_objset_ds(os); - error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "xattr", xattr_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "recordsize", blksz_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "readonly", readonly_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "devices", devices_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "setuid", setuid_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "exec", exec_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "snapdir", snapdir_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "aclmode", acl_mode_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "aclinherit", acl_inherit_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); error = error ? error : dsl_prop_register(ds, - "vscan", vscan_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; @@ -563,28 +567,35 @@ unregister: * registered, but this is OK; it will simply return ENOMSG, * which we will ignore. */ - (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, - zfsvfs); - (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME), + atime_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR), + xattr_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + blksz_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY), + readonly_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), + devices_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID), + setuid_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC), + exec_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), + snapdir_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), + acl_mode_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), + acl_inherit_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), + vscan_changed_cb, zfsvfs); return (error); - } static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { - int error = 0; - /* * Is it a valid type of object to track? */ @@ -641,7 +652,7 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, *groupp = BSWAP_64(*groupp); } } - return (error); + return (0); } static void @@ -993,7 +1004,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp) mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); - rrw_init(&zfsvfs->z_teardown_lock); + rrw_init(&zfsvfs->z_teardown_lock, B_FALSE); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) @@ -1444,8 +1455,9 @@ zfs_mount_label_policy(vfs_t *vfsp, char *osname) char *str = NULL; if (l_to_str_internal(mnt_sl, &str) == 0 && - dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), - ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) + dsl_prop_set_string(osname, + zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, str) == 0) retv = 0; if (str != NULL) kmem_free(str, strlen(str) + 1); @@ -1856,7 +1868,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(zfsvfs->z_os); + dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } diff --git a/uts/common/fs/zfs/zil.c b/uts/common/fs/zfs/zil.c index 81d2bb5..e9616f8 100644 --- a/uts/common/fs/zfs/zil.c +++ b/uts/common/fs/zfs/zil.c @@ -235,7 +235,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, } } - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + VERIFY(arc_buf_remove_ref(abuf, &abuf)); } return (error); @@ -332,7 +332,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); - if (error) + if (error != 0) break; for (lrp = lrbuf; lrp < end; lrp += reclen) { @@ -467,7 +467,7 @@ zilog_dirty(zilog_t *zilog, uint64_t txg) if (dsl_dataset_is_snapshot(ds)) panic("dirtying snapshot!"); - if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg) == 0) { + if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, zilog); } @@ -626,8 +626,8 @@ zil_claim(const char *osname, void *txarg) objset_t *os; int error; - error = dmu_objset_hold(osname, FTAG, &os); - if (error) { + error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os); + if (error != 0) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } @@ -640,7 +640,7 @@ zil_claim(const char *osname, void *txarg) zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); BP_ZERO(&zh->zh_log); dsl_dataset_dirty(dmu_objset_ds(os), tx); - dmu_objset_rele(os, FTAG); + dmu_objset_disown(os, FTAG); return (0); } @@ -665,7 +665,7 @@ zil_claim(const char *osname, void *txarg) } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); - dmu_objset_rele(os, FTAG); + dmu_objset_disown(os, FTAG); return (0); } @@ -685,7 +685,7 @@ zil_check_log_chain(const char *osname, void *tx) ASSERT(tx == NULL); error = dmu_objset_hold(osname, FTAG, &os); - if (error) { + if (error != 0) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } @@ -973,7 +973,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) /* pass the old blkptr in order to spread log blocks across devs */ error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, USE_SLOG(zilog)); - if (!error) { + if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; @@ -1084,7 +1084,7 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } - if (error) { + if (error != 0) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); return (lwb); @@ -1708,6 +1708,9 @@ zil_free(zilog_t *zilog) { zilog->zl_stop_sync = 1; + ASSERT0(zilog->zl_suspend); + ASSERT0(zilog->zl_suspending); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); @@ -1803,32 +1806,100 @@ zil_close(zilog_t *zilog) mutex_exit(&zilog->zl_lock); } +static char *suspend_tag = "zil suspending"; + /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. - * We suspend the log briefly when taking a snapshot so that the snapshot - * contains all the data it's supposed to, and has an empty intent log. + * On old version pools, we suspend the log briefly when taking a + * snapshot so that it will have an empty intent log. + * + * Long holds are not really intended to be used the way we do here -- + * held for such a short time. A concurrent caller of dsl_dataset_long_held() + * could fail. Therefore we take pains to only put a long hold if it is + * actually necessary. Fortunately, it will only be necessary if the + * objset is currently mounted (or the ZVOL equivalent). In that case it + * will already have a long hold, so we are not really making things any worse. + * + * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or + * zvol_state_t), and use their mechanism to prevent their hold from being + * dropped (e.g. VFS_HOLD()). However, that would be even more pain for + * very little gain. + * + * if cookiep == NULL, this does both the suspend & resume. + * Otherwise, it returns with the dataset "long held", and the cookie + * should be passed into zil_resume(). */ int -zil_suspend(zilog_t *zilog) +zil_suspend(const char *osname, void **cookiep) { - const zil_header_t *zh = zilog->zl_header; + objset_t *os; + zilog_t *zilog; + const zil_header_t *zh; + int error; + + error = dmu_objset_hold(osname, suspend_tag, &os); + if (error != 0) + return (error); + zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); + zh = zilog->zl_header; + if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); return (EBUSY); } - if (zilog->zl_suspend++ != 0) { + + /* + * Don't put a long hold in the cases where we can avoid it. This + * is when there is no cookie so we are doing a suspend & resume + * (i.e. called from zil_vdev_offline()), and there's nothing to do + * for the suspend because it's already suspended, or there's no ZIL. + */ + if (cookiep == NULL && !zilog->zl_suspending && + (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { + mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); + return (0); + } + + dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); + dsl_pool_rele(dmu_objset_pool(os), suspend_tag); + + zilog->zl_suspend++; + + if (zilog->zl_suspend > 1) { /* - * Someone else already began a suspend. + * Someone else is already suspending it. * Just wait for them to finish. */ + while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); + + if (cookiep == NULL) + zil_resume(os); + else + *cookiep = os; + return (0); + } + + /* + * If there is no pointer to an on-disk block, this ZIL must not + * be active (e.g. filesystem not mounted), so there's nothing + * to clean up. + */ + if (BP_IS_HOLE(&zh->zh_log)) { + ASSERT(cookiep != NULL); /* fast path already handled */ + + *cookiep = os; + mutex_exit(&zilog->zl_lock); return (0); } + zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); @@ -1841,16 +1912,25 @@ zil_suspend(zilog_t *zilog) cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); + if (cookiep == NULL) + zil_resume(os); + else + *cookiep = os; return (0); } void -zil_resume(zilog_t *zilog) +zil_resume(void *cookie) { + objset_t *os = cookie; + zilog_t *zilog = dmu_objset_zil(os); + mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); + dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); + dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); } typedef struct zil_replay_arg { @@ -1923,7 +2003,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); - if (error) + if (error != 0) return (zil_replay_error(zilog, lr, error)); } @@ -1944,7 +2024,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); - if (error) { + if (error != 0) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with @@ -1954,7 +2034,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); - if (error) + if (error != 0) return (zil_replay_error(zilog, lr, error)); } return (0); @@ -2026,19 +2106,10 @@ zil_replaying(zilog_t *zilog, dmu_tx_t *tx) int zil_vdev_offline(const char *osname, void *arg) { - objset_t *os; - zilog_t *zilog; int error; - error = dmu_objset_hold(osname, FTAG, &os); - if (error) - return (error); - - zilog = dmu_objset_zil(os); - if (zil_suspend(zilog) != 0) - error = EEXIST; - else - zil_resume(zilog); - dmu_objset_rele(os, FTAG); - return (error); + error = zil_suspend(osname, NULL); + if (error != 0) + return (EEXIST); + return (0); } diff --git a/uts/common/fs/zfs/zio.c b/uts/common/fs/zfs/zio.c index 432a992..7940e20 100644 --- a/uts/common/fs/zfs/zio.c +++ b/uts/common/fs/zfs/zio.c @@ -697,6 +697,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { + metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } @@ -713,6 +714,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); + metaslab_check_free(spa, bp); + zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); @@ -2010,7 +2013,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = EEXIST; - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + VERIFY(arc_buf_remove_ref(abuf, &abuf)); } ddt_enter(ddt); @@ -2600,8 +2603,9 @@ zio_vdev_io_assess(zio_t *zio) * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && - vd != NULL && !vd->vdev_ops->vdev_op_leaf) + vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; + } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; diff --git a/uts/common/fs/zfs/zvol.c b/uts/common/fs/zfs/zvol.c index b413f5e..5911fd3 100644 --- a/uts/common/fs/zfs/zvol.c +++ b/uts/common/fs/zfs/zvol.c @@ -653,7 +653,7 @@ zvol_last_close(zvol_state_t *zv) if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) && !(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); - (void) dmu_objset_evict_dbufs(zv->zv_objset); + dmu_objset_evict_dbufs(zv->zv_objset); dmu_objset_disown(zv->zv_objset, zvol_tag); zv->zv_objset = NULL; @@ -698,7 +698,7 @@ zvol_prealloc(zvol_state_t *zv) return (0); } -int +static int zvol_update_volsize(objset_t *os, uint64_t volsize) { dmu_tx_t *tx; @@ -749,13 +749,12 @@ zvol_remove_minors(const char *name) } static int -zvol_set_volsize_impl(objset_t *os, zvol_state_t *zv, uint64_t volsize) +zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize) { uint64_t old_volsize = 0ULL; - int error; + int error = 0; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); - error = zvol_update_volsize(os, volsize); /* * Reinitialize the dump area to the new size. If we @@ -764,27 +763,25 @@ zvol_set_volsize_impl(objset_t *os, zvol_state_t *zv, uint64_t volsize) * to calling dumpvp_resize() to ensure that the devices' * size(9P) is not visible by the dump subsystem. */ - if (zv && error == 0) { - old_volsize = zv->zv_volsize; - zvol_size_changed(zv, volsize); - - if (zv->zv_flags & ZVOL_DUMPIFIED) { - if ((error = zvol_dumpify(zv)) != 0 || - (error = dumpvp_resize()) != 0) { - int dumpify_error; - - (void) zvol_update_volsize(os, old_volsize); - zvol_size_changed(zv, old_volsize); - dumpify_error = zvol_dumpify(zv); - error = dumpify_error ? dumpify_error : error; - } + old_volsize = zv->zv_volsize; + zvol_size_changed(zv, volsize); + + if (zv->zv_flags & ZVOL_DUMPIFIED) { + if ((error = zvol_dumpify(zv)) != 0 || + (error = dumpvp_resize()) != 0) { + int dumpify_error; + + (void) zvol_update_volsize(zv->zv_objset, old_volsize); + zvol_size_changed(zv, old_volsize); + dumpify_error = zvol_dumpify(zv); + error = dumpify_error ? dumpify_error : error; } } /* * Generate a LUN expansion event. */ - if (zv && error == 0) { + if (error == 0) { sysevent_id_t eid; nvlist_t *attr; char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); @@ -812,29 +809,45 @@ zvol_set_volsize(const char *name, uint64_t volsize) int error; dmu_object_info_t doi; uint64_t readonly; + boolean_t owned = B_FALSE; + + error = dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); + if (error != 0) + return (error); + if (readonly) + return (EROFS); mutex_enter(&zfsdev_state_lock); zv = zvol_minor_lookup(name); - if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { - mutex_exit(&zfsdev_state_lock); - return (error); + + if (zv == NULL || zv->zv_objset == NULL) { + if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, + FTAG, &os)) != 0) { + mutex_exit(&zfsdev_state_lock); + return (error); + } + owned = B_TRUE; + if (zv != NULL) + zv->zv_objset = os; + } else { + os = zv->zv_objset; } if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || - (error = zvol_check_volsize(volsize, - doi.doi_data_block_size)) != 0) + (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0) goto out; - VERIFY3U(dsl_prop_get_integer(name, - zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL), ==, 0); - if (readonly) { - error = EROFS; - goto out; - } + error = zvol_update_volsize(os, volsize); - error = zvol_set_volsize_impl(os, zv, volsize); + if (error == 0 && zv != NULL) + error = zvol_update_live_volsize(zv, volsize); out: - dmu_objset_rele(os, FTAG); + if (owned) { + dmu_objset_disown(os, FTAG); + if (zv != NULL) + zv->zv_objset = NULL; + } mutex_exit(&zfsdev_state_lock); return (error); } @@ -1155,6 +1168,9 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, ze = list_next(&zv->zv_extents, ze); } + if (ze == NULL) + return (EINVAL); + if (!ddi_in_panic()) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); @@ -1308,6 +1324,9 @@ zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) if (zv == NULL) return (ENXIO); + if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0) + return (EINVAL); + boff = ldbtob(blkno); resid = ldbtob(nblocks); diff --git a/uts/common/sys/nvpair.h b/uts/common/sys/nvpair.h index ad25eff..e4d637b 100644 --- a/uts/common/sys/nvpair.h +++ b/uts/common/sys/nvpair.h @@ -284,6 +284,7 @@ void fnvlist_pack_free(char *, size_t); nvlist_t *fnvlist_unpack(char *, size_t); nvlist_t *fnvlist_dup(nvlist_t *); void fnvlist_merge(nvlist_t *, nvlist_t *); +size_t fnvlist_num_pairs(nvlist_t *); void fnvlist_add_boolean(nvlist_t *, const char *); void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); -- cgit v1.1 From a41bc8b667fc299a6a06b9f1fee73f738d54d8df Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 5 Mar 2013 22:51:11 +0000 Subject: libzfs_core depends on libnvpair, explicitly say this in Makefile.inc1. This fixes build with make -j. --- Makefile.inc1 | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.inc1 b/Makefile.inc1 index 6114bb7..20b3bd3 100644 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -1420,6 +1420,7 @@ _cddl_lib_libumem= cddl/lib/libumem _cddl_lib_libnvpair= cddl/lib/libnvpair _cddl_lib_libzfs_core= cddl/lib/libzfs_core _cddl_lib= cddl/lib +cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L .endif .if ${MK_CRYPT} != "no" -- cgit v1.1 From cf801867c9adf3ab9de7448db68218e7e3a97c2e Mon Sep 17 00:00:00 2001 From: delphij Date: Tue, 5 Mar 2013 22:58:53 +0000 Subject: Use adx2 instead of adx in the second vsprintf, this fixes a panic. --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c index 6ac8a93..15d3050 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c @@ -450,7 +450,7 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, va_copy(adx2, adx); msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); - (void) vsprintf(msg, fmt, adx); + (void) vsprintf(msg, fmt, adx2); fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); strfree(msg); -- cgit v1.1 From 09d4b602003ca7e48c48de04c63590957d6a6802 Mon Sep 17 00:00:00 2001 From: delphij Date: Wed, 6 Mar 2013 01:21:56 +0000 Subject: Diff reduction with Illumos --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index d2833c2..12c907b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -5801,11 +5801,11 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t addr, int flag, { zfs_cmd_t *zc; uint_t vecnum; -#ifdef illumos int error, rc, len; +#ifdef illumos minor_t minor = getminor(dev); #else - int cflag, cmd, error, rc, len; + int cflag, cmd; cred_t *cr = td->td_ucred; #endif const zfs_ioc_vec_t *vec; @@ -5989,7 +5989,7 @@ out: zfs_cmd_compat_put(zc, addr, cflag); } - kmem_free(zc, sizeof(zfs_cmd_t)); + kmem_free(zc, sizeof (zfs_cmd_t)); return (error); } -- cgit v1.1 From 5791f5a0dc82a169a07f4e2ec7213a6d7575b4a1 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 6 Mar 2013 10:40:50 +0000 Subject: Move libzfs compat functions to libzfs_compat.h This header is used by both libzfs_core and libzfs libraries --- .../opensolaris/lib/libzfs/common/libzfs_compat.h | 94 ++++++++++++++++++++++ .../opensolaris/lib/libzfs/common/libzfs_impl.h | 60 +------------- .../lib/libzfs_core/common/libzfs_core.c | 2 +- .../lib/libzfs_core/common/libzfs_core.h | 1 + 4 files changed, 97 insertions(+), 60 deletions(-) create mode 100644 cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h new file mode 100644 index 0000000..b6654b4 --- /dev/null +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h @@ -0,0 +1,94 @@ +/* + * CDDL HEADER SART + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ + +#ifndef _LIBZFS_COMPAT_H +#define _LIBZFS_COMPAT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static int zfs_kernel_version = 0; +static int zfs_ioctl_version = 0; + +/* + * This is FreeBSD version of ioctl, because Solaris' ioctl() updates + * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an + * error is returned zc_nvlist_dst_size won't be updated. + */ +static __inline int +zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) +{ + unsigned long cmd; + size_t oldsize, zfs_kernel_version_size, zfs_ioctl_version_size; + int version, ret, cflag = ZFS_CMD_COMPAT_NONE; + + cmd = _IOWR('Z', request, struct zfs_cmd); + + zfs_ioctl_version_size = sizeof(zfs_ioctl_version); + if (zfs_ioctl_version == 0) { + sysctlbyname("vfs.zfs.version.ioctl", &zfs_ioctl_version, + &zfs_ioctl_version_size, NULL, 0); + } + + /* + * If vfs.zfs.version.ioctl is not defined, assume we have v28 + * compatible binaries and use vfs.zfs.version.spa to test for v15 + */ + if (zfs_ioctl_version < ZFS_IOCVER_DEADMAN) { + cflag = ZFS_CMD_COMPAT_V28; + zfs_kernel_version_size = sizeof(zfs_kernel_version); + + if (zfs_kernel_version == 0) { + sysctlbyname("vfs.zfs.version.spa", + &zfs_kernel_version, + &zfs_kernel_version_size, NULL, 0); + } + + if (zfs_kernel_version == SPA_VERSION_15 || + zfs_kernel_version == SPA_VERSION_14 || + zfs_kernel_version == SPA_VERSION_13) + cflag = ZFS_CMD_COMPAT_V15; + } + + oldsize = zc->zc_nvlist_dst_size; + ret = zcmd_ioctl_compat(fd, cmd, zc, cflag); + + if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { + ret = -1; + errno = ENOMEM; + } + + return (ret); +} +#define ioctl(fd, cmd, zc) zcmd_ioctl((fd), (cmd), (zc)) + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_COMPAT_H */ diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h index d55e942..11a57a9 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h @@ -40,8 +40,7 @@ #include #include #include - -#include "zfs_ioctl_compat.h" +#include #ifdef __cplusplus extern "C" { @@ -215,63 +214,6 @@ extern int zfs_unshare_proto(zfs_handle_t *, extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t); -#ifndef sun -static int zfs_kernel_version = 0; -static int zfs_ioctl_version = 0; - -/* - * This is FreeBSD version of ioctl, because Solaris' ioctl() updates - * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an - * error is returned zc_nvlist_dst_size won't be updated. - */ -static __inline int -zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) -{ - unsigned long cmd; - size_t oldsize, zfs_kernel_version_size, zfs_ioctl_version_size; - int version, ret, cflag = ZFS_CMD_COMPAT_NONE; - - cmd = _IOWR('Z', request, struct zfs_cmd); - - zfs_ioctl_version_size = sizeof(zfs_ioctl_version); - if (zfs_ioctl_version == 0) { - sysctlbyname("vfs.zfs.version.ioctl", &zfs_ioctl_version, - &zfs_ioctl_version_size, NULL, 0); - } - - /* - * If vfs.zfs.version.ioctl is not defined, assume we have v28 - * compatible binaries and use vfs.zfs.version.spa to test for v15 - */ - if (zfs_ioctl_version < ZFS_IOCVER_DEADMAN) { - cflag = ZFS_CMD_COMPAT_V28; - zfs_kernel_version_size = sizeof(zfs_kernel_version); - - if (zfs_kernel_version == 0) { - sysctlbyname("vfs.zfs.version.spa", - &zfs_kernel_version, - &zfs_kernel_version_size, NULL, 0); - } - - if (zfs_kernel_version == SPA_VERSION_15 || - zfs_kernel_version == SPA_VERSION_14 || - zfs_kernel_version == SPA_VERSION_13) - cflag = ZFS_CMD_COMPAT_V15; - } - - oldsize = zc->zc_nvlist_dst_size; - ret = zcmd_ioctl_compat(fd, cmd, zc, cflag); - - if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { - ret = -1; - errno = ENOMEM; - } - - return (ret); -} -#define ioctl(fd, cmd, zc) zcmd_ioctl((fd), (cmd), (zc)) -#endif /* !sun */ - #ifdef __cplusplus } #endif diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c index 364d92a..b6c28be 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c @@ -84,7 +84,7 @@ #include #include #include -#include +#include static int g_fd; static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h index 9edc884..c8bfbef 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Martin Matuska . All rights reserved. */ #ifndef _LIBZFS_CORE_H -- cgit v1.1 From 400c064719f9ae85f95d8a8afeb773e4744985c3 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 6 Mar 2013 11:33:25 +0000 Subject: Add missing init functions Reduce diff to illumos --- .../contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 12c907b..6ba9192 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -5796,7 +5796,7 @@ zfsdev_close(void *data) } static int -zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t addr, int flag, +zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, struct thread *td) { zfs_cmd_t *zc; @@ -5856,18 +5856,14 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t addr, int flag, zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); bzero(zc, sizeof(zfs_cmd_t)); -#ifdef illumos error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); -#else - error = ddi_copyin((void *)addr, zc, sizeof (zfs_cmd_t), flag); -#endif if (error != 0) { error = EFAULT; goto out; } if (cflag != ZFS_CMD_COMPAT_NONE) { - zfs_cmd_compat_get(zc, addr, cflag); + zfs_cmd_compat_get(zc, arg, cflag); zfs_ioctl_compat_pre(zc, &vecnum, cflag); } @@ -5986,7 +5982,7 @@ out: if (cflag != ZFS_CMD_COMPAT_NONE) { zfs_ioctl_compat_post(zc, cmd, cflag); - zfs_cmd_compat_put(zc, addr, cflag); + zfs_cmd_compat_put(zc, arg, cflag); } kmem_free(zc, sizeof (zfs_cmd_t)); @@ -6216,9 +6212,11 @@ zfs_modevent(module_t mod, int type, void *unused __unused) spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); + zfs_ioctl_init(); tsd_create(&zfs_fsyncer_key, NULL); - tsd_create(&rrw_tsd_key, NULL); + tsd_create(&rrw_tsd_key, rrw_tsd_destroy); + tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n"); root_mount_rel(zfs_root_token); @@ -6239,6 +6237,7 @@ zfs_modevent(module_t mod, int type, void *unused __unused) tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); + tsd_destroy(&zfs_allow_log_key); mutex_destroy(&zfs_share_lock); break; -- cgit v1.1 From 53680f658c3877f4378f68e059f04e00b38207cb Mon Sep 17 00:00:00 2001 From: mm Date: Thu, 7 Mar 2013 23:45:16 +0000 Subject: Comment out unfeasible illumos copyin code and restore previous behavior. --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 6ba9192..66b710d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -5853,6 +5853,7 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, return (EINVAL); vec = &zfs_ioc_vec[vecnum]; +#ifdef illumos zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); bzero(zc, sizeof(zfs_cmd_t)); @@ -5861,11 +5862,15 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, error = EFAULT; goto out; } +#endif if (cflag != ZFS_CMD_COMPAT_NONE) { + zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); + bzero(zc, sizeof(zfs_cmd_t)); zfs_cmd_compat_get(zc, arg, cflag); zfs_ioctl_compat_pre(zc, &vecnum, cflag); - } + } else + zc = (void *)arg; zc->zc_iflags = flag & FKIOCTL; if (zc->zc_nvlist_src_size != 0) { @@ -5983,9 +5988,12 @@ out: if (cflag != ZFS_CMD_COMPAT_NONE) { zfs_ioctl_compat_post(zc, cmd, cflag); zfs_cmd_compat_put(zc, arg, cflag); + kmem_free(zc, sizeof (zfs_cmd_t)); } +#ifdef illumos kmem_free(zc, sizeof (zfs_cmd_t)); +#endif return (error); } -- cgit v1.1 From b9a4266c5c62b694f294026dcd9c7d02b41da67e Mon Sep 17 00:00:00 2001 From: mm Date: Sat, 16 Mar 2013 20:28:38 +0000 Subject: Initialize "error" variable where illumos does. --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 66b710d..38c386e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -5862,6 +5862,8 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, error = EFAULT; goto out; } +#else + error = 0; #endif if (cflag != ZFS_CMD_COMPAT_NONE) { -- cgit v1.1 From 6c7511b96eb323641755bcec17f25f6f6f220a35 Mon Sep 17 00:00:00 2001 From: mm Date: Sun, 17 Mar 2013 10:57:04 +0000 Subject: libzfs_core: - provide complete backwards compatibility (old utility, new kernel) - add zfs_cmd_t compatibility mapping in both directions - determine ioctl address in zfs_ioctl_compat.c --- .../opensolaris/lib/libzfs/common/libzfs_compat.h | 10 +- .../opensolaris/common/zfs/zfs_ioctl_compat.c | 258 +++++++++++++++++++-- .../opensolaris/common/zfs/zfs_ioctl_compat.h | 50 +++- .../opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 23 +- 4 files changed, 314 insertions(+), 27 deletions(-) diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h index b6654b4..881737d 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h @@ -43,18 +43,18 @@ static int zfs_ioctl_version = 0; static __inline int zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) { - unsigned long cmd; size_t oldsize, zfs_kernel_version_size, zfs_ioctl_version_size; int version, ret, cflag = ZFS_CMD_COMPAT_NONE; - cmd = _IOWR('Z', request, struct zfs_cmd); - zfs_ioctl_version_size = sizeof(zfs_ioctl_version); if (zfs_ioctl_version == 0) { sysctlbyname("vfs.zfs.version.ioctl", &zfs_ioctl_version, &zfs_ioctl_version_size, NULL, 0); } + if (zfs_ioctl_version == ZFS_IOCVER_DEADMAN) + cflag = ZFS_CMD_COMPAT_DEADMAN; + /* * If vfs.zfs.version.ioctl is not defined, assume we have v28 * compatible binaries and use vfs.zfs.version.spa to test for v15 @@ -76,7 +76,7 @@ zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) } oldsize = zc->zc_nvlist_dst_size; - ret = zcmd_ioctl_compat(fd, cmd, zc, cflag); + ret = zcmd_ioctl_compat(fd, request, zc, cflag); if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { ret = -1; @@ -85,7 +85,7 @@ zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) return (ret); } -#define ioctl(fd, cmd, zc) zcmd_ioctl((fd), (cmd), (zc)) +#define ioctl(fd, ioc, zc) zcmd_ioctl((fd), (ioc), (zc)) #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c index 0463e9a..926ba26 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c @@ -33,6 +33,7 @@ #include #include #include +#include "zfs_namecheck.h" #include "zfs_ioctl_compat.h" static int zfs_version_ioctl = ZFS_IOCVER_CURRENT; @@ -49,8 +50,53 @@ zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) { zfs_cmd_v15_t *zc_c; zfs_cmd_v28_t *zc28_c; + zfs_cmd_deadman_t *zcdm_c; switch (cflag) { + case ZFS_CMD_COMPAT_DEADMAN: + zcdm_c = (void *)addr; + /* zc */ + strlcpy(zc->zc_name, zcdm_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, zcdm_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, zcdm_c->zc_string, MAXPATHLEN); + strlcpy(zc->zc_top_ds, zcdm_c->zc_top_ds, MAXPATHLEN); + zc->zc_guid = zcdm_c->zc_guid; + zc->zc_nvlist_conf = zcdm_c->zc_nvlist_conf; + zc->zc_nvlist_conf_size = zcdm_c->zc_nvlist_conf_size; + zc->zc_nvlist_src = zcdm_c->zc_nvlist_src; + zc->zc_nvlist_src_size = zcdm_c->zc_nvlist_src_size; + zc->zc_nvlist_dst = zcdm_c->zc_nvlist_dst; + zc->zc_nvlist_dst_size = zcdm_c->zc_nvlist_dst_size; + zc->zc_cookie = zcdm_c->zc_cookie; + zc->zc_objset_type = zcdm_c->zc_objset_type; + zc->zc_perm_action = zcdm_c->zc_perm_action; + zc->zc_history = zcdm_c->zc_history; + zc->zc_history_len = zcdm_c->zc_history_len; + zc->zc_history_offset = zcdm_c->zc_history_offset; + zc->zc_obj = zcdm_c->zc_obj; + zc->zc_iflags = zcdm_c->zc_iflags; + zc->zc_share = zcdm_c->zc_share; + zc->zc_jailid = zcdm_c->zc_jailid; + zc->zc_objset_stats = zcdm_c->zc_objset_stats; + zc->zc_begin_record = zcdm_c->zc_begin_record; + zc->zc_defer_destroy = zcdm_c->zc_defer_destroy; + zc->zc_temphold = zcdm_c->zc_temphold; + zc->zc_action_handle = zcdm_c->zc_action_handle; + zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd; + zc->zc_simple = zcdm_c->zc_simple; + bcopy(zcdm_c->zc_pad, zc->zc_pad, sizeof(zc->zc_pad)); + zc->zc_sendobj = zcdm_c->zc_sendobj; + zc->zc_fromobj = zcdm_c->zc_fromobj; + zc->zc_createtxg = zcdm_c->zc_createtxg; + zc->zc_stat = zcdm_c->zc_stat; + + /* zc_inject_record doesn't change in libzfs_core */ + zcdm_c->zc_inject_record = zc->zc_inject_record; + + /* we always assume zc_nvlist_dst_filled is true */ + zc->zc_nvlist_dst_filled = B_TRUE; + break; + case ZFS_CMD_COMPAT_V28: zc28_c = (void *)addr; @@ -178,8 +224,51 @@ zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int cflag) { zfs_cmd_v15_t *zc_c; zfs_cmd_v28_t *zc28_c; + zfs_cmd_deadman_t *zcdm_c; switch (cflag) { + case ZFS_CMD_COMPAT_DEADMAN: + zcdm_c = (void *)addr; + + strlcpy(zcdm_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(zcdm_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(zcdm_c->zc_string, zc->zc_string, MAXPATHLEN); + strlcpy(zcdm_c->zc_top_ds, zc->zc_top_ds, MAXPATHLEN); + zcdm_c->zc_guid = zc->zc_guid; + zcdm_c->zc_nvlist_conf = zc->zc_nvlist_conf; + zcdm_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; + zcdm_c->zc_nvlist_src = zc->zc_nvlist_src; + zcdm_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; + zcdm_c->zc_nvlist_dst = zc->zc_nvlist_dst; + zcdm_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; + zcdm_c->zc_cookie = zc->zc_cookie; + zcdm_c->zc_objset_type = zc->zc_objset_type; + zcdm_c->zc_perm_action = zc->zc_perm_action; + zcdm_c->zc_history = zc->zc_history; + zcdm_c->zc_history_len = zc->zc_history_len; + zcdm_c->zc_history_offset = zc->zc_history_offset; + zcdm_c->zc_obj = zc->zc_obj; + zcdm_c->zc_iflags = zc->zc_iflags; + zcdm_c->zc_share = zc->zc_share; + zcdm_c->zc_jailid = zc->zc_jailid; + zcdm_c->zc_objset_stats = zc->zc_objset_stats; + zcdm_c->zc_begin_record = zc->zc_begin_record; + zcdm_c->zc_defer_destroy = zc->zc_defer_destroy; + zcdm_c->zc_temphold = zc->zc_temphold; + zcdm_c->zc_action_handle = zc->zc_action_handle; + zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd; + zcdm_c->zc_simple = zc->zc_simple; + bcopy(zc->zc_pad, zcdm_c->zc_pad, sizeof(zcdm_c->zc_pad)); + zcdm_c->zc_sendobj = zc->zc_sendobj; + zcdm_c->zc_fromobj = zc->zc_fromobj; + zcdm_c->zc_createtxg = zc->zc_createtxg; + zcdm_c->zc_stat = zc->zc_stat; + + /* zc_inject_record doesn't change in libzfs_core */ + zc->zc_inject_record = zcdm_c->zc_inject_record; + + break; + case ZFS_CMD_COMPAT_V28: zc28_c = (void *)addr; @@ -476,7 +565,7 @@ zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc) #ifndef _KERNEL int -zcmd_ioctl_compat(int fd, unsigned long cmd, zfs_cmd_t *zc, const int cflag) +zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) { int nc, ret; void *zc_c; @@ -484,16 +573,21 @@ zcmd_ioctl_compat(int fd, unsigned long cmd, zfs_cmd_t *zc, const int cflag) switch (cflag) { case ZFS_CMD_COMPAT_NONE: - ret = ioctl(fd, cmd, zc); + ncmd = _IOWR('Z', request, struct zfs_cmd); + ret = ioctl(fd, ncmd, zc); return (ret); + case ZFS_CMD_COMPAT_DEADMAN: + zc_c = malloc(sizeof(zfs_cmd_deadman_t)); + ncmd = _IOWR('Z', request, struct zfs_cmd_deadman); + break; case ZFS_CMD_COMPAT_V28: zc_c = malloc(sizeof(zfs_cmd_v28_t)); - ncmd = _IOWR('Z', ZFS_IOCREQ(cmd), struct zfs_cmd_v28); + ncmd = _IOWR('Z', request, struct zfs_cmd_v28); break; case ZFS_CMD_COMPAT_V15: - nc = zfs_ioctl_v28_to_v15[ZFS_IOCREQ(cmd)]; + nc = zfs_ioctl_v28_to_v15[request]; zc_c = malloc(sizeof(zfs_cmd_v15_t)); - ncmd = _IOWR('Z', nc, struct zfs_cmd_v15); + ncmd = _IOWR('Z', request, struct zfs_cmd_v15); break; default: return (EINVAL); @@ -505,18 +599,18 @@ zcmd_ioctl_compat(int fd, unsigned long cmd, zfs_cmd_t *zc, const int cflag) zfs_cmd_compat_put(zc, (caddr_t)zc_c, cflag); ret = ioctl(fd, ncmd, zc_c); if (cflag == ZFS_CMD_COMPAT_V15 && - nc == 2 /* ZFS_IOC_POOL_IMPORT */) - ret = ioctl(fd, _IOWR('Z', 4 /* ZFS_IOC_POOL_CONFIGS */, + nc == ZFS_IOC_POOL_IMPORT) + ret = ioctl(fd, _IOWR('Z', ZFS_IOC_POOL_CONFIGS, struct zfs_cmd_v15), zc_c); zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag); free(zc_c); if (cflag == ZFS_CMD_COMPAT_V15) { switch (nc) { - case 2: /* ZFS_IOC_POOL_IMPORT */ - case 4: /* ZFS_IOC_POOL_CONFIGS */ - case 5: /* ZFS_IOC_POOL_STATS */ - case 6: /* ZFS_IOC_POOL_TRYIMPORT */ + case ZFS_IOC_POOL_IMPORT: + case ZFS_IOC_POOL_CONFIGS: + case ZFS_IOC_POOL_STATS: + case ZFS_IOC_POOL_TRYIMPORT: zfs_ioctl_compat_fix_stats(zc, nc); break; case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ @@ -528,16 +622,25 @@ zcmd_ioctl_compat(int fd, unsigned long cmd, zfs_cmd_t *zc, const int cflag) return (ret); } #else /* _KERNEL */ -void +int zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag) { - if (cflag == ZFS_CMD_COMPAT_V15) + int error = 0; + + /* are we creating a clone? */ + if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0') + *vec = ZFS_IOC_CLONE; + + if (cflag == ZFS_CMD_COMPAT_V15) { switch (*vec) { case 7: /* ZFS_IOC_POOL_SCRUB (v15) */ zc->zc_cookie = POOL_SCAN_SCRUB; break; } + } + + return (error); } void @@ -545,9 +648,9 @@ zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag) { if (cflag == ZFS_CMD_COMPAT_V15) { switch (vec) { - case 4: /* ZFS_IOC_POOL_CONFIGS */ - case 5: /* ZFS_IOC_POOL_STATS */ - case 6: /* ZFS_IOC_POOL_TRYIMPORT */ + case ZFS_IOC_POOL_CONFIGS: + case ZFS_IOC_POOL_STATS: + case ZFS_IOC_POOL_TRYIMPORT: zfs_ioctl_compat_fix_stats(zc, vec); break; case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ @@ -556,4 +659,127 @@ zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag) } } } + +nvlist_t * +zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t * innvl, const int vec, + const int cflag) +{ + nvlist_t *nvl, *tmpnvl; + char *poolname, *snapname; + int err; + + if (cflag == ZFS_CMD_COMPAT_NONE) + goto out; + + switch (vec) { + case ZFS_IOC_CREATE: + nvl = fnvlist_alloc(); + fnvlist_add_int32(nvl, "type", zc->zc_objset_type); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "props", innvl); + nvlist_free(innvl); + } + return (nvl); + break; + case ZFS_IOC_CLONE: + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, "origin", zc->zc_value); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "props", innvl); + nvlist_free(innvl); + } + return (nvl); + break; + case ZFS_IOC_SNAPSHOT: + if (innvl == NULL) + goto out; + nvl = fnvlist_alloc(); + fnvlist_add_nvlist(nvl, "props", innvl); + tmpnvl = fnvlist_alloc(); + snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); + fnvlist_add_boolean(tmpnvl, snapname); + kmem_free(snapname, strlen(snapname + 1)); + /* check if we are doing a recursive snapshot */ + if (zc->zc_cookie) + dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, + tmpnvl); + fnvlist_add_nvlist(nvl, "snaps", tmpnvl); + fnvlist_free(tmpnvl); + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_SPACE_SNAPS: + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, "firstsnap", zc->zc_value); + if (innvl != NULL) + nvlist_free(innvl); + return (nvl); + break; + case ZFS_IOC_DESTROY_SNAPS: + if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN) + goto out; + nvl = fnvlist_alloc(); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "snaps", innvl); + } else { + /* + * We are probably called by even older binaries, + * allocate and populate nvlist with recursive + * snapshots + */ + if (snapshot_namecheck(zc->zc_value, NULL, + NULL) == 0) { + tmpnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, tmpnvl) == 0) + fnvlist_add_nvlist(nvl, "snaps", + tmpnvl); + nvlist_free(tmpnvl); + } + } + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + } +out: + return (innvl); +} + +nvlist_t * +zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t * outnvl, const int vec, + const int cflag) +{ + nvlist_t *tmpnvl; + + if (cflag == ZFS_CMD_COMPAT_NONE) + return (outnvl); + + switch (vec) { + case ZFS_IOC_SPACE_SNAPS: + (void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie); + (void) nvlist_lookup_uint64(outnvl, "compressed", + &zc->zc_objset_type); + (void) nvlist_lookup_uint64(outnvl, "uncompressed", + &zc->zc_perm_action); + nvlist_free(outnvl); + /* return empty outnvl */ + tmpnvl = fnvlist_alloc(); + return (tmpnvl); + break; + case ZFS_IOC_CREATE: + case ZFS_IOC_CLONE: + nvlist_free(outnvl); + /* return empty outnvl */ + tmpnvl = fnvlist_alloc(); + return (tmpnvl); + break; + } + + return (outnvl); +} #endif /* KERNEL */ diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h index 6e897b8..2ec2242 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h @@ -45,13 +45,15 @@ extern "C" { */ /* ioctl versions for vfs.zfs.version.ioctl */ +#define ZFS_IOCVER_LZC 2 #define ZFS_IOCVER_DEADMAN 1 -#define ZFS_IOCVER_CURRENT ZFS_IOCVER_DEADMAN +#define ZFS_IOCVER_CURRENT ZFS_IOCVER_LZC /* compatibility conversion flag */ #define ZFS_CMD_COMPAT_NONE 0 #define ZFS_CMD_COMPAT_V15 1 #define ZFS_CMD_COMPAT_V28 2 +#define ZFS_CMD_COMPAT_DEADMAN 3 #define ZFS_IOC_COMPAT_PASS 254 #define ZFS_IOC_COMPAT_FAIL 255 @@ -150,6 +152,44 @@ typedef struct zfs_cmd_v28 { zfs_stat_t zc_stat; } zfs_cmd_v28_t; +typedef struct zfs_cmd_deadman { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + char zc_top_ds[MAXPATHLEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + /* zc_inject_record doesn't change in libzfs_core */ + zinject_record_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_deadman_t; + #ifdef _KERNEL unsigned static long zfs_ioctl_v15_to_v28[] = { 0, /* 0 ZFS_IOC_POOL_CREATE */ @@ -274,10 +314,14 @@ unsigned static long zfs_ioctl_v28_to_v15[] = { #endif /* ! _KERNEL */ #ifdef _KERNEL -void zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); +int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); +nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, + const int); #else -int zcmd_ioctl_compat(int, unsigned long, zfs_cmd_t *, const int); +int zcmd_ioctl_compat(int, int, zfs_cmd_t *, const int); #endif /* _KERNEL */ void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int); void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index 38c386e..bb2ead3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -5805,7 +5805,7 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, #ifdef illumos minor_t minor = getminor(dev); #else - int cflag, cmd; + int cflag, cmd, oldvecnum; cred_t *cr = td->td_ucred; #endif const zfs_ioc_vec_t *vec; @@ -5821,7 +5821,10 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, * and translate zfs_cmd if necessary */ if (len < sizeof(zfs_cmd_t)) - if (len == sizeof(zfs_cmd_v28_t)) { + if (len == sizeof(zfs_cmd_deadman_t)) { + cflag = ZFS_CMD_COMPAT_DEADMAN; + vecnum = cmd; + } else if (len == sizeof(zfs_cmd_v28_t)) { cflag = ZFS_CMD_COMPAT_V28; vecnum = cmd; } else if (len == sizeof(zfs_cmd_v15_t)) { @@ -5870,7 +5873,12 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); bzero(zc, sizeof(zfs_cmd_t)); zfs_cmd_compat_get(zc, arg, cflag); - zfs_ioctl_compat_pre(zc, &vecnum, cflag); + oldvecnum = vecnum; + error = zfs_ioctl_compat_pre(zc, &vecnum, cflag); + if (error != 0) + goto out; + if (oldvecnum != vecnum) + vec = &zfs_ioc_vec[vecnum]; } else zc = (void *)arg; @@ -5882,6 +5890,10 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, goto out; } + /* rewrite innvl for backwards compatibility */ + if (cflag != ZFS_CMD_COMPAT_NONE) + innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag); + /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. @@ -5955,6 +5967,11 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, } fnvlist_free(lognv); + /* rewrite outnvl for backwards compatibility */ + if (cflag != ZFS_CMD_COMPAT_NONE) + outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum, + cflag); + if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { -- cgit v1.1 From 4e6fd9f85c16fb556fb77626827afcaac9104ac7 Mon Sep 17 00:00:00 2001 From: mm Date: Sun, 17 Mar 2013 17:28:06 +0000 Subject: Fix accidentially changed ioc variable for old v15 compatibility --- sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c index 926ba26..f0318b0 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c @@ -587,7 +587,7 @@ zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) case ZFS_CMD_COMPAT_V15: nc = zfs_ioctl_v28_to_v15[request]; zc_c = malloc(sizeof(zfs_cmd_v15_t)); - ncmd = _IOWR('Z', request, struct zfs_cmd_v15); + ncmd = _IOWR('Z', nc, struct zfs_cmd_v15); break; default: return (EINVAL); -- cgit v1.1 From 3d2567990a126eeb3af2fca1367c9749e6bb945f Mon Sep 17 00:00:00 2001 From: mm Date: Sun, 17 Mar 2013 18:33:06 +0000 Subject: Add forwards compatibility for libzfs_core Unsupported: creation of multiple snapshots including "zfs snapshot -r" --- .../lib/libzfs_core/common/libzfs_core.c | 33 ++++- .../lib/libzfs_core/common/libzfs_core.h | 3 + .../lib/libzfs_core/common/libzfs_core_compat.c | 134 +++++++++++++++++++++ .../lib/libzfs_core/common/libzfs_core_compat.h | 47 ++++++++ cddl/lib/libzfs_core/Makefile | 2 +- 5 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c create mode 100644 cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c index b6c28be..c8b401e 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c @@ -86,6 +86,10 @@ #include #include +#ifdef __FreeBSD__ +extern int zfs_ioctl_version; +#endif + static int g_fd; static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; static int g_refcount; @@ -124,12 +128,24 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, zfs_cmd_t zc = { 0 }; int error = 0; char *packed; +#ifdef __FreeBSD__ + nvlist_t *oldsource; +#endif size_t size; ASSERT3S(g_refcount, >, 0); (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); +#ifdef __FreeBSD__ + if (zfs_ioctl_version < ZFS_IOCVER_LZC) { + oldsource = source; + error = lzc_compat_pre(&zc, &ioc, &source); + if (error) + return (error); + } +#endif + packed = fnvlist_pack(source, &size); zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; zc.zc_nvlist_src_size = size; @@ -167,14 +183,29 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, break; } } + +#ifdef __FreeBSD__ + if (zfs_ioctl_version < ZFS_IOCVER_LZC) + lzc_compat_post(&zc, ioc); +#endif if (zc.zc_nvlist_dst_filled) { *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, zc.zc_nvlist_dst_size); } else if (resultp != NULL) { *resultp = NULL; } - +#ifdef __FreeBSD__ + if (zfs_ioctl_version < ZFS_IOCVER_LZC) + lzc_compat_outnvl(&zc, ioc, resultp); +#endif out: +#ifdef __FreeBSD__ + if (zfs_ioctl_version < ZFS_IOCVER_LZC) { + if (source != oldsource) + nvlist_free(source); + source = oldsource; + } +#endif fnvlist_pack_free(packed, size); free((void *)(uintptr_t)zc.zc_nvlist_dst); return (error); diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h index c8bfbef..ee19d17 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h @@ -31,6 +31,9 @@ #include #include #include +#ifdef __FreeBSD__ +#include "libzfs_core_compat.h" +#endif #ifdef __cplusplus extern "C" { diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c new file mode 100644 index 0000000..e5eb7ae --- /dev/null +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c @@ -0,0 +1,134 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ + +#include +#include + +extern int zfs_ioctl_version; + +int +lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source) +{ + nvlist_t *nvl = NULL; + nvpair_t *pair; + char *buf; + zfs_ioc_t vecnum; + uint32_t type32; + int error = 0; + int pos; + + if (zfs_ioctl_version >= ZFS_IOCVER_LZC) + return (0); + + vecnum = *ioc; + + switch (vecnum) { + case ZFS_IOC_CREATE: + type32 = fnvlist_lookup_int32(*source, "type"); + zc->zc_objset_type = (uint64_t)type32; + nvlist_lookup_nvlist(*source, "props", &nvl); + *source = nvl; + break; + case ZFS_IOC_CLONE: + buf = fnvlist_lookup_string(*source, "origin"); + strlcpy(zc->zc_value, buf, MAXPATHLEN); + nvlist_lookup_nvlist(*source, "props", &nvl); + *ioc = ZFS_IOC_CREATE; + *source = nvl; + break; + case ZFS_IOC_SNAPSHOT: + nvl = fnvlist_lookup_nvlist(*source, "snaps"); + pair = nvlist_next_nvpair(nvl, NULL); + if (pair != NULL) { + buf = nvpair_name(pair); + pos = strcspn(buf, "@"); + strlcpy(zc->zc_name, buf, pos + 1); + strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN); + } else + error = EOPNOTSUPP; + /* old kernel cannot create multiple snapshots */ + if (!error && nvlist_next_nvpair(nvl, pair) != NULL) + error = EOPNOTSUPP; + nvlist_free(nvl); + nvl = NULL; + nvlist_lookup_nvlist(*source, "props", &nvl); + *source = nvl; + break; + case ZFS_IOC_SPACE_SNAPS: + buf = fnvlist_lookup_string(*source, "firstsnap"); + strlcpy(zc->zc_value, buf, MAXPATHLEN); + break; + case ZFS_IOC_DESTROY_SNAPS: + nvl = fnvlist_lookup_nvlist(*source, "snaps"); + pair = nvlist_next_nvpair(nvl, NULL); + if (pair != NULL) { + buf = nvpair_name(pair); + pos = strcspn(buf, "@"); + strlcpy(zc->zc_name, buf, pos + 1); + } + *source = nvl; + break; + } + + return (error); +} + +void +lzc_compat_post(zfs_cmd_t *zc, const zfs_ioc_t ioc) +{ + if (zfs_ioctl_version >= ZFS_IOCVER_LZC) + return; + + switch (ioc) { + case ZFS_IOC_CREATE: + case ZFS_IOC_CLONE: + case ZFS_IOC_SNAPSHOT: + case ZFS_IOC_SPACE_SNAPS: + case ZFS_IOC_DESTROY_SNAPS: + zc->zc_nvlist_dst_filled = B_FALSE; + break; + } +} + +int +lzc_compat_outnvl(zfs_cmd_t *zc, const zfs_ioc_t ioc, nvlist_t **outnvl) +{ + nvlist_t *nvl; + + if (zfs_ioctl_version >= ZFS_IOCVER_LZC) + return (0); + + switch (ioc) { + case ZFS_IOC_SPACE_SNAPS: + nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, "used", zc->zc_cookie); + fnvlist_add_uint64(nvl, "compressed", zc->zc_objset_type); + fnvlist_add_uint64(nvl, "uncompressed", zc->zc_perm_action); + *outnvl = nvl; + break; + } + + return (0); +} diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h new file mode 100644 index 0000000..6527c4b --- /dev/null +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.h @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Martin Matuska . All rights reserved. + */ + +#ifndef _LIBZFS_CORE_COMPAT_H +#define _LIBZFS_CORE_COMPAT_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int lzc_compat_pre(zfs_cmd_t *, zfs_ioc_t *, nvlist_t **); +void lzc_compat_post(zfs_cmd_t *, const zfs_ioc_t); +int lzc_compat_outnvl(zfs_cmd_t *, const zfs_ioc_t, nvlist_t **); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_CORE_COMPAT_H */ diff --git a/cddl/lib/libzfs_core/Makefile b/cddl/lib/libzfs_core/Makefile index 5f2fc57..d0993689 100644 --- a/cddl/lib/libzfs_core/Makefile +++ b/cddl/lib/libzfs_core/Makefile @@ -9,7 +9,7 @@ LIB= zfs_core DPADD= ${LIBNVPAIR} LDADD= -lnvpair -SRCS= libzfs_core.c +SRCS= libzfs_core.c libzfs_core_compat.c WARNS?= 0 CSTD= c99 -- cgit v1.1 From 71bf489e1dedab6b58142f0d7e5b523b8685ab9a Mon Sep 17 00:00:00 2001 From: mm Date: Sun, 17 Mar 2013 18:49:11 +0000 Subject: Merge libzfs_core part of r239388 Illumos ZFS issues: 3085 zfs diff panics, then panics in a loop on booting References: https://www.illumos.org/issues/3085 --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c index 15d3050..0fe5f71 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c @@ -444,8 +444,10 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, * initialized yet, so don't bother logging the internal events. * Likewise if the pool is not writeable. */ - if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) + if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { + fnvlist_free(nvl); return; + } va_copy(adx2, adx); -- cgit v1.1 From da897e350b0958d2948223afca3a685866ac6e9c Mon Sep 17 00:00:00 2001 From: mm Date: Sun, 17 Mar 2013 22:24:08 +0000 Subject: Fix working with zfs_ioctl_version in libzfs_compat.h and include mirror lzc_ioctl_version in libzfs_core --- .../opensolaris/lib/libzfs/common/libzfs_compat.h | 26 ++++++++++++++++------ .../lib/libzfs_core/common/libzfs_core.c | 17 +++++++++----- .../lib/libzfs_core/common/libzfs_core.h | 3 --- .../lib/libzfs_core/common/libzfs_core_compat.c | 11 ++++----- 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h index 881737d..400c814 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h @@ -32,8 +32,23 @@ extern "C" { #endif +static int zfs_ioctl_version = -1; static int zfs_kernel_version = 0; -static int zfs_ioctl_version = 0; + +/* + * Get zfs_ioctl_version + */ +static __inline int +get_zfs_ioctl_version(void) +{ + size_t ver_size; + int ver = 0; + + ver_size = sizeof(ver); + sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); + + return (ver); +} /* * This is FreeBSD version of ioctl, because Solaris' ioctl() updates @@ -43,14 +58,11 @@ static int zfs_ioctl_version = 0; static __inline int zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) { - size_t oldsize, zfs_kernel_version_size, zfs_ioctl_version_size; + size_t oldsize, zfs_kernel_version_size; int version, ret, cflag = ZFS_CMD_COMPAT_NONE; - zfs_ioctl_version_size = sizeof(zfs_ioctl_version); - if (zfs_ioctl_version == 0) { - sysctlbyname("vfs.zfs.version.ioctl", &zfs_ioctl_version, - &zfs_ioctl_version_size, NULL, 0); - } + if (zfs_ioctl_version == -1) + zfs_ioctl_version = get_zfs_ioctl_version(); if (zfs_ioctl_version == ZFS_IOCVER_DEADMAN) cflag = ZFS_CMD_COMPAT_DEADMAN; diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c index c8b401e..3319d20 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c @@ -84,10 +84,11 @@ #include #include #include -#include +#include "libzfs_core_compat.h" +#include "libzfs_compat.h" #ifdef __FreeBSD__ -extern int zfs_ioctl_version; +int lzc_ioctl_version = -1; #endif static int g_fd; @@ -107,6 +108,7 @@ libzfs_core_init(void) } g_refcount++; (void) pthread_mutex_unlock(&g_lock); + return (0); } @@ -138,7 +140,10 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); #ifdef __FreeBSD__ - if (zfs_ioctl_version < ZFS_IOCVER_LZC) { + if (lzc_ioctl_version == -1) + lzc_ioctl_version = get_zfs_ioctl_version(); + + if (lzc_ioctl_version < ZFS_IOCVER_LZC) { oldsource = source; error = lzc_compat_pre(&zc, &ioc, &source); if (error) @@ -185,7 +190,7 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, } #ifdef __FreeBSD__ - if (zfs_ioctl_version < ZFS_IOCVER_LZC) + if (lzc_ioctl_version < ZFS_IOCVER_LZC) lzc_compat_post(&zc, ioc); #endif if (zc.zc_nvlist_dst_filled) { @@ -195,12 +200,12 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, *resultp = NULL; } #ifdef __FreeBSD__ - if (zfs_ioctl_version < ZFS_IOCVER_LZC) + if (lzc_ioctl_version < ZFS_IOCVER_LZC) lzc_compat_outnvl(&zc, ioc, resultp); #endif out: #ifdef __FreeBSD__ - if (zfs_ioctl_version < ZFS_IOCVER_LZC) { + if (lzc_ioctl_version < ZFS_IOCVER_LZC) { if (source != oldsource) nvlist_free(source); source = oldsource; diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h index ee19d17..c8bfbef 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h @@ -31,9 +31,6 @@ #include #include #include -#ifdef __FreeBSD__ -#include "libzfs_core_compat.h" -#endif #ifdef __cplusplus extern "C" { diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c index e5eb7ae..0ab12b6 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c @@ -24,9 +24,10 @@ */ #include -#include +#include +#include "libzfs_core_compat.h" -extern int zfs_ioctl_version; +extern int lzc_ioctl_version; int lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source) @@ -39,7 +40,7 @@ lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source) int error = 0; int pos; - if (zfs_ioctl_version >= ZFS_IOCVER_LZC) + if (lzc_ioctl_version >= ZFS_IOCVER_LZC) return (0); vecnum = *ioc; @@ -98,7 +99,7 @@ lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source) void lzc_compat_post(zfs_cmd_t *zc, const zfs_ioc_t ioc) { - if (zfs_ioctl_version >= ZFS_IOCVER_LZC) + if (lzc_ioctl_version >= ZFS_IOCVER_LZC) return; switch (ioc) { @@ -117,7 +118,7 @@ lzc_compat_outnvl(zfs_cmd_t *zc, const zfs_ioc_t ioc, nvlist_t **outnvl) { nvlist_t *nvl; - if (zfs_ioctl_version >= ZFS_IOCVER_LZC) + if (lzc_ioctl_version >= ZFS_IOCVER_LZC) return (0); switch (ioc) { -- cgit v1.1 From 713c2d790d75eb897ffb4bf09b789cf8940e6ce4 Mon Sep 17 00:00:00 2001 From: mm Date: Mon, 18 Mar 2013 09:32:29 +0000 Subject: Move common zfs ioctl compatibility functions (userland) into libzfs_compat.c Introduce additional constants for zfs ioctl versions --- .../opensolaris/lib/libzfs/common/libzfs_compat.c | 103 +++++++++++++++++++++ .../opensolaris/lib/libzfs/common/libzfs_compat.h | 66 +------------ .../lib/libzfs_core/common/libzfs_core.c | 14 +-- .../lib/libzfs_core/common/libzfs_core_compat.c | 8 +- cddl/lib/libzfs/Makefile | 1 + cddl/lib/libzfs_core/Makefile | 6 +- .../opensolaris/common/zfs/zfs_ioctl_compat.h | 4 +- 7 files changed, 125 insertions(+), 77 deletions(-) create mode 100644 cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c new file mode 100644 index 0000000..2a2ae76 --- /dev/null +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c @@ -0,0 +1,103 @@ +/* + * CDDL HEADER SART + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ + +#include "libzfs_compat.h" + +int zfs_ioctl_version = ZFS_IOCVER_UNDEF; +static int zfs_spa_version = -1; + +/* + * Get zfs_ioctl_version + */ +int +get_zfs_ioctl_version(void) +{ + size_t ver_size; + int ver = ZFS_IOCVER_NONE; + + ver_size = sizeof(ver); + sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); + + return (ver); +} + +/* + * Get the SPA version + */ +static int +get_zfs_spa_version(void) +{ + size_t ver_size; + int ver = 0; + + ver_size = sizeof(ver); + sysctlbyname("vfs.zfs.version.spa", &ver, &ver_size, NULL, 0); + + return (ver); +} + +/* + * This is FreeBSD version of ioctl, because Solaris' ioctl() updates + * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an + * error is returned zc_nvlist_dst_size won't be updated. + */ +int +zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) +{ + size_t oldsize; + int ret, cflag = ZFS_CMD_COMPAT_NONE; + + if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) + zfs_ioctl_version = get_zfs_ioctl_version(); + + if (zfs_ioctl_version == ZFS_IOCVER_DEADMAN) + cflag = ZFS_CMD_COMPAT_DEADMAN; + + /* + * If vfs.zfs.version.ioctl is not defined, assume we have v28 + * compatible binaries and use vfs.zfs.version.spa to test for v15 + */ + if (zfs_ioctl_version < ZFS_IOCVER_DEADMAN) { + cflag = ZFS_CMD_COMPAT_V28; + + if (zfs_spa_version < 0) + zfs_spa_version = get_zfs_spa_version(); + + if (zfs_spa_version == SPA_VERSION_15 || + zfs_spa_version == SPA_VERSION_14 || + zfs_spa_version == SPA_VERSION_13) + cflag = ZFS_CMD_COMPAT_V15; + } + + oldsize = zc->zc_nvlist_dst_size; + ret = zcmd_ioctl_compat(fd, request, zc, cflag); + + if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { + ret = -1; + errno = ENOMEM; + } + + return (ret); +} diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h index 400c814..3761668 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.h @@ -32,71 +32,9 @@ extern "C" { #endif -static int zfs_ioctl_version = -1; -static int zfs_kernel_version = 0; +int get_zfs_ioctl_version(void); +int zcmd_ioctl(int fd, int request, zfs_cmd_t *zc); -/* - * Get zfs_ioctl_version - */ -static __inline int -get_zfs_ioctl_version(void) -{ - size_t ver_size; - int ver = 0; - - ver_size = sizeof(ver); - sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); - - return (ver); -} - -/* - * This is FreeBSD version of ioctl, because Solaris' ioctl() updates - * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an - * error is returned zc_nvlist_dst_size won't be updated. - */ -static __inline int -zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) -{ - size_t oldsize, zfs_kernel_version_size; - int version, ret, cflag = ZFS_CMD_COMPAT_NONE; - - if (zfs_ioctl_version == -1) - zfs_ioctl_version = get_zfs_ioctl_version(); - - if (zfs_ioctl_version == ZFS_IOCVER_DEADMAN) - cflag = ZFS_CMD_COMPAT_DEADMAN; - - /* - * If vfs.zfs.version.ioctl is not defined, assume we have v28 - * compatible binaries and use vfs.zfs.version.spa to test for v15 - */ - if (zfs_ioctl_version < ZFS_IOCVER_DEADMAN) { - cflag = ZFS_CMD_COMPAT_V28; - zfs_kernel_version_size = sizeof(zfs_kernel_version); - - if (zfs_kernel_version == 0) { - sysctlbyname("vfs.zfs.version.spa", - &zfs_kernel_version, - &zfs_kernel_version_size, NULL, 0); - } - - if (zfs_kernel_version == SPA_VERSION_15 || - zfs_kernel_version == SPA_VERSION_14 || - zfs_kernel_version == SPA_VERSION_13) - cflag = ZFS_CMD_COMPAT_V15; - } - - oldsize = zc->zc_nvlist_dst_size; - ret = zcmd_ioctl_compat(fd, request, zc, cflag); - - if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { - ret = -1; - errno = ENOMEM; - } - - return (ret); -} #define ioctl(fd, ioc, zc) zcmd_ioctl((fd), (ioc), (zc)) #ifdef __cplusplus diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c index 3319d20..83d0296 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c @@ -88,7 +88,7 @@ #include "libzfs_compat.h" #ifdef __FreeBSD__ -int lzc_ioctl_version = -1; +extern int zfs_ioctl_version; #endif static int g_fd; @@ -140,10 +140,10 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); #ifdef __FreeBSD__ - if (lzc_ioctl_version == -1) - lzc_ioctl_version = get_zfs_ioctl_version(); + if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) + zfs_ioctl_version = get_zfs_ioctl_version(); - if (lzc_ioctl_version < ZFS_IOCVER_LZC) { + if (zfs_ioctl_version < ZFS_IOCVER_LZC) { oldsource = source; error = lzc_compat_pre(&zc, &ioc, &source); if (error) @@ -190,7 +190,7 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, } #ifdef __FreeBSD__ - if (lzc_ioctl_version < ZFS_IOCVER_LZC) + if (zfs_ioctl_version < ZFS_IOCVER_LZC) lzc_compat_post(&zc, ioc); #endif if (zc.zc_nvlist_dst_filled) { @@ -200,12 +200,12 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, *resultp = NULL; } #ifdef __FreeBSD__ - if (lzc_ioctl_version < ZFS_IOCVER_LZC) + if (zfs_ioctl_version < ZFS_IOCVER_LZC) lzc_compat_outnvl(&zc, ioc, resultp); #endif out: #ifdef __FreeBSD__ - if (lzc_ioctl_version < ZFS_IOCVER_LZC) { + if (zfs_ioctl_version < ZFS_IOCVER_LZC) { if (source != oldsource) nvlist_free(source); source = oldsource; diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c index 0ab12b6..c19be1f 100644 --- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c +++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core_compat.c @@ -27,7 +27,7 @@ #include #include "libzfs_core_compat.h" -extern int lzc_ioctl_version; +extern int zfs_ioctl_version; int lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source) @@ -40,7 +40,7 @@ lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source) int error = 0; int pos; - if (lzc_ioctl_version >= ZFS_IOCVER_LZC) + if (zfs_ioctl_version >= ZFS_IOCVER_LZC) return (0); vecnum = *ioc; @@ -99,7 +99,7 @@ lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source) void lzc_compat_post(zfs_cmd_t *zc, const zfs_ioc_t ioc) { - if (lzc_ioctl_version >= ZFS_IOCVER_LZC) + if (zfs_ioctl_version >= ZFS_IOCVER_LZC) return; switch (ioc) { @@ -118,7 +118,7 @@ lzc_compat_outnvl(zfs_cmd_t *zc, const zfs_ioc_t ioc, nvlist_t **outnvl) { nvlist_t *nvl; - if (lzc_ioctl_version >= ZFS_IOCVER_LZC) + if (zfs_ioctl_version >= ZFS_IOCVER_LZC) return (0); switch (ioc) { diff --git a/cddl/lib/libzfs/Makefile b/cddl/lib/libzfs/Makefile index 7397cae..5b6b47d 100644 --- a/cddl/lib/libzfs/Makefile +++ b/cddl/lib/libzfs/Makefile @@ -18,6 +18,7 @@ SRCS= deviceid.c \ zone.c SRCS+= libzfs_changelist.c \ + libzfs_compat.c \ libzfs_config.c \ libzfs_dataset.c \ libzfs_diff.c \ diff --git a/cddl/lib/libzfs_core/Makefile b/cddl/lib/libzfs_core/Makefile index d0993689..a470fbc 100644 --- a/cddl/lib/libzfs_core/Makefile +++ b/cddl/lib/libzfs_core/Makefile @@ -4,12 +4,16 @@ .PATH: ${.CURDIR}/../../../sys/cddl/contrib/opensolaris/common/zfs .PATH: ${.CURDIR}/../../../sys/cddl/contrib/opensolaris/uts/common/fs/zfs .PATH: ${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs_core/common +.PATH: ${.CURDIR}/../../../cddl/contrib/opensolaris/lib/libzfs/common LIB= zfs_core DPADD= ${LIBNVPAIR} LDADD= -lnvpair -SRCS= libzfs_core.c libzfs_core_compat.c +SRCS= libzfs_core.c \ + libzfs_core_compat.c + +SRCS+= libzfs_compat.c WARNS?= 0 CSTD= c99 diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h index 2ec2242..57c2909 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h @@ -45,8 +45,10 @@ extern "C" { */ /* ioctl versions for vfs.zfs.version.ioctl */ -#define ZFS_IOCVER_LZC 2 +#define ZFS_IOCVER_UNDEF -1 +#define ZFS_IOCVER_NONE 0 #define ZFS_IOCVER_DEADMAN 1 +#define ZFS_IOCVER_LZC 2 #define ZFS_IOCVER_CURRENT ZFS_IOCVER_LZC /* compatibility conversion flag */ -- cgit v1.1 From 3a10a36ee8940816a53ed53f8b1ca562b85b68a9 Mon Sep 17 00:00:00 2001 From: mm Date: Mon, 18 Mar 2013 20:22:40 +0000 Subject: Add missing zvol_create_mirrors() on zfs_ioc_create() --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index bb2ead3..96c4ec8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -3268,6 +3268,10 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) if (error != 0) (void) dmu_objset_destroy(fsname, B_FALSE); } +#ifdef __FreeBSD__ + if (error == 0 && type == DMU_OST_ZVOL) + zvol_create_minors(fsname); +#endif return (error); } -- cgit v1.1 From e3be864a3f6e9be93a76b0a36ba9596d73ce6f61 Mon Sep 17 00:00:00 2001 From: mm Date: Tue, 19 Mar 2013 22:14:50 +0000 Subject: Run zvol_create_minors() on snapshot creation --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index aee5407..d224326 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -1287,6 +1287,15 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) fnvlist_free(suspended); } +#ifdef __FreeBSD__ +#ifdef _KERNEL + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char *snapname = nvpair_name(pair); + zvol_create_minors(snapname); + } +#endif +#endif return (error); } -- cgit v1.1 From 18a8e19d913846ad35de8779b4d4c1bfb73e911a Mon Sep 17 00:00:00 2001 From: mm Date: Tue, 19 Mar 2013 22:27:15 +0000 Subject: Run zvol_create_minors() only if in non-error case --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c index d224326..1a6a060 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c @@ -1289,10 +1289,12 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) #ifdef __FreeBSD__ #ifdef _KERNEL - for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; - pair = nvlist_next_nvpair(snaps, pair)) { - char *snapname = nvpair_name(pair); - zvol_create_minors(snapname); + if (error == 0) { + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char *snapname = nvpair_name(pair); + zvol_create_minors(snapname); + } } #endif #endif -- cgit v1.1 From 181ede09a39a4bcb3012d6e3d54af3230a5ce7c3 Mon Sep 17 00:00:00 2001 From: mm Date: Wed, 20 Mar 2013 09:56:20 +0000 Subject: Release hold on pool before calling zvol_create_minor() --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index 4e54abc..659805d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -2233,13 +2233,16 @@ zvol_create_minors(const char *name) return (error); } if (dmu_objset_type(os) == DMU_OST_ZVOL) { + dsl_dataset_long_hold(os->os_dsl_dataset, FTAG); + dsl_pool_rele(dmu_objset_pool(os), FTAG); if ((error = zvol_create_minor(name)) == 0) error = zvol_create_snapshots(os, name); else { printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n", name, error); } - dmu_objset_rele(os, FTAG); + dsl_dataset_long_rele(os->os_dsl_dataset, FTAG); + dsl_dataset_rele(os->os_dsl_dataset, FTAG); return (error); } if (dmu_objset_type(os) != DMU_OST_ZFS) { -- cgit v1.1